/src/opus/celt/x86/vq_sse2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* Copyright (c) 2007-2008 CSIRO |
2 | | Copyright (c) 2007-2009 Xiph.Org Foundation |
3 | | Copyright (c) 2007-2016 Jean-Marc Valin */ |
4 | | /* |
5 | | Redistribution and use in source and binary forms, with or without |
6 | | modification, are permitted provided that the following conditions |
7 | | are met: |
8 | | |
9 | | - Redistributions of source code must retain the above copyright |
10 | | notice, this list of conditions and the following disclaimer. |
11 | | |
12 | | - Redistributions in binary form must reproduce the above copyright |
13 | | notice, this list of conditions and the following disclaimer in the |
14 | | documentation and/or other materials provided with the distribution. |
15 | | |
16 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
17 | | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
18 | | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
19 | | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER |
20 | | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
21 | | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
22 | | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
23 | | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
24 | | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
25 | | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
26 | | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
27 | | */ |
28 | | |
29 | | #ifdef HAVE_CONFIG_H |
30 | | #include "config.h" |
31 | | #endif |
32 | | |
33 | | #include <xmmintrin.h> |
34 | | #include <emmintrin.h> |
35 | | #include "celt_lpc.h" |
36 | | #include "stack_alloc.h" |
37 | | #include "mathops.h" |
38 | | #include "vq.h" |
39 | | #include "x86cpu.h" |
40 | | |
41 | | |
42 | | #ifndef FIXED_POINT |
43 | | |
44 | | opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch) |
45 | 0 | { |
46 | 0 | int i, j; |
47 | 0 | int pulsesLeft; |
48 | 0 | float xy, yy; |
49 | 0 | VARDECL(celt_norm, y); |
50 | 0 | VARDECL(celt_norm, X); |
51 | 0 | VARDECL(float, signy); |
52 | 0 | __m128 signmask; |
53 | 0 | __m128 sums; |
54 | 0 | __m128i fours; |
55 | 0 | SAVE_STACK; |
56 | |
|
57 | 0 | (void)arch; |
58 | | /* All bits set to zero, except for the sign bit. */ |
59 | 0 | signmask = _mm_set_ps1(-0.f); |
60 | 0 | fours = _mm_set_epi32(4, 4, 4, 4); |
61 | 0 | ALLOC(y, N+3, celt_norm); |
62 | 0 | ALLOC(X, N+3, celt_norm); |
63 | 0 | ALLOC(signy, N+3, float); |
64 | |
|
65 | 0 | OPUS_COPY(X, _X, N); |
66 | 0 | X[N] = X[N+1] = X[N+2] = 0; |
67 | 0 | sums = _mm_setzero_ps(); |
68 | 0 | for (j=0;j<N;j+=4) |
69 | 0 | { |
70 | 0 | __m128 x4, s4; |
71 | 0 | x4 = _mm_loadu_ps(&X[j]); |
72 | 0 | s4 = _mm_cmplt_ps(x4, _mm_setzero_ps()); |
73 | | /* Get rid of the sign */ |
74 | 0 | x4 = _mm_andnot_ps(signmask, x4); |
75 | 0 | sums = _mm_add_ps(sums, x4); |
76 | | /* Clear y and iy in case we don't do the projection. */ |
77 | 0 | _mm_storeu_ps(&y[j], _mm_setzero_ps()); |
78 | 0 | _mm_storeu_si128((__m128i*)(void*)&iy[j], _mm_setzero_si128()); |
79 | 0 | _mm_storeu_ps(&X[j], x4); |
80 | 0 | _mm_storeu_ps(&signy[j], s4); |
81 | 0 | } |
82 | 0 | sums = _mm_add_ps(sums, _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(1, 0, 3, 2))); |
83 | 0 | sums = _mm_add_ps(sums, _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(2, 3, 0, 1))); |
84 | |
|
85 | 0 | xy = yy = 0; |
86 | |
|
87 | 0 | pulsesLeft = K; |
88 | | |
89 | | /* Do a pre-search by projecting on the pyramid */ |
90 | 0 | if (K > (N>>1)) |
91 | 0 | { |
92 | 0 | __m128i pulses_sum; |
93 | 0 | __m128 yy4, xy4; |
94 | 0 | __m128 rcp4; |
95 | 0 | opus_val32 sum = _mm_cvtss_f32(sums); |
96 | | /* If X is too small, just replace it with a pulse at 0 */ |
97 | | /* Prevents infinities and NaNs from causing too many pulses |
98 | | to be allocated. 64 is an approximation of infinity here. */ |
99 | 0 | if (!(sum > EPSILON && sum < 64)) |
100 | 0 | { |
101 | 0 | X[0] = QCONST16(1.f,14); |
102 | 0 | j=1; do |
103 | 0 | X[j]=0; |
104 | 0 | while (++j<N); |
105 | 0 | sums = _mm_set_ps1(1.f); |
106 | 0 | } |
107 | | /* Using K+e with e < 1 guarantees we cannot get more than K pulses. */ |
108 | 0 | rcp4 = _mm_mul_ps(_mm_set_ps1((float)(K+.8)), _mm_rcp_ps(sums)); |
109 | 0 | xy4 = yy4 = _mm_setzero_ps(); |
110 | 0 | pulses_sum = _mm_setzero_si128(); |
111 | 0 | for (j=0;j<N;j+=4) |
112 | 0 | { |
113 | 0 | __m128 rx4, x4, y4; |
114 | 0 | __m128i iy4; |
115 | 0 | x4 = _mm_loadu_ps(&X[j]); |
116 | 0 | rx4 = _mm_mul_ps(x4, rcp4); |
117 | 0 | iy4 = _mm_cvttps_epi32(rx4); |
118 | 0 | pulses_sum = _mm_add_epi32(pulses_sum, iy4); |
119 | 0 | _mm_storeu_si128((__m128i*)(void*)&iy[j], iy4); |
120 | 0 | y4 = _mm_cvtepi32_ps(iy4); |
121 | 0 | xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4)); |
122 | 0 | yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4)); |
123 | | /* double the y[] vector so we don't have to do it in the search loop. */ |
124 | 0 | _mm_storeu_ps(&y[j], _mm_add_ps(y4, y4)); |
125 | 0 | } |
126 | 0 | pulses_sum = _mm_add_epi32(pulses_sum, _mm_shuffle_epi32(pulses_sum, _MM_SHUFFLE(1, 0, 3, 2))); |
127 | 0 | pulses_sum = _mm_add_epi32(pulses_sum, _mm_shuffle_epi32(pulses_sum, _MM_SHUFFLE(2, 3, 0, 1))); |
128 | 0 | pulsesLeft -= _mm_cvtsi128_si32(pulses_sum); |
129 | 0 | xy4 = _mm_add_ps(xy4, _mm_shuffle_ps(xy4, xy4, _MM_SHUFFLE(1, 0, 3, 2))); |
130 | 0 | xy4 = _mm_add_ps(xy4, _mm_shuffle_ps(xy4, xy4, _MM_SHUFFLE(2, 3, 0, 1))); |
131 | 0 | xy = _mm_cvtss_f32(xy4); |
132 | 0 | yy4 = _mm_add_ps(yy4, _mm_shuffle_ps(yy4, yy4, _MM_SHUFFLE(1, 0, 3, 2))); |
133 | 0 | yy4 = _mm_add_ps(yy4, _mm_shuffle_ps(yy4, yy4, _MM_SHUFFLE(2, 3, 0, 1))); |
134 | 0 | yy = _mm_cvtss_f32(yy4); |
135 | 0 | } |
136 | 0 | X[N] = X[N+1] = X[N+2] = -100; |
137 | 0 | y[N] = y[N+1] = y[N+2] = 100; |
138 | 0 | celt_sig_assert(pulsesLeft>=0); |
139 | | |
140 | | /* This should never happen, but just in case it does (e.g. on silence) |
141 | | we fill the first bin with pulses. */ |
142 | 0 | if (pulsesLeft > N+3) |
143 | 0 | { |
144 | 0 | opus_val16 tmp = (opus_val16)pulsesLeft; |
145 | 0 | yy = MAC16_16(yy, tmp, tmp); |
146 | 0 | yy = MAC16_16(yy, tmp, y[0]); |
147 | 0 | iy[0] += pulsesLeft; |
148 | 0 | pulsesLeft=0; |
149 | 0 | } |
150 | |
|
151 | 0 | for (i=0;i<pulsesLeft;i++) |
152 | 0 | { |
153 | 0 | int best_id; |
154 | 0 | __m128 xy4, yy4; |
155 | 0 | __m128 max, max2; |
156 | 0 | __m128i count; |
157 | 0 | __m128i pos; |
158 | | /* The squared magnitude term gets added anyway, so we might as well |
159 | | add it outside the loop */ |
160 | 0 | yy = ADD16(yy, 1); |
161 | 0 | xy4 = _mm_load1_ps(&xy); |
162 | 0 | yy4 = _mm_load1_ps(&yy); |
163 | 0 | max = _mm_setzero_ps(); |
164 | 0 | pos = _mm_setzero_si128(); |
165 | 0 | count = _mm_set_epi32(3, 2, 1, 0); |
166 | 0 | for (j=0;j<N;j+=4) |
167 | 0 | { |
168 | 0 | __m128 x4, y4, r4; |
169 | 0 | x4 = _mm_loadu_ps(&X[j]); |
170 | 0 | y4 = _mm_loadu_ps(&y[j]); |
171 | 0 | x4 = _mm_add_ps(x4, xy4); |
172 | 0 | y4 = _mm_add_ps(y4, yy4); |
173 | 0 | y4 = _mm_rsqrt_ps(y4); |
174 | 0 | r4 = _mm_mul_ps(x4, y4); |
175 | | /* Update the index of the max. */ |
176 | 0 | pos = _mm_max_epi16(pos, _mm_and_si128(count, _mm_castps_si128(_mm_cmpgt_ps(r4, max)))); |
177 | | /* Update the max. */ |
178 | 0 | max = _mm_max_ps(max, r4); |
179 | | /* Update the indices (+4) */ |
180 | 0 | count = _mm_add_epi32(count, fours); |
181 | 0 | } |
182 | | /* Horizontal max */ |
183 | 0 | max2 = _mm_max_ps(max, _mm_shuffle_ps(max, max, _MM_SHUFFLE(1, 0, 3, 2))); |
184 | 0 | max2 = _mm_max_ps(max2, _mm_shuffle_ps(max2, max2, _MM_SHUFFLE(2, 3, 0, 1))); |
185 | | /* Now that max2 contains the max at all positions, look at which value(s) of the |
186 | | partial max is equal to the global max. */ |
187 | 0 | pos = _mm_and_si128(pos, _mm_castps_si128(_mm_cmpeq_ps(max, max2))); |
188 | 0 | pos = _mm_max_epi16(pos, _mm_unpackhi_epi64(pos, pos)); |
189 | 0 | pos = _mm_max_epi16(pos, _mm_shufflelo_epi16(pos, _MM_SHUFFLE(1, 0, 3, 2))); |
190 | 0 | best_id = _mm_cvtsi128_si32(pos); |
191 | | |
192 | | /* Updating the sums of the new pulse(s) */ |
193 | 0 | xy = ADD32(xy, EXTEND32(X[best_id])); |
194 | | /* We're multiplying y[j] by two so we don't have to do it here */ |
195 | 0 | yy = ADD16(yy, y[best_id]); |
196 | | |
197 | | /* Only now that we've made the final choice, update y/iy */ |
198 | | /* Multiplying y[j] by 2 so we don't have to do it everywhere else */ |
199 | 0 | y[best_id] += 2; |
200 | 0 | iy[best_id]++; |
201 | 0 | } |
202 | | |
203 | | /* Put the original sign back */ |
204 | 0 | for (j=0;j<N;j+=4) |
205 | 0 | { |
206 | 0 | __m128i y4; |
207 | 0 | __m128i s4; |
208 | 0 | y4 = _mm_loadu_si128((__m128i*)(void*)&iy[j]); |
209 | 0 | s4 = _mm_castps_si128(_mm_loadu_ps(&signy[j])); |
210 | 0 | y4 = _mm_xor_si128(_mm_add_epi32(y4, s4), s4); |
211 | 0 | _mm_storeu_si128((__m128i*)(void*)&iy[j], y4); |
212 | 0 | } |
213 | 0 | RESTORE_STACK; |
214 | 0 | return yy; |
215 | 0 | } |
216 | | |
217 | | #endif |