/src/opus/celt/x86/pitch_sse.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* Copyright (c) 2014, Cisco Systems, INC |
2 | | Written by XiangMingZhu WeiZhou MinPeng YanWang |
3 | | |
4 | | Redistribution and use in source and binary forms, with or without |
5 | | modification, are permitted provided that the following conditions |
6 | | are met: |
7 | | |
8 | | - Redistributions of source code must retain the above copyright |
9 | | notice, this list of conditions and the following disclaimer. |
10 | | |
11 | | - Redistributions in binary form must reproduce the above copyright |
12 | | notice, this list of conditions and the following disclaimer in the |
13 | | documentation and/or other materials provided with the distribution. |
14 | | |
15 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
16 | | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
17 | | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
18 | | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER |
19 | | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
20 | | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
21 | | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
22 | | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
23 | | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
24 | | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
25 | | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
26 | | */ |
27 | | |
28 | | #ifdef HAVE_CONFIG_H |
29 | | #include "config.h" |
30 | | #endif |
31 | | |
32 | | #include "macros.h" |
33 | | #include "celt_lpc.h" |
34 | | #include "stack_alloc.h" |
35 | | #include "mathops.h" |
36 | | #include "pitch.h" |
37 | | |
38 | | #if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT) |
39 | | |
40 | | #include <xmmintrin.h> |
41 | | #include "arch.h" |
42 | | |
43 | | void xcorr_kernel_sse(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len) |
44 | 0 | { |
45 | 0 | int j; |
46 | 0 | __m128 xsum1, xsum2; |
47 | 0 | xsum1 = _mm_loadu_ps(sum); |
48 | 0 | xsum2 = _mm_setzero_ps(); |
49 | |
|
50 | 0 | for (j = 0; j < len-3; j += 4) |
51 | 0 | { |
52 | 0 | __m128 x0 = _mm_loadu_ps(x+j); |
53 | 0 | __m128 yj = _mm_loadu_ps(y+j); |
54 | 0 | __m128 y3 = _mm_loadu_ps(y+j+3); |
55 | |
|
56 | 0 | xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj)); |
57 | 0 | xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55), |
58 | 0 | _mm_shuffle_ps(yj,y3,0x49))); |
59 | 0 | xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa), |
60 | 0 | _mm_shuffle_ps(yj,y3,0x9e))); |
61 | 0 | xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3)); |
62 | 0 | } |
63 | 0 | if (j < len) |
64 | 0 | { |
65 | 0 | xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); |
66 | 0 | if (++j < len) |
67 | 0 | { |
68 | 0 | xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); |
69 | 0 | if (++j < len) |
70 | 0 | { |
71 | 0 | xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); |
72 | 0 | } |
73 | 0 | } |
74 | 0 | } |
75 | 0 | _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2)); |
76 | 0 | } |
77 | | |
78 | | |
79 | | void dual_inner_prod_sse(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, |
80 | | int N, opus_val32 *xy1, opus_val32 *xy2) |
81 | 0 | { |
82 | 0 | int i; |
83 | 0 | __m128 xsum1, xsum2; |
84 | 0 | xsum1 = _mm_setzero_ps(); |
85 | 0 | xsum2 = _mm_setzero_ps(); |
86 | 0 | for (i=0;i<N-3;i+=4) |
87 | 0 | { |
88 | 0 | __m128 xi = _mm_loadu_ps(x+i); |
89 | 0 | __m128 y1i = _mm_loadu_ps(y01+i); |
90 | 0 | __m128 y2i = _mm_loadu_ps(y02+i); |
91 | 0 | xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i)); |
92 | 0 | xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i)); |
93 | 0 | } |
94 | | /* Horizontal sum */ |
95 | 0 | xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1)); |
96 | 0 | xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55)); |
97 | 0 | _mm_store_ss(xy1, xsum1); |
98 | 0 | xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2)); |
99 | 0 | xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55)); |
100 | 0 | _mm_store_ss(xy2, xsum2); |
101 | 0 | for (;i<N;i++) |
102 | 0 | { |
103 | 0 | *xy1 = MAC16_16(*xy1, x[i], y01[i]); |
104 | 0 | *xy2 = MAC16_16(*xy2, x[i], y02[i]); |
105 | 0 | } |
106 | 0 | } |
107 | | |
108 | | opus_val32 celt_inner_prod_sse(const opus_val16 *x, const opus_val16 *y, |
109 | | int N) |
110 | 0 | { |
111 | 0 | int i; |
112 | 0 | float xy; |
113 | 0 | __m128 sum; |
114 | 0 | sum = _mm_setzero_ps(); |
115 | | /* FIXME: We should probably go 8-way and use 2 sums. */ |
116 | 0 | for (i=0;i<N-3;i+=4) |
117 | 0 | { |
118 | 0 | __m128 xi = _mm_loadu_ps(x+i); |
119 | 0 | __m128 yi = _mm_loadu_ps(y+i); |
120 | 0 | sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi)); |
121 | 0 | } |
122 | | /* Horizontal sum */ |
123 | 0 | sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum)); |
124 | 0 | sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55)); |
125 | 0 | _mm_store_ss(&xy, sum); |
126 | 0 | for (;i<N;i++) |
127 | 0 | { |
128 | 0 | xy = MAC16_16(xy, x[i], y[i]); |
129 | 0 | } |
130 | 0 | return xy; |
131 | 0 | } |
132 | | |
133 | | void comb_filter_const_sse(opus_val32 *y, opus_val32 *x, int T, int N, |
134 | | opus_val16 g10, opus_val16 g11, opus_val16 g12) |
135 | 0 | { |
136 | 0 | int i; |
137 | 0 | __m128 x0v; |
138 | 0 | __m128 g10v, g11v, g12v; |
139 | 0 | g10v = _mm_load1_ps(&g10); |
140 | 0 | g11v = _mm_load1_ps(&g11); |
141 | 0 | g12v = _mm_load1_ps(&g12); |
142 | 0 | x0v = _mm_loadu_ps(&x[-T-2]); |
143 | 0 | for (i=0;i<N-3;i+=4) |
144 | 0 | { |
145 | 0 | __m128 yi, yi2, x1v, x2v, x3v, x4v; |
146 | 0 | const opus_val32 *xp = &x[i-T-2]; |
147 | 0 | yi = _mm_loadu_ps(x+i); |
148 | 0 | x4v = _mm_loadu_ps(xp+4); |
149 | | #if 0 |
150 | | /* Slower version with all loads */ |
151 | | x1v = _mm_loadu_ps(xp+1); |
152 | | x2v = _mm_loadu_ps(xp+2); |
153 | | x3v = _mm_loadu_ps(xp+3); |
154 | | #else |
155 | 0 | x2v = _mm_shuffle_ps(x0v, x4v, 0x4e); |
156 | 0 | x1v = _mm_shuffle_ps(x0v, x2v, 0x99); |
157 | 0 | x3v = _mm_shuffle_ps(x2v, x4v, 0x99); |
158 | 0 | #endif |
159 | |
|
160 | 0 | yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v)); |
161 | | #if 0 /* Set to 1 to make it bit-exact with the non-SSE version */ |
162 | | yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v))); |
163 | | yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); |
164 | | #else |
165 | | /* Use partial sums */ |
166 | 0 | yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)), |
167 | 0 | _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); |
168 | 0 | yi = _mm_add_ps(yi, yi2); |
169 | 0 | #endif |
170 | 0 | x0v=x4v; |
171 | 0 | _mm_storeu_ps(y+i, yi); |
172 | 0 | } |
173 | | #ifdef CUSTOM_MODES |
174 | | for (;i<N;i++) |
175 | | { |
176 | | y[i] = x[i] |
177 | | + MULT16_32_Q15(g10,x[i-T]) |
178 | | + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1])) |
179 | | + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2])); |
180 | | } |
181 | | #endif |
182 | 0 | } |
183 | | |
184 | | |
185 | | #endif |