/src/speex/libspeex/ltp_sse.h
Line | Count | Source |
1 | | /* Copyright (C) 2002 Jean-Marc Valin */ |
2 | | /** |
3 | | @file ltp_sse.h |
4 | | @brief Long-Term Prediction functions (SSE version) |
5 | | */ |
6 | | /* |
7 | | Redistribution and use in source and binary forms, with or without |
8 | | modification, are permitted provided that the following conditions |
9 | | are met: |
10 | | |
11 | | - Redistributions of source code must retain the above copyright |
12 | | notice, this list of conditions and the following disclaimer. |
13 | | |
14 | | - Redistributions in binary form must reproduce the above copyright |
15 | | notice, this list of conditions and the following disclaimer in the |
16 | | documentation and/or other materials provided with the distribution. |
17 | | |
18 | | - Neither the name of the Xiph.org Foundation nor the names of its |
19 | | contributors may be used to endorse or promote products derived from |
20 | | this software without specific prior written permission. |
21 | | |
22 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
23 | | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
24 | | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
25 | | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR |
26 | | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
27 | | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
28 | | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
29 | | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
30 | | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
31 | | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
32 | | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
33 | | */ |
34 | | |
35 | | #include <xmmintrin.h> |
36 | | |
37 | | #define OVERRIDE_INNER_PROD |
38 | | float inner_prod(const float *a, const float *b, int len) |
39 | 877k | { |
40 | 877k | int i; |
41 | 877k | float ret; |
42 | 877k | __m128 sum = _mm_setzero_ps(); |
43 | 7.24M | for (i=0;i<(len>>2);i+=2) |
44 | 6.37M | { |
45 | 6.37M | sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+0), _mm_loadu_ps(b+0))); |
46 | 6.37M | sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+4), _mm_loadu_ps(b+4))); |
47 | 6.37M | a += 8; |
48 | 6.37M | b += 8; |
49 | 6.37M | } |
50 | 877k | sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum)); |
51 | 877k | sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55)); |
52 | 877k | _mm_store_ss(&ret, sum); |
53 | 877k | return ret; |
54 | 877k | } |
55 | | |
56 | | #define OVERRIDE_PITCH_XCORR |
57 | | void pitch_xcorr(const float *_x, const float *_y, float *corr, int len, int nb_pitch, char *stack) |
58 | 16.2k | { |
59 | 16.2k | int i, offset; |
60 | 16.2k | VARDECL(__m128 *x); |
61 | 16.2k | VARDECL(__m128 *y); |
62 | 16.2k | int N, L; |
63 | 16.2k | N = len>>2; |
64 | 16.2k | L = nb_pitch>>2; |
65 | 16.2k | ALLOC(x, N, __m128); |
66 | 16.2k | ALLOC(y, N+L, __m128); |
67 | 418k | for (i=0;i<N;i++) |
68 | 402k | x[i] = _mm_loadu_ps(_x+(i<<2)); |
69 | 81.3k | for (offset=0;offset<4;offset++) |
70 | 65.0k | { |
71 | 3.64M | for (i=0;i<N+L;i++) |
72 | 3.57M | y[i] = _mm_loadu_ps(_y+(i<<2)+offset); |
73 | 2.03M | for (i=0;i<L;i++) |
74 | 1.97M | { |
75 | 1.97M | int j; |
76 | 1.97M | __m128 sum, *xx, *yy; |
77 | 1.97M | sum = _mm_setzero_ps(); |
78 | 1.97M | yy = y+i; |
79 | 1.97M | xx = x; |
80 | 27.1M | for (j=0;j<N;j+=2) |
81 | 25.1M | { |
82 | 25.1M | sum = _mm_add_ps(sum, _mm_mul_ps(xx[0], yy[0])); |
83 | 25.1M | sum = _mm_add_ps(sum, _mm_mul_ps(xx[1], yy[1])); |
84 | 25.1M | xx += 2; |
85 | 25.1M | yy += 2; |
86 | 25.1M | } |
87 | 1.97M | sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum)); |
88 | 1.97M | sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55)); |
89 | 1.97M | _mm_store_ss(corr+nb_pitch-1-(i<<2)-offset, sum); |
90 | 1.97M | } |
91 | 65.0k | } |
92 | 16.2k | } |