/src/speex/libspeex/vq_sse.h
Line | Count | Source |
1 | | /* Copyright (C) 2004 Jean-Marc Valin */ |
2 | | /** |
3 | | @file vq_sse.h |
4 | | @brief SSE-optimized vq routine |
5 | | */ |
6 | | /* |
7 | | Redistribution and use in source and binary forms, with or without |
8 | | modification, are permitted provided that the following conditions |
9 | | are met: |
10 | | |
11 | | - Redistributions of source code must retain the above copyright |
12 | | notice, this list of conditions and the following disclaimer. |
13 | | |
14 | | - Redistributions in binary form must reproduce the above copyright |
15 | | notice, this list of conditions and the following disclaimer in the |
16 | | documentation and/or other materials provided with the distribution. |
17 | | |
18 | | - Neither the name of the Xiph.org Foundation nor the names of its |
19 | | contributors may be used to endorse or promote products derived from |
20 | | this software without specific prior written permission. |
21 | | |
22 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
23 | | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
24 | | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
25 | | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR |
26 | | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
27 | | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
28 | | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
29 | | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
30 | | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
31 | | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
32 | | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
33 | | */ |
34 | | |
35 | | #define OVERRIDE_VQ_NBEST |
36 | | void vq_nbest(spx_word16_t *_in, const __m128 *codebook, int len, int entries, __m128 *E, int N, int *nbest, spx_word32_t *best_dist, char *stack) |
37 | 255k | { |
38 | 255k | int i,j,k,used; |
39 | 255k | VARDECL(float *dist); |
40 | 255k | VARDECL(__m128 *in); |
41 | 255k | __m128 half; |
42 | 255k | used = 0; |
43 | 255k | ALLOC(dist, entries, float); |
44 | 255k | half = _mm_set_ps1(.5f); |
45 | 255k | ALLOC(in, len, __m128); |
46 | 2.07M | for (i=0;i<len;i++) |
47 | 1.81M | in[i] = _mm_set_ps1(_in[i]); |
48 | 6.90M | for (i=0;i<entries>>2;i++) |
49 | 6.65M | { |
50 | 6.65M | __m128 d = _mm_mul_ps(E[i], half); |
51 | 46.0M | for (j=0;j<len;j++) |
52 | 39.3M | d = _mm_sub_ps(d, _mm_mul_ps(in[j], *codebook++)); |
53 | 6.65M | _mm_storeu_ps(dist+4*i, d); |
54 | 6.65M | } |
55 | 26.8M | for (i=0;i<entries;i++) |
56 | 26.6M | { |
57 | 26.6M | if (i<N || dist[i]<best_dist[N-1]) |
58 | 4.49M | { |
59 | 15.4M | for (k=N-1; (k >= 1) && (k > used || dist[i] < best_dist[k-1]); k--) |
60 | 10.9M | { |
61 | 10.9M | best_dist[k]=best_dist[k-1]; |
62 | 10.9M | nbest[k] = nbest[k-1]; |
63 | 10.9M | } |
64 | 4.49M | best_dist[k]=dist[i]; |
65 | 4.49M | nbest[k]=i; |
66 | 4.49M | used++; |
67 | 4.49M | } |
68 | 26.6M | } |
69 | 255k | } |
70 | | |
71 | | |
72 | | |
73 | | |
74 | | #define OVERRIDE_VQ_NBEST_SIGN |
75 | | void vq_nbest_sign(spx_word16_t *_in, const __m128 *codebook, int len, int entries, __m128 *E, int N, int *nbest, spx_word32_t *best_dist, char *stack) |
76 | 36.2k | { |
77 | 36.2k | int i,j,k,used; |
78 | 36.2k | VARDECL(float *dist); |
79 | 36.2k | VARDECL(__m128 *in); |
80 | | |
81 | 36.2k | used = 0; |
82 | 36.2k | ALLOC(dist, entries, float); |
83 | | |
84 | 36.2k | ALLOC(in, len, __m128); |
85 | 326k | for (i=0;i<len;i++) |
86 | 290k | in[i] = _mm_set_ps1(_in[i]); |
87 | 1.19M | for (i=0;i<entries>>2;i++) |
88 | 1.16M | { |
89 | 1.16M | __m128 d = _mm_setzero_ps(); |
90 | 10.4M | for (j=0;j<len;j++) |
91 | 9.28M | d = _mm_add_ps(d, _mm_mul_ps(in[j], *codebook++)); |
92 | 1.16M | _mm_storeu_ps(dist+4*i, d); |
93 | 1.16M | } |
94 | 4.67M | for (i=0;i<entries;i++) |
95 | 4.64M | { |
96 | 4.64M | int sign; |
97 | 4.64M | if (dist[i]>0) |
98 | 2.35M | { |
99 | 2.35M | sign=0; |
100 | 2.35M | dist[i]=-dist[i]; |
101 | 2.35M | } else |
102 | 2.28M | { |
103 | 2.28M | sign=1; |
104 | 2.28M | } |
105 | 4.64M | dist[i] += .5f*((float*)E)[i]; |
106 | 4.64M | if (i<N || dist[i]<best_dist[N-1]) |
107 | 583k | { |
108 | 1.78M | for (k=N-1; (k >= 1) && (k > used || dist[i] < best_dist[k-1]); k--) |
109 | 1.20M | { |
110 | 1.20M | best_dist[k]=best_dist[k-1]; |
111 | 1.20M | nbest[k] = nbest[k-1]; |
112 | 1.20M | } |
113 | 583k | best_dist[k]=dist[i]; |
114 | 583k | nbest[k]=i; |
115 | 583k | used++; |
116 | 583k | if (sign) |
117 | 275k | nbest[k]+=entries; |
118 | 583k | } |
119 | 4.64M | } |
120 | 36.2k | } |