Coverage Report

Created: 2025-11-11 07:47

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/opus/celt/x86/vq_sse2.c
Line
Count
Source
1
/* Copyright (c) 2007-2008 CSIRO
2
   Copyright (c) 2007-2009 Xiph.Org Foundation
3
   Copyright (c) 2007-2016 Jean-Marc Valin */
4
/*
5
   Redistribution and use in source and binary forms, with or without
6
   modification, are permitted provided that the following conditions
7
   are met:
8
9
   - Redistributions of source code must retain the above copyright
10
   notice, this list of conditions and the following disclaimer.
11
12
   - Redistributions in binary form must reproduce the above copyright
13
   notice, this list of conditions and the following disclaimer in the
14
   documentation and/or other materials provided with the distribution.
15
16
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19
   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
20
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
*/
28
29
#ifdef HAVE_CONFIG_H
30
#include "config.h"
31
#endif
32
33
#include <xmmintrin.h>
34
#include <emmintrin.h>
35
#include "celt_lpc.h"
36
#include "stack_alloc.h"
37
#include "mathops.h"
38
#include "vq.h"
39
#include "x86cpu.h"
40
41
42
#ifndef FIXED_POINT
43
44
opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch)
45
34.0M
{
46
34.0M
   int i, j;
47
34.0M
   int pulsesLeft;
48
34.0M
   float xy, yy;
49
34.0M
   VARDECL(celt_norm, y);
50
34.0M
   VARDECL(celt_norm, X);
51
34.0M
   VARDECL(float, signy);
52
34.0M
   __m128 signmask;
53
34.0M
   __m128 sums;
54
34.0M
   __m128i fours;
55
34.0M
   SAVE_STACK;
56
57
34.0M
   (void)arch;
58
   /* All bits set to zero, except for the sign bit. */
59
34.0M
   signmask = _mm_set_ps1(-0.f);
60
34.0M
   fours = _mm_set_epi32(4, 4, 4, 4);
61
34.0M
   ALLOC(y, N+3, celt_norm);
62
34.0M
   ALLOC(X, N+3, celt_norm);
63
34.0M
   ALLOC(signy, N+3, float);
64
65
34.0M
   OPUS_COPY(X, _X, N);
66
34.0M
   X[N] = X[N+1] = X[N+2] = 0;
67
34.0M
   sums = _mm_setzero_ps();
68
108M
   for (j=0;j<N;j+=4)
69
74.6M
   {
70
74.6M
      __m128 x4, s4;
71
74.6M
      x4 = _mm_loadu_ps(&X[j]);
72
74.6M
      s4 = _mm_cmplt_ps(x4, _mm_setzero_ps());
73
      /* Get rid of the sign */
74
74.6M
      x4 = _mm_andnot_ps(signmask, x4);
75
74.6M
      sums = _mm_add_ps(sums, x4);
76
      /* Clear y and iy in case we don't do the projection. */
77
74.6M
      _mm_storeu_ps(&y[j], _mm_setzero_ps());
78
74.6M
      _mm_storeu_si128((__m128i*)(void*)&iy[j], _mm_setzero_si128());
79
74.6M
      _mm_storeu_ps(&X[j], x4);
80
74.6M
      _mm_storeu_ps(&signy[j], s4);
81
74.6M
   }
82
34.0M
   sums = _mm_add_ps(sums, _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(1, 0, 3, 2)));
83
34.0M
   sums = _mm_add_ps(sums, _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(2, 3, 0, 1)));
84
85
34.0M
   xy = yy = 0;
86
87
34.0M
   pulsesLeft = K;
88
89
   /* Do a pre-search by projecting on the pyramid */
90
34.0M
   if (K > (N>>1))
91
24.3M
   {
92
24.3M
      __m128i pulses_sum;
93
24.3M
      __m128 yy4, xy4;
94
24.3M
      __m128 rcp4;
95
24.3M
      opus_val32 sum = _mm_cvtss_f32(sums);
96
      /* If X is too small, just replace it with a pulse at 0 */
97
      /* Prevents infinities and NaNs from causing too many pulses
98
         to be allocated. 64 is an approximation of infinity here. */
99
24.3M
      if (!(sum > EPSILON && sum < 64))
100
920k
      {
101
920k
         X[0] = QCONST16(1.f,14);
102
920k
         j=1; do
103
5.42M
            X[j]=0;
104
5.42M
         while (++j<N);
105
920k
         sums = _mm_set_ps1(1.f);
106
920k
      }
107
      /* Using K+e with e < 1 guarantees we cannot get more than K pulses. */
108
24.3M
      rcp4 = _mm_mul_ps(_mm_set_ps1((float)(K+.8)), _mm_rcp_ps(sums));
109
24.3M
      xy4 = yy4 = _mm_setzero_ps();
110
24.3M
      pulses_sum = _mm_setzero_si128();
111
63.3M
      for (j=0;j<N;j+=4)
112
38.9M
      {
113
38.9M
         __m128 rx4, x4, y4;
114
38.9M
         __m128i iy4;
115
38.9M
         x4 = _mm_loadu_ps(&X[j]);
116
38.9M
         rx4 = _mm_mul_ps(x4, rcp4);
117
38.9M
         iy4 = _mm_cvttps_epi32(rx4);
118
38.9M
         pulses_sum = _mm_add_epi32(pulses_sum, iy4);
119
38.9M
         _mm_storeu_si128((__m128i*)(void*)&iy[j], iy4);
120
38.9M
         y4 = _mm_cvtepi32_ps(iy4);
121
38.9M
         xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4));
122
38.9M
         yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4));
123
         /* double the y[] vector so we don't have to do it in the search loop. */
124
38.9M
         _mm_storeu_ps(&y[j], _mm_add_ps(y4, y4));
125
38.9M
      }
126
24.3M
      pulses_sum = _mm_add_epi32(pulses_sum, _mm_shuffle_epi32(pulses_sum, _MM_SHUFFLE(1, 0, 3, 2)));
127
24.3M
      pulses_sum = _mm_add_epi32(pulses_sum, _mm_shuffle_epi32(pulses_sum, _MM_SHUFFLE(2, 3, 0, 1)));
128
24.3M
      pulsesLeft -= _mm_cvtsi128_si32(pulses_sum);
129
24.3M
      xy4 = _mm_add_ps(xy4, _mm_shuffle_ps(xy4, xy4, _MM_SHUFFLE(1, 0, 3, 2)));
130
24.3M
      xy4 = _mm_add_ps(xy4, _mm_shuffle_ps(xy4, xy4, _MM_SHUFFLE(2, 3, 0, 1)));
131
24.3M
      xy = _mm_cvtss_f32(xy4);
132
24.3M
      yy4 = _mm_add_ps(yy4, _mm_shuffle_ps(yy4, yy4, _MM_SHUFFLE(1, 0, 3, 2)));
133
24.3M
      yy4 = _mm_add_ps(yy4, _mm_shuffle_ps(yy4, yy4, _MM_SHUFFLE(2, 3, 0, 1)));
134
24.3M
      yy = _mm_cvtss_f32(yy4);
135
24.3M
   }
136
34.0M
   X[N] = X[N+1] = X[N+2] = -100;
137
34.0M
   y[N] = y[N+1] = y[N+2] = 100;
138
34.0M
   celt_sig_assert(pulsesLeft>=0);
139
140
   /* This should never happen, but just in case it does (e.g. on silence)
141
      we fill the first bin with pulses. */
142
34.0M
   if (pulsesLeft > N+3)
143
0
   {
144
0
      opus_val16 tmp = (opus_val16)pulsesLeft;
145
0
      yy = MAC16_16(yy, tmp, tmp);
146
0
      yy = MAC16_16(yy, tmp, y[0]);
147
0
      iy[0] += pulsesLeft;
148
0
      pulsesLeft=0;
149
0
   }
150
151
105M
   for (i=0;i<pulsesLeft;i++)
152
71.5M
   {
153
71.5M
      int best_id;
154
71.5M
      __m128 xy4, yy4;
155
71.5M
      __m128 max, max2;
156
71.5M
      __m128i count;
157
71.5M
      __m128i pos;
158
      /* The squared magnitude term gets added anyway, so we might as well
159
         add it outside the loop */
160
71.5M
      yy = ADD16(yy, 1);
161
71.5M
      xy4 = _mm_load1_ps(&xy);
162
71.5M
      yy4 = _mm_load1_ps(&yy);
163
71.5M
      max = _mm_setzero_ps();
164
71.5M
      pos = _mm_setzero_si128();
165
71.5M
      count = _mm_set_epi32(3, 2, 1, 0);
166
310M
      for (j=0;j<N;j+=4)
167
239M
      {
168
239M
         __m128 x4, y4, r4;
169
239M
         x4 = _mm_loadu_ps(&X[j]);
170
239M
         y4 = _mm_loadu_ps(&y[j]);
171
239M
         x4 = _mm_add_ps(x4, xy4);
172
239M
         y4 = _mm_add_ps(y4, yy4);
173
239M
         y4 = _mm_rsqrt_ps(y4);
174
239M
         r4 = _mm_mul_ps(x4, y4);
175
         /* Update the index of the max. */
176
239M
         pos = _mm_max_epi16(pos, _mm_and_si128(count, _mm_castps_si128(_mm_cmpgt_ps(r4, max))));
177
         /* Update the max. */
178
239M
         max = _mm_max_ps(max, r4);
179
         /* Update the indices (+4) */
180
239M
         count = _mm_add_epi32(count, fours);
181
239M
      }
182
      /* Horizontal max */
183
71.5M
      max2 = _mm_max_ps(max, _mm_shuffle_ps(max, max, _MM_SHUFFLE(1, 0, 3, 2)));
184
71.5M
      max2 = _mm_max_ps(max2, _mm_shuffle_ps(max2, max2, _MM_SHUFFLE(2, 3, 0, 1)));
185
      /* Now that max2 contains the max at all positions, look at which value(s) of the
186
         partial max is equal to the global max. */
187
71.5M
      pos = _mm_and_si128(pos, _mm_castps_si128(_mm_cmpeq_ps(max, max2)));
188
71.5M
      pos = _mm_max_epi16(pos, _mm_unpackhi_epi64(pos, pos));
189
71.5M
      pos = _mm_max_epi16(pos, _mm_shufflelo_epi16(pos, _MM_SHUFFLE(1, 0, 3, 2)));
190
71.5M
      best_id = _mm_cvtsi128_si32(pos);
191
192
      /* Updating the sums of the new pulse(s) */
193
71.5M
      xy = ADD32(xy, EXTEND32(X[best_id]));
194
      /* We're multiplying y[j] by two so we don't have to do it here */
195
71.5M
      yy = ADD16(yy, y[best_id]);
196
197
      /* Only now that we've made the final choice, update y/iy */
198
      /* Multiplying y[j] by 2 so we don't have to do it everywhere else */
199
71.5M
      y[best_id] += 2;
200
71.5M
      iy[best_id]++;
201
71.5M
   }
202
203
   /* Put the original sign back */
204
108M
   for (j=0;j<N;j+=4)
205
74.6M
   {
206
74.6M
      __m128i y4;
207
74.6M
      __m128i s4;
208
74.6M
      y4 = _mm_loadu_si128((__m128i*)(void*)&iy[j]);
209
74.6M
      s4 = _mm_castps_si128(_mm_loadu_ps(&signy[j]));
210
74.6M
      y4 = _mm_xor_si128(_mm_add_epi32(y4, s4), s4);
211
74.6M
      _mm_storeu_si128((__m128i*)(void*)&iy[j], y4);
212
74.6M
   }
213
34.0M
   RESTORE_STACK;
214
34.0M
   return yy;
215
34.0M
}
216
217
#endif