Coverage Report

Created: 2025-11-09 07:37

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/opus/celt/x86/pitch_sse4_1.c
Line
Count
Source
1
/* Copyright (c) 2014, Cisco Systems, INC
2
   Written by XiangMingZhu WeiZhou MinPeng YanWang
3
4
   Redistribution and use in source and binary forms, with or without
5
   modification, are permitted provided that the following conditions
6
   are met:
7
8
   - Redistributions of source code must retain the above copyright
9
   notice, this list of conditions and the following disclaimer.
10
11
   - Redistributions in binary form must reproduce the above copyright
12
   notice, this list of conditions and the following disclaimer in the
13
   documentation and/or other materials provided with the distribution.
14
15
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18
   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
*/
27
28
#ifdef HAVE_CONFIG_H
29
#include "config.h"
30
#endif
31
32
#include <xmmintrin.h>
33
#include <emmintrin.h>
34
35
#include "macros.h"
36
#include "celt_lpc.h"
37
#include "stack_alloc.h"
38
#include "mathops.h"
39
#include "pitch.h"
40
41
#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
42
#include <smmintrin.h>
43
#include "x86cpu.h"
44
45
opus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16 *y,
46
      int N)
47
10.5M
{
48
10.5M
    opus_int  i, dataSize16;
49
10.5M
    opus_int32 sum;
50
10.5M
    __m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
51
10.5M
    __m128i inVec2_76543210, inVec2_FEDCBA98, acc2;
52
10.5M
    __m128i inVec1_3210, inVec2_3210;
53
54
10.5M
    sum = 0;
55
10.5M
    dataSize16 = N & ~15;
56
57
10.5M
    acc1 = _mm_setzero_si128();
58
10.5M
    acc2 = _mm_setzero_si128();
59
60
59.0M
    for (i=0;i<dataSize16;i+=16) {
61
48.5M
        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
62
48.5M
        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
63
64
48.5M
        inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
65
48.5M
        inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));
66
67
48.5M
        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
68
48.5M
        inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98);
69
70
48.5M
        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
71
48.5M
        acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
72
48.5M
    }
73
74
10.5M
    acc1 = _mm_add_epi32(acc1, acc2);
75
76
10.5M
    if (N - i >= 8)
77
9.75M
    {
78
9.75M
        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
79
9.75M
        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
80
81
9.75M
        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
82
83
9.75M
        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
84
9.75M
        i += 8;
85
9.75M
    }
86
87
10.5M
    if (N - i >= 4)
88
1.17M
    {
89
1.17M
        inVec1_3210 = OP_CVTEPI16_EPI32_M64(&x[i + 0]);
90
1.17M
        inVec2_3210 = OP_CVTEPI16_EPI32_M64(&y[i + 0]);
91
92
1.17M
        inVec1_3210 = _mm_mullo_epi32(inVec1_3210, inVec2_3210);
93
94
1.17M
        acc1 = _mm_add_epi32(acc1, inVec1_3210);
95
1.17M
        i += 4;
96
1.17M
    }
97
98
10.5M
    acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64(acc1, acc1));
99
10.5M
    acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16(acc1, 0x0E));
100
101
10.5M
    sum += _mm_cvtsi128_si32(acc1);
102
103
12.1M
    for (;i<N;i++)
104
1.59M
    {
105
1.59M
        sum = silk_SMLABB(sum, x[i], y[i]);
106
1.59M
    }
107
108
10.5M
    return sum;
109
10.5M
}
110
111
void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[ 4 ], int len)
112
49.1M
{
113
49.1M
    int j;
114
115
49.1M
    __m128i vecX, vecX0, vecX1, vecX2, vecX3;
116
49.1M
    __m128i vecY0, vecY1, vecY2, vecY3;
117
49.1M
    __m128i sum0, sum1, sum2, sum3, vecSum;
118
49.1M
    __m128i initSum;
119
120
49.1M
#ifdef OPUS_CHECK_ASM
121
49.1M
    opus_val32 sum_c[4];
122
245M
    for (j=0;j<4;j++) {
123
196M
      sum_c[j] = sum[j];
124
196M
    }
125
49.1M
    xcorr_kernel_c(x, y, sum_c, len);
126
49.1M
#endif
127
128
49.1M
    celt_assert(len >= 3);
129
130
49.1M
    sum0 = _mm_setzero_si128();
131
49.1M
    sum1 = _mm_setzero_si128();
132
49.1M
    sum2 = _mm_setzero_si128();
133
49.1M
    sum3 = _mm_setzero_si128();
134
135
466M
    for (j=0;j<(len-7);j+=8)
136
417M
    {
137
417M
        vecX = _mm_loadu_si128((__m128i *)(&x[j + 0]));
138
417M
        vecY0 = _mm_loadu_si128((__m128i *)(&y[j + 0]));
139
417M
        vecY1 = _mm_loadu_si128((__m128i *)(&y[j + 1]));
140
417M
        vecY2 = _mm_loadu_si128((__m128i *)(&y[j + 2]));
141
417M
        vecY3 = _mm_loadu_si128((__m128i *)(&y[j + 3]));
142
143
417M
        sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(vecX, vecY0));
144
417M
        sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(vecX, vecY1));
145
417M
        sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(vecX, vecY2));
146
417M
        sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(vecX, vecY3));
147
417M
    }
148
149
49.1M
    sum0 = _mm_add_epi32(sum0, _mm_unpackhi_epi64( sum0, sum0));
150
49.1M
    sum0 = _mm_add_epi32(sum0, _mm_shufflelo_epi16( sum0, 0x0E));
151
152
49.1M
    sum1 = _mm_add_epi32(sum1, _mm_unpackhi_epi64( sum1, sum1));
153
49.1M
    sum1 = _mm_add_epi32(sum1, _mm_shufflelo_epi16( sum1, 0x0E));
154
155
49.1M
    sum2 = _mm_add_epi32(sum2, _mm_unpackhi_epi64( sum2, sum2));
156
49.1M
    sum2 = _mm_add_epi32(sum2, _mm_shufflelo_epi16( sum2, 0x0E));
157
158
49.1M
    sum3 = _mm_add_epi32(sum3, _mm_unpackhi_epi64( sum3, sum3));
159
49.1M
    sum3 = _mm_add_epi32(sum3, _mm_shufflelo_epi16( sum3, 0x0E));
160
161
49.1M
    vecSum = _mm_unpacklo_epi64(_mm_unpacklo_epi32(sum0, sum1),
162
49.1M
          _mm_unpacklo_epi32(sum2, sum3));
163
164
54.9M
    for (;j<(len-3);j+=4)
165
5.81M
    {
166
5.81M
        vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
167
5.81M
        vecX0 = _mm_shuffle_epi32(vecX, 0x00);
168
5.81M
        vecX1 = _mm_shuffle_epi32(vecX, 0x55);
169
5.81M
        vecX2 = _mm_shuffle_epi32(vecX, 0xaa);
170
5.81M
        vecX3 = _mm_shuffle_epi32(vecX, 0xff);
171
172
5.81M
        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
173
5.81M
        vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
174
5.81M
        vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
175
5.81M
        vecY3 = OP_CVTEPI16_EPI32_M64(&y[j + 3]);
176
177
5.81M
        sum0 = _mm_mullo_epi32(vecX0, vecY0);
178
5.81M
        sum1 = _mm_mullo_epi32(vecX1, vecY1);
179
5.81M
        sum2 = _mm_mullo_epi32(vecX2, vecY2);
180
5.81M
        sum3 = _mm_mullo_epi32(vecX3, vecY3);
181
182
5.81M
        sum0 = _mm_add_epi32(sum0, sum1);
183
5.81M
        sum2 = _mm_add_epi32(sum2, sum3);
184
5.81M
        vecSum = _mm_add_epi32(vecSum, sum0);
185
5.81M
        vecSum = _mm_add_epi32(vecSum, sum2);
186
5.81M
    }
187
188
49.1M
    vecX = OP_CVTEPI16_EPI32_M64(&x[len - 4]);
189
49.1M
    if (len - j == 3)
190
0
    {
191
0
        vecX0 = _mm_shuffle_epi32(vecX, 0x55);
192
0
        vecX1 = _mm_shuffle_epi32(vecX, 0xaa);
193
0
        vecX2 = _mm_shuffle_epi32(vecX, 0xff);
194
195
0
        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
196
0
        vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
197
0
        vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
198
199
0
        sum0 = _mm_mullo_epi32(vecX0, vecY0);
200
0
        sum1 = _mm_mullo_epi32(vecX1, vecY1);
201
0
        sum2 = _mm_mullo_epi32(vecX2, vecY2);
202
203
0
        vecSum = _mm_add_epi32(vecSum, sum0);
204
0
        vecSum = _mm_add_epi32(vecSum, sum1);
205
0
        vecSum = _mm_add_epi32(vecSum, sum2);
206
0
    }
207
49.1M
    else if (len - j == 2)
208
2.42M
    {
209
2.42M
        vecX0 = _mm_shuffle_epi32(vecX, 0xaa);
210
2.42M
        vecX1 = _mm_shuffle_epi32(vecX, 0xff);
211
212
2.42M
        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
213
2.42M
        vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
214
215
2.42M
        sum0 = _mm_mullo_epi32(vecX0, vecY0);
216
2.42M
        sum1 = _mm_mullo_epi32(vecX1, vecY1);
217
218
2.42M
        vecSum = _mm_add_epi32(vecSum, sum0);
219
2.42M
        vecSum = _mm_add_epi32(vecSum, sum1);
220
2.42M
    }
221
46.6M
    else if (len - j == 1)
222
0
    {
223
0
        vecX0 = _mm_shuffle_epi32(vecX, 0xff);
224
225
0
        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
226
227
0
        sum0 = _mm_mullo_epi32(vecX0, vecY0);
228
229
0
        vecSum = _mm_add_epi32(vecSum, sum0);
230
0
    }
231
232
49.1M
    initSum = _mm_loadu_si128((__m128i *)(&sum[0]));
233
49.1M
    initSum = _mm_add_epi32(initSum, vecSum);
234
49.1M
    _mm_storeu_si128((__m128i *)sum, initSum);
235
236
49.1M
#ifdef OPUS_CHECK_ASM
237
49.1M
    celt_assert(!memcmp(sum_c, sum, sizeof(sum_c)));
238
49.1M
#endif
239
49.1M
}
240
#endif