Coverage Report

Created: 2025-11-16 07:23

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/opus/celt/x86/pitch_sse4_1.c
Line
Count
Source
1
/* Copyright (c) 2014, Cisco Systems, INC
2
   Written by XiangMingZhu WeiZhou MinPeng YanWang
3
4
   Redistribution and use in source and binary forms, with or without
5
   modification, are permitted provided that the following conditions
6
   are met:
7
8
   - Redistributions of source code must retain the above copyright
9
   notice, this list of conditions and the following disclaimer.
10
11
   - Redistributions in binary form must reproduce the above copyright
12
   notice, this list of conditions and the following disclaimer in the
13
   documentation and/or other materials provided with the distribution.
14
15
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18
   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
*/
27
28
#ifdef HAVE_CONFIG_H
29
#include "config.h"
30
#endif
31
32
#include <xmmintrin.h>
33
#include <emmintrin.h>
34
35
#include "macros.h"
36
#include "celt_lpc.h"
37
#include "stack_alloc.h"
38
#include "mathops.h"
39
#include "pitch.h"
40
41
#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
42
#include <smmintrin.h>
43
#include "x86cpu.h"
44
45
opus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16 *y,
46
      int N)
47
838M
{
48
838M
    opus_int  i, dataSize16;
49
838M
    opus_int32 sum;
50
838M
    __m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
51
838M
    __m128i inVec2_76543210, inVec2_FEDCBA98, acc2;
52
838M
    __m128i inVec1_3210, inVec2_3210;
53
54
838M
    sum = 0;
55
838M
    dataSize16 = N & ~15;
56
57
838M
    acc1 = _mm_setzero_si128();
58
838M
    acc2 = _mm_setzero_si128();
59
60
5.71G
    for (i=0;i<dataSize16;i+=16) {
61
4.87G
        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
62
4.87G
        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
63
64
4.87G
        inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
65
4.87G
        inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));
66
67
4.87G
        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
68
4.87G
        inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98);
69
70
4.87G
        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
71
4.87G
        acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
72
4.87G
    }
73
74
838M
    acc1 = _mm_add_epi32(acc1, acc2);
75
76
838M
    if (N - i >= 8)
77
703M
    {
78
703M
        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
79
703M
        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
80
81
703M
        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
82
83
703M
        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
84
703M
        i += 8;
85
703M
    }
86
87
838M
    if (N - i >= 4)
88
226M
    {
89
226M
        inVec1_3210 = OP_CVTEPI16_EPI32_M64(&x[i + 0]);
90
226M
        inVec2_3210 = OP_CVTEPI16_EPI32_M64(&y[i + 0]);
91
92
226M
        inVec1_3210 = _mm_mullo_epi32(inVec1_3210, inVec2_3210);
93
94
226M
        acc1 = _mm_add_epi32(acc1, inVec1_3210);
95
226M
        i += 4;
96
226M
    }
97
98
838M
    acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64(acc1, acc1));
99
838M
    acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16(acc1, 0x0E));
100
101
838M
    sum += _mm_cvtsi128_si32(acc1);
102
103
1.34G
    for (;i<N;i++)
104
506M
    {
105
506M
        sum = silk_SMLABB(sum, x[i], y[i]);
106
506M
    }
107
108
838M
    return sum;
109
838M
}
110
111
void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[ 4 ], int len)
112
1.38G
{
113
1.38G
    int j;
114
115
1.38G
    __m128i vecX, vecX0, vecX1, vecX2, vecX3;
116
1.38G
    __m128i vecY0, vecY1, vecY2, vecY3;
117
1.38G
    __m128i sum0, sum1, sum2, sum3, vecSum;
118
1.38G
    __m128i initSum;
119
120
1.38G
#ifdef OPUS_CHECK_ASM
121
1.38G
    opus_val32 sum_c[4];
122
6.94G
    for (j=0;j<4;j++) {
123
5.55G
      sum_c[j] = sum[j];
124
5.55G
    }
125
1.38G
    xcorr_kernel_c(x, y, sum_c, len);
126
1.38G
#endif
127
128
1.38G
    celt_assert(len >= 3);
129
130
1.38G
    sum0 = _mm_setzero_si128();
131
1.38G
    sum1 = _mm_setzero_si128();
132
1.38G
    sum2 = _mm_setzero_si128();
133
1.38G
    sum3 = _mm_setzero_si128();
134
135
20.2G
    for (j=0;j<(len-7);j+=8)
136
18.8G
    {
137
18.8G
        vecX = _mm_loadu_si128((__m128i *)(&x[j + 0]));
138
18.8G
        vecY0 = _mm_loadu_si128((__m128i *)(&y[j + 0]));
139
18.8G
        vecY1 = _mm_loadu_si128((__m128i *)(&y[j + 1]));
140
18.8G
        vecY2 = _mm_loadu_si128((__m128i *)(&y[j + 2]));
141
18.8G
        vecY3 = _mm_loadu_si128((__m128i *)(&y[j + 3]));
142
143
18.8G
        sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(vecX, vecY0));
144
18.8G
        sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(vecX, vecY1));
145
18.8G
        sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(vecX, vecY2));
146
18.8G
        sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(vecX, vecY3));
147
18.8G
    }
148
149
1.38G
    sum0 = _mm_add_epi32(sum0, _mm_unpackhi_epi64( sum0, sum0));
150
1.38G
    sum0 = _mm_add_epi32(sum0, _mm_shufflelo_epi16( sum0, 0x0E));
151
152
1.38G
    sum1 = _mm_add_epi32(sum1, _mm_unpackhi_epi64( sum1, sum1));
153
1.38G
    sum1 = _mm_add_epi32(sum1, _mm_shufflelo_epi16( sum1, 0x0E));
154
155
1.38G
    sum2 = _mm_add_epi32(sum2, _mm_unpackhi_epi64( sum2, sum2));
156
1.38G
    sum2 = _mm_add_epi32(sum2, _mm_shufflelo_epi16( sum2, 0x0E));
157
158
1.38G
    sum3 = _mm_add_epi32(sum3, _mm_unpackhi_epi64( sum3, sum3));
159
1.38G
    sum3 = _mm_add_epi32(sum3, _mm_shufflelo_epi16( sum3, 0x0E));
160
161
1.38G
    vecSum = _mm_unpacklo_epi64(_mm_unpacklo_epi32(sum0, sum1),
162
1.38G
          _mm_unpacklo_epi32(sum2, sum3));
163
164
1.85G
    for (;j<(len-3);j+=4)
165
463M
    {
166
463M
        vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
167
463M
        vecX0 = _mm_shuffle_epi32(vecX, 0x00);
168
463M
        vecX1 = _mm_shuffle_epi32(vecX, 0x55);
169
463M
        vecX2 = _mm_shuffle_epi32(vecX, 0xaa);
170
463M
        vecX3 = _mm_shuffle_epi32(vecX, 0xff);
171
172
463M
        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
173
463M
        vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
174
463M
        vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
175
463M
        vecY3 = OP_CVTEPI16_EPI32_M64(&y[j + 3]);
176
177
463M
        sum0 = _mm_mullo_epi32(vecX0, vecY0);
178
463M
        sum1 = _mm_mullo_epi32(vecX1, vecY1);
179
463M
        sum2 = _mm_mullo_epi32(vecX2, vecY2);
180
463M
        sum3 = _mm_mullo_epi32(vecX3, vecY3);
181
182
463M
        sum0 = _mm_add_epi32(sum0, sum1);
183
463M
        sum2 = _mm_add_epi32(sum2, sum3);
184
463M
        vecSum = _mm_add_epi32(vecSum, sum0);
185
463M
        vecSum = _mm_add_epi32(vecSum, sum2);
186
463M
    }
187
188
1.38G
    vecX = OP_CVTEPI16_EPI32_M64(&x[len - 4]);
189
1.38G
    if (len - j == 3)
190
0
    {
191
0
        vecX0 = _mm_shuffle_epi32(vecX, 0x55);
192
0
        vecX1 = _mm_shuffle_epi32(vecX, 0xaa);
193
0
        vecX2 = _mm_shuffle_epi32(vecX, 0xff);
194
195
0
        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
196
0
        vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
197
0
        vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
198
199
0
        sum0 = _mm_mullo_epi32(vecX0, vecY0);
200
0
        sum1 = _mm_mullo_epi32(vecX1, vecY1);
201
0
        sum2 = _mm_mullo_epi32(vecX2, vecY2);
202
203
0
        vecSum = _mm_add_epi32(vecSum, sum0);
204
0
        vecSum = _mm_add_epi32(vecSum, sum1);
205
0
        vecSum = _mm_add_epi32(vecSum, sum2);
206
0
    }
207
1.38G
    else if (len - j == 2)
208
184M
    {
209
184M
        vecX0 = _mm_shuffle_epi32(vecX, 0xaa);
210
184M
        vecX1 = _mm_shuffle_epi32(vecX, 0xff);
211
212
184M
        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
213
184M
        vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
214
215
184M
        sum0 = _mm_mullo_epi32(vecX0, vecY0);
216
184M
        sum1 = _mm_mullo_epi32(vecX1, vecY1);
217
218
184M
        vecSum = _mm_add_epi32(vecSum, sum0);
219
184M
        vecSum = _mm_add_epi32(vecSum, sum1);
220
184M
    }
221
1.20G
    else if (len - j == 1)
222
0
    {
223
0
        vecX0 = _mm_shuffle_epi32(vecX, 0xff);
224
225
0
        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
226
227
0
        sum0 = _mm_mullo_epi32(vecX0, vecY0);
228
229
0
        vecSum = _mm_add_epi32(vecSum, sum0);
230
0
    }
231
232
1.38G
    initSum = _mm_loadu_si128((__m128i *)(&sum[0]));
233
1.38G
    initSum = _mm_add_epi32(initSum, vecSum);
234
1.38G
    _mm_storeu_si128((__m128i *)sum, initSum);
235
236
1.38G
#ifdef OPUS_CHECK_ASM
237
1.38G
    celt_assert(!memcmp(sum_c, sum, sizeof(sum_c)));
238
1.38G
#endif
239
1.38G
}
240
#endif