Coverage Report

Created: 2023-09-25 08:12

/src/opus/celt/x86/pitch_sse4_1.c
Line
Count
Source (jump to first uncovered line)
1
/* Copyright (c) 2014, Cisco Systems, INC
2
   Written by XiangMingZhu WeiZhou MinPeng YanWang
3
4
   Redistribution and use in source and binary forms, with or without
5
   modification, are permitted provided that the following conditions
6
   are met:
7
8
   - Redistributions of source code must retain the above copyright
9
   notice, this list of conditions and the following disclaimer.
10
11
   - Redistributions in binary form must reproduce the above copyright
12
   notice, this list of conditions and the following disclaimer in the
13
   documentation and/or other materials provided with the distribution.
14
15
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18
   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
*/
27
28
#ifdef HAVE_CONFIG_H
29
#include "config.h"
30
#endif
31
32
#include <xmmintrin.h>
33
#include <emmintrin.h>
34
35
#include "macros.h"
36
#include "celt_lpc.h"
37
#include "stack_alloc.h"
38
#include "mathops.h"
39
#include "pitch.h"
40
41
#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
42
#include <smmintrin.h>
43
#include "x86cpu.h"
44
45
opus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16 *y,
46
      int N)
47
549M
{
48
549M
    opus_int  i, dataSize16;
49
549M
    opus_int32 sum;
50
549M
    __m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
51
549M
    __m128i inVec2_76543210, inVec2_FEDCBA98, acc2;
52
549M
    __m128i inVec1_3210, inVec2_3210;
53
54
549M
    sum = 0;
55
549M
    dataSize16 = N & ~15;
56
57
549M
    acc1 = _mm_setzero_si128();
58
549M
    acc2 = _mm_setzero_si128();
59
60
2.39G
    for (i=0;i<dataSize16;i+=16) {
61
1.84G
        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
62
1.84G
        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
63
64
1.84G
        inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
65
1.84G
        inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));
66
67
1.84G
        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
68
1.84G
        inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98);
69
70
1.84G
        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
71
1.84G
        acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
72
1.84G
    }
73
74
549M
    acc1 = _mm_add_epi32(acc1, acc2);
75
76
549M
    if (N - i >= 8)
77
311M
    {
78
311M
        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
79
311M
        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
80
81
311M
        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
82
83
311M
        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
84
311M
        i += 8;
85
311M
    }
86
87
549M
    if (N - i >= 4)
88
177M
    {
89
177M
        inVec1_3210 = OP_CVTEPI16_EPI32_M64(&x[i + 0]);
90
177M
        inVec2_3210 = OP_CVTEPI16_EPI32_M64(&y[i + 0]);
91
92
177M
        inVec1_3210 = _mm_mullo_epi32(inVec1_3210, inVec2_3210);
93
94
177M
        acc1 = _mm_add_epi32(acc1, inVec1_3210);
95
177M
        i += 4;
96
177M
    }
97
98
549M
    acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64(acc1, acc1));
99
549M
    acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16(acc1, 0x0E));
100
101
549M
    sum += _mm_cvtsi128_si32(acc1);
102
103
911M
    for (;i<N;i++)
104
362M
    {
105
362M
        sum = silk_SMLABB(sum, x[i], y[i]);
106
362M
    }
107
108
549M
    return sum;
109
549M
}
110
111
void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[ 4 ], int len)
112
469M
{
113
469M
    int j;
114
115
469M
    __m128i vecX, vecX0, vecX1, vecX2, vecX3;
116
469M
    __m128i vecY0, vecY1, vecY2, vecY3;
117
469M
    __m128i sum0, sum1, sum2, sum3, vecSum;
118
469M
    __m128i initSum;
119
120
469M
#ifdef OPUS_CHECK_ASM
121
469M
    opus_val32 sum_c[4];
122
2.34G
    for (j=0;j<4;j++) {
123
1.87G
      sum_c[j] = sum[j];
124
1.87G
    }
125
469M
    xcorr_kernel_c(x, y, sum_c, len);
126
469M
#endif
127
128
469M
    celt_assert(len >= 3);
129
130
469M
    sum0 = _mm_setzero_si128();
131
469M
    sum1 = _mm_setzero_si128();
132
469M
    sum2 = _mm_setzero_si128();
133
469M
    sum3 = _mm_setzero_si128();
134
135
6.19G
    for (j=0;j<(len-7);j+=8)
136
5.72G
    {
137
5.72G
        vecX = _mm_loadu_si128((__m128i *)(&x[j + 0]));
138
5.72G
        vecY0 = _mm_loadu_si128((__m128i *)(&y[j + 0]));
139
5.72G
        vecY1 = _mm_loadu_si128((__m128i *)(&y[j + 1]));
140
5.72G
        vecY2 = _mm_loadu_si128((__m128i *)(&y[j + 2]));
141
5.72G
        vecY3 = _mm_loadu_si128((__m128i *)(&y[j + 3]));
142
143
5.72G
        sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(vecX, vecY0));
144
5.72G
        sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(vecX, vecY1));
145
5.72G
        sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(vecX, vecY2));
146
5.72G
        sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(vecX, vecY3));
147
5.72G
    }
148
149
469M
    sum0 = _mm_add_epi32(sum0, _mm_unpackhi_epi64( sum0, sum0));
150
469M
    sum0 = _mm_add_epi32(sum0, _mm_shufflelo_epi16( sum0, 0x0E));
151
152
469M
    sum1 = _mm_add_epi32(sum1, _mm_unpackhi_epi64( sum1, sum1));
153
469M
    sum1 = _mm_add_epi32(sum1, _mm_shufflelo_epi16( sum1, 0x0E));
154
155
469M
    sum2 = _mm_add_epi32(sum2, _mm_unpackhi_epi64( sum2, sum2));
156
469M
    sum2 = _mm_add_epi32(sum2, _mm_shufflelo_epi16( sum2, 0x0E));
157
158
469M
    sum3 = _mm_add_epi32(sum3, _mm_unpackhi_epi64( sum3, sum3));
159
469M
    sum3 = _mm_add_epi32(sum3, _mm_shufflelo_epi16( sum3, 0x0E));
160
161
469M
    vecSum = _mm_unpacklo_epi64(_mm_unpacklo_epi32(sum0, sum1),
162
469M
          _mm_unpacklo_epi32(sum2, sum3));
163
164
643M
    for (;j<(len-3);j+=4)
165
173M
    {
166
173M
        vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
167
173M
        vecX0 = _mm_shuffle_epi32(vecX, 0x00);
168
173M
        vecX1 = _mm_shuffle_epi32(vecX, 0x55);
169
173M
        vecX2 = _mm_shuffle_epi32(vecX, 0xaa);
170
173M
        vecX3 = _mm_shuffle_epi32(vecX, 0xff);
171
172
173M
        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
173
173M
        vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
174
173M
        vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
175
173M
        vecY3 = OP_CVTEPI16_EPI32_M64(&y[j + 3]);
176
177
173M
        sum0 = _mm_mullo_epi32(vecX0, vecY0);
178
173M
        sum1 = _mm_mullo_epi32(vecX1, vecY1);
179
173M
        sum2 = _mm_mullo_epi32(vecX2, vecY2);
180
173M
        sum3 = _mm_mullo_epi32(vecX3, vecY3);
181
182
173M
        sum0 = _mm_add_epi32(sum0, sum1);
183
173M
        sum2 = _mm_add_epi32(sum2, sum3);
184
173M
        vecSum = _mm_add_epi32(vecSum, sum0);
185
173M
        vecSum = _mm_add_epi32(vecSum, sum2);
186
173M
    }
187
188
469M
    vecX = OP_CVTEPI16_EPI32_M64(&x[len - 4]);
189
469M
    if (len - j == 3)
190
0
    {
191
0
        vecX0 = _mm_shuffle_epi32(vecX, 0x55);
192
0
        vecX1 = _mm_shuffle_epi32(vecX, 0xaa);
193
0
        vecX2 = _mm_shuffle_epi32(vecX, 0xff);
194
195
0
        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
196
0
        vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
197
0
        vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
198
199
0
        sum0 = _mm_mullo_epi32(vecX0, vecY0);
200
0
        sum1 = _mm_mullo_epi32(vecX1, vecY1);
201
0
        sum2 = _mm_mullo_epi32(vecX2, vecY2);
202
203
0
        vecSum = _mm_add_epi32(vecSum, sum0);
204
0
        vecSum = _mm_add_epi32(vecSum, sum1);
205
0
        vecSum = _mm_add_epi32(vecSum, sum2);
206
0
    }
207
469M
    else if (len - j == 2)
208
78.7M
    {
209
78.7M
        vecX0 = _mm_shuffle_epi32(vecX, 0xaa);
210
78.7M
        vecX1 = _mm_shuffle_epi32(vecX, 0xff);
211
212
78.7M
        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
213
78.7M
        vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
214
215
78.7M
        sum0 = _mm_mullo_epi32(vecX0, vecY0);
216
78.7M
        sum1 = _mm_mullo_epi32(vecX1, vecY1);
217
218
78.7M
        vecSum = _mm_add_epi32(vecSum, sum0);
219
78.7M
        vecSum = _mm_add_epi32(vecSum, sum1);
220
78.7M
    }
221
390M
    else if (len - j == 1)
222
0
    {
223
0
        vecX0 = _mm_shuffle_epi32(vecX, 0xff);
224
225
0
        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
226
227
0
        sum0 = _mm_mullo_epi32(vecX0, vecY0);
228
229
0
        vecSum = _mm_add_epi32(vecSum, sum0);
230
0
    }
231
232
469M
    initSum = _mm_loadu_si128((__m128i *)(&sum[0]));
233
469M
    initSum = _mm_add_epi32(initSum, vecSum);
234
469M
    _mm_storeu_si128((__m128i *)sum, initSum);
235
236
469M
#ifdef OPUS_CHECK_ASM
237
469M
    celt_assert(!memcmp(sum_c, sum, sizeof(sum_c)));
238
469M
#endif
239
469M
}
240
#endif