/src/opus/celt/x86/pitch_avx.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* Copyright (c) 2023 Amazon */ |
2 | | /* |
3 | | Redistribution and use in source and binary forms, with or without |
4 | | modification, are permitted provided that the following conditions |
5 | | are met: |
6 | | |
7 | | - Redistributions of source code must retain the above copyright |
8 | | notice, this list of conditions and the following disclaimer. |
9 | | |
10 | | - Redistributions in binary form must reproduce the above copyright |
11 | | notice, this list of conditions and the following disclaimer in the |
12 | | documentation and/or other materials provided with the distribution. |
13 | | |
14 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
15 | | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
16 | | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
17 | | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER |
18 | | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
19 | | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
20 | | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
21 | | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
22 | | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
23 | | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
24 | | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
25 | | */ |
26 | | |
27 | | #ifdef HAVE_CONFIG_H |
28 | | #include "config.h" |
29 | | #endif |
30 | | |
31 | | |
32 | | #include <immintrin.h> |
33 | | #include "x86cpu.h" |
34 | | #include "pitch.h" |
35 | | |
36 | | #if defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(FIXED_POINT) |
37 | | |
38 | | /* Like the "regular" xcorr_kernel(), but computes 8 results at a time. */ |
39 | | static void xcorr_kernel_avx(const float *x, const float *y, float sum[8], int len) |
40 | 0 | { |
41 | 0 | __m256 xsum0, xsum1, xsum2, xsum3, xsum4, xsum5, xsum6, xsum7; |
42 | 0 | xsum7 = xsum6 = xsum5 = xsum4 = xsum3 = xsum2 = xsum1 = xsum0 = _mm256_setzero_ps(); |
43 | 0 | int i; |
44 | 0 | __m256 x0; |
45 | | /* Compute 8 inner products using partial sums. */ |
46 | 0 | for (i=0;i<len-7;i+=8) |
47 | 0 | { |
48 | 0 | x0 = _mm256_loadu_ps(x+i); |
49 | 0 | xsum0 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i ), xsum0); |
50 | 0 | xsum1 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+1), xsum1); |
51 | 0 | xsum2 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+2), xsum2); |
52 | 0 | xsum3 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+3), xsum3); |
53 | 0 | xsum4 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+4), xsum4); |
54 | 0 | xsum5 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+5), xsum5); |
55 | 0 | xsum6 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+6), xsum6); |
56 | 0 | xsum7 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+7), xsum7); |
57 | 0 | } |
58 | 0 | if (i != len) { |
59 | 0 | static const int mask[15] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0}; |
60 | 0 | __m256i m; |
61 | 0 | m = _mm256_loadu_si256((__m256i*)(void*)(mask + 7+i-len)); |
62 | 0 | x0 = _mm256_maskload_ps(x+i, m); |
63 | 0 | xsum0 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i , m), xsum0); |
64 | 0 | xsum1 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+1, m), xsum1); |
65 | 0 | xsum2 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+2, m), xsum2); |
66 | 0 | xsum3 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+3, m), xsum3); |
67 | 0 | xsum4 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+4, m), xsum4); |
68 | 0 | xsum5 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+5, m), xsum5); |
69 | 0 | xsum6 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+6, m), xsum6); |
70 | 0 | xsum7 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+7, m), xsum7); |
71 | 0 | } |
72 | | /* 8 horizontal adds. */ |
73 | | /* Compute [0 4] [1 5] [2 6] [3 7] */ |
74 | 0 | xsum0 = _mm256_add_ps(_mm256_permute2f128_ps(xsum0, xsum4, 2<<4), _mm256_permute2f128_ps(xsum0, xsum4, 1 | (3<<4))); |
75 | 0 | xsum1 = _mm256_add_ps(_mm256_permute2f128_ps(xsum1, xsum5, 2<<4), _mm256_permute2f128_ps(xsum1, xsum5, 1 | (3<<4))); |
76 | 0 | xsum2 = _mm256_add_ps(_mm256_permute2f128_ps(xsum2, xsum6, 2<<4), _mm256_permute2f128_ps(xsum2, xsum6, 1 | (3<<4))); |
77 | 0 | xsum3 = _mm256_add_ps(_mm256_permute2f128_ps(xsum3, xsum7, 2<<4), _mm256_permute2f128_ps(xsum3, xsum7, 1 | (3<<4))); |
78 | | /* Compute [0 1 4 5] [2 3 6 7] */ |
79 | 0 | xsum0 = _mm256_hadd_ps(xsum0, xsum1); |
80 | 0 | xsum1 = _mm256_hadd_ps(xsum2, xsum3); |
81 | | /* Compute [0 1 2 3 4 5 6 7] */ |
82 | 0 | xsum0 = _mm256_hadd_ps(xsum0, xsum1); |
83 | 0 | _mm256_storeu_ps(sum, xsum0); |
84 | 0 | } |
85 | | |
86 | | void celt_pitch_xcorr_avx2(const float *_x, const float *_y, float *xcorr, int len, int max_pitch, int arch) |
87 | 0 | { |
88 | 0 | int i; |
89 | 0 | celt_assert(max_pitch>0); |
90 | 0 | (void)arch; |
91 | 0 | for (i=0;i<max_pitch-7;i+=8) |
92 | 0 | { |
93 | 0 | xcorr_kernel_avx(_x, _y+i, &xcorr[i], len); |
94 | 0 | } |
95 | 0 | for (;i<max_pitch;i++) |
96 | 0 | { |
97 | 0 | xcorr[i] = celt_inner_prod(_x, _y+i, len, arch); |
98 | 0 | } |
99 | 0 | } |
100 | | |
101 | | #endif |