/src/vvenc/source/Lib/CommonLib/x86/AffineGradientSearchX86.h
Line | Count | Source |
1 | | /* ----------------------------------------------------------------------------- |
2 | | The copyright in this software is being made available under the Clear BSD |
3 | | License, included below. No patent rights, trademark rights and/or |
4 | | other Intellectual Property Rights other than the copyrights concerning |
5 | | the Software are granted under this license. |
6 | | |
7 | | The Clear BSD License |
8 | | |
9 | | Copyright (c) 2019-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. |
10 | | All rights reserved. |
11 | | |
12 | | Redistribution and use in source and binary forms, with or without modification, |
13 | | are permitted (subject to the limitations in the disclaimer below) provided that |
14 | | the following conditions are met: |
15 | | |
16 | | * Redistributions of source code must retain the above copyright notice, |
17 | | this list of conditions and the following disclaimer. |
18 | | |
19 | | * Redistributions in binary form must reproduce the above copyright |
20 | | notice, this list of conditions and the following disclaimer in the |
21 | | documentation and/or other materials provided with the distribution. |
22 | | |
23 | | * Neither the name of the copyright holder nor the names of its |
24 | | contributors may be used to endorse or promote products derived from this |
25 | | software without specific prior written permission. |
26 | | |
27 | | NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY |
28 | | THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND |
29 | | CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
30 | | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A |
31 | | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR |
32 | | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
33 | | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
34 | | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR |
35 | | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER |
36 | | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
37 | | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
38 | | POSSIBILITY OF SUCH DAMAGE. |
39 | | |
40 | | |
41 | | ------------------------------------------------------------------------------------------- */ |
42 | | /** |
43 | | * \file |
44 | | * \brief Implementation of AffineGradientSearch class |
45 | | */ |
46 | | //#define USE_AVX2 |
47 | | // ==================================================================================================================== |
48 | | // Includes |
49 | | // ==================================================================================================================== |
50 | | |
51 | | #include "CommonDefX86.h" |
52 | | #include "../AffineGradientSearch.h" |
53 | | |
54 | | //! \ingroup CommonLib |
55 | | //! \{ |
56 | | |
57 | | #if defined(TARGET_SIMD_X86) && ENABLE_SIMD_OPT_AFFINE_ME |
58 | | |
59 | | namespace vvenc { |
60 | | |
61 | | template<X86_VEXT vext> |
62 | | static void simdHorizontalSobelFilter(Pel* const pPred, const int predStride, Pel *const pDerivate, const int derivateBufStride, const int width, const int height) |
63 | 0 | { |
64 | 0 | CHECK( width % 8, "Invalid size!" ); |
65 | | |
66 | | // pPred is 10-bit |
67 | | |
68 | | // -1 0 1 |
69 | | // -2 0 2 |
70 | | // -1 0 1 |
71 | | // |
72 | | // sum( sobel ) = 8, i.e. 4-bit extension |
73 | |
|
74 | 0 | for( int y = 1; y < ( height - 1 ); y++ ) |
75 | 0 | { |
76 | 0 | int x = 1; |
77 | 0 | for( ; x < ( width - 8 ); x += 8 ) |
78 | 0 | { |
79 | 0 | __m128i acc = _mm_loadu_si128( ( const __m128i* ) &pPred[y * predStride + x - 1] ); |
80 | 0 | acc = _mm_sub_epi16( _mm_loadu_si128( ( const __m128i* ) &pPred[y * predStride + x + 1] ), acc ); |
81 | 0 | acc = _mm_slli_epi16( acc, 1 ); |
82 | 0 | acc = _mm_sub_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x - 1] ) ); |
83 | 0 | acc = _mm_add_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x + 1] ) ); |
84 | 0 | acc = _mm_sub_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x - 1] ) ); |
85 | 0 | acc = _mm_add_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x + 1] ) ); |
86 | |
|
87 | 0 | _mm_storeu_si128( ( __m128i* ) &pDerivate[y * derivateBufStride + x], acc ); |
88 | 0 | } |
89 | |
|
90 | 0 | __m128i acc = _mm_loadu_si128( ( const __m128i* ) &pPred[y * predStride + x - 1] ); |
91 | 0 | acc = _mm_sub_epi16( _mm_loadu_si128( ( const __m128i* ) &pPred[y * predStride + x + 1] ), acc ); |
92 | 0 | acc = _mm_slli_epi16( acc, 1 ); |
93 | 0 | acc = _mm_sub_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x - 1] ) ); |
94 | 0 | acc = _mm_add_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x + 1] ) ); |
95 | 0 | acc = _mm_sub_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x - 1] ) ); |
96 | 0 | acc = _mm_add_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x + 1] ) ); |
97 | |
|
98 | 0 | _vv_storel_epi64( ( __m128i* ) &pDerivate[y * derivateBufStride + x], acc ); |
99 | 0 | _mm_storeu_si32 ( &pDerivate[y * derivateBufStride + x + 4], _mm_unpackhi_epi64( acc, acc ) ); |
100 | |
|
101 | 0 | pDerivate[y * derivateBufStride] = pDerivate[y * derivateBufStride + 1]; |
102 | 0 | pDerivate[y * derivateBufStride + (width - 1)] = pDerivate[y * derivateBufStride + (width - 2)]; |
103 | 0 | } |
104 | |
|
105 | 0 | memcpy( pDerivate, pDerivate + derivateBufStride, width * sizeof( pDerivate[ 0 ] ) ); |
106 | 0 | memcpy( pDerivate + ( height - 1 ) * derivateBufStride, pDerivate + ( height - 2 ) * derivateBufStride, width * sizeof( pDerivate[ 0 ] ) ); |
107 | 0 | } Unexecuted instantiation: AffineGradientSearch_sse41.cpp:void vvenc::simdHorizontalSobelFilter<(vvenc::x86_simd::X86_VEXT)1>(short*, int, short*, int, int, int) Unexecuted instantiation: AffineGradientSearch_avx2.cpp:void vvenc::simdHorizontalSobelFilter<(vvenc::x86_simd::X86_VEXT)4>(short*, int, short*, int, int, int) |
108 | | |
109 | | template<X86_VEXT vext> |
110 | | static void simdVerticalSobelFilter(Pel* const pPred, const int predStride, Pel *const pDerivate, const int derivateBufStride, const int width, const int height) |
111 | 0 | { |
112 | 0 | CHECK( width % 8, "Invalid size!" ); |
113 | | |
114 | | // pPred is 10-bit |
115 | | |
116 | | // -1 -2 -1 |
117 | | // 0 0 0 |
118 | | // 1 2 1 |
119 | | // |
120 | | // sum( sobel ) = 8, i.e. 4-bit extension |
121 | |
|
122 | 0 | for( int y = 1; y < ( height - 1 ); y++ ) |
123 | 0 | { |
124 | 0 | int x = 1; |
125 | 0 | for( ; x < ( width - 8 ); x += 8 ) |
126 | 0 | { |
127 | 0 | __m128i acc = _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x] ); |
128 | 0 | acc = _mm_sub_epi16( _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x] ), acc ); |
129 | 0 | acc = _mm_slli_epi16( acc, 1 ); |
130 | 0 | acc = _mm_sub_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x - 1] ) ); |
131 | 0 | acc = _mm_sub_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x + 1] ) ); |
132 | 0 | acc = _mm_add_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x - 1] ) ); |
133 | 0 | acc = _mm_add_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x + 1] ) ); |
134 | |
|
135 | 0 | _mm_storeu_si128( ( __m128i* ) &pDerivate[y * derivateBufStride + x], acc ); |
136 | 0 | } |
137 | | |
138 | 0 | __m128i acc = _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x] ); |
139 | 0 | acc = _mm_sub_epi16( _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x] ), acc ); |
140 | 0 | acc = _mm_slli_epi16( acc, 1 ); |
141 | 0 | acc = _mm_sub_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x - 1] ) ); |
142 | 0 | acc = _mm_sub_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y - 1 ) * predStride + x + 1] ) ); |
143 | 0 | acc = _mm_add_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x - 1] ) ); |
144 | 0 | acc = _mm_add_epi16( acc, _mm_loadu_si128( ( const __m128i* ) &pPred[( y + 1 ) * predStride + x + 1] ) ); |
145 | |
|
146 | 0 | _vv_storel_epi64( ( __m128i* ) &pDerivate[y * derivateBufStride + x], acc ); |
147 | 0 | _mm_storeu_si32 ( &pDerivate[y * derivateBufStride + x + 4], _mm_unpackhi_epi64( acc, acc ) ); |
148 | |
|
149 | 0 | pDerivate[y * derivateBufStride] = pDerivate[y * derivateBufStride + 1]; |
150 | 0 | pDerivate[y * derivateBufStride + (width - 1)] = pDerivate[y * derivateBufStride + (width - 2)]; |
151 | 0 | } |
152 | |
|
153 | 0 | memcpy( pDerivate, pDerivate + derivateBufStride, width * sizeof( pDerivate[ 0 ] ) ); |
154 | 0 | memcpy( pDerivate + (height - 1) * derivateBufStride, pDerivate + (height - 2) * derivateBufStride, width * sizeof( pDerivate[ 0 ] ) ); |
155 | 0 | } Unexecuted instantiation: AffineGradientSearch_sse41.cpp:void vvenc::simdVerticalSobelFilter<(vvenc::x86_simd::X86_VEXT)1>(short*, int, short*, int, int, int) Unexecuted instantiation: AffineGradientSearch_avx2.cpp:void vvenc::simdVerticalSobelFilter<(vvenc::x86_simd::X86_VEXT)4>(short*, int, short*, int, int, int) |
156 | | |
157 | | |
158 | | |
159 | 0 | #define CALC_EQUAL_COEFF_8PXLS(x1,x2,y1,y2,tmp0,tmp1,tmp2,tmp3,inter0,inter1,inter2,inter3,loadLocation) \ |
160 | 0 | { \ |
161 | 0 | inter0 = _mm_mul_epi32(x1, y1); \ |
162 | 0 | inter1 = _mm_mul_epi32(tmp0, tmp2); \ |
163 | 0 | inter2 = _mm_mul_epi32(x2, y2); \ |
164 | 0 | inter3 = _mm_mul_epi32(tmp1, tmp3); \ |
165 | 0 | inter2 = _mm_add_epi64(inter0, inter2); \ |
166 | 0 | inter3 = _mm_add_epi64(inter1, inter3); \ |
167 | 0 | inter0 = _vv_loadl_epi64(loadLocation); \ |
168 | 0 | inter3 = _mm_add_epi64(inter2, inter3); \ |
169 | 0 | inter1 = _mm_srli_si128(inter3, 8); \ |
170 | 0 | inter3 = _mm_add_epi64(inter1, inter3); \ |
171 | 0 | inter3 = _mm_add_epi64(inter0, inter3); \ |
172 | 0 | } |
173 | | |
174 | | template<X86_VEXT vext, bool b6Param> |
175 | | static void simdEqualCoeffComputer(Pel* const pResidue, const int residueStride, Pel **const ppDerivate, const int derivateBufStride, const int width, const int height, int64_t(*pEqualCoeff)[7]) |
176 | 0 | { |
177 | 0 | __m128i mmFour; |
178 | 0 | __m128i mmTmp[4]; |
179 | 0 | __m128i mmIntermediate[4]; |
180 | 0 | __m128i mmIndxK, mmIndxJ; |
181 | 0 | __m128i mmResidue[2]; |
182 | 0 | __m128i mmC[12]; |
183 | | |
184 | | // Add directly to indexes to get new index |
185 | 0 | mmFour = _mm_set1_epi32(4); |
186 | 0 | mmIndxJ = _mm_set1_epi32(-2); |
187 | | |
188 | |
|
189 | 0 | static constexpr int n = b6Param ? 6 : 4; |
190 | 0 | int idx1 = -2 * derivateBufStride - 4; |
191 | 0 | int idx2 = - derivateBufStride - 4; |
192 | 0 | int resIdx1 = -2 * residueStride - 4; |
193 | 0 | int resIdx2 = - residueStride - 4; |
194 | |
|
195 | 0 | for (int j = 0; j < height; j += 2) |
196 | 0 | { |
197 | 0 | if (!(j & 3)) |
198 | 0 | mmIndxJ = _mm_add_epi32(mmIndxJ, mmFour); |
199 | 0 | mmIndxK = _mm_set1_epi32(-2); |
200 | 0 | idx1 += (derivateBufStride << 1); |
201 | 0 | idx2 += (derivateBufStride << 1); |
202 | 0 | resIdx1 += (residueStride << 1); |
203 | 0 | resIdx2 += (residueStride << 1); |
204 | |
|
205 | 0 | for (int k = 0; k < width; k += 4) |
206 | 0 | { |
207 | 0 | idx1 += 4; |
208 | 0 | idx2 += 4; |
209 | 0 | resIdx1 += 4; |
210 | 0 | resIdx2 += 4; |
211 | 0 | mmIndxK = _mm_add_epi32(mmIndxK, mmFour); |
212 | |
|
213 | 0 | if (b6Param) |
214 | 0 | { |
215 | | // mmC[0-5] for iC[0-5] of 1st row of pixels |
216 | 0 | mmC[0] = _mm_cvtepi16_epi32(_vv_loadl_epi64((const __m128i*)&ppDerivate[0][idx1])); |
217 | 0 | mmC[2] = _mm_cvtepi16_epi32(_vv_loadl_epi64((const __m128i*)&ppDerivate[1][idx1])); |
218 | 0 | mmC[1] = _mm_mullo_epi32(mmIndxK, mmC[0]); |
219 | 0 | mmC[3] = _mm_mullo_epi32(mmIndxK, mmC[2]); |
220 | 0 | mmC[4] = _mm_mullo_epi32(mmIndxJ, mmC[0]); |
221 | 0 | mmC[5] = _mm_mullo_epi32(mmIndxJ, mmC[2]); |
222 | | |
223 | | // mmC[6-11] for iC[0-5] of 2nd row of pixels |
224 | 0 | mmC[6] = _mm_cvtepi16_epi32(_vv_loadl_epi64((const __m128i*)&ppDerivate[0][idx2])); |
225 | 0 | mmC[8] = _mm_cvtepi16_epi32(_vv_loadl_epi64((const __m128i*)&ppDerivate[1][idx2])); |
226 | 0 | mmC[7] = _mm_mullo_epi32(mmIndxK, mmC[6]); |
227 | 0 | mmC[9] = _mm_mullo_epi32(mmIndxK, mmC[8]); |
228 | 0 | mmC[10] = _mm_mullo_epi32(mmIndxJ, mmC[6]); |
229 | 0 | mmC[11] = _mm_mullo_epi32(mmIndxJ, mmC[8]); |
230 | 0 | } |
231 | 0 | else |
232 | 0 | { |
233 | | // mmC[0-3] for iC[0-3] of 1st row of pixels |
234 | 0 | mmC[0] = _mm_cvtepi16_epi32(_vv_loadl_epi64((const __m128i*)&ppDerivate[0][idx1])); |
235 | 0 | mmC[2] = _mm_cvtepi16_epi32(_vv_loadl_epi64((const __m128i*)&ppDerivate[1][idx1])); |
236 | 0 | mmC[1] = _mm_mullo_epi32(mmIndxK, mmC[0]); |
237 | 0 | mmC[3] = _mm_mullo_epi32(mmIndxJ, mmC[0]); |
238 | 0 | mmTmp[0] = _mm_mullo_epi32(mmIndxJ, mmC[2]); |
239 | 0 | mmTmp[1] = _mm_mullo_epi32(mmIndxK, mmC[2]); |
240 | 0 | mmC[1] = _mm_add_epi32(mmC[1], mmTmp[0]); |
241 | 0 | mmC[3] = _mm_sub_epi32(mmC[3], mmTmp[1]); |
242 | | |
243 | | // mmC[4-7] for iC[0-3] of 1st row of pixels |
244 | 0 | mmC[4] = _mm_cvtepi16_epi32(_vv_loadl_epi64((const __m128i*)&ppDerivate[0][idx2])); |
245 | 0 | mmC[6] = _mm_cvtepi16_epi32(_vv_loadl_epi64((const __m128i*)&ppDerivate[1][idx2])); |
246 | 0 | mmC[5] = _mm_mullo_epi32(mmIndxK, mmC[4]); |
247 | 0 | mmC[7] = _mm_mullo_epi32(mmIndxJ, mmC[4]); |
248 | 0 | mmTmp[2] = _mm_mullo_epi32(mmIndxJ, mmC[6]); |
249 | 0 | mmTmp[3] = _mm_mullo_epi32(mmIndxK, mmC[6]); |
250 | 0 | mmC[5] = _mm_add_epi32(mmC[5], mmTmp[2]); |
251 | 0 | mmC[7] = _mm_sub_epi32(mmC[7], mmTmp[3]); |
252 | 0 | } |
253 | | |
254 | | // Residue |
255 | 0 | mmResidue[0] = _vv_loadl_epi64((const __m128i*)&pResidue[resIdx1]); |
256 | 0 | mmResidue[1] = _vv_loadl_epi64((const __m128i*)&pResidue[resIdx2]); |
257 | 0 | mmResidue[0] = _mm_cvtepi16_epi32(mmResidue[0]); |
258 | 0 | mmResidue[1] = _mm_cvtepi16_epi32(mmResidue[1]); |
259 | 0 | mmResidue[0] = _mm_slli_epi32(mmResidue[0], 3); |
260 | 0 | mmResidue[1] = _mm_slli_epi32(mmResidue[1], 3); |
261 | | |
262 | | // Calculation of coefficient matrix |
263 | 0 | for (int col = 0; col < n; col++) |
264 | 0 | { |
265 | 0 | mmTmp[0] = _mm_srli_si128(mmC[0 + col], 4); |
266 | 0 | mmTmp[1] = _mm_srli_si128(mmC[n + col], 4); |
267 | 0 | CALC_EQUAL_COEFF_8PXLS(mmC[0 + col], mmC[n + col], mmC[0 + col], mmC[n + col], mmTmp[0], mmTmp[1], mmTmp[0], mmTmp[1], mmIntermediate[0], mmIntermediate[1], mmIntermediate[2], mmIntermediate[3], (const __m128i*)&pEqualCoeff[col + 1][col]); |
268 | 0 | _vv_storel_epi64((__m128i*)&pEqualCoeff[col + 1][col], mmIntermediate[3]); |
269 | |
|
270 | 0 | for (int row = col + 1; row < n; row++) |
271 | 0 | { |
272 | 0 | mmTmp[2] = _mm_srli_si128(mmC[0 + row], 4); |
273 | 0 | mmTmp[3] = _mm_srli_si128(mmC[n + row], 4); |
274 | 0 | CALC_EQUAL_COEFF_8PXLS(mmC[0 + col], mmC[n + col], mmC[0 + row], mmC[n + row], mmTmp[0], mmTmp[1], mmTmp[2], mmTmp[3], mmIntermediate[0], mmIntermediate[1], mmIntermediate[2], mmIntermediate[3], (const __m128i*)&pEqualCoeff[col + 1][row]); |
275 | 0 | _vv_storel_epi64((__m128i*)&pEqualCoeff[col + 1][row], mmIntermediate[3]); |
276 | 0 | _vv_storel_epi64((__m128i*)&pEqualCoeff[row + 1][col], mmIntermediate[3]); |
277 | 0 | } |
278 | |
|
279 | 0 | mmTmp[2] = _mm_srli_si128(mmResidue[0], 4); |
280 | 0 | mmTmp[3] = _mm_srli_si128(mmResidue[1], 4); |
281 | 0 | CALC_EQUAL_COEFF_8PXLS(mmC[0 + col], mmC[n + col], mmResidue[0], mmResidue[1], mmTmp[0], mmTmp[1], mmTmp[2], mmTmp[3], mmIntermediate[0], mmIntermediate[1], mmIntermediate[2], mmIntermediate[3], (const __m128i*)&pEqualCoeff[col + 1][n]); |
282 | 0 | _vv_storel_epi64((__m128i*)&pEqualCoeff[col + 1][n], mmIntermediate[3]); |
283 | 0 | } |
284 | 0 | } |
285 | |
|
286 | 0 | idx1 -= (width); |
287 | 0 | idx2 -= (width); |
288 | 0 | resIdx1 -= (width); |
289 | 0 | resIdx2 -= (width); |
290 | 0 | } |
291 | 0 | } Unexecuted instantiation: AffineGradientSearch_sse41.cpp:void vvenc::simdEqualCoeffComputer<(vvenc::x86_simd::X86_VEXT)1, false>(short*, int, short**, int, int, int, long (*) [7]) Unexecuted instantiation: AffineGradientSearch_sse41.cpp:void vvenc::simdEqualCoeffComputer<(vvenc::x86_simd::X86_VEXT)1, true>(short*, int, short**, int, int, int, long (*) [7]) |
292 | | |
293 | | #if USE_AVX2 |
294 | | |
295 | 0 | #define CALC_EQUAL_COEFF_8PXLS_AVX2(x1,x2,y1,y2,tmp0,tmp1,tmp2,tmp3,inter0,inter1,inter2,inter3,res,loadLocation) \ |
296 | 0 | { \ |
297 | 0 | inter0 = _mm256_mul_epi32(x1, y1); \ |
298 | 0 | inter1 = _mm256_mul_epi32(tmp0, tmp2); \ |
299 | 0 | inter2 = _mm256_mul_epi32(x2, y2); \ |
300 | 0 | inter3 = _mm256_mul_epi32(tmp1, tmp3); \ |
301 | 0 | inter2 = _mm256_add_epi64(inter0, inter2); \ |
302 | 0 | inter3 = _mm256_add_epi64(inter1, inter3); \ |
303 | 0 | res = _vv_loadl_epi64(loadLocation); \ |
304 | 0 | inter3 = _mm256_add_epi64(inter2, inter3); \ |
305 | 0 | inter1 = _mm256_srli_si256(inter3, 8); \ |
306 | 0 | inter3 = _mm256_add_epi64(inter1, inter3); \ |
307 | 0 | res = _mm_add_epi64(res, _mm256_castsi256_si128(inter3)); \ |
308 | 0 | res = _mm_add_epi64(res, _mm256_extracti128_si256(inter3, 1)); \ |
309 | 0 | } |
310 | | |
311 | | template<bool b6Param> |
312 | | static void simdEqualCoeffComputer_avx2(Pel* const pResidue, const int residueStride, Pel **const ppDerivate, const int derivateBufStride, const int width, const int height, int64_t(*pEqualCoeff)[7]) |
313 | 0 | { |
314 | 0 | __m256i mmFour; |
315 | 0 | __m256i mmTmp[4]; |
316 | 0 | __m256i mmIntermediate[4]; |
317 | 0 | __m256i mmIndxK, mmIndxJ; |
318 | 0 | __m256i mmResidue[2]; |
319 | 0 | __m256i mmC[12]; |
320 | 0 | __m128i mmRes; |
321 | | |
322 | | // Add directly to indexes to get new index |
323 | 0 | mmFour = _mm256_set1_epi32(4); |
324 | 0 | mmIndxJ = _mm256_set1_epi32(-2); |
325 | |
|
326 | 0 | static constexpr int n = b6Param ? 6 : 4; |
327 | 0 | int idx1 = -2 * derivateBufStride - 8; |
328 | 0 | int idx2 = - derivateBufStride - 8; |
329 | 0 | int resIdx1 = -2 * residueStride - 8; |
330 | 0 | int resIdx2 = - residueStride - 8; |
331 | |
|
332 | 0 | for (int j = 0; j < height; j += 2) |
333 | 0 | { |
334 | 0 | if (!(j & 3)) |
335 | 0 | mmIndxJ = _mm256_add_epi32(mmIndxJ, mmFour); |
336 | 0 | mmIndxK = _mm256_inserti128_si256( _mm256_castsi128_si256( _mm_set1_epi32( -6 ) ), _mm_set1_epi32( -2 ), 1 ); |
337 | 0 | idx1 += (derivateBufStride << 1); |
338 | 0 | idx2 += (derivateBufStride << 1); |
339 | 0 | resIdx1 += (residueStride << 1); |
340 | 0 | resIdx2 += (residueStride << 1); |
341 | |
|
342 | 0 | for (int k = 0; k < width; k += 8) |
343 | 0 | { |
344 | 0 | idx1 += 8; |
345 | 0 | idx2 += 8; |
346 | 0 | resIdx1 += 8; |
347 | 0 | resIdx2 += 8; |
348 | 0 | mmIndxK = _mm256_add_epi32(mmIndxK, mmFour); |
349 | 0 | mmIndxK = _mm256_add_epi32(mmIndxK, mmFour); |
350 | |
|
351 | 0 | if (b6Param) |
352 | 0 | { |
353 | | // mmC[0-5] for iC[0-5] of 1st row of pixels |
354 | 0 | mmC[0] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&ppDerivate[0][idx1])); |
355 | 0 | mmC[2] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&ppDerivate[1][idx1])); |
356 | 0 | mmC[1] = _mm256_mullo_epi32(mmIndxK, mmC[0]); |
357 | 0 | mmC[3] = _mm256_mullo_epi32(mmIndxK, mmC[2]); |
358 | 0 | mmC[4] = _mm256_mullo_epi32(mmIndxJ, mmC[0]); |
359 | 0 | mmC[5] = _mm256_mullo_epi32(mmIndxJ, mmC[2]); |
360 | | |
361 | | // mmC[6-11] for iC[0-5] of 2nd row of pixels |
362 | 0 | mmC[6] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&ppDerivate[0][idx2])); |
363 | 0 | mmC[8] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&ppDerivate[1][idx2])); |
364 | 0 | mmC[7] = _mm256_mullo_epi32(mmIndxK, mmC[6]); |
365 | 0 | mmC[9] = _mm256_mullo_epi32(mmIndxK, mmC[8]); |
366 | 0 | mmC[10] = _mm256_mullo_epi32(mmIndxJ, mmC[6]); |
367 | 0 | mmC[11] = _mm256_mullo_epi32(mmIndxJ, mmC[8]); |
368 | 0 | } |
369 | 0 | else |
370 | 0 | { |
371 | | // mmC[0-3] for iC[0-3] of 1st row of pixels |
372 | 0 | mmC[0] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&ppDerivate[0][idx1])); |
373 | 0 | mmC[2] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&ppDerivate[1][idx1])); |
374 | 0 | mmC[1] = _mm256_mullo_epi32(mmIndxK, mmC[0]); |
375 | 0 | mmC[3] = _mm256_mullo_epi32(mmIndxJ, mmC[0]); |
376 | 0 | mmTmp[0] = _mm256_mullo_epi32(mmIndxJ, mmC[2]); |
377 | 0 | mmTmp[1] = _mm256_mullo_epi32(mmIndxK, mmC[2]); |
378 | 0 | mmC[1] = _mm256_add_epi32(mmC[1], mmTmp[0]); |
379 | 0 | mmC[3] = _mm256_sub_epi32(mmC[3], mmTmp[1]); |
380 | | |
381 | | // mmC[4-7] for iC[0-3] of 1st row of pixels |
382 | 0 | mmC[4] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&ppDerivate[0][idx2])); |
383 | 0 | mmC[6] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&ppDerivate[1][idx2])); |
384 | 0 | mmC[5] = _mm256_mullo_epi32(mmIndxK, mmC[4]); |
385 | 0 | mmC[7] = _mm256_mullo_epi32(mmIndxJ, mmC[4]); |
386 | 0 | mmTmp[2] = _mm256_mullo_epi32(mmIndxJ, mmC[6]); |
387 | 0 | mmTmp[3] = _mm256_mullo_epi32(mmIndxK, mmC[6]); |
388 | 0 | mmC[5] = _mm256_add_epi32(mmC[5], mmTmp[2]); |
389 | 0 | mmC[7] = _mm256_sub_epi32(mmC[7], mmTmp[3]); |
390 | 0 | } |
391 | | |
392 | | // Residue |
393 | 0 | mmResidue[0] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&pResidue[resIdx1])); |
394 | 0 | mmResidue[1] = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i*)&pResidue[resIdx2])); |
395 | 0 | mmResidue[0] = _mm256_slli_epi32(mmResidue[0], 3); |
396 | 0 | mmResidue[1] = _mm256_slli_epi32(mmResidue[1], 3); |
397 | | |
398 | | // Calculation of coefficient matrix |
399 | 0 | for (int col = 0; col < n; col++) |
400 | 0 | { |
401 | 0 | mmTmp[0] = _mm256_srli_si256(mmC[0 + col], 4); |
402 | 0 | mmTmp[1] = _mm256_srli_si256(mmC[n + col], 4); |
403 | 0 | CALC_EQUAL_COEFF_8PXLS_AVX2(mmC[0 + col], mmC[n + col], mmC[0 + col], mmC[n + col], mmTmp[0], mmTmp[1], mmTmp[0], mmTmp[1], mmIntermediate[0], mmIntermediate[1], mmIntermediate[2], mmIntermediate[3], mmRes, (const __m128i*)&pEqualCoeff[col + 1][col]); |
404 | 0 | _vv_storel_epi64((__m128i*)&pEqualCoeff[col + 1][col], mmRes); |
405 | |
|
406 | 0 | for (int row = col + 1; row < n; row++) |
407 | 0 | { |
408 | 0 | mmTmp[2] = _mm256_srli_si256(mmC[0 + row], 4); |
409 | 0 | mmTmp[3] = _mm256_srli_si256(mmC[n + row], 4); |
410 | 0 | CALC_EQUAL_COEFF_8PXLS_AVX2(mmC[0 + col], mmC[n + col], mmC[0 + row], mmC[n + row], mmTmp[0], mmTmp[1], mmTmp[2], mmTmp[3], mmIntermediate[0], mmIntermediate[1], mmIntermediate[2], mmIntermediate[3], mmRes, (const __m128i*)&pEqualCoeff[col + 1][row]); |
411 | 0 | _vv_storel_epi64((__m128i*)&pEqualCoeff[col + 1][row], mmRes); |
412 | 0 | _vv_storel_epi64((__m128i*)&pEqualCoeff[row + 1][col], mmRes); |
413 | 0 | } |
414 | |
|
415 | 0 | mmTmp[2] = _mm256_srli_si256(mmResidue[0], 4); |
416 | 0 | mmTmp[3] = _mm256_srli_si256(mmResidue[1], 4); |
417 | 0 | CALC_EQUAL_COEFF_8PXLS_AVX2(mmC[0 + col], mmC[n + col], mmResidue[0], mmResidue[1], mmTmp[0], mmTmp[1], mmTmp[2], mmTmp[3], mmIntermediate[0], mmIntermediate[1], mmIntermediate[2], mmIntermediate[3], mmRes, (const __m128i*)&pEqualCoeff[col + 1][n]); |
418 | 0 | _vv_storel_epi64((__m128i*)&pEqualCoeff[col + 1][n], mmRes); |
419 | 0 | } |
420 | 0 | } |
421 | |
|
422 | 0 | idx1 -= (width); |
423 | 0 | idx2 -= (width); |
424 | 0 | resIdx1 -= (width); |
425 | 0 | resIdx2 -= (width); |
426 | 0 | } |
427 | 0 | } Unexecuted instantiation: AffineGradientSearch_avx2.cpp:void vvenc::simdEqualCoeffComputer_avx2<false>(short*, int, short**, int, int, int, long (*) [7]) Unexecuted instantiation: AffineGradientSearch_avx2.cpp:void vvenc::simdEqualCoeffComputer_avx2<true>(short*, int, short**, int, int, int, long (*) [7]) |
428 | | #endif |
429 | | |
430 | | template <X86_VEXT vext> |
431 | | void AffineGradientSearch::_initAffineGradientSearchX86() |
432 | 0 | { |
433 | 0 | m_HorizontalSobelFilter = simdHorizontalSobelFilter<vext>; |
434 | 0 | m_VerticalSobelFilter = simdVerticalSobelFilter<vext>; |
435 | | #if USE_AVX2 |
436 | | m_EqualCoeffComputer[0] = simdEqualCoeffComputer_avx2<false>; |
437 | | m_EqualCoeffComputer[1] = simdEqualCoeffComputer_avx2<true>; |
438 | | #else |
439 | | m_EqualCoeffComputer[0] = simdEqualCoeffComputer<vext, false>; |
440 | | m_EqualCoeffComputer[1] = simdEqualCoeffComputer<vext, true>; |
441 | | #endif |
442 | 0 | } Unexecuted instantiation: void vvenc::AffineGradientSearch::_initAffineGradientSearchX86<(vvenc::x86_simd::X86_VEXT)1>() Unexecuted instantiation: void vvenc::AffineGradientSearch::_initAffineGradientSearchX86<(vvenc::x86_simd::X86_VEXT)4>() |
443 | | |
444 | | template void AffineGradientSearch::_initAffineGradientSearchX86<SIMDX86>(); |
445 | | |
446 | | } |
447 | | |
448 | | #endif //#ifdef TARGET_SIMD_X86 |
449 | | //! \} |