/src/vvenc/source/Lib/CommonLib/x86/RdCostX86.h
Line | Count | Source |
1 | | /* ----------------------------------------------------------------------------- |
2 | | The copyright in this software is being made available under the Clear BSD |
3 | | License, included below. No patent rights, trademark rights and/or |
4 | | other Intellectual Property Rights other than the copyrights concerning |
5 | | the Software are granted under this license. |
6 | | |
7 | | The Clear BSD License |
8 | | |
9 | | Copyright (c) 2019-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. |
10 | | All rights reserved. |
11 | | |
12 | | Redistribution and use in source and binary forms, with or without modification, |
13 | | are permitted (subject to the limitations in the disclaimer below) provided that |
14 | | the following conditions are met: |
15 | | |
16 | | * Redistributions of source code must retain the above copyright notice, |
17 | | this list of conditions and the following disclaimer. |
18 | | |
19 | | * Redistributions in binary form must reproduce the above copyright |
20 | | notice, this list of conditions and the following disclaimer in the |
21 | | documentation and/or other materials provided with the distribution. |
22 | | |
23 | | * Neither the name of the copyright holder nor the names of its |
24 | | contributors may be used to endorse or promote products derived from this |
25 | | software without specific prior written permission. |
26 | | |
27 | | NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY |
28 | | THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND |
29 | | CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
30 | | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A |
31 | | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR |
32 | | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
33 | | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
34 | | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR |
35 | | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER |
36 | | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
37 | | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
38 | | POSSIBILITY OF SUCH DAMAGE. |
39 | | |
40 | | |
41 | | ------------------------------------------------------------------------------------------- */ |
42 | | /** \file RdCostX86.cpp |
43 | | \brief RD cost computation class, SIMD version |
44 | | */ |
45 | | |
46 | | #pragma once |
47 | | |
48 | | #include "CommonDefX86.h" |
49 | | #include "Rom.h" |
50 | | #include "RdCost.h" |
51 | | |
52 | | #include <math.h> |
53 | | |
54 | | #if defined(TARGET_SIMD_X86) && ENABLE_SIMD_OPT_DIST |
55 | | |
56 | | //! \ingroup CommonLib |
57 | | //! \{ |
58 | | |
59 | | namespace vvenc { |
60 | | |
61 | | typedef Pel Torg; |
62 | | typedef Pel Tcur; |
63 | | |
64 | | template<X86_VEXT vext > |
65 | | Distortion RdCost::xGetSSE_SIMD( const DistParam &rcDtParam ) |
66 | 0 | { |
67 | 0 | const Torg* pSrc1 = (const Torg*)rcDtParam.org.buf; |
68 | 0 | const Tcur* pSrc2 = (const Tcur*)rcDtParam.cur.buf; |
69 | 0 | int iRows = rcDtParam.org.height; |
70 | 0 | int iCols = rcDtParam.org.width; |
71 | 0 | const int iStrideSrc1 = rcDtParam.org.stride; |
72 | 0 | const int iStrideSrc2 = rcDtParam.cur.stride; |
73 | |
|
74 | 0 | const uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1; |
75 | 0 | Distortion uiRet = 0; |
76 | |
|
77 | 0 | if( vext >= AVX2 && ( iCols & 15 ) == 0 ) |
78 | 0 | { |
79 | | #ifdef USE_AVX2 |
80 | | __m256i Sum = _mm256_setzero_si256(); |
81 | 0 | for( int iY = 0; iY < iRows; iY++ ) |
82 | 0 | { |
83 | 0 | for( int iX = 0; iX < iCols; iX+=16 ) |
84 | 0 | { |
85 | 0 | __m256i Src1 = ( _mm256_lddqu_si256( ( __m256i* )( &pSrc1[iX] ) ) ); |
86 | 0 | __m256i Src2 = ( _mm256_lddqu_si256( ( __m256i* )( &pSrc2[iX] ) ) ); |
87 | 0 | __m256i Diff = _mm256_sub_epi16( Src1, Src2 ); |
88 | 0 | __m256i Res = _mm256_madd_epi16( Diff, Diff ); |
89 | 0 | Sum = _mm256_add_epi32( Sum, Res ); |
90 | 0 | } |
91 | 0 | pSrc1 += iStrideSrc1; |
92 | 0 | pSrc2 += iStrideSrc2; |
93 | 0 | } |
94 | | Sum = _mm256_hadd_epi32( Sum, Sum ); |
95 | | Sum = _mm256_hadd_epi32( Sum, Sum ); |
96 | | uiRet = ( _mm_cvtsi128_si32( _mm256_castsi256_si128( Sum ) ) + _mm_cvtsi128_si32( _mm256_castsi256_si128( _mm256_permute2x128_si256( Sum, Sum, 0x11 ) ) ) ) >> uiShift; |
97 | | #endif |
98 | 0 | } |
99 | 0 | else if( ( iCols & 7 ) == 0 ) |
100 | 0 | { |
101 | 0 | __m128i Sum = _mm_setzero_si128(); |
102 | 0 | for( int iY = 0; iY < iRows; iY++ ) |
103 | 0 | { |
104 | 0 | for( int iX = 0; iX < iCols; iX += 8 ) |
105 | 0 | { |
106 | 0 | __m128i Src1 = ( sizeof( Torg ) > 1 ) ? ( _mm_loadu_si128 ( ( const __m128i* )( &pSrc1[iX] ) ) ) : ( _mm_unpacklo_epi8( _vv_loadl_epi64( ( const __m128i* )( &pSrc1[iX] ) ), _mm_setzero_si128() ) ); |
107 | 0 | __m128i Src2 = ( sizeof( Tcur ) > 1 ) ? ( _mm_loadu_si128( ( const __m128i* )( &pSrc2[iX] ) ) ) : ( _mm_unpacklo_epi8( _vv_loadl_epi64( ( const __m128i* )( &pSrc2[iX] ) ), _mm_setzero_si128() ) ); |
108 | 0 | __m128i Diff = _mm_sub_epi16( Src1, Src2 ); |
109 | 0 | __m128i Res = _mm_madd_epi16( Diff, Diff ); |
110 | 0 | Sum = _mm_add_epi32( Sum, Res ); |
111 | 0 | } |
112 | 0 | pSrc1 += iStrideSrc1; |
113 | 0 | pSrc2 += iStrideSrc2; |
114 | 0 | } |
115 | 0 | Sum = _mm_hadd_epi32( Sum, Sum ); |
116 | 0 | Sum = _mm_hadd_epi32( Sum, Sum ); |
117 | 0 | uiRet = _mm_cvtsi128_si32( Sum )>>uiShift; |
118 | 0 | } |
119 | 0 | else |
120 | 0 | { |
121 | 0 | __m128i Sum = _mm_setzero_si128(); |
122 | 0 | for( int iY = 0; iY < iRows; iY++ ) |
123 | 0 | { |
124 | 0 | for( int iX = 0; iX < iCols; iX += 4 ) |
125 | 0 | { |
126 | 0 | __m128i Src1 = ( sizeof( Torg ) > 1 ) ? ( _vv_loadl_epi64( ( const __m128i* )&pSrc1[iX] ) ) : ( _mm_unpacklo_epi8( _mm_cvtsi32_si128( *(const int*)&pSrc1[iX] ), _mm_setzero_si128() ) ); |
127 | 0 | __m128i Src2 = ( sizeof( Tcur ) > 1 ) ? ( _vv_loadl_epi64( ( const __m128i* )&pSrc2[iX] ) ) : ( _mm_unpacklo_epi8( _mm_cvtsi32_si128( *(const int*)&pSrc2[iX] ), _mm_setzero_si128() ) ); |
128 | 0 | __m128i Diff = _mm_sub_epi16( Src1, Src2 ); |
129 | 0 | __m128i Res = _mm_madd_epi16( Diff, Diff ); |
130 | 0 | Sum = _mm_add_epi32( Sum, Res ); |
131 | 0 | } |
132 | 0 | pSrc1 += iStrideSrc1; |
133 | 0 | pSrc2 += iStrideSrc2; |
134 | 0 | } |
135 | 0 | Sum = _mm_hadd_epi32( Sum, Sum ); |
136 | 0 | uiRet = _mm_cvtsi128_si32( Sum )>>uiShift; |
137 | 0 | } |
138 | |
|
139 | 0 | return uiRet; |
140 | 0 | } Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSSE_SIMD<(vvenc::x86_simd::X86_VEXT)1>(vvenc::DistParam const&) Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSSE_SIMD<(vvenc::x86_simd::X86_VEXT)4>(vvenc::DistParam const&) |
141 | | |
142 | | |
143 | | template<int iWidth, X86_VEXT vext > |
144 | | Distortion RdCost::xGetSSE_NxN_SIMD( const DistParam &rcDtParam ) |
145 | 0 | { |
146 | 0 | const Torg* pSrc1 = (const Torg*)rcDtParam.org.buf; |
147 | 0 | const Tcur* pSrc2 = (const Tcur*)rcDtParam.cur.buf; |
148 | 0 | int iRows = rcDtParam.org.height; |
149 | 0 | const int iStrideSrc1 = rcDtParam.org.stride; |
150 | 0 | const int iStrideSrc2 = rcDtParam.cur.stride; |
151 | |
|
152 | 0 | const uint32_t uiShift = DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1; |
153 | 0 | Distortion uiRet = 0; |
154 | |
|
155 | 0 | if( 4 == iWidth ) |
156 | 0 | { |
157 | 0 | __m128i Sum = _mm_setzero_si128(); |
158 | 0 | for( int iY = 0; iY < iRows; iY++ ) |
159 | 0 | { |
160 | 0 | __m128i Src1 = ( sizeof( Torg ) > 1 ) ? ( _vv_loadl_epi64( ( const __m128i* )pSrc1 ) ) : ( _mm_unpacklo_epi8( _mm_cvtsi32_si128( *(const int*)pSrc1 ), _mm_setzero_si128() ) ); |
161 | 0 | __m128i Src2 = ( sizeof( Tcur ) > 1 ) ? ( _vv_loadl_epi64( ( const __m128i* )pSrc2 ) ) : ( _mm_unpacklo_epi8( _mm_cvtsi32_si128( *(const int*)pSrc2 ), _mm_setzero_si128() ) ); |
162 | 0 | pSrc1 += iStrideSrc1; |
163 | 0 | pSrc2 += iStrideSrc2; |
164 | 0 | __m128i Diff = _mm_sub_epi16( Src1, Src2 ); |
165 | 0 | __m128i Res = _mm_madd_epi16( Diff, Diff ); |
166 | 0 | Sum = _mm_add_epi32( Sum, Res ); |
167 | 0 | } |
168 | 0 | Sum = _mm_hadd_epi32( Sum, Sum ); |
169 | 0 | uiRet = _mm_cvtsi128_si32( Sum )>>uiShift; |
170 | 0 | } |
171 | 0 | else |
172 | 0 | { |
173 | 0 | if( vext >= AVX2 && iWidth >= 16 ) |
174 | 0 | { |
175 | | #ifdef USE_AVX2 |
176 | | __m256i Sum = _mm256_setzero_si256(); |
177 | 0 | for( int iY = 0; iY < iRows; iY++ ) |
178 | 0 | { |
179 | 0 | for( int iX = 0; iX < iWidth; iX+=16 ) |
180 | 0 | { |
181 | 0 | __m256i Src1 = ( sizeof( Torg ) > 1 ) ? ( _mm256_lddqu_si256( ( __m256i* )( &pSrc1[iX] ) ) ) : ( _mm256_unpacklo_epi8( _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_loadu_si128( ( __m128i* )( &pSrc1[iX] ) ) ), 0xD8 ), _mm256_setzero_si256() ) ); |
182 | 0 | __m256i Src2 = ( sizeof( Tcur ) > 1 ) ? ( _mm256_lddqu_si256( ( __m256i* )( &pSrc2[iX] ) ) ) : ( _mm256_unpacklo_epi8( _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_loadu_si128( ( __m128i* )( &pSrc2[iX] ) ) ), 0xD8 ), _mm256_setzero_si256() ) ); |
183 | 0 | __m256i Diff = _mm256_sub_epi16( Src1, Src2 ); |
184 | 0 | __m256i Res = _mm256_madd_epi16( Diff, Diff ); |
185 | 0 | Sum = _mm256_add_epi32( Sum, Res ); |
186 | 0 | } |
187 | 0 | pSrc1 += iStrideSrc1; |
188 | 0 | pSrc2 += iStrideSrc2; |
189 | 0 | } |
190 | | |
191 | | __m256i vzero = _mm256_setzero_si256(); |
192 | | Sum = _mm256_add_epi64( _mm256_unpacklo_epi32( Sum, vzero ), _mm256_unpackhi_epi32( Sum, vzero )); |
193 | | Sum = _mm256_add_epi64( Sum, _mm256_permute4x64_epi64( Sum, 14 ) ); |
194 | | Sum = _mm256_add_epi64( Sum, _mm256_permute4x64_epi64( Sum, 1 ) ); |
195 | | uiRet = _mm_cvtsi128_si64( _mm256_castsi256_si128( Sum ))>>uiShift; |
196 | | #endif |
197 | 0 | } |
198 | 0 | else |
199 | 0 | { |
200 | 0 | __m128i Sum = _mm_setzero_si128(); |
201 | 0 | for( int iY = 0; iY < iRows; iY++ ) |
202 | 0 | { |
203 | 0 | for( int iX = 0; iX < iWidth; iX+=8 ) |
204 | 0 | { |
205 | 0 | __m128i Src1 = ( sizeof( Torg ) > 1 ) ? ( _mm_loadu_si128( ( const __m128i* )( &pSrc1[iX] ) ) ) : ( _mm_unpacklo_epi8( _vv_loadl_epi64( ( const __m128i* )( &pSrc1[iX] ) ), _mm_setzero_si128() ) ); |
206 | 0 | __m128i Src2 = ( sizeof( Tcur ) > 1 ) ? ( _mm_loadu_si128( ( const __m128i* )( &pSrc2[iX] ) ) ) : ( _mm_unpacklo_epi8( _vv_loadl_epi64( ( const __m128i* )( &pSrc2[iX] ) ), _mm_setzero_si128() ) ); |
207 | 0 | __m128i Diff = _mm_sub_epi16( Src1, Src2 ); |
208 | 0 | __m128i Res = _mm_madd_epi16( Diff, Diff ); |
209 | 0 | Sum = _mm_add_epi32( Sum, Res ); |
210 | 0 | } |
211 | 0 | pSrc1 += iStrideSrc1; |
212 | 0 | pSrc2 += iStrideSrc2; |
213 | 0 | } |
214 | |
|
215 | 0 | __m128i vzero = _mm_setzero_si128(); |
216 | 0 | Sum = _mm_add_epi64( _mm_unpacklo_epi32( Sum, vzero ), _mm_unpackhi_epi32( Sum, vzero )); |
217 | 0 | uiRet = (_mm_cvtsi128_si64( Sum ) + _mm_extract_epi64( Sum, 1 ))>>uiShift; |
218 | 0 | } |
219 | 0 | } |
220 | |
|
221 | 0 | return uiRet; |
222 | 0 | } Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSSE_NxN_SIMD<4, (vvenc::x86_simd::X86_VEXT)1>(vvenc::DistParam const&) Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSSE_NxN_SIMD<8, (vvenc::x86_simd::X86_VEXT)1>(vvenc::DistParam const&) Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSSE_NxN_SIMD<16, (vvenc::x86_simd::X86_VEXT)1>(vvenc::DistParam const&) Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSSE_NxN_SIMD<32, (vvenc::x86_simd::X86_VEXT)1>(vvenc::DistParam const&) Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSSE_NxN_SIMD<64, (vvenc::x86_simd::X86_VEXT)1>(vvenc::DistParam const&) Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSSE_NxN_SIMD<128, (vvenc::x86_simd::X86_VEXT)1>(vvenc::DistParam const&) Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSSE_NxN_SIMD<4, (vvenc::x86_simd::X86_VEXT)4>(vvenc::DistParam const&) Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSSE_NxN_SIMD<8, (vvenc::x86_simd::X86_VEXT)4>(vvenc::DistParam const&) Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSSE_NxN_SIMD<16, (vvenc::x86_simd::X86_VEXT)4>(vvenc::DistParam const&) Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSSE_NxN_SIMD<32, (vvenc::x86_simd::X86_VEXT)4>(vvenc::DistParam const&) Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSSE_NxN_SIMD<64, (vvenc::x86_simd::X86_VEXT)4>(vvenc::DistParam const&) Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSSE_NxN_SIMD<128, (vvenc::x86_simd::X86_VEXT)4>(vvenc::DistParam const&) |
223 | | |
224 | | template< X86_VEXT vext > |
225 | | Distortion RdCost::xGetSAD_SIMD( const DistParam &rcDtParam ) |
226 | 0 | { |
227 | 0 | if( rcDtParam.org.width < 4 ) |
228 | 0 | return RdCost::xGetSAD( rcDtParam ); |
229 | | |
230 | 0 | const short* pSrc1 = (const short*)rcDtParam.org.buf; |
231 | 0 | const short* pSrc2 = (const short*)rcDtParam.cur.buf; |
232 | 0 | int iRows = rcDtParam.org.height; |
233 | 0 | int iCols = rcDtParam.org.width; |
234 | 0 | int iSubShift = rcDtParam.subShift; |
235 | 0 | int iSubStep = ( 1 << iSubShift ); |
236 | 0 | const int iStrideSrc1 = rcDtParam.org.stride * iSubStep; |
237 | 0 | const int iStrideSrc2 = rcDtParam.cur.stride * iSubStep; |
238 | |
|
239 | 0 | uint32_t uiSum = 0; |
240 | 0 | if( vext >= AVX2 && ( iCols & 15 ) == 0 ) |
241 | 0 | { |
242 | | #ifdef USE_AVX2 |
243 | | // Do for width that multiple of 16 |
244 | | __m256i vzero = _mm256_setzero_si256(); |
245 | | __m256i vsum32 = vzero; |
246 | 0 | for( int iY = 0; iY < iRows; iY+=iSubStep ) |
247 | 0 | { |
248 | 0 | __m256i vsum16 = vzero; |
249 | 0 | for( int iX = 0; iX < iCols; iX+=16 ) |
250 | 0 | { |
251 | 0 | __m256i vsrc1 = _mm256_lddqu_si256( ( __m256i* )( &pSrc1[iX] ) ); |
252 | 0 | __m256i vsrc2 = _mm256_lddqu_si256( ( __m256i* )( &pSrc2[iX] ) ); |
253 | 0 | vsum16 = _mm256_add_epi16( vsum16, _mm256_abs_epi16( _mm256_sub_epi16( vsrc1, vsrc2 ) ) ); |
254 | 0 | } |
255 | 0 | __m256i vsumtemp = _mm256_add_epi32( _mm256_unpacklo_epi16( vsum16, vzero ), _mm256_unpackhi_epi16( vsum16, vzero ) ); |
256 | 0 | vsum32 = _mm256_add_epi32( vsum32, vsumtemp ); |
257 | 0 | pSrc1 += iStrideSrc1; |
258 | 0 | pSrc2 += iStrideSrc2; |
259 | 0 | } |
260 | | vsum32 = _mm256_hadd_epi32( vsum32, vzero ); |
261 | | vsum32 = _mm256_hadd_epi32( vsum32, vzero ); |
262 | | uiSum = _mm_cvtsi128_si32( _mm256_castsi256_si128( vsum32 ) ) + _mm_cvtsi128_si32( _mm256_castsi256_si128( _mm256_permute2x128_si256( vsum32, vsum32, 0x11 ) ) ); |
263 | | #endif |
264 | 0 | } |
265 | 0 | else if( ( iCols & 7 ) == 0 ) |
266 | 0 | { |
267 | | // Do with step of 8 |
268 | 0 | __m128i vzero = _mm_setzero_si128(); |
269 | 0 | __m128i vsum32 = vzero; |
270 | 0 | for( int iY = 0; iY < iRows; iY+=iSubStep ) |
271 | 0 | { |
272 | 0 | __m128i vsum16 = vzero; |
273 | 0 | for( int iX = 0; iX < iCols; iX+=8 ) |
274 | 0 | { |
275 | 0 | __m128i vsrc1 = _mm_loadu_si128( ( const __m128i* )( &pSrc1[iX] ) ); |
276 | 0 | __m128i vsrc2 = _mm_loadu_si128( ( const __m128i* )( &pSrc2[iX] ) ); |
277 | 0 | vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) ); |
278 | 0 | } |
279 | 0 | __m128i vsumtemp = _mm_add_epi32( _mm_unpacklo_epi16( vsum16, vzero ), _mm_unpackhi_epi16( vsum16, vzero ) ); |
280 | 0 | vsum32 = _mm_add_epi32( vsum32, vsumtemp ); |
281 | 0 | pSrc1 += iStrideSrc1; |
282 | 0 | pSrc2 += iStrideSrc2; |
283 | 0 | } |
284 | 0 | vsum32 = _mm_hadd_epi32( vsum32, vzero ); |
285 | 0 | vsum32 = _mm_hadd_epi32( vsum32, vzero ); |
286 | 0 | uiSum = _mm_cvtsi128_si32( vsum32 ); |
287 | 0 | } |
288 | 0 | else |
289 | 0 | { |
290 | | // Do with step of 4 |
291 | 0 | CHECK( ( iCols & 3 ) != 0, "Not divisible by 4: " << iCols ); |
292 | 0 | __m128i vzero = _mm_setzero_si128(); |
293 | 0 | __m128i vsum32 = vzero; |
294 | 0 | for( int iY = 0; iY < iRows; iY += iSubStep ) |
295 | 0 | { |
296 | 0 | __m128i vsum16 = vzero; |
297 | 0 | for( int iX = 0; iX < iCols; iX+=4 ) |
298 | 0 | { |
299 | 0 | __m128i vsrc1 = _vv_loadl_epi64( ( const __m128i* )&pSrc1[iX] ); |
300 | 0 | __m128i vsrc2 = _vv_loadl_epi64( ( const __m128i* )&pSrc2[iX] ); |
301 | 0 | vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) ); |
302 | 0 | } |
303 | 0 | __m128i vsumtemp = _mm_add_epi32( _mm_unpacklo_epi16( vsum16, vzero ), _mm_unpackhi_epi16( vsum16, vzero ) ); |
304 | 0 | vsum32 = _mm_add_epi32( vsum32, vsumtemp ); |
305 | 0 | pSrc1 += iStrideSrc1; |
306 | 0 | pSrc2 += iStrideSrc2; |
307 | 0 | } |
308 | 0 | vsum32 = _mm_hadd_epi32( vsum32, vzero ); |
309 | 0 | vsum32 = _mm_hadd_epi32( vsum32, vzero ); |
310 | 0 | uiSum = _mm_cvtsi128_si32( vsum32 ); |
311 | 0 | } |
312 | | |
313 | 0 | uiSum <<= iSubShift; |
314 | 0 | return uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth); |
315 | 0 | } Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSAD_SIMD<(vvenc::x86_simd::X86_VEXT)1>(vvenc::DistParam const&) Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSAD_SIMD<(vvenc::x86_simd::X86_VEXT)4>(vvenc::DistParam const&) |
316 | | |
317 | | |
318 | | template< int iWidth, X86_VEXT vext > |
319 | | Distortion RdCost::xGetSAD_NxN_SIMD( const DistParam &rcDtParam ) |
320 | 0 | { |
321 | | // assert( rcDtParam.iCols == iWidth); |
322 | 0 | const short* pSrc1 = (const short*)rcDtParam.org.buf; |
323 | 0 | const short* pSrc2 = (const short*)rcDtParam.cur.buf; |
324 | 0 | int iRows = rcDtParam.org.height; |
325 | 0 | int iSubShift = rcDtParam.subShift; |
326 | 0 | int iSubStep = ( 1 << iSubShift ); |
327 | 0 | const int iStrideSrc1 = rcDtParam.org.stride * iSubStep; |
328 | 0 | const int iStrideSrc2 = rcDtParam.cur.stride * iSubStep; |
329 | |
|
330 | 0 | uint32_t uiSum = 0; |
331 | |
|
332 | 0 | if( iWidth == 4 ) |
333 | 0 | { |
334 | 0 | if( iRows == 4 && iSubShift == 0 ) |
335 | 0 | { |
336 | 0 | __m128i vzero = _mm_setzero_si128(); |
337 | 0 | __m128i vsrc1 = _mm_or_si128( _vv_loadl_epi64( ( const __m128i* )pSrc1 ), _mm_slli_si128( _vv_loadl_epi64( ( const __m128i* )( &pSrc1[iStrideSrc1] ) ), 8 ) ); |
338 | 0 | __m128i vsrc2 = _mm_or_si128( _vv_loadl_epi64( ( const __m128i* )pSrc2 ), _mm_slli_si128( _vv_loadl_epi64( ( const __m128i* )( &pSrc2[iStrideSrc2] ) ), 8 ) ); |
339 | 0 | __m128i vsum = _mm_cvtepi16_epi32( _mm_hadd_epi16( _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ), vzero ) ); |
340 | |
|
341 | 0 | vsrc1 = _mm_or_si128( _vv_loadl_epi64( ( const __m128i* )( &pSrc1[2 * iStrideSrc1] ) ), _mm_slli_si128( _vv_loadl_epi64( ( const __m128i* )( &pSrc1[3 * iStrideSrc1] ) ), 8 ) ); |
342 | 0 | vsrc2 = _mm_or_si128( _vv_loadl_epi64( ( const __m128i* )( &pSrc2[2 * iStrideSrc2] ) ), _mm_slli_si128( _vv_loadl_epi64( ( const __m128i* )( &pSrc2[3 * iStrideSrc2] ) ), 8 ) ); |
343 | 0 | vsum = _mm_add_epi32( vsum, _mm_cvtepi16_epi32( _mm_hadd_epi16( _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ), vzero ) ) ); |
344 | 0 | vsum = _mm_hadd_epi32( vsum, vzero ); |
345 | 0 | vsum = _mm_hadd_epi32( vsum, vzero ); |
346 | |
|
347 | 0 | uiSum = _mm_cvtsi128_si32( vsum ); |
348 | 0 | } |
349 | 0 | else |
350 | 0 | { |
351 | 0 | __m128i vone = _mm_set1_epi16( 1 ); |
352 | 0 | __m128i vsum32 = _mm_setzero_si128(); |
353 | 0 | for( int iY = 0; iY < iRows; iY += iSubStep ) |
354 | 0 | { |
355 | 0 | __m128i vsrc1 = _mm_cvtepi16_epi32( _vv_loadl_epi64( ( const __m128i* )pSrc1 ) ); |
356 | 0 | __m128i vsrc2 = _mm_cvtepi16_epi32( _vv_loadl_epi64( ( const __m128i* )pSrc2 ) ); |
357 | 0 | vsum32 = _mm_add_epi32( vsum32, _mm_abs_epi32( _mm_sub_epi32( vsrc1, vsrc2 ) ) ); |
358 | |
|
359 | 0 | pSrc1 += iStrideSrc1; |
360 | 0 | pSrc2 += iStrideSrc2; |
361 | 0 | } |
362 | 0 | vsum32 = _mm_hadd_epi32( vsum32, vone ); |
363 | 0 | vsum32 = _mm_hadd_epi32( vsum32, vone ); |
364 | 0 | uiSum = _mm_cvtsi128_si32( vsum32 ); |
365 | 0 | } |
366 | 0 | } |
367 | 0 | else |
368 | 0 | { |
369 | | #ifdef USE_AVX2 |
370 | 0 | if( vext >= AVX2 && iWidth >= 16 ) |
371 | 0 | { |
372 | 0 | static constexpr bool earlyExitAllowed = iWidth >= 64; |
373 | | // Do for width that multiple of 16 |
374 | 0 | __m256i vone = _mm256_set1_epi16( 1 ); |
375 | 0 | __m256i vsum32 = _mm256_setzero_si256(); |
376 | | |
377 | | int checkExit = 3; |
378 | | |
379 | 0 | for( int iY = 0; iY < iRows; iY+=iSubStep ) |
380 | 0 | { |
381 | 0 | __m256i vsrc1 = _mm256_loadu_si256( ( __m256i* )( pSrc1 ) ); |
382 | 0 | __m256i vsrc2 = _mm256_loadu_si256( ( __m256i* )( pSrc2 ) ); |
383 | 0 | __m256i vsum16 = _mm256_abs_epi16( _mm256_sub_epi16( vsrc1, vsrc2 ) ); |
384 | |
|
385 | 0 | for( int iX = 16; iX < iWidth; iX+=16 ) |
386 | 0 | { |
387 | 0 | vsrc1 = _mm256_loadu_si256( ( __m256i* )( &pSrc1[iX] ) ); |
388 | 0 | vsrc2 = _mm256_loadu_si256( ( __m256i* )( &pSrc2[iX] ) ); |
389 | 0 | vsum16 = _mm256_add_epi16( vsum16, _mm256_abs_epi16( _mm256_sub_epi16( vsrc1, vsrc2 ) ) ); |
390 | 0 | } |
391 | |
|
392 | 0 | __m256i vsumtemp = _mm256_madd_epi16( vsum16, vone ); |
393 | 0 | if( earlyExitAllowed ) vsum32 = _mm256_hadd_epi32( vsum32, vsumtemp ); |
394 | 0 | else vsum32 = _mm256_add_epi32 ( vsum32, vsumtemp ); |
395 | |
|
396 | 0 | pSrc1 += iStrideSrc1; |
397 | 0 | pSrc2 += iStrideSrc2; |
398 | |
|
399 | 0 | if( earlyExitAllowed && checkExit == 0 ) |
400 | 0 | { |
401 | 0 | Distortion distTemp = _mm256_extract_epi32( vsum32, 0 ) + _mm256_extract_epi32( vsum32, 4 ); |
402 | 0 | distTemp <<= iSubShift; |
403 | 0 | distTemp >>= DISTORTION_PRECISION_ADJUSTMENT( rcDtParam.bitDepth ); |
404 | 0 | if( distTemp > rcDtParam.maximumDistortionForEarlyExit ) return distTemp; |
405 | 0 | checkExit = 3; |
406 | 0 | } |
407 | 0 | else if( earlyExitAllowed ) |
408 | 0 | { |
409 | 0 | checkExit--; |
410 | 0 | } |
411 | 0 | } |
412 | | |
413 | 0 | __m128i |
414 | 0 | xsum32 = _mm_add_epi32( _mm256_castsi256_si128( vsum32 ), _mm256_extracti128_si256( vsum32, 1 ) ); |
415 | 0 | xsum32 = _mm_hadd_epi32( xsum32, xsum32 ); |
416 | 0 | xsum32 = _mm_hadd_epi32( xsum32, xsum32 ); |
417 | 0 | uiSum = _mm_cvtsi128_si32( xsum32 ); |
418 | 0 | } |
419 | 0 | else |
420 | 0 | #endif |
421 | 0 | if( iRows == 16 && ( iWidth == 16 || iWidth == 8 ) && iSubShift == 1 && rcDtParam.bitDepth <= 10 ) |
422 | 0 | { |
423 | 0 | static constexpr bool isWdt16 = iWidth >= 16; |
424 | |
|
425 | 0 | __m128i vone = _mm_set1_epi16( 1 ); |
426 | 0 | __m128i vsum32 = _mm_setzero_si128(); |
427 | |
|
428 | 0 | for( int i = 0; i < 2; i++ ) |
429 | 0 | { |
430 | | //0 |
431 | 0 | __m128i vsrc1 = _mm_loadu_si128( (const __m128i*)(pSrc1) ); |
432 | 0 | __m128i vsrc2 = _mm_loadu_si128( (const __m128i*)(pSrc2) ); |
433 | |
|
434 | 0 | __m128i vsum16 = _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ); |
435 | |
|
436 | 0 | if( isWdt16 ) |
437 | 0 | { |
438 | 0 | vsrc1 = _mm_loadu_si128( (const __m128i*)(pSrc1 + 8) ); |
439 | 0 | vsrc2 = _mm_loadu_si128( (const __m128i*)(pSrc2 + 8) ); |
440 | |
|
441 | 0 | vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) ); |
442 | 0 | } |
443 | |
|
444 | 0 | pSrc1 += iStrideSrc1; pSrc2 += iStrideSrc2; |
445 | | |
446 | | // 1 |
447 | 0 | vsrc1 = _mm_loadu_si128( (const __m128i*)(pSrc1) ); |
448 | 0 | vsrc2 = _mm_loadu_si128( (const __m128i*)(pSrc2) ); |
449 | |
|
450 | 0 | vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) ); |
451 | |
|
452 | 0 | if( isWdt16 ) |
453 | 0 | { |
454 | 0 | vsrc1 = _mm_loadu_si128( (const __m128i*)(pSrc1 + 8) ); |
455 | 0 | vsrc2 = _mm_loadu_si128( (const __m128i*)(pSrc2 + 8) ); |
456 | |
|
457 | 0 | vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) ); |
458 | 0 | } |
459 | |
|
460 | 0 | pSrc1 += iStrideSrc1; pSrc2 += iStrideSrc2; |
461 | | |
462 | | // 2 |
463 | 0 | vsrc1 = _mm_loadu_si128( (const __m128i*)(pSrc1) ); |
464 | 0 | vsrc2 = _mm_loadu_si128( (const __m128i*)(pSrc2) ); |
465 | |
|
466 | 0 | vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) ); |
467 | |
|
468 | 0 | if( isWdt16 ) |
469 | 0 | { |
470 | 0 | vsrc1 = _mm_loadu_si128( (const __m128i*)(pSrc1 + 8) ); |
471 | 0 | vsrc2 = _mm_loadu_si128( (const __m128i*)(pSrc2 + 8) ); |
472 | |
|
473 | 0 | vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) ); |
474 | 0 | } |
475 | |
|
476 | 0 | pSrc1 += iStrideSrc1; pSrc2 += iStrideSrc2; |
477 | | |
478 | | // 3 |
479 | 0 | vsrc1 = _mm_loadu_si128( (const __m128i*)(pSrc1) ); |
480 | 0 | vsrc2 = _mm_loadu_si128( (const __m128i*)(pSrc2) ); |
481 | |
|
482 | 0 | vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) ); |
483 | |
|
484 | 0 | if( isWdt16 ) |
485 | 0 | { |
486 | 0 | vsrc1 = _mm_loadu_si128( (const __m128i*)(pSrc1 + 8) ); |
487 | 0 | vsrc2 = _mm_loadu_si128( (const __m128i*)(pSrc2 + 8) ); |
488 | |
|
489 | 0 | vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) ); |
490 | 0 | } |
491 | |
|
492 | 0 | pSrc1 += iStrideSrc1; pSrc2 += iStrideSrc2; |
493 | |
|
494 | 0 | vsum32 = _mm_add_epi32( vsum32, _mm_madd_epi16( vsum16, vone ) ); |
495 | 0 | } |
496 | |
|
497 | 0 | vsum32 = _mm_hadd_epi32( vsum32, vone ); |
498 | 0 | vsum32 = _mm_hadd_epi32( vsum32, vone ); |
499 | 0 | uiSum = _mm_cvtsi128_si32( vsum32 ); |
500 | |
|
501 | 0 | uiSum <<= 1; |
502 | 0 | return uiSum >> DISTORTION_PRECISION_ADJUSTMENT( rcDtParam.bitDepth ); |
503 | 0 | } |
504 | 0 | else |
505 | 0 | { |
506 | 0 | static constexpr bool earlyExitAllowed = iWidth >= 64; |
507 | | |
508 | | // For width that multiple of 8 |
509 | 0 | __m128i vone = _mm_set1_epi16( 1 ); |
510 | 0 | __m128i vsum32 = _mm_setzero_si128(); |
511 | |
|
512 | 0 | int checkExit = 3; |
513 | |
|
514 | 0 | for( int iY = 0; iY < iRows; iY+=iSubStep ) |
515 | 0 | { |
516 | 0 | __m128i vsrc1 = _mm_loadu_si128( ( const __m128i* )( pSrc1 ) ); |
517 | 0 | __m128i vsrc2 = _mm_loadu_si128( ( const __m128i* )( pSrc2 ) ); |
518 | 0 | __m128i vsum16 = _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ); |
519 | |
|
520 | 0 | if( iWidth >= 16 ) |
521 | 0 | { |
522 | 0 | vsrc1 = _mm_loadu_si128( ( const __m128i* )( &pSrc1[8] ) ); |
523 | 0 | vsrc2 = _mm_loadu_si128( ( const __m128i* )( &pSrc2[8] ) ); |
524 | 0 | vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) ); |
525 | |
|
526 | 0 | for( int iX = 16; iX < iWidth; iX += 16 ) |
527 | 0 | { |
528 | 0 | vsrc1 = _mm_loadu_si128( ( const __m128i* )( &pSrc1[iX] ) ); |
529 | 0 | vsrc2 = _mm_loadu_si128( ( const __m128i* )( &pSrc2[iX] ) ); |
530 | 0 | vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) ); |
531 | | |
532 | 0 | vsrc1 = _mm_loadu_si128( ( const __m128i* )( &pSrc1[iX + 8] ) ); |
533 | 0 | vsrc2 = _mm_loadu_si128( ( const __m128i* )( &pSrc2[iX + 8] ) ); |
534 | 0 | vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) ); |
535 | 0 | } |
536 | 0 | } |
537 | |
|
538 | 0 | __m128i vsumtemp = _mm_madd_epi16( vsum16, vone ); |
539 | 0 | if( earlyExitAllowed ) vsum32 = _mm_hadd_epi32( vsum32, vsumtemp ); |
540 | 0 | else vsum32 = _mm_add_epi32 ( vsum32, vsumtemp ); |
541 | |
|
542 | 0 | pSrc1 += iStrideSrc1; |
543 | 0 | pSrc2 += iStrideSrc2; |
544 | |
|
545 | 0 | if( earlyExitAllowed && checkExit == 0 ) |
546 | 0 | { |
547 | 0 | Distortion distTemp = _mm_cvtsi128_si32( vsum32 ); |
548 | 0 | distTemp <<= iSubShift; |
549 | 0 | distTemp >>= DISTORTION_PRECISION_ADJUSTMENT( rcDtParam.bitDepth ); |
550 | 0 | if( distTemp > rcDtParam.maximumDistortionForEarlyExit ) return distTemp; |
551 | 0 | checkExit = 3; |
552 | 0 | } |
553 | 0 | else if( earlyExitAllowed ) |
554 | 0 | { |
555 | 0 | checkExit--; |
556 | 0 | } |
557 | 0 | } |
558 | 0 | vsum32 = _mm_hadd_epi32( vsum32, vone ); |
559 | 0 | vsum32 = _mm_hadd_epi32( vsum32, vone ); |
560 | 0 | uiSum = _mm_cvtsi128_si32( vsum32 ); |
561 | 0 | } |
562 | 0 | } |
563 | | |
564 | 0 | uiSum <<= iSubShift; |
565 | 0 | return uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth); |
566 | 0 | } Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSAD_NxN_SIMD<4, (vvenc::x86_simd::X86_VEXT)1>(vvenc::DistParam const&) Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSAD_NxN_SIMD<8, (vvenc::x86_simd::X86_VEXT)1>(vvenc::DistParam const&) Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSAD_NxN_SIMD<16, (vvenc::x86_simd::X86_VEXT)1>(vvenc::DistParam const&) Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSAD_NxN_SIMD<32, (vvenc::x86_simd::X86_VEXT)1>(vvenc::DistParam const&) Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSAD_NxN_SIMD<64, (vvenc::x86_simd::X86_VEXT)1>(vvenc::DistParam const&) Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSAD_NxN_SIMD<128, (vvenc::x86_simd::X86_VEXT)1>(vvenc::DistParam const&) Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSAD_NxN_SIMD<4, (vvenc::x86_simd::X86_VEXT)4>(vvenc::DistParam const&) Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSAD_NxN_SIMD<8, (vvenc::x86_simd::X86_VEXT)4>(vvenc::DistParam const&) Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSAD_NxN_SIMD<16, (vvenc::x86_simd::X86_VEXT)4>(vvenc::DistParam const&) Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSAD_NxN_SIMD<32, (vvenc::x86_simd::X86_VEXT)4>(vvenc::DistParam const&) Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSAD_NxN_SIMD<64, (vvenc::x86_simd::X86_VEXT)4>(vvenc::DistParam const&) Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSAD_NxN_SIMD<128, (vvenc::x86_simd::X86_VEXT)4>(vvenc::DistParam const&) |
567 | | |
568 | | static uint32_t xCalcHAD4x4_SSE( const Torg *piOrg, const Tcur *piCur, const int iStrideOrg, const int iStrideCur ) |
569 | 0 | { |
570 | 0 | __m128i r0 = ( sizeof( Torg ) > 1 ) ? ( _vv_loadl_epi64( ( const __m128i* )&piOrg[0] ) ) : ( _mm_unpacklo_epi8( _mm_cvtsi32_si128( *(const int*)&piOrg[0] ), _mm_setzero_si128() ) ); |
571 | 0 | __m128i r1 = ( sizeof( Torg ) > 1 ) ? ( _vv_loadl_epi64( ( const __m128i* )&piOrg[iStrideOrg] ) ) : ( _mm_unpacklo_epi8( _mm_cvtsi32_si128( *(const int*)&piOrg[iStrideOrg] ), _mm_setzero_si128() ) ); |
572 | 0 | __m128i r2 = ( sizeof( Torg ) > 1 ) ? ( _vv_loadl_epi64( ( const __m128i* )&piOrg[2 * iStrideOrg] ) ) : ( _mm_unpacklo_epi8( _mm_cvtsi32_si128( *(const int*)&piOrg[2 * iStrideOrg] ), _mm_setzero_si128() ) ); |
573 | 0 | __m128i r3 = ( sizeof( Torg ) > 1 ) ? ( _vv_loadl_epi64( ( const __m128i* )&piOrg[3 * iStrideOrg] ) ) : ( _mm_unpacklo_epi8( _mm_cvtsi32_si128( *(const int*)&piOrg[3 * iStrideOrg] ), _mm_setzero_si128() ) ); |
574 | 0 | __m128i r4 = ( sizeof( Tcur ) > 1 ) ? ( _vv_loadl_epi64( ( const __m128i* )&piCur[0] ) ) : ( _mm_unpacklo_epi8( _mm_cvtsi32_si128( *(const int*)&piCur[0] ), _mm_setzero_si128() ) ); |
575 | 0 | __m128i r5 = ( sizeof( Tcur ) > 1 ) ? ( _vv_loadl_epi64( ( const __m128i* )&piCur[iStrideCur] ) ) : ( _mm_unpacklo_epi8( _mm_cvtsi32_si128( *(const int*)&piCur[iStrideCur] ), _mm_setzero_si128() ) ); |
576 | 0 | __m128i r6 = ( sizeof( Tcur ) > 1 ) ? ( _vv_loadl_epi64( ( const __m128i* )&piCur[2 * iStrideCur] ) ) : ( _mm_unpacklo_epi8( _mm_cvtsi32_si128( *(const int*)&piCur[2 * iStrideCur] ), _mm_setzero_si128() ) ); |
577 | 0 | __m128i r7 = ( sizeof( Tcur ) > 1 ) ? ( _vv_loadl_epi64( ( const __m128i* )&piCur[3 * iStrideCur] ) ) : ( _mm_unpacklo_epi8( _mm_cvtsi32_si128( *(const int*)&piCur[3 * iStrideCur] ), _mm_setzero_si128() ) ); |
578 | |
|
579 | 0 | r0 = _mm_sub_epi16( r0, r4 ); |
580 | 0 | r1 = _mm_sub_epi16( r1, r5 ); |
581 | 0 | r2 = _mm_sub_epi16( r2, r6 ); |
582 | 0 | r3 = _mm_sub_epi16( r3, r7 ); |
583 | | |
584 | | // first stage |
585 | 0 | r4 = r0; |
586 | 0 | r5 = r1; |
587 | |
|
588 | 0 | r0 = _mm_add_epi16( r0, r3 ); |
589 | 0 | r1 = _mm_add_epi16( r1, r2 ); |
590 | |
|
591 | 0 | r4 = _mm_sub_epi16( r4, r3 ); |
592 | 0 | r5 = _mm_sub_epi16( r5, r2 ); |
593 | |
|
594 | 0 | r2 = r0; |
595 | 0 | r3 = r4; |
596 | |
|
597 | 0 | r0 = _mm_add_epi16( r0, r1 ); |
598 | 0 | r2 = _mm_sub_epi16( r2, r1 ); |
599 | 0 | r3 = _mm_sub_epi16( r3, r5 ); |
600 | 0 | r5 = _mm_add_epi16( r5, r4 ); |
601 | | |
602 | | // shuffle - flip matrix for vertical transform |
603 | 0 | r0 = _mm_unpacklo_epi16( r0, r5 ); |
604 | 0 | r2 = _mm_unpacklo_epi16( r2, r3 ); |
605 | |
|
606 | 0 | r3 = r0; |
607 | 0 | r0 = _mm_unpacklo_epi32( r0, r2 ); |
608 | 0 | r3 = _mm_unpackhi_epi32( r3, r2 ); |
609 | |
|
610 | 0 | r1 = r0; |
611 | 0 | r2 = r3; |
612 | 0 | r1 = _mm_srli_si128( r1, 8 ); |
613 | 0 | r3 = _mm_srli_si128( r3, 8 ); |
614 | | |
615 | | // second stage |
616 | 0 | r4 = r0; |
617 | 0 | r5 = r1; |
618 | |
|
619 | 0 | r0 = _mm_add_epi16( r0, r3 ); |
620 | 0 | r1 = _mm_add_epi16( r1, r2 ); |
621 | |
|
622 | 0 | r4 = _mm_sub_epi16( r4, r3 ); |
623 | 0 | r5 = _mm_sub_epi16( r5, r2 ); |
624 | |
|
625 | 0 | r2 = r0; |
626 | 0 | r3 = r4; |
627 | |
|
628 | 0 | r0 = _mm_add_epi16( r0, r1 ); |
629 | 0 | r2 = _mm_sub_epi16( r2, r1 ); |
630 | 0 | r3 = _mm_sub_epi16( r3, r5 ); |
631 | 0 | r5 = _mm_add_epi16( r5, r4 ); |
632 | | |
633 | | // abs |
634 | 0 | __m128i Sum = _mm_abs_epi16( r0 ); |
635 | 0 | uint32_t absDc = _mm_cvtsi128_si32( Sum ) & 0x0000ffff; |
636 | 0 | Sum = _mm_add_epi16( Sum, _mm_abs_epi16( r2 ) ); |
637 | 0 | Sum = _mm_add_epi16( Sum, _mm_abs_epi16( r3 ) ); |
638 | 0 | Sum = _mm_add_epi16( Sum, _mm_abs_epi16( r5 ) ); |
639 | |
|
640 | 0 | __m128i iZero = _mm_set1_epi16( 0 ); |
641 | 0 | Sum = _mm_unpacklo_epi16( Sum, iZero ); |
642 | 0 | Sum = _mm_hadd_epi32( Sum, Sum ); |
643 | 0 | Sum = _mm_hadd_epi32( Sum, Sum ); |
644 | |
|
645 | 0 | uint32_t sad = _mm_cvtsi128_si32( Sum ); |
646 | | |
647 | 0 | sad -= absDc; |
648 | 0 | sad += absDc >> 2; |
649 | 0 | sad = ( ( sad + 1 ) >> 1 ); |
650 | |
|
651 | 0 | return sad; |
652 | 0 | } Unexecuted instantiation: RdCost_sse41.cpp:vvenc::xCalcHAD4x4_SSE(short const*, short const*, int, int) Unexecuted instantiation: RdCost_avx2.cpp:vvenc::xCalcHAD4x4_SSE(short const*, short const*, int, int) |
653 | | |
654 | | //working up to 12-bit |
655 | | static uint32_t xCalcHAD8x8_SSE( const Torg *piOrg, const Tcur *piCur, const int iStrideOrg, const int iStrideCur, const int iBitDepth ) |
656 | 0 | { |
657 | 0 | __m128i m1[2][8], m2[2][8]; |
658 | |
|
659 | 0 | CHECK( iBitDepth > 10, "Only bit-depths of up to 10 bits supported!" ); |
660 | |
|
661 | 0 | for( int k = 0; k < 8; k++ ) |
662 | 0 | { |
663 | 0 | __m128i r0 = ( sizeof( Torg ) > 1 ) ? ( _mm_loadu_si128( ( __m128i* )piOrg ) ) : ( _mm_unpacklo_epi8( _vv_loadl_epi64( ( const __m128i* )piOrg ), _mm_setzero_si128() ) ); |
664 | 0 | __m128i r1 = ( sizeof( Tcur ) > 1 ) ? ( _mm_loadu_si128( ( __m128i* )piCur ) ) : ( _mm_unpacklo_epi8( _vv_loadl_epi64( ( const __m128i* )piCur ), _mm_setzero_si128() ) ); |
665 | 0 | m2[0][k] = _mm_sub_epi16( r0, r1 ); // 11bit |
666 | 0 | piCur += iStrideCur; |
667 | 0 | piOrg += iStrideOrg; |
668 | 0 | } |
669 | | |
670 | | //horizontal |
671 | 0 | m1[0][0] = _mm_add_epi16( m2[0][0], m2[0][4] ); |
672 | 0 | m1[0][1] = _mm_add_epi16( m2[0][1], m2[0][5] ); |
673 | 0 | m1[0][2] = _mm_add_epi16( m2[0][2], m2[0][6] ); |
674 | 0 | m1[0][3] = _mm_add_epi16( m2[0][3], m2[0][7] ); |
675 | 0 | m1[0][4] = _mm_sub_epi16( m2[0][0], m2[0][4] ); |
676 | 0 | m1[0][5] = _mm_sub_epi16( m2[0][1], m2[0][5] ); |
677 | 0 | m1[0][6] = _mm_sub_epi16( m2[0][2], m2[0][6] ); |
678 | 0 | m1[0][7] = _mm_sub_epi16( m2[0][3], m2[0][7] ); // 12 bit |
679 | |
|
680 | 0 | m2[0][0] = _mm_add_epi16( m1[0][0], m1[0][2] ); |
681 | 0 | m2[0][1] = _mm_add_epi16( m1[0][1], m1[0][3] ); |
682 | 0 | m2[0][2] = _mm_sub_epi16( m1[0][0], m1[0][2] ); |
683 | 0 | m2[0][3] = _mm_sub_epi16( m1[0][1], m1[0][3] ); |
684 | 0 | m2[0][4] = _mm_add_epi16( m1[0][4], m1[0][6] ); |
685 | 0 | m2[0][5] = _mm_add_epi16( m1[0][5], m1[0][7] ); |
686 | 0 | m2[0][6] = _mm_sub_epi16( m1[0][4], m1[0][6] ); |
687 | 0 | m2[0][7] = _mm_sub_epi16( m1[0][5], m1[0][7] ); // 13 bit |
688 | |
|
689 | 0 | m1[0][0] = _mm_add_epi16( m2[0][0], m2[0][1] ); |
690 | 0 | m1[0][1] = _mm_sub_epi16( m2[0][0], m2[0][1] ); |
691 | 0 | m1[0][2] = _mm_add_epi16( m2[0][2], m2[0][3] ); |
692 | 0 | m1[0][3] = _mm_sub_epi16( m2[0][2], m2[0][3] ); |
693 | 0 | m1[0][4] = _mm_add_epi16( m2[0][4], m2[0][5] ); |
694 | 0 | m1[0][5] = _mm_sub_epi16( m2[0][4], m2[0][5] ); |
695 | 0 | m1[0][6] = _mm_add_epi16( m2[0][6], m2[0][7] ); |
696 | 0 | m1[0][7] = _mm_sub_epi16( m2[0][6], m2[0][7] ); // 14 bit |
697 | |
|
698 | 0 | m2[0][0] = _mm_unpacklo_epi16( m1[0][0], m1[0][1] ); |
699 | 0 | m2[0][1] = _mm_unpacklo_epi16( m1[0][2], m1[0][3] ); |
700 | 0 | m2[0][2] = _mm_unpackhi_epi16( m1[0][0], m1[0][1] ); |
701 | 0 | m2[0][3] = _mm_unpackhi_epi16( m1[0][2], m1[0][3] ); |
702 | 0 | m2[0][4] = _mm_unpacklo_epi16( m1[0][4], m1[0][5] ); |
703 | 0 | m2[0][5] = _mm_unpacklo_epi16( m1[0][6], m1[0][7] ); |
704 | 0 | m2[0][6] = _mm_unpackhi_epi16( m1[0][4], m1[0][5] ); |
705 | 0 | m2[0][7] = _mm_unpackhi_epi16( m1[0][6], m1[0][7] ); |
706 | |
|
707 | 0 | m1[0][0] = _mm_unpacklo_epi32( m2[0][0], m2[0][1] ); |
708 | 0 | m1[0][1] = _mm_unpackhi_epi32( m2[0][0], m2[0][1] ); |
709 | 0 | m1[0][2] = _mm_unpacklo_epi32( m2[0][2], m2[0][3] ); |
710 | 0 | m1[0][3] = _mm_unpackhi_epi32( m2[0][2], m2[0][3] ); |
711 | 0 | m1[0][4] = _mm_unpacklo_epi32( m2[0][4], m2[0][5] ); |
712 | 0 | m1[0][5] = _mm_unpackhi_epi32( m2[0][4], m2[0][5] ); |
713 | 0 | m1[0][6] = _mm_unpacklo_epi32( m2[0][6], m2[0][7] ); |
714 | 0 | m1[0][7] = _mm_unpackhi_epi32( m2[0][6], m2[0][7] ); |
715 | | |
716 | 0 | m1[1][0] = _mm_cvtepi16_epi32( _mm_srli_si128( m1[0][0], 8 ) ); |
717 | 0 | m1[0][0] = _mm_cvtepi16_epi32( m1[0][0] ); |
718 | 0 | m1[1][1] = _mm_cvtepi16_epi32( _mm_srli_si128( m1[0][1], 8 ) ); |
719 | 0 | m1[0][1] = _mm_cvtepi16_epi32( m1[0][1] ); |
720 | 0 | m1[1][2] = _mm_cvtepi16_epi32( _mm_srli_si128( m1[0][2], 8 ) ); |
721 | 0 | m1[0][2] = _mm_cvtepi16_epi32( m1[0][2] ); |
722 | 0 | m1[1][3] = _mm_cvtepi16_epi32( _mm_srli_si128( m1[0][3], 8 ) ); |
723 | 0 | m1[0][3] = _mm_cvtepi16_epi32( m1[0][3] ); |
724 | 0 | m1[1][4] = _mm_cvtepi16_epi32( _mm_srli_si128( m1[0][4], 8 ) ); |
725 | 0 | m1[0][4] = _mm_cvtepi16_epi32( m1[0][4] ); |
726 | 0 | m1[1][5] = _mm_cvtepi16_epi32( _mm_srli_si128( m1[0][5], 8 ) ); |
727 | 0 | m1[0][5] = _mm_cvtepi16_epi32( m1[0][5] ); |
728 | 0 | m1[1][6] = _mm_cvtepi16_epi32( _mm_srli_si128( m1[0][6], 8 ) ); |
729 | 0 | m1[0][6] = _mm_cvtepi16_epi32( m1[0][6] ); |
730 | 0 | m1[1][7] = _mm_cvtepi16_epi32( _mm_srli_si128( m1[0][7], 8 ) ); |
731 | 0 | m1[0][7] = _mm_cvtepi16_epi32( m1[0][7] ); |
732 | |
|
733 | 0 | for( int i = 0; i < 8; i++ ) |
734 | 0 | { |
735 | 0 | int ii = i % 4; |
736 | 0 | int ij = i >> 2; |
737 | |
|
738 | 0 | m2[0][i] = m1[ij][ii ]; |
739 | 0 | m2[1][i] = m1[ij][ii + 4]; |
740 | 0 | } |
741 | |
|
742 | 0 | for( int i = 0; i < 2; i++ ) |
743 | 0 | { |
744 | 0 | m1[i][0] = _mm_add_epi32( m2[i][0], m2[i][4] ); |
745 | 0 | m1[i][1] = _mm_add_epi32( m2[i][1], m2[i][5] ); |
746 | 0 | m1[i][2] = _mm_add_epi32( m2[i][2], m2[i][6] ); |
747 | 0 | m1[i][3] = _mm_add_epi32( m2[i][3], m2[i][7] ); |
748 | 0 | m1[i][4] = _mm_sub_epi32( m2[i][0], m2[i][4] ); |
749 | 0 | m1[i][5] = _mm_sub_epi32( m2[i][1], m2[i][5] ); |
750 | 0 | m1[i][6] = _mm_sub_epi32( m2[i][2], m2[i][6] ); |
751 | 0 | m1[i][7] = _mm_sub_epi32( m2[i][3], m2[i][7] ); |
752 | |
|
753 | 0 | m2[i][0] = _mm_add_epi32( m1[i][0], m1[i][2] ); |
754 | 0 | m2[i][1] = _mm_add_epi32( m1[i][1], m1[i][3] ); |
755 | 0 | m2[i][2] = _mm_sub_epi32( m1[i][0], m1[i][2] ); |
756 | 0 | m2[i][3] = _mm_sub_epi32( m1[i][1], m1[i][3] ); |
757 | 0 | m2[i][4] = _mm_add_epi32( m1[i][4], m1[i][6] ); |
758 | 0 | m2[i][5] = _mm_add_epi32( m1[i][5], m1[i][7] ); |
759 | 0 | m2[i][6] = _mm_sub_epi32( m1[i][4], m1[i][6] ); |
760 | 0 | m2[i][7] = _mm_sub_epi32( m1[i][5], m1[i][7] ); |
761 | |
|
762 | 0 | m1[i][0] = _mm_abs_epi32( _mm_add_epi32( m2[i][0], m2[i][1] ) ); |
763 | 0 | m1[i][1] = _mm_abs_epi32( _mm_sub_epi32( m2[i][0], m2[i][1] ) ); |
764 | 0 | m1[i][2] = _mm_abs_epi32( _mm_add_epi32( m2[i][2], m2[i][3] ) ); |
765 | 0 | m1[i][3] = _mm_abs_epi32( _mm_sub_epi32( m2[i][2], m2[i][3] ) ); |
766 | 0 | m1[i][4] = _mm_abs_epi32( _mm_add_epi32( m2[i][4], m2[i][5] ) ); |
767 | 0 | m1[i][5] = _mm_abs_epi32( _mm_sub_epi32( m2[i][4], m2[i][5] ) ); |
768 | 0 | m1[i][6] = _mm_abs_epi32( _mm_add_epi32( m2[i][6], m2[i][7] ) ); |
769 | 0 | m1[i][7] = _mm_abs_epi32( _mm_sub_epi32( m2[i][6], m2[i][7] ) ); |
770 | 0 | } |
771 | 0 | m2[0][0] = m1[0][0]; |
772 | 0 | for( int i = 0; i < 8; i++ ) |
773 | 0 | { |
774 | 0 | m1[0][i] = _mm_add_epi32( m1[0][i], m1[1][i] ); |
775 | 0 | } |
776 | |
|
777 | 0 | m1[0][0] = _mm_add_epi32( m1[0][0], m1[0][1] ); |
778 | 0 | m1[0][2] = _mm_add_epi32( m1[0][2], m1[0][3] ); |
779 | 0 | m1[0][4] = _mm_add_epi32( m1[0][4], m1[0][5] ); |
780 | 0 | m1[0][6] = _mm_add_epi32( m1[0][6], m1[0][7] ); |
781 | |
|
782 | 0 | m1[0][0] = _mm_add_epi32( m1[0][0], m1[0][2] ); |
783 | 0 | m1[0][4] = _mm_add_epi32( m1[0][4], m1[0][6] ); |
784 | 0 | __m128i iSum = _mm_add_epi32( m1[0][0], m1[0][4] ); |
785 | |
|
786 | 0 | iSum = _mm_hadd_epi32( iSum, iSum ); |
787 | 0 | iSum = _mm_hadd_epi32( iSum, iSum ); |
788 | |
|
789 | 0 | uint32_t sad = _mm_cvtsi128_si32( iSum ); |
790 | 0 | uint32_t absDc = _mm_cvtsi128_si32( m2[0][0] ); |
791 | 0 | sad -= absDc; |
792 | 0 | sad += absDc >> 2; |
793 | 0 | sad = ( ( sad + 2 ) >> 2 ); |
794 | |
|
795 | 0 | return sad; |
796 | 0 | } Unexecuted instantiation: RdCost_sse41.cpp:vvenc::xCalcHAD8x8_SSE(short const*, short const*, int, int, int) Unexecuted instantiation: RdCost_avx2.cpp:vvenc::xCalcHAD8x8_SSE(short const*, short const*, int, int, int) |
797 | | |
798 | | |
799 | | //working up to 12-bit |
800 | | static uint32_t xCalcHAD16x16_fast_SSE( const Torg *piOrg, const Tcur *piCur, const int iStrideOrg, const int iStrideCur, const int iBitDepth ) |
801 | 0 | { |
802 | 0 | __m128i m1[2][8], m2[2][8]; |
803 | |
|
804 | 0 | CHECK( iBitDepth > 10, "Only bit-depths of up to 10 bits supported!" ); |
805 | |
|
806 | 0 | for( int k = 0; k < 8; k++ ) |
807 | 0 | { |
808 | 0 | __m128i r0 = _mm_loadu_si128( ( __m128i* )piOrg ); |
809 | 0 | __m128i r1 = _mm_loadu_si128( ( __m128i* )piCur ); |
810 | 0 | __m128i r2 = _mm_loadu_si128( ( __m128i* )( piOrg + iStrideOrg ) ); |
811 | 0 | __m128i r3 = _mm_loadu_si128( ( __m128i* )( piCur + iStrideCur ) ); |
812 | |
|
813 | 0 | r0 = _mm_add_epi16( r0, r2 ); |
814 | 0 | r1 = _mm_add_epi16( r1, r3 ); |
815 | |
|
816 | 0 | r2 = _mm_loadu_si128( ( __m128i* )( piOrg + 8 ) ); |
817 | 0 | r3 = _mm_loadu_si128( ( __m128i* )( piCur + 8 ) ); |
818 | 0 | __m128i r4 = _mm_loadu_si128( ( __m128i* )( piOrg + iStrideOrg + 8 ) ); |
819 | 0 | __m128i r5 = _mm_loadu_si128( ( __m128i* )( piCur + iStrideCur + 8 ) ); |
820 | |
|
821 | 0 | r2 = _mm_add_epi16( r2, r4 ); |
822 | 0 | r3 = _mm_add_epi16( r3, r5 ); |
823 | |
|
824 | 0 | r0 = _mm_hadd_epi16( r0, r2 ); |
825 | 0 | r1 = _mm_hadd_epi16( r1, r3 ); |
826 | |
|
827 | 0 | r0 = _mm_add_epi16( r0, _mm_set1_epi16( 2 ) ); |
828 | 0 | r1 = _mm_add_epi16( r1, _mm_set1_epi16( 2 ) ); |
829 | 0 | r0 = _mm_srai_epi16( r0, 2 ); |
830 | 0 | r1 = _mm_srai_epi16( r1, 2 ); |
831 | |
|
832 | 0 | m2[0][k] = _mm_sub_epi16( r0, r1 ); // 11bit |
833 | | //m2[1][k] = _mm_cvtepi16_epi32( _mm_srli_si128( m2[0][k], 8 ) ); |
834 | | //m2[0][k] = _mm_cvtepi16_epi32( m2[0][k] ); |
835 | 0 | piCur += iStrideCur * 2; |
836 | 0 | piOrg += iStrideOrg * 2; |
837 | 0 | } |
838 | | |
839 | | //horizontal |
840 | 0 | m1[0][0] = _mm_add_epi16( m2[0][0], m2[0][4] ); |
841 | 0 | m1[0][1] = _mm_add_epi16( m2[0][1], m2[0][5] ); |
842 | 0 | m1[0][2] = _mm_add_epi16( m2[0][2], m2[0][6] ); |
843 | 0 | m1[0][3] = _mm_add_epi16( m2[0][3], m2[0][7] ); |
844 | 0 | m1[0][4] = _mm_sub_epi16( m2[0][0], m2[0][4] ); |
845 | 0 | m1[0][5] = _mm_sub_epi16( m2[0][1], m2[0][5] ); |
846 | 0 | m1[0][6] = _mm_sub_epi16( m2[0][2], m2[0][6] ); |
847 | 0 | m1[0][7] = _mm_sub_epi16( m2[0][3], m2[0][7] ); // 12 bit |
848 | |
|
849 | 0 | m2[0][0] = _mm_add_epi16( m1[0][0], m1[0][2] ); |
850 | 0 | m2[0][1] = _mm_add_epi16( m1[0][1], m1[0][3] ); |
851 | 0 | m2[0][2] = _mm_sub_epi16( m1[0][0], m1[0][2] ); |
852 | 0 | m2[0][3] = _mm_sub_epi16( m1[0][1], m1[0][3] ); |
853 | 0 | m2[0][4] = _mm_add_epi16( m1[0][4], m1[0][6] ); |
854 | 0 | m2[0][5] = _mm_add_epi16( m1[0][5], m1[0][7] ); |
855 | 0 | m2[0][6] = _mm_sub_epi16( m1[0][4], m1[0][6] ); |
856 | 0 | m2[0][7] = _mm_sub_epi16( m1[0][5], m1[0][7] ); // 13 bit |
857 | |
|
858 | 0 | m1[0][0] = _mm_add_epi16( m2[0][0], m2[0][1] ); |
859 | 0 | m1[0][1] = _mm_sub_epi16( m2[0][0], m2[0][1] ); |
860 | 0 | m1[0][2] = _mm_add_epi16( m2[0][2], m2[0][3] ); |
861 | 0 | m1[0][3] = _mm_sub_epi16( m2[0][2], m2[0][3] ); |
862 | 0 | m1[0][4] = _mm_add_epi16( m2[0][4], m2[0][5] ); |
863 | 0 | m1[0][5] = _mm_sub_epi16( m2[0][4], m2[0][5] ); |
864 | 0 | m1[0][6] = _mm_add_epi16( m2[0][6], m2[0][7] ); |
865 | 0 | m1[0][7] = _mm_sub_epi16( m2[0][6], m2[0][7] ); // 14 bit |
866 | |
|
867 | 0 | m2[0][0] = _mm_unpacklo_epi16( m1[0][0], m1[0][1] ); |
868 | 0 | m2[0][1] = _mm_unpacklo_epi16( m1[0][2], m1[0][3] ); |
869 | 0 | m2[0][2] = _mm_unpackhi_epi16( m1[0][0], m1[0][1] ); |
870 | 0 | m2[0][3] = _mm_unpackhi_epi16( m1[0][2], m1[0][3] ); |
871 | 0 | m2[0][4] = _mm_unpacklo_epi16( m1[0][4], m1[0][5] ); |
872 | 0 | m2[0][5] = _mm_unpacklo_epi16( m1[0][6], m1[0][7] ); |
873 | 0 | m2[0][6] = _mm_unpackhi_epi16( m1[0][4], m1[0][5] ); |
874 | 0 | m2[0][7] = _mm_unpackhi_epi16( m1[0][6], m1[0][7] ); |
875 | |
|
876 | 0 | m1[0][0] = _mm_unpacklo_epi32( m2[0][0], m2[0][1] ); |
877 | 0 | m1[0][1] = _mm_unpackhi_epi32( m2[0][0], m2[0][1] ); |
878 | 0 | m1[0][2] = _mm_unpacklo_epi32( m2[0][2], m2[0][3] ); |
879 | 0 | m1[0][3] = _mm_unpackhi_epi32( m2[0][2], m2[0][3] ); |
880 | 0 | m1[0][4] = _mm_unpacklo_epi32( m2[0][4], m2[0][5] ); |
881 | 0 | m1[0][5] = _mm_unpackhi_epi32( m2[0][4], m2[0][5] ); |
882 | 0 | m1[0][6] = _mm_unpacklo_epi32( m2[0][6], m2[0][7] ); |
883 | 0 | m1[0][7] = _mm_unpackhi_epi32( m2[0][6], m2[0][7] ); |
884 | | |
885 | 0 | m1[1][0] = _mm_cvtepi16_epi32( _mm_srli_si128( m1[0][0], 8 ) ); |
886 | 0 | m1[0][0] = _mm_cvtepi16_epi32( m1[0][0] ); |
887 | 0 | m1[1][1] = _mm_cvtepi16_epi32( _mm_srli_si128( m1[0][1], 8 ) ); |
888 | 0 | m1[0][1] = _mm_cvtepi16_epi32( m1[0][1] ); |
889 | 0 | m1[1][2] = _mm_cvtepi16_epi32( _mm_srli_si128( m1[0][2], 8 ) ); |
890 | 0 | m1[0][2] = _mm_cvtepi16_epi32( m1[0][2] ); |
891 | 0 | m1[1][3] = _mm_cvtepi16_epi32( _mm_srli_si128( m1[0][3], 8 ) ); |
892 | 0 | m1[0][3] = _mm_cvtepi16_epi32( m1[0][3] ); |
893 | 0 | m1[1][4] = _mm_cvtepi16_epi32( _mm_srli_si128( m1[0][4], 8 ) ); |
894 | 0 | m1[0][4] = _mm_cvtepi16_epi32( m1[0][4] ); |
895 | 0 | m1[1][5] = _mm_cvtepi16_epi32( _mm_srli_si128( m1[0][5], 8 ) ); |
896 | 0 | m1[0][5] = _mm_cvtepi16_epi32( m1[0][5] ); |
897 | 0 | m1[1][6] = _mm_cvtepi16_epi32( _mm_srli_si128( m1[0][6], 8 ) ); |
898 | 0 | m1[0][6] = _mm_cvtepi16_epi32( m1[0][6] ); |
899 | 0 | m1[1][7] = _mm_cvtepi16_epi32( _mm_srli_si128( m1[0][7], 8 ) ); |
900 | 0 | m1[0][7] = _mm_cvtepi16_epi32( m1[0][7] ); |
901 | |
|
902 | 0 | for( int i = 0; i < 8; i++ ) |
903 | 0 | { |
904 | 0 | int ii = i % 4; |
905 | 0 | int ij = i >> 2; |
906 | |
|
907 | 0 | m2[0][i] = m1[ij][ii ]; |
908 | 0 | m2[1][i] = m1[ij][ii + 4]; |
909 | 0 | } |
910 | |
|
911 | 0 | for( int i = 0; i < 2; i++ ) |
912 | 0 | { |
913 | 0 | m1[i][0] = _mm_add_epi32( m2[i][0], m2[i][4] ); |
914 | 0 | m1[i][1] = _mm_add_epi32( m2[i][1], m2[i][5] ); |
915 | 0 | m1[i][2] = _mm_add_epi32( m2[i][2], m2[i][6] ); |
916 | 0 | m1[i][3] = _mm_add_epi32( m2[i][3], m2[i][7] ); |
917 | 0 | m1[i][4] = _mm_sub_epi32( m2[i][0], m2[i][4] ); |
918 | 0 | m1[i][5] = _mm_sub_epi32( m2[i][1], m2[i][5] ); |
919 | 0 | m1[i][6] = _mm_sub_epi32( m2[i][2], m2[i][6] ); |
920 | 0 | m1[i][7] = _mm_sub_epi32( m2[i][3], m2[i][7] ); |
921 | |
|
922 | 0 | m2[i][0] = _mm_add_epi32( m1[i][0], m1[i][2] ); |
923 | 0 | m2[i][1] = _mm_add_epi32( m1[i][1], m1[i][3] ); |
924 | 0 | m2[i][2] = _mm_sub_epi32( m1[i][0], m1[i][2] ); |
925 | 0 | m2[i][3] = _mm_sub_epi32( m1[i][1], m1[i][3] ); |
926 | 0 | m2[i][4] = _mm_add_epi32( m1[i][4], m1[i][6] ); |
927 | 0 | m2[i][5] = _mm_add_epi32( m1[i][5], m1[i][7] ); |
928 | 0 | m2[i][6] = _mm_sub_epi32( m1[i][4], m1[i][6] ); |
929 | 0 | m2[i][7] = _mm_sub_epi32( m1[i][5], m1[i][7] ); |
930 | |
|
931 | 0 | m1[i][0] = _mm_abs_epi32( _mm_add_epi32( m2[i][0], m2[i][1] ) ); |
932 | 0 | m1[i][1] = _mm_abs_epi32( _mm_sub_epi32( m2[i][0], m2[i][1] ) ); |
933 | 0 | m1[i][2] = _mm_abs_epi32( _mm_add_epi32( m2[i][2], m2[i][3] ) ); |
934 | 0 | m1[i][3] = _mm_abs_epi32( _mm_sub_epi32( m2[i][2], m2[i][3] ) ); |
935 | 0 | m1[i][4] = _mm_abs_epi32( _mm_add_epi32( m2[i][4], m2[i][5] ) ); |
936 | 0 | m1[i][5] = _mm_abs_epi32( _mm_sub_epi32( m2[i][4], m2[i][5] ) ); |
937 | 0 | m1[i][6] = _mm_abs_epi32( _mm_add_epi32( m2[i][6], m2[i][7] ) ); |
938 | 0 | m1[i][7] = _mm_abs_epi32( _mm_sub_epi32( m2[i][6], m2[i][7] ) ); |
939 | 0 | } |
940 | 0 | m2[0][0] = m1[0][0]; |
941 | 0 | for( int i = 0; i < 8; i++ ) |
942 | 0 | { |
943 | 0 | m1[0][i] = _mm_add_epi32( m1[0][i], m1[1][i] ); |
944 | 0 | } |
945 | |
|
946 | 0 | m1[0][0] = _mm_add_epi32( m1[0][0], m1[0][1] ); |
947 | 0 | m1[0][2] = _mm_add_epi32( m1[0][2], m1[0][3] ); |
948 | 0 | m1[0][4] = _mm_add_epi32( m1[0][4], m1[0][5] ); |
949 | 0 | m1[0][6] = _mm_add_epi32( m1[0][6], m1[0][7] ); |
950 | |
|
951 | 0 | m1[0][0] = _mm_add_epi32( m1[0][0], m1[0][2] ); |
952 | 0 | m1[0][4] = _mm_add_epi32( m1[0][4], m1[0][6] ); |
953 | 0 | __m128i iSum = _mm_add_epi32( m1[0][0], m1[0][4] ); |
954 | |
|
955 | 0 | iSum = _mm_hadd_epi32( iSum, iSum ); |
956 | 0 | iSum = _mm_hadd_epi32( iSum, iSum ); |
957 | |
|
958 | 0 | uint32_t sad = _mm_cvtsi128_si32( iSum ); |
959 | 0 | uint32_t absDc = _mm_cvtsi128_si32( m2[0][0] ); |
960 | 0 | sad -= absDc; |
961 | 0 | sad += absDc >> 2; |
962 | 0 | sad = ( ( sad + 2 ) >> 2 ); |
963 | |
|
964 | 0 | return ( sad << 2 ); |
965 | 0 | } Unexecuted instantiation: RdCost_sse41.cpp:vvenc::xCalcHAD16x16_fast_SSE(short const*, short const*, int, int, int) Unexecuted instantiation: RdCost_avx2.cpp:vvenc::xCalcHAD16x16_fast_SSE(short const*, short const*, int, int, int) |
966 | | |
967 | | |
968 | | //working up to 12-bit |
969 | | static uint32_t xCalcHAD16x8_SSE( const Torg *piOrg, const Tcur *piCur, const int iStrideOrg, const int iStrideCur, const int iBitDepth ) |
970 | 0 | { |
971 | 0 | __m128i m1[16][2][2], m2[16][2][2]; |
972 | 0 | __m128i iSum = _mm_setzero_si128(); |
973 | |
|
974 | 0 | for( int l = 0; l < 2; l++ ) |
975 | 0 | { |
976 | 0 | const Torg *piOrgPtr = piOrg + l*8; |
977 | 0 | const Tcur *piCurPtr = piCur + l*8; |
978 | 0 | for( int k = 0; k < 8; k++ ) |
979 | 0 | { |
980 | 0 | __m128i r0 = _mm_loadu_si128( (__m128i*) piOrgPtr ); |
981 | 0 | __m128i r1 = _mm_loadu_si128( (__m128i*) piCurPtr ); |
982 | 0 | m2[k][l][0] = _mm_sub_epi16( r0, r1 ); |
983 | 0 | m2[k][l][1] = _mm_cvtepi16_epi32( _mm_srli_si128( m2[k][l][0], 8 ) ); |
984 | 0 | m2[k][l][0] = _mm_cvtepi16_epi32( m2[k][l][0] ); |
985 | 0 | piCurPtr += iStrideCur; |
986 | 0 | piOrgPtr += iStrideOrg; |
987 | 0 | } |
988 | |
|
989 | 0 | for( int i = 0; i < 2; i++ ) |
990 | 0 | { |
991 | | //vertical |
992 | 0 | m1[0][l][i] = _mm_add_epi32( m2[0][l][i], m2[4][l][i] ); |
993 | 0 | m1[1][l][i] = _mm_add_epi32( m2[1][l][i], m2[5][l][i] ); |
994 | 0 | m1[2][l][i] = _mm_add_epi32( m2[2][l][i], m2[6][l][i] ); |
995 | 0 | m1[3][l][i] = _mm_add_epi32( m2[3][l][i], m2[7][l][i] ); |
996 | 0 | m1[4][l][i] = _mm_sub_epi32( m2[0][l][i], m2[4][l][i] ); |
997 | 0 | m1[5][l][i] = _mm_sub_epi32( m2[1][l][i], m2[5][l][i] ); |
998 | 0 | m1[6][l][i] = _mm_sub_epi32( m2[2][l][i], m2[6][l][i] ); |
999 | 0 | m1[7][l][i] = _mm_sub_epi32( m2[3][l][i], m2[7][l][i] ); |
1000 | |
|
1001 | 0 | m2[0][l][i] = _mm_add_epi32( m1[0][l][i], m1[2][l][i] ); |
1002 | 0 | m2[1][l][i] = _mm_add_epi32( m1[1][l][i], m1[3][l][i] ); |
1003 | 0 | m2[2][l][i] = _mm_sub_epi32( m1[0][l][i], m1[2][l][i] ); |
1004 | 0 | m2[3][l][i] = _mm_sub_epi32( m1[1][l][i], m1[3][l][i] ); |
1005 | 0 | m2[4][l][i] = _mm_add_epi32( m1[4][l][i], m1[6][l][i] ); |
1006 | 0 | m2[5][l][i] = _mm_add_epi32( m1[5][l][i], m1[7][l][i] ); |
1007 | 0 | m2[6][l][i] = _mm_sub_epi32( m1[4][l][i], m1[6][l][i] ); |
1008 | 0 | m2[7][l][i] = _mm_sub_epi32( m1[5][l][i], m1[7][l][i] ); |
1009 | |
|
1010 | 0 | m1[0][l][i] = _mm_add_epi32( m2[0][l][i], m2[1][l][i] ); |
1011 | 0 | m1[1][l][i] = _mm_sub_epi32( m2[0][l][i], m2[1][l][i] ); |
1012 | 0 | m1[2][l][i] = _mm_add_epi32( m2[2][l][i], m2[3][l][i] ); |
1013 | 0 | m1[3][l][i] = _mm_sub_epi32( m2[2][l][i], m2[3][l][i] ); |
1014 | 0 | m1[4][l][i] = _mm_add_epi32( m2[4][l][i], m2[5][l][i] ); |
1015 | 0 | m1[5][l][i] = _mm_sub_epi32( m2[4][l][i], m2[5][l][i] ); |
1016 | 0 | m1[6][l][i] = _mm_add_epi32( m2[6][l][i], m2[7][l][i] ); |
1017 | 0 | m1[7][l][i] = _mm_sub_epi32( m2[6][l][i], m2[7][l][i] ); |
1018 | 0 | } |
1019 | 0 | } |
1020 | | |
1021 | | // 4 x 8x4 blocks |
1022 | | // 0 1 |
1023 | | // 2 3 |
1024 | 0 | uint32_t absDc = 0; |
1025 | | |
1026 | | // transpose and do horizontal in two steps |
1027 | 0 | for( int l = 0; l < 2; l++ ) |
1028 | 0 | { |
1029 | 0 | int off = l * 4; |
1030 | |
|
1031 | 0 | __m128i n1[16]; |
1032 | 0 | __m128i n2[16]; |
1033 | |
|
1034 | 0 | m2[0][0][0] = _mm_unpacklo_epi32( m1[0 + off][0][0], m1[1 + off][0][0] ); |
1035 | 0 | m2[1][0][0] = _mm_unpacklo_epi32( m1[2 + off][0][0], m1[3 + off][0][0] ); |
1036 | 0 | m2[2][0][0] = _mm_unpackhi_epi32( m1[0 + off][0][0], m1[1 + off][0][0] ); |
1037 | 0 | m2[3][0][0] = _mm_unpackhi_epi32( m1[2 + off][0][0], m1[3 + off][0][0] ); |
1038 | |
|
1039 | 0 | m2[0][0][1] = _mm_unpacklo_epi32( m1[0 + off][0][1], m1[1 + off][0][1] ); |
1040 | 0 | m2[1][0][1] = _mm_unpacklo_epi32( m1[2 + off][0][1], m1[3 + off][0][1] ); |
1041 | 0 | m2[2][0][1] = _mm_unpackhi_epi32( m1[0 + off][0][1], m1[1 + off][0][1] ); |
1042 | 0 | m2[3][0][1] = _mm_unpackhi_epi32( m1[2 + off][0][1], m1[3 + off][0][1] ); |
1043 | |
|
1044 | 0 | n1[0] = _mm_unpacklo_epi64( m2[0][0][0], m2[1][0][0] ); |
1045 | 0 | n1[1] = _mm_unpackhi_epi64( m2[0][0][0], m2[1][0][0] ); |
1046 | 0 | n1[2] = _mm_unpacklo_epi64( m2[2][0][0], m2[3][0][0] ); |
1047 | 0 | n1[3] = _mm_unpackhi_epi64( m2[2][0][0], m2[3][0][0] ); |
1048 | 0 | n1[4] = _mm_unpacklo_epi64( m2[0][0][1], m2[1][0][1] ); |
1049 | 0 | n1[5] = _mm_unpackhi_epi64( m2[0][0][1], m2[1][0][1] ); |
1050 | 0 | n1[6] = _mm_unpacklo_epi64( m2[2][0][1], m2[3][0][1] ); |
1051 | 0 | n1[7] = _mm_unpackhi_epi64( m2[2][0][1], m2[3][0][1] ); |
1052 | | |
1053 | | // transpose 8x4 -> 4x8, block 1(3) |
1054 | 0 | m2[8+0][0][0] = _mm_unpacklo_epi32( m1[0 + off][1][0], m1[1 + off][1][0] ); |
1055 | 0 | m2[8+1][0][0] = _mm_unpacklo_epi32( m1[2 + off][1][0], m1[3 + off][1][0] ); |
1056 | 0 | m2[8+2][0][0] = _mm_unpackhi_epi32( m1[0 + off][1][0], m1[1 + off][1][0] ); |
1057 | 0 | m2[8+3][0][0] = _mm_unpackhi_epi32( m1[2 + off][1][0], m1[3 + off][1][0] ); |
1058 | |
|
1059 | 0 | m2[8+0][0][1] = _mm_unpacklo_epi32( m1[0 + off][1][1], m1[1 + off][1][1] ); |
1060 | 0 | m2[8+1][0][1] = _mm_unpacklo_epi32( m1[2 + off][1][1], m1[3 + off][1][1] ); |
1061 | 0 | m2[8+2][0][1] = _mm_unpackhi_epi32( m1[0 + off][1][1], m1[1 + off][1][1] ); |
1062 | 0 | m2[8+3][0][1] = _mm_unpackhi_epi32( m1[2 + off][1][1], m1[3 + off][1][1] ); |
1063 | |
|
1064 | 0 | n1[8+0] = _mm_unpacklo_epi64( m2[8+0][0][0], m2[8+1][0][0] ); |
1065 | 0 | n1[8+1] = _mm_unpackhi_epi64( m2[8+0][0][0], m2[8+1][0][0] ); |
1066 | 0 | n1[8+2] = _mm_unpacklo_epi64( m2[8+2][0][0], m2[8+3][0][0] ); |
1067 | 0 | n1[8+3] = _mm_unpackhi_epi64( m2[8+2][0][0], m2[8+3][0][0] ); |
1068 | 0 | n1[8+4] = _mm_unpacklo_epi64( m2[8+0][0][1], m2[8+1][0][1] ); |
1069 | 0 | n1[8+5] = _mm_unpackhi_epi64( m2[8+0][0][1], m2[8+1][0][1] ); |
1070 | 0 | n1[8+6] = _mm_unpacklo_epi64( m2[8+2][0][1], m2[8+3][0][1] ); |
1071 | 0 | n1[8+7] = _mm_unpackhi_epi64( m2[8+2][0][1], m2[8+3][0][1] ); |
1072 | |
|
1073 | 0 | n2[0] = _mm_add_epi32( n1[0], n1[8] ); |
1074 | 0 | n2[1] = _mm_add_epi32( n1[1], n1[9] ); |
1075 | 0 | n2[2] = _mm_add_epi32( n1[2], n1[10] ); |
1076 | 0 | n2[3] = _mm_add_epi32( n1[3], n1[11] ); |
1077 | 0 | n2[4] = _mm_add_epi32( n1[4], n1[12] ); |
1078 | 0 | n2[5] = _mm_add_epi32( n1[5], n1[13] ); |
1079 | 0 | n2[6] = _mm_add_epi32( n1[6], n1[14] ); |
1080 | 0 | n2[7] = _mm_add_epi32( n1[7], n1[15] ); |
1081 | 0 | n2[8] = _mm_sub_epi32( n1[0], n1[8] ); |
1082 | 0 | n2[9] = _mm_sub_epi32( n1[1], n1[9] ); |
1083 | 0 | n2[10] = _mm_sub_epi32( n1[2], n1[10] ); |
1084 | 0 | n2[11] = _mm_sub_epi32( n1[3], n1[11] ); |
1085 | 0 | n2[12] = _mm_sub_epi32( n1[4], n1[12] ); |
1086 | 0 | n2[13] = _mm_sub_epi32( n1[5], n1[13] ); |
1087 | 0 | n2[14] = _mm_sub_epi32( n1[6], n1[14] ); |
1088 | 0 | n2[15] = _mm_sub_epi32( n1[7], n1[15] ); |
1089 | |
|
1090 | 0 | n1[0] = _mm_add_epi32( n2[0], n2[4] ); |
1091 | 0 | n1[1] = _mm_add_epi32( n2[1], n2[5] ); |
1092 | 0 | n1[2] = _mm_add_epi32( n2[2], n2[6] ); |
1093 | 0 | n1[3] = _mm_add_epi32( n2[3], n2[7] ); |
1094 | 0 | n1[4] = _mm_sub_epi32( n2[0], n2[4] ); |
1095 | 0 | n1[5] = _mm_sub_epi32( n2[1], n2[5] ); |
1096 | 0 | n1[6] = _mm_sub_epi32( n2[2], n2[6] ); |
1097 | 0 | n1[7] = _mm_sub_epi32( n2[3], n2[7] ); |
1098 | 0 | n1[8] = _mm_add_epi32( n2[8], n2[12] ); |
1099 | 0 | n1[9] = _mm_add_epi32( n2[9], n2[13] ); |
1100 | 0 | n1[10] = _mm_add_epi32( n2[10], n2[14] ); |
1101 | 0 | n1[11] = _mm_add_epi32( n2[11], n2[15] ); |
1102 | 0 | n1[12] = _mm_sub_epi32( n2[8], n2[12] ); |
1103 | 0 | n1[13] = _mm_sub_epi32( n2[9], n2[13] ); |
1104 | 0 | n1[14] = _mm_sub_epi32( n2[10], n2[14] ); |
1105 | 0 | n1[15] = _mm_sub_epi32( n2[11], n2[15] ); |
1106 | |
|
1107 | 0 | n2[0] = _mm_add_epi32( n1[0], n1[2] ); |
1108 | 0 | n2[1] = _mm_add_epi32( n1[1], n1[3] ); |
1109 | 0 | n2[2] = _mm_sub_epi32( n1[0], n1[2] ); |
1110 | 0 | n2[3] = _mm_sub_epi32( n1[1], n1[3] ); |
1111 | 0 | n2[4] = _mm_add_epi32( n1[4], n1[6] ); |
1112 | 0 | n2[5] = _mm_add_epi32( n1[5], n1[7] ); |
1113 | 0 | n2[6] = _mm_sub_epi32( n1[4], n1[6] ); |
1114 | 0 | n2[7] = _mm_sub_epi32( n1[5], n1[7] ); |
1115 | 0 | n2[8] = _mm_add_epi32( n1[8], n1[10] ); |
1116 | 0 | n2[9] = _mm_add_epi32( n1[9], n1[11] ); |
1117 | 0 | n2[10] = _mm_sub_epi32( n1[8], n1[10] ); |
1118 | 0 | n2[11] = _mm_sub_epi32( n1[9], n1[11] ); |
1119 | 0 | n2[12] = _mm_add_epi32( n1[12], n1[14] ); |
1120 | 0 | n2[13] = _mm_add_epi32( n1[13], n1[15] ); |
1121 | 0 | n2[14] = _mm_sub_epi32( n1[12], n1[14] ); |
1122 | 0 | n2[15] = _mm_sub_epi32( n1[13], n1[15] ); |
1123 | |
|
1124 | 0 | n1[0] = _mm_abs_epi32( _mm_add_epi32( n2[0], n2[1] ) ); |
1125 | 0 | n1[1] = _mm_abs_epi32( _mm_sub_epi32( n2[0], n2[1] ) ); |
1126 | 0 | n1[2] = _mm_abs_epi32( _mm_add_epi32( n2[2], n2[3] ) ); |
1127 | 0 | n1[3] = _mm_abs_epi32( _mm_sub_epi32( n2[2], n2[3] ) ); |
1128 | 0 | n1[4] = _mm_abs_epi32( _mm_add_epi32( n2[4], n2[5] ) ); |
1129 | 0 | n1[5] = _mm_abs_epi32( _mm_sub_epi32( n2[4], n2[5] ) ); |
1130 | 0 | n1[6] = _mm_abs_epi32( _mm_add_epi32( n2[6], n2[7] ) ); |
1131 | 0 | n1[7] = _mm_abs_epi32( _mm_sub_epi32( n2[6], n2[7] ) ); |
1132 | 0 | n1[8] = _mm_abs_epi32( _mm_add_epi32( n2[8], n2[9] ) ); |
1133 | 0 | n1[9] = _mm_abs_epi32( _mm_sub_epi32( n2[8], n2[9] ) ); |
1134 | 0 | n1[10] = _mm_abs_epi32( _mm_add_epi32( n2[10], n2[11] ) ); |
1135 | 0 | n1[11] = _mm_abs_epi32( _mm_sub_epi32( n2[10], n2[11] ) ); |
1136 | 0 | n1[12] = _mm_abs_epi32( _mm_add_epi32( n2[12], n2[13] ) ); |
1137 | 0 | n1[13] = _mm_abs_epi32( _mm_sub_epi32( n2[12], n2[13] ) ); |
1138 | 0 | n1[14] = _mm_abs_epi32( _mm_add_epi32( n2[14], n2[15] ) ); |
1139 | 0 | n1[15] = _mm_abs_epi32( _mm_sub_epi32( n2[14], n2[15] ) ); |
1140 | | |
1141 | 0 | if (l == 0) |
1142 | 0 | absDc = _mm_cvtsi128_si32( n1[0] ); |
1143 | | |
1144 | | // sum up |
1145 | 0 | n1[0] = _mm_add_epi32( n1[0], n1[1] ); |
1146 | 0 | n1[2] = _mm_add_epi32( n1[2], n1[3] ); |
1147 | 0 | n1[4] = _mm_add_epi32( n1[4], n1[5] ); |
1148 | 0 | n1[6] = _mm_add_epi32( n1[6], n1[7] ); |
1149 | 0 | n1[8] = _mm_add_epi32( n1[8], n1[9] ); |
1150 | 0 | n1[10] = _mm_add_epi32( n1[10], n1[11] ); |
1151 | 0 | n1[12] = _mm_add_epi32( n1[12], n1[13] ); |
1152 | 0 | n1[14] = _mm_add_epi32( n1[14], n1[15] ); |
1153 | |
|
1154 | 0 | n1[0] = _mm_add_epi32( n1[0], n1[2] ); |
1155 | 0 | n1[4] = _mm_add_epi32( n1[4], n1[6] ); |
1156 | 0 | n1[8] = _mm_add_epi32( n1[8], n1[10] ); |
1157 | 0 | n1[12] = _mm_add_epi32( n1[12], n1[14] ); |
1158 | |
|
1159 | 0 | n1[0] = _mm_add_epi32( n1[0], n1[4] ); |
1160 | 0 | n1[8] = _mm_add_epi32( n1[8], n1[12] ); |
1161 | |
|
1162 | 0 | n1[0] = _mm_add_epi32( n1[0], n1[8] ); |
1163 | 0 | iSum = _mm_add_epi32( iSum, n1[0] ); |
1164 | 0 | } |
1165 | |
|
1166 | 0 | iSum = _mm_hadd_epi32( iSum, iSum ); |
1167 | 0 | iSum = _mm_hadd_epi32( iSum, iSum ); |
1168 | |
|
1169 | 0 | uint32_t sad = _mm_cvtsi128_si32( iSum ); |
1170 | 0 | sad -= absDc; |
1171 | 0 | sad += absDc >> 2; |
1172 | 0 | sad = (uint32_t)(sad / sqrt(16.0 * 8) * 2); |
1173 | |
|
1174 | 0 | return sad; |
1175 | 0 | } Unexecuted instantiation: RdCost_sse41.cpp:vvenc::xCalcHAD16x8_SSE(short const*, short const*, int, int, int) Unexecuted instantiation: RdCost_avx2.cpp:vvenc::xCalcHAD16x8_SSE(short const*, short const*, int, int, int) |
1176 | | |
1177 | | |
1178 | | //working up to 12-bit |
1179 | | static uint32_t xCalcHAD8x16_SSE( const Torg *piOrg, const Tcur *piCur, const int iStrideOrg, const int iStrideCur, const int iBitDepth ) |
1180 | 0 | { |
1181 | 0 | __m128i m1[2][16], m2[2][16]; |
1182 | 0 | __m128i iSum = _mm_setzero_si128(); |
1183 | |
|
1184 | 0 | for( int k = 0; k < 16; k++ ) |
1185 | 0 | { |
1186 | 0 | __m128i r0 =_mm_loadu_si128( (__m128i*)piOrg ); |
1187 | 0 | __m128i r1 =_mm_loadu_si128( (__m128i*)piCur ); |
1188 | 0 | m1[0][k] = _mm_sub_epi16( r0, r1 ); |
1189 | 0 | m1[1][k] = _mm_cvtepi16_epi32( _mm_srli_si128( m1[0][k], 8 ) ); |
1190 | 0 | m1[0][k] = _mm_cvtepi16_epi32( m1[0][k] ); |
1191 | 0 | piCur += iStrideCur; |
1192 | 0 | piOrg += iStrideOrg; |
1193 | 0 | } |
1194 | |
|
1195 | 0 | for( int i = 0; i < 2; i++ ) |
1196 | 0 | { |
1197 | | // vertical |
1198 | 0 | m2[i][ 0] = _mm_add_epi32( m1[i][ 0], m1[i][ 8] ); |
1199 | 0 | m2[i][ 1] = _mm_add_epi32( m1[i][ 1], m1[i][ 9] ); |
1200 | 0 | m2[i][ 2] = _mm_add_epi32( m1[i][ 2], m1[i][10] ); |
1201 | 0 | m2[i][ 3] = _mm_add_epi32( m1[i][ 3], m1[i][11] ); |
1202 | 0 | m2[i][ 4] = _mm_add_epi32( m1[i][ 4], m1[i][12] ); |
1203 | 0 | m2[i][ 5] = _mm_add_epi32( m1[i][ 5], m1[i][13] ); |
1204 | 0 | m2[i][ 6] = _mm_add_epi32( m1[i][ 6], m1[i][14] ); |
1205 | 0 | m2[i][ 7] = _mm_add_epi32( m1[i][ 7], m1[i][15] ); |
1206 | 0 | m2[i][ 8] = _mm_sub_epi32( m1[i][ 0], m1[i][ 8] ); |
1207 | 0 | m2[i][ 9] = _mm_sub_epi32( m1[i][ 1], m1[i][ 9] ); |
1208 | 0 | m2[i][10] = _mm_sub_epi32( m1[i][ 2], m1[i][10] ); |
1209 | 0 | m2[i][11] = _mm_sub_epi32( m1[i][ 3], m1[i][11] ); |
1210 | 0 | m2[i][12] = _mm_sub_epi32( m1[i][ 4], m1[i][12] ); |
1211 | 0 | m2[i][13] = _mm_sub_epi32( m1[i][ 5], m1[i][13] ); |
1212 | 0 | m2[i][14] = _mm_sub_epi32( m1[i][ 6], m1[i][14] ); |
1213 | 0 | m2[i][15] = _mm_sub_epi32( m1[i][ 7], m1[i][15] ); |
1214 | |
|
1215 | 0 | m1[i][ 0] = _mm_add_epi32( m2[i][ 0], m2[i][ 4] ); |
1216 | 0 | m1[i][ 1] = _mm_add_epi32( m2[i][ 1], m2[i][ 5] ); |
1217 | 0 | m1[i][ 2] = _mm_add_epi32( m2[i][ 2], m2[i][ 6] ); |
1218 | 0 | m1[i][ 3] = _mm_add_epi32( m2[i][ 3], m2[i][ 7] ); |
1219 | 0 | m1[i][ 4] = _mm_sub_epi32( m2[i][ 0], m2[i][ 4] ); |
1220 | 0 | m1[i][ 5] = _mm_sub_epi32( m2[i][ 1], m2[i][ 5] ); |
1221 | 0 | m1[i][ 6] = _mm_sub_epi32( m2[i][ 2], m2[i][ 6] ); |
1222 | 0 | m1[i][ 7] = _mm_sub_epi32( m2[i][ 3], m2[i][ 7] ); |
1223 | 0 | m1[i][ 8] = _mm_add_epi32( m2[i][ 8], m2[i][12] ); |
1224 | 0 | m1[i][ 9] = _mm_add_epi32( m2[i][ 9], m2[i][13] ); |
1225 | 0 | m1[i][10] = _mm_add_epi32( m2[i][10], m2[i][14] ); |
1226 | 0 | m1[i][11] = _mm_add_epi32( m2[i][11], m2[i][15] ); |
1227 | 0 | m1[i][12] = _mm_sub_epi32( m2[i][ 8], m2[i][12] ); |
1228 | 0 | m1[i][13] = _mm_sub_epi32( m2[i][ 9], m2[i][13] ); |
1229 | 0 | m1[i][14] = _mm_sub_epi32( m2[i][10], m2[i][14] ); |
1230 | 0 | m1[i][15] = _mm_sub_epi32( m2[i][11], m2[i][15] ); |
1231 | |
|
1232 | 0 | m2[i][ 0] = _mm_add_epi32( m1[i][ 0], m1[i][ 2] ); |
1233 | 0 | m2[i][ 1] = _mm_add_epi32( m1[i][ 1], m1[i][ 3] ); |
1234 | 0 | m2[i][ 2] = _mm_sub_epi32( m1[i][ 0], m1[i][ 2] ); |
1235 | 0 | m2[i][ 3] = _mm_sub_epi32( m1[i][ 1], m1[i][ 3] ); |
1236 | 0 | m2[i][ 4] = _mm_add_epi32( m1[i][ 4], m1[i][ 6] ); |
1237 | 0 | m2[i][ 5] = _mm_add_epi32( m1[i][ 5], m1[i][ 7] ); |
1238 | 0 | m2[i][ 6] = _mm_sub_epi32( m1[i][ 4], m1[i][ 6] ); |
1239 | 0 | m2[i][ 7] = _mm_sub_epi32( m1[i][ 5], m1[i][ 7] ); |
1240 | 0 | m2[i][ 8] = _mm_add_epi32( m1[i][ 8], m1[i][10] ); |
1241 | 0 | m2[i][ 9] = _mm_add_epi32( m1[i][ 9], m1[i][11] ); |
1242 | 0 | m2[i][10] = _mm_sub_epi32( m1[i][ 8], m1[i][10] ); |
1243 | 0 | m2[i][11] = _mm_sub_epi32( m1[i][ 9], m1[i][11] ); |
1244 | 0 | m2[i][12] = _mm_add_epi32( m1[i][12], m1[i][14] ); |
1245 | 0 | m2[i][13] = _mm_add_epi32( m1[i][13], m1[i][15] ); |
1246 | 0 | m2[i][14] = _mm_sub_epi32( m1[i][12], m1[i][14] ); |
1247 | 0 | m2[i][15] = _mm_sub_epi32( m1[i][13], m1[i][15] ); |
1248 | |
|
1249 | 0 | m1[i][ 0] = _mm_add_epi32( m2[i][ 0], m2[i][ 1] ); |
1250 | 0 | m1[i][ 1] = _mm_sub_epi32( m2[i][ 0], m2[i][ 1] ); |
1251 | 0 | m1[i][ 2] = _mm_add_epi32( m2[i][ 2], m2[i][ 3] ); |
1252 | 0 | m1[i][ 3] = _mm_sub_epi32( m2[i][ 2], m2[i][ 3] ); |
1253 | 0 | m1[i][ 4] = _mm_add_epi32( m2[i][ 4], m2[i][ 5] ); |
1254 | 0 | m1[i][ 5] = _mm_sub_epi32( m2[i][ 4], m2[i][ 5] ); |
1255 | 0 | m1[i][ 6] = _mm_add_epi32( m2[i][ 6], m2[i][ 7] ); |
1256 | 0 | m1[i][ 7] = _mm_sub_epi32( m2[i][ 6], m2[i][ 7] ); |
1257 | 0 | m1[i][ 8] = _mm_add_epi32( m2[i][ 8], m2[i][ 9] ); |
1258 | 0 | m1[i][ 9] = _mm_sub_epi32( m2[i][ 8], m2[i][ 9] ); |
1259 | 0 | m1[i][10] = _mm_add_epi32( m2[i][10], m2[i][11] ); |
1260 | 0 | m1[i][11] = _mm_sub_epi32( m2[i][10], m2[i][11] ); |
1261 | 0 | m1[i][12] = _mm_add_epi32( m2[i][12], m2[i][13] ); |
1262 | 0 | m1[i][13] = _mm_sub_epi32( m2[i][12], m2[i][13] ); |
1263 | 0 | m1[i][14] = _mm_add_epi32( m2[i][14], m2[i][15] ); |
1264 | 0 | m1[i][15] = _mm_sub_epi32( m2[i][14], m2[i][15] ); |
1265 | 0 | } |
1266 | | |
1267 | | // process horizontal in two steps ( 2 x 8x8 blocks ) |
1268 | |
|
1269 | 0 | for( int l = 0; l < 4; l++ ) |
1270 | 0 | { |
1271 | 0 | int off = l * 4; |
1272 | |
|
1273 | 0 | for( int i = 0; i < 2; i++ ) |
1274 | 0 | { |
1275 | | // transpose 4x4 |
1276 | 0 | m2[i][0 + off] = _mm_unpacklo_epi32( m1[i][0 + off], m1[i][1 + off] ); |
1277 | 0 | m2[i][1 + off] = _mm_unpackhi_epi32( m1[i][0 + off], m1[i][1 + off] ); |
1278 | 0 | m2[i][2 + off] = _mm_unpacklo_epi32( m1[i][2 + off], m1[i][3 + off] ); |
1279 | 0 | m2[i][3 + off] = _mm_unpackhi_epi32( m1[i][2 + off], m1[i][3 + off] ); |
1280 | |
|
1281 | 0 | m1[i][0 + off] = _mm_unpacklo_epi64( m2[i][0 + off], m2[i][2 + off] ); |
1282 | 0 | m1[i][1 + off] = _mm_unpackhi_epi64( m2[i][0 + off], m2[i][2 + off] ); |
1283 | 0 | m1[i][2 + off] = _mm_unpacklo_epi64( m2[i][1 + off], m2[i][3 + off] ); |
1284 | 0 | m1[i][3 + off] = _mm_unpackhi_epi64( m2[i][1 + off], m2[i][3 + off] ); |
1285 | 0 | } |
1286 | 0 | } |
1287 | |
|
1288 | 0 | uint32_t absDc = 0; |
1289 | |
|
1290 | 0 | for( int l = 0; l < 2; l++ ) |
1291 | 0 | { |
1292 | 0 | int off = l * 8; |
1293 | |
|
1294 | 0 | __m128i n1[2][8]; |
1295 | 0 | __m128i n2[2][8]; |
1296 | |
|
1297 | 0 | for( int i = 0; i < 8; i++ ) |
1298 | 0 | { |
1299 | 0 | int ii = i % 4; |
1300 | 0 | int ij = i >> 2; |
1301 | |
|
1302 | 0 | n2[0][i] = m1[ij][off + ii ]; |
1303 | 0 | n2[1][i] = m1[ij][off + ii + 4]; |
1304 | 0 | } |
1305 | |
|
1306 | 0 | for( int i = 0; i < 2; i++ ) |
1307 | 0 | { |
1308 | 0 | n1[i][0] = _mm_add_epi32( n2[i][0], n2[i][4] ); |
1309 | 0 | n1[i][1] = _mm_add_epi32( n2[i][1], n2[i][5] ); |
1310 | 0 | n1[i][2] = _mm_add_epi32( n2[i][2], n2[i][6] ); |
1311 | 0 | n1[i][3] = _mm_add_epi32( n2[i][3], n2[i][7] ); |
1312 | 0 | n1[i][4] = _mm_sub_epi32( n2[i][0], n2[i][4] ); |
1313 | 0 | n1[i][5] = _mm_sub_epi32( n2[i][1], n2[i][5] ); |
1314 | 0 | n1[i][6] = _mm_sub_epi32( n2[i][2], n2[i][6] ); |
1315 | 0 | n1[i][7] = _mm_sub_epi32( n2[i][3], n2[i][7] ); |
1316 | |
|
1317 | 0 | n2[i][0] = _mm_add_epi32( n1[i][0], n1[i][2] ); |
1318 | 0 | n2[i][1] = _mm_add_epi32( n1[i][1], n1[i][3] ); |
1319 | 0 | n2[i][2] = _mm_sub_epi32( n1[i][0], n1[i][2] ); |
1320 | 0 | n2[i][3] = _mm_sub_epi32( n1[i][1], n1[i][3] ); |
1321 | 0 | n2[i][4] = _mm_add_epi32( n1[i][4], n1[i][6] ); |
1322 | 0 | n2[i][5] = _mm_add_epi32( n1[i][5], n1[i][7] ); |
1323 | 0 | n2[i][6] = _mm_sub_epi32( n1[i][4], n1[i][6] ); |
1324 | 0 | n2[i][7] = _mm_sub_epi32( n1[i][5], n1[i][7] ); |
1325 | |
|
1326 | 0 | n1[i][0] = _mm_abs_epi32( _mm_add_epi32( n2[i][0], n2[i][1] ) ); |
1327 | 0 | n1[i][1] = _mm_abs_epi32( _mm_sub_epi32( n2[i][0], n2[i][1] ) ); |
1328 | 0 | n1[i][2] = _mm_abs_epi32( _mm_add_epi32( n2[i][2], n2[i][3] ) ); |
1329 | 0 | n1[i][3] = _mm_abs_epi32( _mm_sub_epi32( n2[i][2], n2[i][3] ) ); |
1330 | 0 | n1[i][4] = _mm_abs_epi32( _mm_add_epi32( n2[i][4], n2[i][5] ) ); |
1331 | 0 | n1[i][5] = _mm_abs_epi32( _mm_sub_epi32( n2[i][4], n2[i][5] ) ); |
1332 | 0 | n1[i][6] = _mm_abs_epi32( _mm_add_epi32( n2[i][6], n2[i][7] ) ); |
1333 | 0 | n1[i][7] = _mm_abs_epi32( _mm_sub_epi32( n2[i][6], n2[i][7] ) ); |
1334 | | |
1335 | 0 | if ( l + i == 0 ) |
1336 | 0 | absDc = _mm_cvtsi128_si32( n1[i][0] ); |
1337 | 0 | } |
1338 | |
|
1339 | 0 | for( int i = 0; i < 8; i++ ) |
1340 | 0 | { |
1341 | 0 | n2[0][i] = _mm_add_epi32( n1[0][i], n1[1][i] ); |
1342 | 0 | } |
1343 | |
|
1344 | 0 | n2[0][0] = _mm_add_epi32( n2[0][0], n2[0][1] ); |
1345 | 0 | n2[0][2] = _mm_add_epi32( n2[0][2], n2[0][3] ); |
1346 | 0 | n2[0][4] = _mm_add_epi32( n2[0][4], n2[0][5] ); |
1347 | 0 | n2[0][6] = _mm_add_epi32( n2[0][6], n2[0][7] ); |
1348 | |
|
1349 | 0 | n2[0][0] = _mm_add_epi32( n2[0][0], n2[0][2] ); |
1350 | 0 | n2[0][4] = _mm_add_epi32( n2[0][4], n2[0][6] ); |
1351 | 0 | iSum = _mm_add_epi32( iSum, _mm_add_epi32( n2[0][0], n2[0][4] ) ); |
1352 | 0 | } |
1353 | |
|
1354 | 0 | iSum = _mm_hadd_epi32( iSum, iSum ); |
1355 | 0 | iSum = _mm_hadd_epi32( iSum, iSum ); |
1356 | |
|
1357 | 0 | uint32_t sad = _mm_cvtsi128_si32( iSum ); |
1358 | 0 | sad -= absDc; |
1359 | 0 | sad += absDc >> 2; |
1360 | 0 | sad = (uint32_t)(sad / sqrt(16.0 * 8) * 2); |
1361 | |
|
1362 | 0 | return sad; |
1363 | 0 | } Unexecuted instantiation: RdCost_sse41.cpp:vvenc::xCalcHAD8x16_SSE(short const*, short const*, int, int, int) Unexecuted instantiation: RdCost_avx2.cpp:vvenc::xCalcHAD8x16_SSE(short const*, short const*, int, int, int) |
1364 | | |
1365 | | |
1366 | | template< typename Torg, typename Tcur > |
1367 | | static uint32_t xCalcHAD8x4_SSE( const Torg *piOrg, const Tcur *piCur, const int iStrideOrg, const int iStrideCur, const int iBitDepth ) |
1368 | 0 | { |
1369 | 0 | __m128i m1[8], m2[8]; |
1370 | 0 | __m128i vzero = _mm_setzero_si128(); |
1371 | |
|
1372 | 0 | for( int k = 0; k < 4; k++ ) |
1373 | 0 | { |
1374 | 0 | __m128i r0 = (sizeof( Torg ) > 1) ? (_mm_loadu_si128 ( (__m128i*)piOrg )) : (_mm_unpacklo_epi8( _vv_loadl_epi64( (const __m128i*)piOrg ), _mm_setzero_si128() )); |
1375 | 0 | __m128i r1 = (sizeof( Tcur ) > 1) ? (_mm_loadu_si128( (__m128i*)piCur )) : (_mm_unpacklo_epi8( _vv_loadl_epi64( (const __m128i*)piCur ), _mm_setzero_si128() )); // th _mm_loadu_si128( (__m128i*)piCur ) |
1376 | 0 | m1[k] = _mm_sub_epi16( r0, r1 ); |
1377 | 0 | piCur += iStrideCur; |
1378 | 0 | piOrg += iStrideOrg; |
1379 | 0 | } |
1380 | | |
1381 | | //vertical |
1382 | 0 | m2[0] = _mm_add_epi16( m1[0], m1[2] ); |
1383 | 0 | m2[1] = _mm_add_epi16( m1[1], m1[3] ); |
1384 | 0 | m2[2] = _mm_sub_epi16( m1[0], m1[2] ); |
1385 | 0 | m2[3] = _mm_sub_epi16( m1[1], m1[3] ); |
1386 | |
|
1387 | 0 | m1[0] = _mm_add_epi16( m2[0], m2[1] ); |
1388 | 0 | m1[1] = _mm_sub_epi16( m2[0], m2[1] ); |
1389 | 0 | m1[2] = _mm_add_epi16( m2[2], m2[3] ); |
1390 | 0 | m1[3] = _mm_sub_epi16( m2[2], m2[3] ); |
1391 | | |
1392 | | // transpose, partially |
1393 | 0 | { |
1394 | 0 | m2[0] = _mm_unpacklo_epi16( m1[0], m1[1] ); |
1395 | 0 | m2[1] = _mm_unpacklo_epi16( m1[2], m1[3] ); |
1396 | 0 | m2[2] = _mm_unpackhi_epi16( m1[0], m1[1] ); |
1397 | 0 | m2[3] = _mm_unpackhi_epi16( m1[2], m1[3] ); |
1398 | |
|
1399 | 0 | m1[0] = _mm_unpacklo_epi32( m2[0], m2[1] ); |
1400 | 0 | m1[1] = _mm_unpackhi_epi32( m2[0], m2[1] ); |
1401 | 0 | m1[2] = _mm_unpacklo_epi32( m2[2], m2[3] ); |
1402 | 0 | m1[3] = _mm_unpackhi_epi32( m2[2], m2[3] ); |
1403 | 0 | } |
1404 | | |
1405 | | // horizontal |
1406 | 0 | if( iBitDepth >= 10 /*sizeof( Torg ) > 1 || sizeof( Tcur ) > 1*/ ) |
1407 | 0 | { |
1408 | | // finish transpose |
1409 | 0 | m2[0] = _mm_unpacklo_epi64( m1[0], vzero ); |
1410 | 0 | m2[1] = _mm_unpackhi_epi64( m1[0], vzero ); |
1411 | 0 | m2[2] = _mm_unpacklo_epi64( m1[1], vzero ); |
1412 | 0 | m2[3] = _mm_unpackhi_epi64( m1[1], vzero ); |
1413 | 0 | m2[4] = _mm_unpacklo_epi64( m1[2], vzero ); |
1414 | 0 | m2[5] = _mm_unpackhi_epi64( m1[2], vzero ); |
1415 | 0 | m2[6] = _mm_unpacklo_epi64( m1[3], vzero ); |
1416 | 0 | m2[7] = _mm_unpackhi_epi64( m1[3], vzero ); |
1417 | |
|
1418 | 0 | for( int i = 0; i < 8; i++ ) |
1419 | 0 | { |
1420 | 0 | m2[i] = _mm_cvtepi16_epi32( m2[i] ); |
1421 | 0 | } |
1422 | |
|
1423 | 0 | m1[0] = _mm_add_epi32( m2[0], m2[4] ); |
1424 | 0 | m1[1] = _mm_add_epi32( m2[1], m2[5] ); |
1425 | 0 | m1[2] = _mm_add_epi32( m2[2], m2[6] ); |
1426 | 0 | m1[3] = _mm_add_epi32( m2[3], m2[7] ); |
1427 | 0 | m1[4] = _mm_sub_epi32( m2[0], m2[4] ); |
1428 | 0 | m1[5] = _mm_sub_epi32( m2[1], m2[5] ); |
1429 | 0 | m1[6] = _mm_sub_epi32( m2[2], m2[6] ); |
1430 | 0 | m1[7] = _mm_sub_epi32( m2[3], m2[7] ); |
1431 | |
|
1432 | 0 | m2[0] = _mm_add_epi32( m1[0], m1[2] ); |
1433 | 0 | m2[1] = _mm_add_epi32( m1[1], m1[3] ); |
1434 | 0 | m2[2] = _mm_sub_epi32( m1[0], m1[2] ); |
1435 | 0 | m2[3] = _mm_sub_epi32( m1[1], m1[3] ); |
1436 | 0 | m2[4] = _mm_add_epi32( m1[4], m1[6] ); |
1437 | 0 | m2[5] = _mm_add_epi32( m1[5], m1[7] ); |
1438 | 0 | m2[6] = _mm_sub_epi32( m1[4], m1[6] ); |
1439 | 0 | m2[7] = _mm_sub_epi32( m1[5], m1[7] ); |
1440 | |
|
1441 | 0 | m1[0] = _mm_abs_epi32( _mm_add_epi32( m2[0], m2[1] ) ); |
1442 | 0 | m1[1] = _mm_abs_epi32( _mm_sub_epi32( m2[0], m2[1] ) ); |
1443 | 0 | m1[2] = _mm_abs_epi32( _mm_add_epi32( m2[2], m2[3] ) ); |
1444 | 0 | m1[3] = _mm_abs_epi32( _mm_sub_epi32( m2[2], m2[3] ) ); |
1445 | 0 | m1[4] = _mm_abs_epi32( _mm_add_epi32( m2[4], m2[5] ) ); |
1446 | 0 | m1[5] = _mm_abs_epi32( _mm_sub_epi32( m2[4], m2[5] ) ); |
1447 | 0 | m1[6] = _mm_abs_epi32( _mm_add_epi32( m2[6], m2[7] ) ); |
1448 | 0 | m1[7] = _mm_abs_epi32( _mm_sub_epi32( m2[6], m2[7] ) ); |
1449 | 0 | } |
1450 | 0 | else |
1451 | 0 | { |
1452 | 0 | m2[0] = _mm_add_epi16( m1[0], m1[2] ); |
1453 | 0 | m2[1] = _mm_add_epi16( m1[1], m1[3] ); |
1454 | 0 | m2[2] = _mm_sub_epi16( m1[0], m1[2] ); |
1455 | 0 | m2[3] = _mm_sub_epi16( m1[1], m1[3] ); |
1456 | |
|
1457 | 0 | m1[0] = _mm_add_epi16( m2[0], m2[1] ); |
1458 | 0 | m1[1] = _mm_sub_epi16( m2[0], m2[1] ); |
1459 | 0 | m1[2] = _mm_add_epi16( m2[2], m2[3] ); |
1460 | 0 | m1[3] = _mm_sub_epi16( m2[2], m2[3] ); |
1461 | | |
1462 | | // finish transpose |
1463 | 0 | m2[0] = _mm_unpacklo_epi64( m1[0], vzero ); |
1464 | 0 | m2[1] = _mm_unpackhi_epi64( m1[0], vzero ); |
1465 | 0 | m2[2] = _mm_unpacklo_epi64( m1[1], vzero ); |
1466 | 0 | m2[3] = _mm_unpackhi_epi64( m1[1], vzero ); |
1467 | 0 | m2[4] = _mm_unpacklo_epi64( m1[2], vzero ); |
1468 | 0 | m2[5] = _mm_unpackhi_epi64( m1[2], vzero ); |
1469 | 0 | m2[6] = _mm_unpacklo_epi64( m1[3], vzero ); |
1470 | 0 | m2[7] = _mm_unpackhi_epi64( m1[3], vzero ); |
1471 | |
|
1472 | 0 | m1[0] = _mm_abs_epi16( _mm_add_epi16( m2[0], m2[1] ) ); |
1473 | 0 | m1[1] = _mm_abs_epi16( _mm_sub_epi16( m2[0], m2[1] ) ); |
1474 | 0 | m1[2] = _mm_abs_epi16( _mm_add_epi16( m2[2], m2[3] ) ); |
1475 | 0 | m1[3] = _mm_abs_epi16( _mm_sub_epi16( m2[2], m2[3] ) ); |
1476 | 0 | m1[4] = _mm_abs_epi16( _mm_add_epi16( m2[4], m2[5] ) ); |
1477 | 0 | m1[5] = _mm_abs_epi16( _mm_sub_epi16( m2[4], m2[5] ) ); |
1478 | 0 | m1[6] = _mm_abs_epi16( _mm_add_epi16( m2[6], m2[7] ) ); |
1479 | 0 | m1[7] = _mm_abs_epi16( _mm_sub_epi16( m2[6], m2[7] ) ); |
1480 | |
|
1481 | 0 | for( int i = 0; i < 8; i++ ) |
1482 | 0 | { |
1483 | 0 | m1[i] = _mm_unpacklo_epi16( m1[i], vzero ); |
1484 | 0 | } |
1485 | 0 | } |
1486 | | |
1487 | 0 | uint32_t absDc = _mm_cvtsi128_si32( m1[0] ); |
1488 | |
|
1489 | 0 | m1[0] = _mm_add_epi32( m1[0], m1[1] ); |
1490 | 0 | m1[1] = _mm_add_epi32( m1[2], m1[3] ); |
1491 | 0 | m1[2] = _mm_add_epi32( m1[4], m1[5] ); |
1492 | 0 | m1[3] = _mm_add_epi32( m1[6], m1[7] ); |
1493 | |
|
1494 | 0 | m1[0] = _mm_add_epi32( m1[0], m1[1] ); |
1495 | 0 | m1[1] = _mm_add_epi32( m1[2], m1[3] ); |
1496 | |
|
1497 | 0 | __m128i iSum = _mm_add_epi32( m1[0], m1[1] ); |
1498 | |
|
1499 | 0 | iSum = _mm_hadd_epi32( iSum, iSum ); |
1500 | 0 | iSum = _mm_hadd_epi32( iSum, iSum ); |
1501 | |
|
1502 | 0 | uint32_t sad = _mm_cvtsi128_si32( iSum ); |
1503 | 0 | sad -= absDc; |
1504 | 0 | sad += absDc >> 2; |
1505 | 0 | sad = (uint32_t)(sad / sqrt(4.0 * 8) * 2); |
1506 | 0 | return sad; |
1507 | 0 | } Unexecuted instantiation: RdCost_sse41.cpp:unsigned int vvenc::xCalcHAD8x4_SSE<short, short>(short const*, short const*, int, int, int) Unexecuted instantiation: RdCost_avx2.cpp:unsigned int vvenc::xCalcHAD8x4_SSE<short, short>(short const*, short const*, int, int, int) |
1508 | | |
1509 | | static uint32_t xCalcHAD4x8_SSE( const Torg *piOrg, const Tcur *piCur, const int iStrideOrg, const int iStrideCur, const int iBitDepth ) |
1510 | 0 | { |
1511 | 0 | __m128i m1[8], m2[8]; |
1512 | |
|
1513 | 0 | for( int k = 0; k < 8; k++ ) |
1514 | 0 | { |
1515 | 0 | __m128i r0 = (sizeof( Torg ) > 1) ? (_vv_loadl_epi64( (__m128i*)piOrg )) : (_mm_cvtsi32_si128( *(const int*)piOrg )); |
1516 | 0 | __m128i r1 = (sizeof( Tcur ) > 1) ? (_vv_loadl_epi64( (__m128i*)piCur )) : (_mm_cvtsi32_si128( *(const int*)piCur )); |
1517 | 0 | m2[k] = _mm_sub_epi16( r0, r1 ); |
1518 | 0 | piCur += iStrideCur; |
1519 | 0 | piOrg += iStrideOrg; |
1520 | 0 | } |
1521 | | |
1522 | | |
1523 | | // vertical |
1524 | |
|
1525 | 0 | m1[0] = _mm_add_epi16( m2[0], m2[4] ); |
1526 | 0 | m1[1] = _mm_add_epi16( m2[1], m2[5] ); |
1527 | 0 | m1[2] = _mm_add_epi16( m2[2], m2[6] ); |
1528 | 0 | m1[3] = _mm_add_epi16( m2[3], m2[7] ); |
1529 | 0 | m1[4] = _mm_sub_epi16( m2[0], m2[4] ); |
1530 | 0 | m1[5] = _mm_sub_epi16( m2[1], m2[5] ); |
1531 | 0 | m1[6] = _mm_sub_epi16( m2[2], m2[6] ); |
1532 | 0 | m1[7] = _mm_sub_epi16( m2[3], m2[7] ); |
1533 | |
|
1534 | 0 | m2[0] = _mm_add_epi16( m1[0], m1[2] ); |
1535 | 0 | m2[1] = _mm_add_epi16( m1[1], m1[3] ); |
1536 | 0 | m2[2] = _mm_sub_epi16( m1[0], m1[2] ); |
1537 | 0 | m2[3] = _mm_sub_epi16( m1[1], m1[3] ); |
1538 | 0 | m2[4] = _mm_add_epi16( m1[4], m1[6] ); |
1539 | 0 | m2[5] = _mm_add_epi16( m1[5], m1[7] ); |
1540 | 0 | m2[6] = _mm_sub_epi16( m1[4], m1[6] ); |
1541 | 0 | m2[7] = _mm_sub_epi16( m1[5], m1[7] ); |
1542 | |
|
1543 | 0 | m1[0] = _mm_add_epi16( m2[0], m2[1] ); |
1544 | 0 | m1[1] = _mm_sub_epi16( m2[0], m2[1] ); |
1545 | 0 | m1[2] = _mm_add_epi16( m2[2], m2[3] ); |
1546 | 0 | m1[3] = _mm_sub_epi16( m2[2], m2[3] ); |
1547 | 0 | m1[4] = _mm_add_epi16( m2[4], m2[5] ); |
1548 | 0 | m1[5] = _mm_sub_epi16( m2[4], m2[5] ); |
1549 | 0 | m1[6] = _mm_add_epi16( m2[6], m2[7] ); |
1550 | 0 | m1[7] = _mm_sub_epi16( m2[6], m2[7] ); |
1551 | | |
1552 | | |
1553 | | // horizontal |
1554 | | // transpose |
1555 | 0 | { |
1556 | 0 | m2[0] = _mm_unpacklo_epi16( m1[0], m1[1] ); |
1557 | 0 | m2[1] = _mm_unpacklo_epi16( m1[2], m1[3] ); |
1558 | 0 | m2[2] = _mm_unpacklo_epi16( m1[4], m1[5] ); |
1559 | 0 | m2[3] = _mm_unpacklo_epi16( m1[6], m1[7] ); |
1560 | |
|
1561 | 0 | m1[0] = _mm_unpacklo_epi32( m2[0], m2[1] ); |
1562 | 0 | m1[1] = _mm_unpackhi_epi32( m2[0], m2[1] ); |
1563 | 0 | m1[2] = _mm_unpacklo_epi32( m2[2], m2[3] ); |
1564 | 0 | m1[3] = _mm_unpackhi_epi32( m2[2], m2[3] ); |
1565 | |
|
1566 | 0 | m2[0] = _mm_unpacklo_epi64( m1[0], m1[2] ); |
1567 | 0 | m2[1] = _mm_unpackhi_epi64( m1[0], m1[2] ); |
1568 | 0 | m2[2] = _mm_unpacklo_epi64( m1[1], m1[3] ); |
1569 | 0 | m2[3] = _mm_unpackhi_epi64( m1[1], m1[3] ); |
1570 | 0 | } |
1571 | |
|
1572 | 0 | uint32_t absDc = 0; |
1573 | |
|
1574 | 0 | if( iBitDepth >= 10 /*sizeof( Torg ) > 1 || sizeof( Tcur ) > 1*/ ) |
1575 | 0 | { |
1576 | 0 | __m128i n1[4][2]; |
1577 | 0 | __m128i n2[4][2]; |
1578 | |
|
1579 | 0 | for( int i = 0; i < 4; i++ ) |
1580 | 0 | { |
1581 | 0 | n1[i][0] = _mm_cvtepi16_epi32( m2[i] ); |
1582 | 0 | n1[i][1] = _mm_cvtepi16_epi32( _mm_shuffle_epi32( m2[i], 0xEE ) ); |
1583 | 0 | } |
1584 | |
|
1585 | 0 | for( int i = 0; i < 2; i++ ) |
1586 | 0 | { |
1587 | 0 | n2[0][i] = _mm_add_epi32( n1[0][i], n1[2][i] ); |
1588 | 0 | n2[1][i] = _mm_add_epi32( n1[1][i], n1[3][i] ); |
1589 | 0 | n2[2][i] = _mm_sub_epi32( n1[0][i], n1[2][i] ); |
1590 | 0 | n2[3][i] = _mm_sub_epi32( n1[1][i], n1[3][i] ); |
1591 | |
|
1592 | 0 | n1[0][i] = _mm_abs_epi32( _mm_add_epi32( n2[0][i], n2[1][i] ) ); |
1593 | 0 | n1[1][i] = _mm_abs_epi32( _mm_sub_epi32( n2[0][i], n2[1][i] ) ); |
1594 | 0 | n1[2][i] = _mm_abs_epi32( _mm_add_epi32( n2[2][i], n2[3][i] ) ); |
1595 | 0 | n1[3][i] = _mm_abs_epi32( _mm_sub_epi32( n2[2][i], n2[3][i] ) ); |
1596 | 0 | } |
1597 | 0 | for( int i = 0; i < 4; i++ ) |
1598 | 0 | { |
1599 | 0 | m1[i] = _mm_add_epi32( n1[i][0], n1[i][1] ); |
1600 | 0 | } |
1601 | |
|
1602 | 0 | absDc = _mm_cvtsi128_si32( n1[0][0] ); |
1603 | 0 | } |
1604 | 0 | else |
1605 | 0 | { |
1606 | 0 | m1[0] = _mm_add_epi16( m2[0], m2[2] ); |
1607 | 0 | m1[1] = _mm_add_epi16( m2[1], m2[3] ); |
1608 | 0 | m1[2] = _mm_sub_epi16( m2[0], m2[2] ); |
1609 | 0 | m1[3] = _mm_sub_epi16( m2[1], m2[3] ); |
1610 | |
|
1611 | 0 | m2[0] = _mm_abs_epi16( _mm_add_epi16( m1[0], m1[1] ) ); |
1612 | 0 | m2[1] = _mm_abs_epi16( _mm_sub_epi16( m1[0], m1[1] ) ); |
1613 | 0 | m2[2] = _mm_abs_epi16( _mm_add_epi16( m1[2], m1[3] ) ); |
1614 | 0 | m2[3] = _mm_abs_epi16( _mm_sub_epi16( m1[2], m1[3] ) ); |
1615 | |
|
1616 | 0 | __m128i ma1, ma2; |
1617 | 0 | __m128i vzero = _mm_setzero_si128(); |
1618 | |
|
1619 | 0 | for( int i = 0; i < 4; i++ ) |
1620 | 0 | { |
1621 | 0 | ma1 = _mm_unpacklo_epi16( m2[i], vzero ); |
1622 | 0 | ma2 = _mm_unpackhi_epi16( m2[i], vzero ); |
1623 | 0 | m1[i] = _mm_add_epi32( ma1, ma2 ); |
1624 | 0 | } |
1625 | |
|
1626 | 0 | absDc = _mm_cvtsi128_si32( m2[0] ) & 0x0000ffff; |
1627 | 0 | } |
1628 | |
|
1629 | 0 | m1[0] = _mm_add_epi32( m1[0], m1[1] ); |
1630 | 0 | m1[2] = _mm_add_epi32( m1[2], m1[3] ); |
1631 | |
|
1632 | 0 | __m128i iSum = _mm_add_epi32( m1[0], m1[2] ); |
1633 | |
|
1634 | 0 | iSum = _mm_hadd_epi32( iSum, iSum ); |
1635 | 0 | iSum = _mm_hadd_epi32( iSum, iSum ); |
1636 | |
|
1637 | 0 | uint32_t sad = _mm_cvtsi128_si32( iSum ); |
1638 | | |
1639 | 0 | sad -= absDc; |
1640 | 0 | sad += absDc >> 2; |
1641 | 0 | sad = (uint32_t)(sad / sqrt(4.0 * 8) * 2); |
1642 | |
|
1643 | 0 | return sad; |
1644 | 0 | } Unexecuted instantiation: RdCost_sse41.cpp:vvenc::xCalcHAD4x8_SSE(short const*, short const*, int, int, int) Unexecuted instantiation: RdCost_avx2.cpp:vvenc::xCalcHAD4x8_SSE(short const*, short const*, int, int, int) |
1645 | | |
1646 | | static uint32_t xCalcHAD32x32_fast_AVX2( const Torg *piOrg, const Tcur *piCur, const int iStrideOrg, const int iStrideCur, const int iBitDepth ) |
1647 | 0 | { |
1648 | 0 | uint32_t sad = 0; |
1649 | |
|
1650 | 0 | #ifdef USE_AVX2 |
1651 | 0 | const int iLoops = 2; |
1652 | 0 | __m256i m1[2][8], m2[2][8]; |
1653 | |
|
1654 | 0 | CHECK( iBitDepth > 10, "Only bitdepths up to 10 supported!" ); |
1655 | |
|
1656 | 0 | for( int l = 0; l < iLoops; l++ ) |
1657 | 0 | { |
1658 | 0 | for( int k = 0; k < 8; k++ ) |
1659 | 0 | { |
1660 | 0 | __m256i r0 = _mm256_loadu_si256( ( __m256i* ) piOrg ); |
1661 | 0 | __m256i r1 = _mm256_loadu_si256( ( __m256i* ) piCur ); |
1662 | 0 | __m256i r2 = _mm256_loadu_si256( ( __m256i* ) ( piOrg + iStrideOrg ) ); |
1663 | 0 | __m256i r3 = _mm256_loadu_si256( ( __m256i* ) ( piCur + iStrideCur ) ); |
1664 | |
|
1665 | 0 | r0 = _mm256_add_epi16( r0, r2 ); |
1666 | 0 | r1 = _mm256_add_epi16( r1, r3 ); |
1667 | |
|
1668 | 0 | __m256i r4 = _mm256_loadu_si256( ( __m256i* ) ( piOrg + 16 ) ); |
1669 | 0 | __m256i r5 = _mm256_loadu_si256( ( __m256i* ) ( piCur + 16 ) ); |
1670 | 0 | r2 = _mm256_loadu_si256( ( __m256i* ) ( piOrg + iStrideOrg + 16 ) ); |
1671 | 0 | r3 = _mm256_loadu_si256( ( __m256i* ) ( piCur + iStrideCur + 16 ) ); |
1672 | |
|
1673 | 0 | r2 = _mm256_add_epi16( r4, r2 ); |
1674 | 0 | r3 = _mm256_add_epi16( r5, r3 ); |
1675 | |
|
1676 | 0 | r0 = _mm256_hadd_epi16( r0, r2 ); |
1677 | 0 | r1 = _mm256_hadd_epi16( r1, r3 ); |
1678 | |
|
1679 | 0 | r0 = _mm256_add_epi16( r0, _mm256_set1_epi16( 2 ) ); |
1680 | 0 | r1 = _mm256_add_epi16( r1, _mm256_set1_epi16( 2 ) ); |
1681 | |
|
1682 | 0 | r0 = _mm256_srai_epi16( r0, 2 ); |
1683 | 0 | r1 = _mm256_srai_epi16( r1, 2 ); |
1684 | |
|
1685 | 0 | m2[0][k] = _mm256_permute4x64_epi64( _mm256_sub_epi16( r0, r1 ), 0 + ( 2 << 2 ) + ( 1 << 4 ) + ( 3 << 6 ) ); // 11 bit |
1686 | | //m2[1][k] = _mm256_cvtepi16_epi32( _mm256_extracti128_si256( m2[0][k], 1 ) ); |
1687 | | //m2[0][k] = _mm256_cvtepi16_epi32( _mm256_castsi256_si128( m2[0][k] ) ); |
1688 | 0 | piCur += iStrideCur * 2; |
1689 | 0 | piOrg += iStrideOrg * 2; |
1690 | 0 | } |
1691 | |
|
1692 | 0 | m1[0][0] = _mm256_add_epi16( m2[0][0], m2[0][4] ); |
1693 | 0 | m1[0][1] = _mm256_add_epi16( m2[0][1], m2[0][5] ); |
1694 | 0 | m1[0][2] = _mm256_add_epi16( m2[0][2], m2[0][6] ); |
1695 | 0 | m1[0][3] = _mm256_add_epi16( m2[0][3], m2[0][7] ); |
1696 | 0 | m1[0][4] = _mm256_sub_epi16( m2[0][0], m2[0][4] ); |
1697 | 0 | m1[0][5] = _mm256_sub_epi16( m2[0][1], m2[0][5] ); |
1698 | 0 | m1[0][6] = _mm256_sub_epi16( m2[0][2], m2[0][6] ); |
1699 | 0 | m1[0][7] = _mm256_sub_epi16( m2[0][3], m2[0][7] ); // 12 bit |
1700 | |
|
1701 | 0 | m2[0][0] = _mm256_add_epi16( m1[0][0], m1[0][2] ); |
1702 | 0 | m2[0][1] = _mm256_add_epi16( m1[0][1], m1[0][3] ); |
1703 | 0 | m2[0][2] = _mm256_sub_epi16( m1[0][0], m1[0][2] ); |
1704 | 0 | m2[0][3] = _mm256_sub_epi16( m1[0][1], m1[0][3] ); |
1705 | 0 | m2[0][4] = _mm256_add_epi16( m1[0][4], m1[0][6] ); |
1706 | 0 | m2[0][5] = _mm256_add_epi16( m1[0][5], m1[0][7] ); |
1707 | 0 | m2[0][6] = _mm256_sub_epi16( m1[0][4], m1[0][6] ); |
1708 | 0 | m2[0][7] = _mm256_sub_epi16( m1[0][5], m1[0][7] ); // 13 bit |
1709 | |
|
1710 | 0 | m1[0][0] = _mm256_add_epi16( m2[0][0], m2[0][1] ); |
1711 | 0 | m1[0][1] = _mm256_sub_epi16( m2[0][0], m2[0][1] ); |
1712 | 0 | m1[0][2] = _mm256_add_epi16( m2[0][2], m2[0][3] ); |
1713 | 0 | m1[0][3] = _mm256_sub_epi16( m2[0][2], m2[0][3] ); |
1714 | 0 | m1[0][4] = _mm256_add_epi16( m2[0][4], m2[0][5] ); |
1715 | 0 | m1[0][5] = _mm256_sub_epi16( m2[0][4], m2[0][5] ); |
1716 | 0 | m1[0][6] = _mm256_add_epi16( m2[0][6], m2[0][7] ); |
1717 | 0 | m1[0][7] = _mm256_sub_epi16( m2[0][6], m2[0][7] ); // 14 bit |
1718 | | |
1719 | | // transpose |
1720 | | // 8x8 |
1721 | 0 | m2[0][0] = _mm256_unpacklo_epi16( m1[0][0], m1[0][1] ); |
1722 | 0 | m2[0][1] = _mm256_unpacklo_epi16( m1[0][2], m1[0][3] ); |
1723 | 0 | m2[0][2] = _mm256_unpacklo_epi16( m1[0][4], m1[0][5] ); |
1724 | 0 | m2[0][3] = _mm256_unpacklo_epi16( m1[0][6], m1[0][7] ); |
1725 | 0 | m2[0][4] = _mm256_unpackhi_epi16( m1[0][0], m1[0][1] ); |
1726 | 0 | m2[0][5] = _mm256_unpackhi_epi16( m1[0][2], m1[0][3] ); |
1727 | 0 | m2[0][6] = _mm256_unpackhi_epi16( m1[0][4], m1[0][5] ); |
1728 | 0 | m2[0][7] = _mm256_unpackhi_epi16( m1[0][6], m1[0][7] ); |
1729 | |
|
1730 | 0 | m1[0][0] = _mm256_unpacklo_epi32( m2[0][0], m2[0][1] ); |
1731 | 0 | m1[0][1] = _mm256_unpackhi_epi32( m2[0][0], m2[0][1] ); |
1732 | 0 | m1[0][2] = _mm256_unpacklo_epi32( m2[0][2], m2[0][3] ); |
1733 | 0 | m1[0][3] = _mm256_unpackhi_epi32( m2[0][2], m2[0][3] ); |
1734 | 0 | m1[0][4] = _mm256_unpacklo_epi32( m2[0][4], m2[0][5] ); |
1735 | 0 | m1[0][5] = _mm256_unpackhi_epi32( m2[0][4], m2[0][5] ); |
1736 | 0 | m1[0][6] = _mm256_unpacklo_epi32( m2[0][6], m2[0][7] ); |
1737 | 0 | m1[0][7] = _mm256_unpackhi_epi32( m2[0][6], m2[0][7] ); |
1738 | |
|
1739 | 0 | m2[0][0] = _mm256_unpacklo_epi64( m1[0][0], m1[0][2] ); |
1740 | 0 | m2[0][1] = _mm256_unpackhi_epi64( m1[0][0], m1[0][2] ); |
1741 | 0 | m2[0][2] = _mm256_unpacklo_epi64( m1[0][1], m1[0][3] ); |
1742 | 0 | m2[0][3] = _mm256_unpackhi_epi64( m1[0][1], m1[0][3] ); |
1743 | 0 | m2[0][4] = _mm256_unpacklo_epi64( m1[0][4], m1[0][6] ); |
1744 | 0 | m2[0][5] = _mm256_unpackhi_epi64( m1[0][4], m1[0][6] ); |
1745 | 0 | m2[0][6] = _mm256_unpacklo_epi64( m1[0][5], m1[0][7] ); |
1746 | 0 | m2[0][7] = _mm256_unpackhi_epi64( m1[0][5], m1[0][7] ); |
1747 | |
|
1748 | 0 | __m256i vzero = _mm256_setzero_si256(); |
1749 | 0 | __m256i vtmp; |
1750 | |
|
1751 | 0 | #define UNPACKX(x) \ |
1752 | 0 | vtmp = _mm256_cmpgt_epi16( vzero, m2[0][x] ); \ |
1753 | 0 | m1[0][x] = _mm256_unpacklo_epi16( m2[0][x], vtmp ); \ |
1754 | 0 | m1[1][x] = _mm256_unpackhi_epi16( m2[0][x], vtmp ); |
1755 | |
|
1756 | 0 | UNPACKX( 0 ); |
1757 | 0 | UNPACKX( 1 ); |
1758 | 0 | UNPACKX( 2 ); |
1759 | 0 | UNPACKX( 3 ); |
1760 | 0 | UNPACKX( 4 ); |
1761 | 0 | UNPACKX( 5 ); |
1762 | 0 | UNPACKX( 6 ); |
1763 | 0 | UNPACKX( 7 ); |
1764 | |
|
1765 | 0 | #undef UNPACKX |
1766 | |
|
1767 | 0 | for( int i = 0; i < 2; i++ ) |
1768 | 0 | { |
1769 | 0 | m2[i][0] = _mm256_add_epi32( m1[i][0], m1[i][4] ); |
1770 | 0 | m2[i][1] = _mm256_add_epi32( m1[i][1], m1[i][5] ); |
1771 | 0 | m2[i][2] = _mm256_add_epi32( m1[i][2], m1[i][6] ); |
1772 | 0 | m2[i][3] = _mm256_add_epi32( m1[i][3], m1[i][7] ); |
1773 | 0 | m2[i][4] = _mm256_sub_epi32( m1[i][0], m1[i][4] ); |
1774 | 0 | m2[i][5] = _mm256_sub_epi32( m1[i][1], m1[i][5] ); |
1775 | 0 | m2[i][6] = _mm256_sub_epi32( m1[i][2], m1[i][6] ); |
1776 | 0 | m2[i][7] = _mm256_sub_epi32( m1[i][3], m1[i][7] ); |
1777 | |
|
1778 | 0 | m1[i][0] = _mm256_add_epi32( m2[i][0], m2[i][2] ); |
1779 | 0 | m1[i][1] = _mm256_add_epi32( m2[i][1], m2[i][3] ); |
1780 | 0 | m1[i][2] = _mm256_sub_epi32( m2[i][0], m2[i][2] ); |
1781 | 0 | m1[i][3] = _mm256_sub_epi32( m2[i][1], m2[i][3] ); |
1782 | 0 | m1[i][4] = _mm256_add_epi32( m2[i][4], m2[i][6] ); |
1783 | 0 | m1[i][5] = _mm256_add_epi32( m2[i][5], m2[i][7] ); |
1784 | 0 | m1[i][6] = _mm256_sub_epi32( m2[i][4], m2[i][6] ); |
1785 | 0 | m1[i][7] = _mm256_sub_epi32( m2[i][5], m2[i][7] ); |
1786 | |
|
1787 | 0 | m2[i][0] = _mm256_abs_epi32( _mm256_add_epi32( m1[i][0], m1[i][1] ) ); |
1788 | 0 | m2[i][1] = _mm256_abs_epi32( _mm256_sub_epi32( m1[i][0], m1[i][1] ) ); |
1789 | 0 | m2[i][2] = _mm256_abs_epi32( _mm256_add_epi32( m1[i][2], m1[i][3] ) ); |
1790 | 0 | m2[i][3] = _mm256_abs_epi32( _mm256_sub_epi32( m1[i][2], m1[i][3] ) ); |
1791 | 0 | m2[i][4] = _mm256_abs_epi32( _mm256_add_epi32( m1[i][4], m1[i][5] ) ); |
1792 | 0 | m2[i][5] = _mm256_abs_epi32( _mm256_sub_epi32( m1[i][4], m1[i][5] ) ); |
1793 | 0 | m2[i][6] = _mm256_abs_epi32( _mm256_add_epi32( m1[i][6], m1[i][7] ) ); |
1794 | 0 | m2[i][7] = _mm256_abs_epi32( _mm256_sub_epi32( m1[i][6], m1[i][7] ) ); |
1795 | 0 | } |
1796 | |
|
1797 | 0 | uint32_t absDc0 = _mm_cvtsi128_si32( _mm256_castsi256_si128( m2[0][0] ) ); |
1798 | 0 | uint32_t absDc1 = _mm_cvtsi128_si32( _mm256_castsi256_si128( _mm256_permute2x128_si256( m2[0][0], m2[0][0], 0x11 ) ) ); |
1799 | |
|
1800 | 0 | for( int i = 0; i < 8; i++ ) |
1801 | 0 | { |
1802 | 0 | m1[0][i] = _mm256_add_epi32( m2[0][i], m2[1][i] ); |
1803 | 0 | } |
1804 | |
|
1805 | 0 | m1[0][0] = _mm256_add_epi32( m1[0][0], m1[0][1] ); |
1806 | 0 | m1[0][2] = _mm256_add_epi32( m1[0][2], m1[0][3] ); |
1807 | 0 | m1[0][4] = _mm256_add_epi32( m1[0][4], m1[0][5] ); |
1808 | 0 | m1[0][6] = _mm256_add_epi32( m1[0][6], m1[0][7] ); |
1809 | |
|
1810 | 0 | m1[0][0] = _mm256_add_epi32( m1[0][0], m1[0][2] ); |
1811 | 0 | m1[0][4] = _mm256_add_epi32( m1[0][4], m1[0][6] ); |
1812 | |
|
1813 | 0 | __m256i iSum = _mm256_add_epi32( m1[0][0], m1[0][4] ); |
1814 | |
|
1815 | 0 | iSum = _mm256_hadd_epi32( iSum, iSum ); |
1816 | 0 | iSum = _mm256_hadd_epi32( iSum, iSum ); |
1817 | |
|
1818 | 0 | uint32_t tmp; |
1819 | 0 | tmp = _mm_cvtsi128_si32( _mm256_castsi256_si128( iSum ) ); |
1820 | | // 16x16 block is done by adding together 4 8x8 SATDs |
1821 | 0 | tmp -= absDc0; |
1822 | 0 | tmp += absDc0 >> 2; |
1823 | 0 | tmp = ( ( tmp + 2 ) >> 2 ); |
1824 | 0 | sad += tmp; |
1825 | |
|
1826 | 0 | tmp = _mm_cvtsi128_si32( _mm256_castsi256_si128( _mm256_permute2x128_si256( iSum, iSum, 0x11 ) ) ); |
1827 | | // 16x16 block is done by adding together 4 8x8 SATDs |
1828 | 0 | tmp -= absDc1; |
1829 | 0 | tmp += absDc1 >> 2; |
1830 | 0 | tmp = ( ( tmp + 2 ) >> 2 ); |
1831 | 0 | sad += tmp; |
1832 | 0 | } |
1833 | |
|
1834 | 0 | #endif |
1835 | 0 | return ( sad << 2 ); |
1836 | 0 | } Unexecuted instantiation: RdCost_sse41.cpp:vvenc::xCalcHAD32x32_fast_AVX2(short const*, short const*, int, int, int) Unexecuted instantiation: RdCost_avx2.cpp:vvenc::xCalcHAD32x32_fast_AVX2(short const*, short const*, int, int, int) |
1837 | | |
1838 | | static uint32_t xCalcHAD16x16_AVX2( const Torg *piOrg, const Tcur *piCur, const int iStrideOrg, const int iStrideCur, const int iBitDepth ) |
1839 | 0 | { |
1840 | 0 | uint32_t sad = 0; |
1841 | |
|
1842 | 0 | #ifdef USE_AVX2 |
1843 | 0 | const int iLoops = 2; |
1844 | 0 | __m256i m1[2][8], m2[2][8]; |
1845 | |
|
1846 | 0 | CHECK( iBitDepth > 10, "Only bitdepths up to 10 supported!" ); |
1847 | |
|
1848 | 0 | for( int l = 0; l < iLoops; l++ ) |
1849 | 0 | { |
1850 | 0 | for( int k = 0; k < 8; k++ ) |
1851 | 0 | { |
1852 | 0 | __m256i r0 = _mm256_loadu_si256( ( __m256i* ) piOrg ); |
1853 | 0 | __m256i r1 = _mm256_loadu_si256( ( __m256i* ) piCur ); |
1854 | 0 | m2[0][k] = _mm256_sub_epi16( r0, r1 ); // 11 bit |
1855 | | //m2[1][k] = _mm256_cvtepi16_epi32( _mm256_extracti128_si256( m2[0][k], 1 ) ); |
1856 | | //m2[0][k] = _mm256_cvtepi16_epi32( _mm256_castsi256_si128( m2[0][k] ) ); |
1857 | 0 | piCur += iStrideCur; |
1858 | 0 | piOrg += iStrideOrg; |
1859 | 0 | } |
1860 | |
|
1861 | 0 | m1[0][0] = _mm256_add_epi16( m2[0][0], m2[0][4] ); |
1862 | 0 | m1[0][1] = _mm256_add_epi16( m2[0][1], m2[0][5] ); |
1863 | 0 | m1[0][2] = _mm256_add_epi16( m2[0][2], m2[0][6] ); |
1864 | 0 | m1[0][3] = _mm256_add_epi16( m2[0][3], m2[0][7] ); |
1865 | 0 | m1[0][4] = _mm256_sub_epi16( m2[0][0], m2[0][4] ); |
1866 | 0 | m1[0][5] = _mm256_sub_epi16( m2[0][1], m2[0][5] ); |
1867 | 0 | m1[0][6] = _mm256_sub_epi16( m2[0][2], m2[0][6] ); |
1868 | 0 | m1[0][7] = _mm256_sub_epi16( m2[0][3], m2[0][7] ); // 12 bit |
1869 | |
|
1870 | 0 | m2[0][0] = _mm256_add_epi16( m1[0][0], m1[0][2] ); |
1871 | 0 | m2[0][1] = _mm256_add_epi16( m1[0][1], m1[0][3] ); |
1872 | 0 | m2[0][2] = _mm256_sub_epi16( m1[0][0], m1[0][2] ); |
1873 | 0 | m2[0][3] = _mm256_sub_epi16( m1[0][1], m1[0][3] ); |
1874 | 0 | m2[0][4] = _mm256_add_epi16( m1[0][4], m1[0][6] ); |
1875 | 0 | m2[0][5] = _mm256_add_epi16( m1[0][5], m1[0][7] ); |
1876 | 0 | m2[0][6] = _mm256_sub_epi16( m1[0][4], m1[0][6] ); |
1877 | 0 | m2[0][7] = _mm256_sub_epi16( m1[0][5], m1[0][7] ); // 13 bit |
1878 | |
|
1879 | 0 | m1[0][0] = _mm256_add_epi16( m2[0][0], m2[0][1] ); |
1880 | 0 | m1[0][1] = _mm256_sub_epi16( m2[0][0], m2[0][1] ); |
1881 | 0 | m1[0][2] = _mm256_add_epi16( m2[0][2], m2[0][3] ); |
1882 | 0 | m1[0][3] = _mm256_sub_epi16( m2[0][2], m2[0][3] ); |
1883 | 0 | m1[0][4] = _mm256_add_epi16( m2[0][4], m2[0][5] ); |
1884 | 0 | m1[0][5] = _mm256_sub_epi16( m2[0][4], m2[0][5] ); |
1885 | 0 | m1[0][6] = _mm256_add_epi16( m2[0][6], m2[0][7] ); |
1886 | 0 | m1[0][7] = _mm256_sub_epi16( m2[0][6], m2[0][7] ); // 14 bit |
1887 | | |
1888 | | // transpose |
1889 | | // 8x8 |
1890 | 0 | m2[0][0] = _mm256_unpacklo_epi16( m1[0][0], m1[0][1] ); |
1891 | 0 | m2[0][1] = _mm256_unpacklo_epi16( m1[0][2], m1[0][3] ); |
1892 | 0 | m2[0][2] = _mm256_unpacklo_epi16( m1[0][4], m1[0][5] ); |
1893 | 0 | m2[0][3] = _mm256_unpacklo_epi16( m1[0][6], m1[0][7] ); |
1894 | 0 | m2[0][4] = _mm256_unpackhi_epi16( m1[0][0], m1[0][1] ); |
1895 | 0 | m2[0][5] = _mm256_unpackhi_epi16( m1[0][2], m1[0][3] ); |
1896 | 0 | m2[0][6] = _mm256_unpackhi_epi16( m1[0][4], m1[0][5] ); |
1897 | 0 | m2[0][7] = _mm256_unpackhi_epi16( m1[0][6], m1[0][7] ); |
1898 | |
|
1899 | 0 | m1[0][0] = _mm256_unpacklo_epi32( m2[0][0], m2[0][1] ); |
1900 | 0 | m1[0][1] = _mm256_unpackhi_epi32( m2[0][0], m2[0][1] ); |
1901 | 0 | m1[0][2] = _mm256_unpacklo_epi32( m2[0][2], m2[0][3] ); |
1902 | 0 | m1[0][3] = _mm256_unpackhi_epi32( m2[0][2], m2[0][3] ); |
1903 | 0 | m1[0][4] = _mm256_unpacklo_epi32( m2[0][4], m2[0][5] ); |
1904 | 0 | m1[0][5] = _mm256_unpackhi_epi32( m2[0][4], m2[0][5] ); |
1905 | 0 | m1[0][6] = _mm256_unpacklo_epi32( m2[0][6], m2[0][7] ); |
1906 | 0 | m1[0][7] = _mm256_unpackhi_epi32( m2[0][6], m2[0][7] ); |
1907 | |
|
1908 | 0 | m2[0][0] = _mm256_unpacklo_epi64( m1[0][0], m1[0][2] ); |
1909 | 0 | m2[0][1] = _mm256_unpackhi_epi64( m1[0][0], m1[0][2] ); |
1910 | 0 | m2[0][2] = _mm256_unpacklo_epi64( m1[0][1], m1[0][3] ); |
1911 | 0 | m2[0][3] = _mm256_unpackhi_epi64( m1[0][1], m1[0][3] ); |
1912 | 0 | m2[0][4] = _mm256_unpacklo_epi64( m1[0][4], m1[0][6] ); |
1913 | 0 | m2[0][5] = _mm256_unpackhi_epi64( m1[0][4], m1[0][6] ); |
1914 | 0 | m2[0][6] = _mm256_unpacklo_epi64( m1[0][5], m1[0][7] ); |
1915 | 0 | m2[0][7] = _mm256_unpackhi_epi64( m1[0][5], m1[0][7] ); |
1916 | |
|
1917 | 0 | __m256i vzero = _mm256_setzero_si256(); |
1918 | 0 | __m256i vtmp; |
1919 | |
|
1920 | 0 | #define UNPACKX(x) \ |
1921 | 0 | vtmp = _mm256_cmpgt_epi16( vzero, m2[0][x] ); \ |
1922 | 0 | m1[0][x] = _mm256_unpacklo_epi16( m2[0][x], vtmp ); \ |
1923 | 0 | m1[1][x] = _mm256_unpackhi_epi16( m2[0][x], vtmp ); |
1924 | |
|
1925 | 0 | UNPACKX( 0 ); |
1926 | 0 | UNPACKX( 1 ); |
1927 | 0 | UNPACKX( 2 ); |
1928 | 0 | UNPACKX( 3 ); |
1929 | 0 | UNPACKX( 4 ); |
1930 | 0 | UNPACKX( 5 ); |
1931 | 0 | UNPACKX( 6 ); |
1932 | 0 | UNPACKX( 7 ); |
1933 | |
|
1934 | 0 | #undef UNPACKX |
1935 | |
|
1936 | 0 | for( int i = 0; i < 2; i++ ) |
1937 | 0 | { |
1938 | 0 | m2[i][0] = _mm256_add_epi32( m1[i][0], m1[i][4] ); |
1939 | 0 | m2[i][1] = _mm256_add_epi32( m1[i][1], m1[i][5] ); |
1940 | 0 | m2[i][2] = _mm256_add_epi32( m1[i][2], m1[i][6] ); |
1941 | 0 | m2[i][3] = _mm256_add_epi32( m1[i][3], m1[i][7] ); |
1942 | 0 | m2[i][4] = _mm256_sub_epi32( m1[i][0], m1[i][4] ); |
1943 | 0 | m2[i][5] = _mm256_sub_epi32( m1[i][1], m1[i][5] ); |
1944 | 0 | m2[i][6] = _mm256_sub_epi32( m1[i][2], m1[i][6] ); |
1945 | 0 | m2[i][7] = _mm256_sub_epi32( m1[i][3], m1[i][7] ); |
1946 | |
|
1947 | 0 | m1[i][0] = _mm256_add_epi32( m2[i][0], m2[i][2] ); |
1948 | 0 | m1[i][1] = _mm256_add_epi32( m2[i][1], m2[i][3] ); |
1949 | 0 | m1[i][2] = _mm256_sub_epi32( m2[i][0], m2[i][2] ); |
1950 | 0 | m1[i][3] = _mm256_sub_epi32( m2[i][1], m2[i][3] ); |
1951 | 0 | m1[i][4] = _mm256_add_epi32( m2[i][4], m2[i][6] ); |
1952 | 0 | m1[i][5] = _mm256_add_epi32( m2[i][5], m2[i][7] ); |
1953 | 0 | m1[i][6] = _mm256_sub_epi32( m2[i][4], m2[i][6] ); |
1954 | 0 | m1[i][7] = _mm256_sub_epi32( m2[i][5], m2[i][7] ); |
1955 | |
|
1956 | 0 | m2[i][0] = _mm256_abs_epi32( _mm256_add_epi32( m1[i][0], m1[i][1] ) ); |
1957 | 0 | m2[i][1] = _mm256_abs_epi32( _mm256_sub_epi32( m1[i][0], m1[i][1] ) ); |
1958 | 0 | m2[i][2] = _mm256_abs_epi32( _mm256_add_epi32( m1[i][2], m1[i][3] ) ); |
1959 | 0 | m2[i][3] = _mm256_abs_epi32( _mm256_sub_epi32( m1[i][2], m1[i][3] ) ); |
1960 | 0 | m2[i][4] = _mm256_abs_epi32( _mm256_add_epi32( m1[i][4], m1[i][5] ) ); |
1961 | 0 | m2[i][5] = _mm256_abs_epi32( _mm256_sub_epi32( m1[i][4], m1[i][5] ) ); |
1962 | 0 | m2[i][6] = _mm256_abs_epi32( _mm256_add_epi32( m1[i][6], m1[i][7] ) ); |
1963 | 0 | m2[i][7] = _mm256_abs_epi32( _mm256_sub_epi32( m1[i][6], m1[i][7] ) ); |
1964 | 0 | } |
1965 | |
|
1966 | 0 | uint32_t absDc0 = _mm_cvtsi128_si32( _mm256_castsi256_si128( m2[0][0] ) ); |
1967 | 0 | uint32_t absDc1 = _mm_cvtsi128_si32( _mm256_castsi256_si128( _mm256_permute2x128_si256( m2[0][0], m2[0][0], 0x11 ) ) ); |
1968 | |
|
1969 | 0 | for( int i = 0; i < 8; i++ ) |
1970 | 0 | { |
1971 | 0 | m1[0][i] = _mm256_add_epi32( m2[0][i], m2[1][i] ); |
1972 | 0 | } |
1973 | |
|
1974 | 0 | m1[0][0] = _mm256_add_epi32( m1[0][0], m1[0][1] ); |
1975 | 0 | m1[0][2] = _mm256_add_epi32( m1[0][2], m1[0][3] ); |
1976 | 0 | m1[0][4] = _mm256_add_epi32( m1[0][4], m1[0][5] ); |
1977 | 0 | m1[0][6] = _mm256_add_epi32( m1[0][6], m1[0][7] ); |
1978 | |
|
1979 | 0 | m1[0][0] = _mm256_add_epi32( m1[0][0], m1[0][2] ); |
1980 | 0 | m1[0][4] = _mm256_add_epi32( m1[0][4], m1[0][6] ); |
1981 | |
|
1982 | 0 | __m256i iSum = _mm256_add_epi32( m1[0][0], m1[0][4] ); |
1983 | |
|
1984 | 0 | iSum = _mm256_hadd_epi32( iSum, iSum ); |
1985 | 0 | iSum = _mm256_hadd_epi32( iSum, iSum ); |
1986 | |
|
1987 | 0 | uint32_t tmp; |
1988 | 0 | tmp = _mm_cvtsi128_si32( _mm256_castsi256_si128( iSum ) ); |
1989 | | // 16x16 block is done by adding together 4 8x8 SATDs |
1990 | 0 | tmp -= absDc0; |
1991 | 0 | tmp += absDc0 >> 2; |
1992 | 0 | tmp = ( ( tmp + 2 ) >> 2 ); |
1993 | 0 | sad += tmp; |
1994 | |
|
1995 | 0 | tmp = _mm_cvtsi128_si32( _mm256_castsi256_si128( _mm256_permute2x128_si256( iSum, iSum, 0x11 ) ) ); |
1996 | | // 16x16 block is done by adding together 4 8x8 SATDs |
1997 | 0 | tmp -= absDc1; |
1998 | 0 | tmp += absDc1 >> 2; |
1999 | 0 | tmp = ( ( tmp + 2 ) >> 2 ); |
2000 | 0 | sad += tmp; |
2001 | 0 | } |
2002 | |
|
2003 | 0 | #endif |
2004 | 0 | return ( sad ); |
2005 | 0 | } Unexecuted instantiation: RdCost_sse41.cpp:vvenc::xCalcHAD16x16_AVX2(short const*, short const*, int, int, int) Unexecuted instantiation: RdCost_avx2.cpp:vvenc::xCalcHAD16x16_AVX2(short const*, short const*, int, int, int) |
2006 | | |
2007 | | static uint32_t xCalcHAD16x8_AVX2( const Torg *piOrg, const Tcur *piCur, const int iStrideOrg, const int iStrideCur, const int iBitDepth ) |
2008 | 0 | { |
2009 | 0 | uint32_t sad = 0; |
2010 | |
|
2011 | 0 | #ifdef USE_AVX2 |
2012 | 0 | __m256i m1[16], m2[16]; |
2013 | |
|
2014 | 0 | CHECK( iBitDepth > 10, "Only bitdepths up to 10 supported!" ); |
2015 | |
|
2016 | 0 | { |
2017 | 0 | for( int k = 0; k < 8; k++ ) |
2018 | 0 | { |
2019 | 0 | __m256i r0 = _mm256_lddqu_si256( (__m256i*)piOrg ); |
2020 | 0 | __m256i r1 = _mm256_lddqu_si256( (__m256i*)piCur ); |
2021 | 0 | m1[k] = _mm256_sub_epi16( r0, r1 ); // 11 bit |
2022 | | //m1[k+8] = _mm256_cvtepi16_epi32( _mm256_extracti128_si256( m1[k], 1 ) ); |
2023 | | //m1[k] = _mm256_cvtepi16_epi32( _mm256_castsi256_si128 ( m1[k] ) ); |
2024 | 0 | piCur += iStrideCur; |
2025 | 0 | piOrg += iStrideOrg; |
2026 | 0 | } |
2027 | | |
2028 | | // vertical, first 8x8 |
2029 | | #if 0 |
2030 | | m2[0] = _mm256_add_epi32( m1[0], m1[4] ); |
2031 | | m2[1] = _mm256_add_epi32( m1[1], m1[5] ); |
2032 | | m2[2] = _mm256_add_epi32( m1[2], m1[6] ); |
2033 | | m2[3] = _mm256_add_epi32( m1[3], m1[7] ); |
2034 | | m2[4] = _mm256_sub_epi32( m1[0], m1[4] ); |
2035 | | m2[5] = _mm256_sub_epi32( m1[1], m1[5] ); |
2036 | | m2[6] = _mm256_sub_epi32( m1[2], m1[6] ); |
2037 | | m2[7] = _mm256_sub_epi32( m1[3], m1[7] ); |
2038 | | |
2039 | | m1[0] = _mm256_add_epi32( m2[0], m2[2] ); |
2040 | | m1[1] = _mm256_add_epi32( m2[1], m2[3] ); |
2041 | | m1[2] = _mm256_sub_epi32( m2[0], m2[2] ); |
2042 | | m1[3] = _mm256_sub_epi32( m2[1], m2[3] ); |
2043 | | m1[4] = _mm256_add_epi32( m2[4], m2[6] ); |
2044 | | m1[5] = _mm256_add_epi32( m2[5], m2[7] ); |
2045 | | m1[6] = _mm256_sub_epi32( m2[4], m2[6] ); |
2046 | | m1[7] = _mm256_sub_epi32( m2[5], m2[7] ); |
2047 | | |
2048 | | m2[0] = _mm256_add_epi32( m1[0], m1[1] ); |
2049 | | m2[1] = _mm256_sub_epi32( m1[0], m1[1] ); |
2050 | | m2[2] = _mm256_add_epi32( m1[2], m1[3] ); |
2051 | | m2[3] = _mm256_sub_epi32( m1[2], m1[3] ); |
2052 | | m2[4] = _mm256_add_epi32( m1[4], m1[5] ); |
2053 | | m2[5] = _mm256_sub_epi32( m1[4], m1[5] ); |
2054 | | m2[6] = _mm256_add_epi32( m1[6], m1[7] ); |
2055 | | m2[7] = _mm256_sub_epi32( m1[6], m1[7] ); |
2056 | | |
2057 | | // vertical, second 8x8 |
2058 | | m2[8+0] = _mm256_add_epi32( m1[8+0], m1[8+4] ); |
2059 | | m2[8+1] = _mm256_add_epi32( m1[8+1], m1[8+5] ); |
2060 | | m2[8+2] = _mm256_add_epi32( m1[8+2], m1[8+6] ); |
2061 | | m2[8+3] = _mm256_add_epi32( m1[8+3], m1[8+7] ); |
2062 | | m2[8+4] = _mm256_sub_epi32( m1[8+0], m1[8+4] ); |
2063 | | m2[8+5] = _mm256_sub_epi32( m1[8+1], m1[8+5] ); |
2064 | | m2[8+6] = _mm256_sub_epi32( m1[8+2], m1[8+6] ); |
2065 | | m2[8+7] = _mm256_sub_epi32( m1[8+3], m1[8+7] ); |
2066 | | |
2067 | | m1[8+0] = _mm256_add_epi32( m2[8+0], m2[8+2] ); |
2068 | | m1[8+1] = _mm256_add_epi32( m2[8+1], m2[8+3] ); |
2069 | | m1[8+2] = _mm256_sub_epi32( m2[8+0], m2[8+2] ); |
2070 | | m1[8+3] = _mm256_sub_epi32( m2[8+1], m2[8+3] ); |
2071 | | m1[8+4] = _mm256_add_epi32( m2[8+4], m2[8+6] ); |
2072 | | m1[8+5] = _mm256_add_epi32( m2[8+5], m2[8+7] ); |
2073 | | m1[8+6] = _mm256_sub_epi32( m2[8+4], m2[8+6] ); |
2074 | | m1[8+7] = _mm256_sub_epi32( m2[8+5], m2[8+7] ); |
2075 | | |
2076 | | m2[8+0] = _mm256_add_epi32( m1[8+0], m1[8+1] ); |
2077 | | m2[8+1] = _mm256_sub_epi32( m1[8+0], m1[8+1] ); |
2078 | | m2[8+2] = _mm256_add_epi32( m1[8+2], m1[8+3] ); |
2079 | | m2[8+3] = _mm256_sub_epi32( m1[8+2], m1[8+3] ); |
2080 | | m2[8+4] = _mm256_add_epi32( m1[8+4], m1[8+5] ); |
2081 | | m2[8+5] = _mm256_sub_epi32( m1[8+4], m1[8+5] ); |
2082 | | m2[8+6] = _mm256_add_epi32( m1[8+6], m1[8+7] ); |
2083 | | m2[8+7] = _mm256_sub_epi32( m1[8+6], m1[8+7] ); |
2084 | | |
2085 | | // transpose |
2086 | | constexpr int perm_unpacklo_epi128 = ( 0 << 0 ) + ( 2 << 4 ); |
2087 | | constexpr int perm_unpackhi_epi128 = ( 1 << 0 ) + ( 3 << 4 ); |
2088 | | |
2089 | | m1[0] = _mm256_unpacklo_epi32( m2[0], m2[1] ); |
2090 | | m1[1] = _mm256_unpacklo_epi32( m2[2], m2[3] ); |
2091 | | m1[2] = _mm256_unpacklo_epi32( m2[4], m2[5] ); |
2092 | | m1[3] = _mm256_unpacklo_epi32( m2[6], m2[7] ); |
2093 | | m1[4] = _mm256_unpackhi_epi32( m2[0], m2[1] ); |
2094 | | m1[5] = _mm256_unpackhi_epi32( m2[2], m2[3] ); |
2095 | | m1[6] = _mm256_unpackhi_epi32( m2[4], m2[5] ); |
2096 | | m1[7] = _mm256_unpackhi_epi32( m2[6], m2[7] ); |
2097 | | |
2098 | | m2[0] = _mm256_unpacklo_epi64( m1[0], m1[1] ); |
2099 | | m2[1] = _mm256_unpackhi_epi64( m1[0], m1[1] ); |
2100 | | m2[2] = _mm256_unpacklo_epi64( m1[2], m1[3] ); |
2101 | | m2[3] = _mm256_unpackhi_epi64( m1[2], m1[3] ); |
2102 | | m2[4] = _mm256_unpacklo_epi64( m1[4], m1[5] ); |
2103 | | m2[5] = _mm256_unpackhi_epi64( m1[4], m1[5] ); |
2104 | | m2[6] = _mm256_unpacklo_epi64( m1[6], m1[7] ); |
2105 | | m2[7] = _mm256_unpackhi_epi64( m1[6], m1[7] ); |
2106 | | |
2107 | | m1[0] = _mm256_permute2x128_si256( m2[0], m2[2], perm_unpacklo_epi128 ); |
2108 | | m1[1] = _mm256_permute2x128_si256( m2[0], m2[2], perm_unpackhi_epi128 ); |
2109 | | m1[2] = _mm256_permute2x128_si256( m2[1], m2[3], perm_unpacklo_epi128 ); |
2110 | | m1[3] = _mm256_permute2x128_si256( m2[1], m2[3], perm_unpackhi_epi128 ); |
2111 | | m1[4] = _mm256_permute2x128_si256( m2[4], m2[6], perm_unpacklo_epi128 ); |
2112 | | m1[5] = _mm256_permute2x128_si256( m2[4], m2[6], perm_unpackhi_epi128 ); |
2113 | | m1[6] = _mm256_permute2x128_si256( m2[5], m2[7], perm_unpacklo_epi128 ); |
2114 | | m1[7] = _mm256_permute2x128_si256( m2[5], m2[7], perm_unpackhi_epi128 ); |
2115 | | |
2116 | | m1[8+0] = _mm256_unpacklo_epi32( m2[8+0], m2[8+1] ); |
2117 | | m1[8+1] = _mm256_unpacklo_epi32( m2[8+2], m2[8+3] ); |
2118 | | m1[8+2] = _mm256_unpacklo_epi32( m2[8+4], m2[8+5] ); |
2119 | | m1[8+3] = _mm256_unpacklo_epi32( m2[8+6], m2[8+7] ); |
2120 | | m1[8+4] = _mm256_unpackhi_epi32( m2[8+0], m2[8+1] ); |
2121 | | m1[8+5] = _mm256_unpackhi_epi32( m2[8+2], m2[8+3] ); |
2122 | | m1[8+6] = _mm256_unpackhi_epi32( m2[8+4], m2[8+5] ); |
2123 | | m1[8+7] = _mm256_unpackhi_epi32( m2[8+6], m2[8+7] ); |
2124 | | |
2125 | | m2[8+0] = _mm256_unpacklo_epi64( m1[8+0], m1[8+1] ); |
2126 | | m2[8+1] = _mm256_unpackhi_epi64( m1[8+0], m1[8+1] ); |
2127 | | m2[8+2] = _mm256_unpacklo_epi64( m1[8+2], m1[8+3] ); |
2128 | | m2[8+3] = _mm256_unpackhi_epi64( m1[8+2], m1[8+3] ); |
2129 | | m2[8+4] = _mm256_unpacklo_epi64( m1[8+4], m1[8+5] ); |
2130 | | m2[8+5] = _mm256_unpackhi_epi64( m1[8+4], m1[8+5] ); |
2131 | | m2[8+6] = _mm256_unpacklo_epi64( m1[8+6], m1[8+7] ); |
2132 | | m2[8+7] = _mm256_unpackhi_epi64( m1[8+6], m1[8+7] ); |
2133 | | |
2134 | | m1[8+0] = _mm256_permute2x128_si256( m2[8+0], m2[8+2], perm_unpacklo_epi128 ); |
2135 | | m1[8+1] = _mm256_permute2x128_si256( m2[8+0], m2[8+2], perm_unpackhi_epi128 ); |
2136 | | m1[8+2] = _mm256_permute2x128_si256( m2[8+1], m2[8+3], perm_unpacklo_epi128 ); |
2137 | | m1[8+3] = _mm256_permute2x128_si256( m2[8+1], m2[8+3], perm_unpackhi_epi128 ); |
2138 | | m1[8+4] = _mm256_permute2x128_si256( m2[8+4], m2[8+6], perm_unpacklo_epi128 ); |
2139 | | m1[8+5] = _mm256_permute2x128_si256( m2[8+4], m2[8+6], perm_unpackhi_epi128 ); |
2140 | | m1[8+6] = _mm256_permute2x128_si256( m2[8+5], m2[8+7], perm_unpacklo_epi128 ); |
2141 | | m1[8+7] = _mm256_permute2x128_si256( m2[8+5], m2[8+7], perm_unpackhi_epi128 ); |
2142 | | #else |
2143 | 0 | m2[0] = _mm256_add_epi16( m1[0], m1[4] ); |
2144 | 0 | m2[1] = _mm256_add_epi16( m1[1], m1[5] ); |
2145 | 0 | m2[2] = _mm256_add_epi16( m1[2], m1[6] ); |
2146 | 0 | m2[3] = _mm256_add_epi16( m1[3], m1[7] ); |
2147 | 0 | m2[4] = _mm256_sub_epi16( m1[0], m1[4] ); |
2148 | 0 | m2[5] = _mm256_sub_epi16( m1[1], m1[5] ); |
2149 | 0 | m2[6] = _mm256_sub_epi16( m1[2], m1[6] ); |
2150 | 0 | m2[7] = _mm256_sub_epi16( m1[3], m1[7] ); // 12 bit |
2151 | |
|
2152 | 0 | m1[0] = _mm256_add_epi16( m2[0], m2[2] ); |
2153 | 0 | m1[1] = _mm256_add_epi16( m2[1], m2[3] ); |
2154 | 0 | m1[2] = _mm256_sub_epi16( m2[0], m2[2] ); |
2155 | 0 | m1[3] = _mm256_sub_epi16( m2[1], m2[3] ); |
2156 | 0 | m1[4] = _mm256_add_epi16( m2[4], m2[6] ); |
2157 | 0 | m1[5] = _mm256_add_epi16( m2[5], m2[7] ); |
2158 | 0 | m1[6] = _mm256_sub_epi16( m2[4], m2[6] ); |
2159 | 0 | m1[7] = _mm256_sub_epi16( m2[5], m2[7] ); // 13 bit |
2160 | |
|
2161 | 0 | m2[0] = _mm256_add_epi16( m1[0], m1[1] ); |
2162 | 0 | m2[1] = _mm256_sub_epi16( m1[0], m1[1] ); |
2163 | 0 | m2[2] = _mm256_add_epi16( m1[2], m1[3] ); |
2164 | 0 | m2[3] = _mm256_sub_epi16( m1[2], m1[3] ); |
2165 | 0 | m2[4] = _mm256_add_epi16( m1[4], m1[5] ); |
2166 | 0 | m2[5] = _mm256_sub_epi16( m1[4], m1[5] ); |
2167 | 0 | m2[6] = _mm256_add_epi16( m1[6], m1[7] ); |
2168 | 0 | m2[7] = _mm256_sub_epi16( m1[6], m1[7] ); // 14 bit |
2169 | |
|
2170 | 0 | m1[0] = _mm256_unpacklo_epi16( m2[0], m2[1] ); |
2171 | 0 | m1[1] = _mm256_unpacklo_epi16( m2[2], m2[3] ); |
2172 | 0 | m1[2] = _mm256_unpacklo_epi16( m2[4], m2[5] ); |
2173 | 0 | m1[3] = _mm256_unpacklo_epi16( m2[6], m2[7] ); |
2174 | 0 | m1[4] = _mm256_unpackhi_epi16( m2[0], m2[1] ); |
2175 | 0 | m1[5] = _mm256_unpackhi_epi16( m2[2], m2[3] ); |
2176 | 0 | m1[6] = _mm256_unpackhi_epi16( m2[4], m2[5] ); |
2177 | 0 | m1[7] = _mm256_unpackhi_epi16( m2[6], m2[7] ); |
2178 | |
|
2179 | 0 | m2[0] = _mm256_unpacklo_epi32( m1[0], m1[1] ); |
2180 | 0 | m2[1] = _mm256_unpackhi_epi32( m1[0], m1[1] ); |
2181 | 0 | m2[2] = _mm256_unpacklo_epi32( m1[2], m1[3] ); |
2182 | 0 | m2[3] = _mm256_unpackhi_epi32( m1[2], m1[3] ); |
2183 | 0 | m2[4] = _mm256_unpacklo_epi32( m1[4], m1[5] ); |
2184 | 0 | m2[5] = _mm256_unpackhi_epi32( m1[4], m1[5] ); |
2185 | 0 | m2[6] = _mm256_unpacklo_epi32( m1[6], m1[7] ); |
2186 | 0 | m2[7] = _mm256_unpackhi_epi32( m1[6], m1[7] ); |
2187 | |
|
2188 | 0 | m1[0] = _mm256_unpacklo_epi64( m2[0], m2[2] ); |
2189 | 0 | m1[1] = _mm256_unpackhi_epi64( m2[0], m2[2] ); |
2190 | 0 | m1[2] = _mm256_unpacklo_epi64( m2[1], m2[3] ); |
2191 | 0 | m1[3] = _mm256_unpackhi_epi64( m2[1], m2[3] ); |
2192 | 0 | m1[4] = _mm256_unpacklo_epi64( m2[4], m2[6] ); |
2193 | 0 | m1[5] = _mm256_unpackhi_epi64( m2[4], m2[6] ); |
2194 | 0 | m1[6] = _mm256_unpacklo_epi64( m2[5], m2[7] ); |
2195 | 0 | m1[7] = _mm256_unpackhi_epi64( m2[5], m2[7] ); |
2196 | | |
2197 | 0 | for( int k = 0; k < 8; k++ ) |
2198 | 0 | { |
2199 | 0 | m1[k+8] = _mm256_cvtepi16_epi32( _mm256_extracti128_si256( m1[k], 1 ) ); |
2200 | 0 | m1[k] = _mm256_cvtepi16_epi32( _mm256_castsi256_si128 ( m1[k] ) ); |
2201 | 0 | } |
2202 | 0 | #endif |
2203 | | |
2204 | | // horizontal |
2205 | 0 | { |
2206 | 0 | m2[ 0] = _mm256_add_epi32( m1[0], m1[ 8] ); |
2207 | 0 | m2[ 1] = _mm256_add_epi32( m1[1], m1[ 9] ); |
2208 | 0 | m2[ 2] = _mm256_add_epi32( m1[2], m1[10] ); |
2209 | 0 | m2[ 3] = _mm256_add_epi32( m1[3], m1[11] ); |
2210 | 0 | m2[ 4] = _mm256_add_epi32( m1[4], m1[12] ); |
2211 | 0 | m2[ 5] = _mm256_add_epi32( m1[5], m1[13] ); |
2212 | 0 | m2[ 6] = _mm256_add_epi32( m1[6], m1[14] ); |
2213 | 0 | m2[ 7] = _mm256_add_epi32( m1[7], m1[15] ); |
2214 | 0 | m2[ 8] = _mm256_sub_epi32( m1[0], m1[ 8] ); |
2215 | 0 | m2[ 9] = _mm256_sub_epi32( m1[1], m1[ 9] ); |
2216 | 0 | m2[10] = _mm256_sub_epi32( m1[2], m1[10] ); |
2217 | 0 | m2[11] = _mm256_sub_epi32( m1[3], m1[11] ); |
2218 | 0 | m2[12] = _mm256_sub_epi32( m1[4], m1[12] ); |
2219 | 0 | m2[13] = _mm256_sub_epi32( m1[5], m1[13] ); |
2220 | 0 | m2[14] = _mm256_sub_epi32( m1[6], m1[14] ); |
2221 | 0 | m2[15] = _mm256_sub_epi32( m1[7], m1[15] ); |
2222 | |
|
2223 | 0 | m1[ 0] = _mm256_add_epi32( m2[ 0], m2[ 4] ); |
2224 | 0 | m1[ 1] = _mm256_add_epi32( m2[ 1], m2[ 5] ); |
2225 | 0 | m1[ 2] = _mm256_add_epi32( m2[ 2], m2[ 6] ); |
2226 | 0 | m1[ 3] = _mm256_add_epi32( m2[ 3], m2[ 7] ); |
2227 | 0 | m1[ 4] = _mm256_sub_epi32( m2[ 0], m2[ 4] ); |
2228 | 0 | m1[ 5] = _mm256_sub_epi32( m2[ 1], m2[ 5] ); |
2229 | 0 | m1[ 6] = _mm256_sub_epi32( m2[ 2], m2[ 6] ); |
2230 | 0 | m1[ 7] = _mm256_sub_epi32( m2[ 3], m2[ 7] ); |
2231 | 0 | m1[ 8] = _mm256_add_epi32( m2[ 8], m2[12] ); |
2232 | 0 | m1[ 9] = _mm256_add_epi32( m2[ 9], m2[13] ); |
2233 | 0 | m1[10] = _mm256_add_epi32( m2[10], m2[14] ); |
2234 | 0 | m1[11] = _mm256_add_epi32( m2[11], m2[15] ); |
2235 | 0 | m1[12] = _mm256_sub_epi32( m2[ 8], m2[12] ); |
2236 | 0 | m1[13] = _mm256_sub_epi32( m2[ 9], m2[13] ); |
2237 | 0 | m1[14] = _mm256_sub_epi32( m2[10], m2[14] ); |
2238 | 0 | m1[15] = _mm256_sub_epi32( m2[11], m2[15] ); |
2239 | |
|
2240 | 0 | m2[ 0] = _mm256_add_epi32( m1[ 0], m1[ 2] ); |
2241 | 0 | m2[ 1] = _mm256_add_epi32( m1[ 1], m1[ 3] ); |
2242 | 0 | m2[ 2] = _mm256_sub_epi32( m1[ 0], m1[ 2] ); |
2243 | 0 | m2[ 3] = _mm256_sub_epi32( m1[ 1], m1[ 3] ); |
2244 | 0 | m2[ 4] = _mm256_add_epi32( m1[ 4], m1[ 6] ); |
2245 | 0 | m2[ 5] = _mm256_add_epi32( m1[ 5], m1[ 7] ); |
2246 | 0 | m2[ 6] = _mm256_sub_epi32( m1[ 4], m1[ 6] ); |
2247 | 0 | m2[ 7] = _mm256_sub_epi32( m1[ 5], m1[ 7] ); |
2248 | 0 | m2[ 8] = _mm256_add_epi32( m1[ 8], m1[10] ); |
2249 | 0 | m2[ 9] = _mm256_add_epi32( m1[ 9], m1[11] ); |
2250 | 0 | m2[10] = _mm256_sub_epi32( m1[ 8], m1[10] ); |
2251 | 0 | m2[11] = _mm256_sub_epi32( m1[ 9], m1[11] ); |
2252 | 0 | m2[12] = _mm256_add_epi32( m1[12], m1[14] ); |
2253 | 0 | m2[13] = _mm256_add_epi32( m1[13], m1[15] ); |
2254 | 0 | m2[14] = _mm256_sub_epi32( m1[12], m1[14] ); |
2255 | 0 | m2[15] = _mm256_sub_epi32( m1[13], m1[15] ); |
2256 | |
|
2257 | 0 | m1[ 0] = _mm256_abs_epi32( _mm256_add_epi32( m2[ 0], m2[ 1] ) ); |
2258 | 0 | m1[ 1] = _mm256_abs_epi32( _mm256_sub_epi32( m2[ 0], m2[ 1] ) ); |
2259 | 0 | m1[ 2] = _mm256_abs_epi32( _mm256_add_epi32( m2[ 2], m2[ 3] ) ); |
2260 | 0 | m1[ 3] = _mm256_abs_epi32( _mm256_sub_epi32( m2[ 2], m2[ 3] ) ); |
2261 | 0 | m1[ 4] = _mm256_abs_epi32( _mm256_add_epi32( m2[ 4], m2[ 5] ) ); |
2262 | 0 | m1[ 5] = _mm256_abs_epi32( _mm256_sub_epi32( m2[ 4], m2[ 5] ) ); |
2263 | 0 | m1[ 6] = _mm256_abs_epi32( _mm256_add_epi32( m2[ 6], m2[ 7] ) ); |
2264 | 0 | m1[ 7] = _mm256_abs_epi32( _mm256_sub_epi32( m2[ 6], m2[ 7] ) ); |
2265 | 0 | m1[ 8] = _mm256_abs_epi32( _mm256_add_epi32( m2[ 8], m2[ 9] ) ); |
2266 | 0 | m1[ 9] = _mm256_abs_epi32( _mm256_sub_epi32( m2[ 8], m2[ 9] ) ); |
2267 | 0 | m1[10] = _mm256_abs_epi32( _mm256_add_epi32( m2[10], m2[11] ) ); |
2268 | 0 | m1[11] = _mm256_abs_epi32( _mm256_sub_epi32( m2[10], m2[11] ) ); |
2269 | 0 | m1[12] = _mm256_abs_epi32( _mm256_add_epi32( m2[12], m2[13] ) ); |
2270 | 0 | m1[13] = _mm256_abs_epi32( _mm256_sub_epi32( m2[12], m2[13] ) ); |
2271 | 0 | m1[14] = _mm256_abs_epi32( _mm256_add_epi32( m2[14], m2[15] ) ); |
2272 | 0 | m1[15] = _mm256_abs_epi32( _mm256_sub_epi32( m2[14], m2[15] ) ); |
2273 | 0 | } |
2274 | |
|
2275 | 0 | uint32_t absDc = _mm_cvtsi128_si32( _mm256_castsi256_si128( m1[0] ) ); |
2276 | | |
2277 | | // sum up |
2278 | 0 | m1[ 0] = _mm256_add_epi32( m1[ 0], m1[ 1] ); |
2279 | 0 | m1[ 2] = _mm256_add_epi32( m1[ 2], m1[ 3] ); |
2280 | 0 | m1[ 4] = _mm256_add_epi32( m1[ 4], m1[ 5] ); |
2281 | 0 | m1[ 6] = _mm256_add_epi32( m1[ 6], m1[ 7] ); |
2282 | 0 | m1[ 8] = _mm256_add_epi32( m1[ 8], m1[ 9] ); |
2283 | 0 | m1[10] = _mm256_add_epi32( m1[10], m1[11] ); |
2284 | 0 | m1[12] = _mm256_add_epi32( m1[12], m1[13] ); |
2285 | 0 | m1[14] = _mm256_add_epi32( m1[14], m1[15] ); |
2286 | |
|
2287 | 0 | m1[ 0] = _mm256_add_epi32( m1[ 0], m1[ 2] ); |
2288 | 0 | m1[ 4] = _mm256_add_epi32( m1[ 4], m1[ 6] ); |
2289 | 0 | m1[ 8] = _mm256_add_epi32( m1[ 8], m1[10] ); |
2290 | 0 | m1[12] = _mm256_add_epi32( m1[12], m1[14] ); |
2291 | |
|
2292 | 0 | m1[0] = _mm256_add_epi32( m1[0], m1[ 4] ); |
2293 | 0 | m1[8] = _mm256_add_epi32( m1[8], m1[12] ); |
2294 | |
|
2295 | 0 | __m256i iSum = _mm256_add_epi32( m1[0], m1[8] ); |
2296 | 0 | iSum = _mm256_hadd_epi32( iSum, iSum ); |
2297 | 0 | iSum = _mm256_hadd_epi32( iSum, iSum ); |
2298 | 0 | iSum = _mm256_add_epi32( iSum, _mm256_permute2x128_si256( iSum, iSum, 0x11 ) ); |
2299 | |
|
2300 | 0 | sad = _mm_cvtsi128_si32( _mm256_castsi256_si128( iSum ) ); |
2301 | | |
2302 | 0 | sad -= absDc; |
2303 | 0 | sad += absDc >> 2; |
2304 | 0 | sad = (uint32_t)(sad / sqrt(16.0 * 8) * 2); |
2305 | 0 | } |
2306 | |
|
2307 | 0 | #endif //USE_AVX2 |
2308 | |
|
2309 | 0 | return (sad); |
2310 | 0 | } Unexecuted instantiation: RdCost_sse41.cpp:vvenc::xCalcHAD16x8_AVX2(short const*, short const*, int, int, int) Unexecuted instantiation: RdCost_avx2.cpp:vvenc::xCalcHAD16x8_AVX2(short const*, short const*, int, int, int) |
2311 | | |
2312 | | static uint32_t xCalcHAD8x16_AVX2( const Pel* piOrg, const Pel* piCur, const int iStrideOrg, const int iStrideCur, const int iBitDepth ) |
2313 | 0 | { |
2314 | 0 | uint32_t sad = 0; |
2315 | |
|
2316 | 0 | #ifdef USE_AVX2 |
2317 | 0 | __m256i m1[16], m2[16]; |
2318 | |
|
2319 | 0 | { |
2320 | 0 | { |
2321 | 0 | for( int k = 0; k < 16; k++ ) |
2322 | 0 | { |
2323 | 0 | __m256i r0 = _mm256_cvtepi16_epi32( _mm_loadu_si128( (__m128i*)piOrg ) ); |
2324 | 0 | __m256i r1 = _mm256_cvtepi16_epi32( _mm_loadu_si128( (__m128i*)piCur ) ); |
2325 | 0 | m1[k] = _mm256_sub_epi32( r0, r1 ); |
2326 | 0 | piCur += iStrideCur; |
2327 | 0 | piOrg += iStrideOrg; |
2328 | 0 | } |
2329 | 0 | } |
2330 | | |
2331 | | // vertical |
2332 | |
|
2333 | 0 | m2[ 0] = _mm256_add_epi32( m1[0], m1[ 8] ); |
2334 | 0 | m2[ 1] = _mm256_add_epi32( m1[1], m1[ 9] ); |
2335 | 0 | m2[ 2] = _mm256_add_epi32( m1[2], m1[10] ); |
2336 | 0 | m2[ 3] = _mm256_add_epi32( m1[3], m1[11] ); |
2337 | 0 | m2[ 4] = _mm256_add_epi32( m1[4], m1[12] ); |
2338 | 0 | m2[ 5] = _mm256_add_epi32( m1[5], m1[13] ); |
2339 | 0 | m2[ 6] = _mm256_add_epi32( m1[6], m1[14] ); |
2340 | 0 | m2[ 7] = _mm256_add_epi32( m1[7], m1[15] ); |
2341 | 0 | m2[ 8] = _mm256_sub_epi32( m1[0], m1[ 8] ); |
2342 | 0 | m2[ 9] = _mm256_sub_epi32( m1[1], m1[ 9] ); |
2343 | 0 | m2[10] = _mm256_sub_epi32( m1[2], m1[10] ); |
2344 | 0 | m2[11] = _mm256_sub_epi32( m1[3], m1[11] ); |
2345 | 0 | m2[12] = _mm256_sub_epi32( m1[4], m1[12] ); |
2346 | 0 | m2[13] = _mm256_sub_epi32( m1[5], m1[13] ); |
2347 | 0 | m2[14] = _mm256_sub_epi32( m1[6], m1[14] ); |
2348 | 0 | m2[15] = _mm256_sub_epi32( m1[7], m1[15] ); |
2349 | |
|
2350 | 0 | m1[ 0] = _mm256_add_epi32( m2[ 0], m2[ 4] ); |
2351 | 0 | m1[ 1] = _mm256_add_epi32( m2[ 1], m2[ 5] ); |
2352 | 0 | m1[ 2] = _mm256_add_epi32( m2[ 2], m2[ 6] ); |
2353 | 0 | m1[ 3] = _mm256_add_epi32( m2[ 3], m2[ 7] ); |
2354 | 0 | m1[ 4] = _mm256_sub_epi32( m2[ 0], m2[ 4] ); |
2355 | 0 | m1[ 5] = _mm256_sub_epi32( m2[ 1], m2[ 5] ); |
2356 | 0 | m1[ 6] = _mm256_sub_epi32( m2[ 2], m2[ 6] ); |
2357 | 0 | m1[ 7] = _mm256_sub_epi32( m2[ 3], m2[ 7] ); |
2358 | 0 | m1[ 8] = _mm256_add_epi32( m2[ 8], m2[12] ); |
2359 | 0 | m1[ 9] = _mm256_add_epi32( m2[ 9], m2[13] ); |
2360 | 0 | m1[10] = _mm256_add_epi32( m2[10], m2[14] ); |
2361 | 0 | m1[11] = _mm256_add_epi32( m2[11], m2[15] ); |
2362 | 0 | m1[12] = _mm256_sub_epi32( m2[ 8], m2[12] ); |
2363 | 0 | m1[13] = _mm256_sub_epi32( m2[ 9], m2[13] ); |
2364 | 0 | m1[14] = _mm256_sub_epi32( m2[10], m2[14] ); |
2365 | 0 | m1[15] = _mm256_sub_epi32( m2[11], m2[15] ); |
2366 | |
|
2367 | 0 | m2[ 0] = _mm256_add_epi32( m1[ 0], m1[ 2] ); |
2368 | 0 | m2[ 1] = _mm256_add_epi32( m1[ 1], m1[ 3] ); |
2369 | 0 | m2[ 2] = _mm256_sub_epi32( m1[ 0], m1[ 2] ); |
2370 | 0 | m2[ 3] = _mm256_sub_epi32( m1[ 1], m1[ 3] ); |
2371 | 0 | m2[ 4] = _mm256_add_epi32( m1[ 4], m1[ 6] ); |
2372 | 0 | m2[ 5] = _mm256_add_epi32( m1[ 5], m1[ 7] ); |
2373 | 0 | m2[ 6] = _mm256_sub_epi32( m1[ 4], m1[ 6] ); |
2374 | 0 | m2[ 7] = _mm256_sub_epi32( m1[ 5], m1[ 7] ); |
2375 | 0 | m2[ 8] = _mm256_add_epi32( m1[ 8], m1[10] ); |
2376 | 0 | m2[ 9] = _mm256_add_epi32( m1[ 9], m1[11] ); |
2377 | 0 | m2[10] = _mm256_sub_epi32( m1[ 8], m1[10] ); |
2378 | 0 | m2[11] = _mm256_sub_epi32( m1[ 9], m1[11] ); |
2379 | 0 | m2[12] = _mm256_add_epi32( m1[12], m1[14] ); |
2380 | 0 | m2[13] = _mm256_add_epi32( m1[13], m1[15] ); |
2381 | 0 | m2[14] = _mm256_sub_epi32( m1[12], m1[14] ); |
2382 | 0 | m2[15] = _mm256_sub_epi32( m1[13], m1[15] ); |
2383 | |
|
2384 | 0 | m1[ 0] = _mm256_add_epi32( m2[ 0], m2[ 1] ); |
2385 | 0 | m1[ 1] = _mm256_sub_epi32( m2[ 0], m2[ 1] ); |
2386 | 0 | m1[ 2] = _mm256_add_epi32( m2[ 2], m2[ 3] ); |
2387 | 0 | m1[ 3] = _mm256_sub_epi32( m2[ 2], m2[ 3] ); |
2388 | 0 | m1[ 4] = _mm256_add_epi32( m2[ 4], m2[ 5] ); |
2389 | 0 | m1[ 5] = _mm256_sub_epi32( m2[ 4], m2[ 5] ); |
2390 | 0 | m1[ 6] = _mm256_add_epi32( m2[ 6], m2[ 7] ); |
2391 | 0 | m1[ 7] = _mm256_sub_epi32( m2[ 6], m2[ 7] ); |
2392 | 0 | m1[ 8] = _mm256_add_epi32( m2[ 8], m2[ 9] ); |
2393 | 0 | m1[ 9] = _mm256_sub_epi32( m2[ 8], m2[ 9] ); |
2394 | 0 | m1[10] = _mm256_add_epi32( m2[10], m2[11] ); |
2395 | 0 | m1[11] = _mm256_sub_epi32( m2[10], m2[11] ); |
2396 | 0 | m1[12] = _mm256_add_epi32( m2[12], m2[13] ); |
2397 | 0 | m1[13] = _mm256_sub_epi32( m2[12], m2[13] ); |
2398 | 0 | m1[14] = _mm256_add_epi32( m2[14], m2[15] ); |
2399 | 0 | m1[15] = _mm256_sub_epi32( m2[14], m2[15] ); |
2400 | | |
2401 | | // transpose |
2402 | 0 | #define perm_unpacklo_epi128 ( ( 0 << 0 ) + ( 2 << 4 ) ) |
2403 | 0 | #define perm_unpackhi_epi128 ( ( 1 << 0 ) + ( 3 << 4 ) ) |
2404 | | |
2405 | | // 1. 8x8 |
2406 | 0 | m2[0] = _mm256_unpacklo_epi32( m1[0], m1[1] ); |
2407 | 0 | m2[1] = _mm256_unpacklo_epi32( m1[2], m1[3] ); |
2408 | 0 | m2[2] = _mm256_unpacklo_epi32( m1[4], m1[5] ); |
2409 | 0 | m2[3] = _mm256_unpacklo_epi32( m1[6], m1[7] ); |
2410 | 0 | m2[4] = _mm256_unpackhi_epi32( m1[0], m1[1] ); |
2411 | 0 | m2[5] = _mm256_unpackhi_epi32( m1[2], m1[3] ); |
2412 | 0 | m2[6] = _mm256_unpackhi_epi32( m1[4], m1[5] ); |
2413 | 0 | m2[7] = _mm256_unpackhi_epi32( m1[6], m1[7] ); |
2414 | |
|
2415 | 0 | m1[0] = _mm256_unpacklo_epi64( m2[0], m2[1] ); |
2416 | 0 | m1[1] = _mm256_unpackhi_epi64( m2[0], m2[1] ); |
2417 | 0 | m1[2] = _mm256_unpacklo_epi64( m2[2], m2[3] ); |
2418 | 0 | m1[3] = _mm256_unpackhi_epi64( m2[2], m2[3] ); |
2419 | 0 | m1[4] = _mm256_unpacklo_epi64( m2[4], m2[5] ); |
2420 | 0 | m1[5] = _mm256_unpackhi_epi64( m2[4], m2[5] ); |
2421 | 0 | m1[6] = _mm256_unpacklo_epi64( m2[6], m2[7] ); |
2422 | 0 | m1[7] = _mm256_unpackhi_epi64( m2[6], m2[7] ); |
2423 | |
|
2424 | 0 | m2[0] = _mm256_permute2x128_si256( m1[0], m1[2], perm_unpacklo_epi128 ); |
2425 | 0 | m2[1] = _mm256_permute2x128_si256( m1[0], m1[2], perm_unpackhi_epi128 ); |
2426 | 0 | m2[2] = _mm256_permute2x128_si256( m1[1], m1[3], perm_unpacklo_epi128 ); |
2427 | 0 | m2[3] = _mm256_permute2x128_si256( m1[1], m1[3], perm_unpackhi_epi128 ); |
2428 | 0 | m2[4] = _mm256_permute2x128_si256( m1[4], m1[6], perm_unpacklo_epi128 ); |
2429 | 0 | m2[5] = _mm256_permute2x128_si256( m1[4], m1[6], perm_unpackhi_epi128 ); |
2430 | 0 | m2[6] = _mm256_permute2x128_si256( m1[5], m1[7], perm_unpacklo_epi128 ); |
2431 | 0 | m2[7] = _mm256_permute2x128_si256( m1[5], m1[7], perm_unpackhi_epi128 ); |
2432 | | |
2433 | | // 2. 8x8 |
2434 | 0 | m2[0+8] = _mm256_unpacklo_epi32( m1[0+8], m1[1+8] ); |
2435 | 0 | m2[1+8] = _mm256_unpacklo_epi32( m1[2+8], m1[3+8] ); |
2436 | 0 | m2[2+8] = _mm256_unpacklo_epi32( m1[4+8], m1[5+8] ); |
2437 | 0 | m2[3+8] = _mm256_unpacklo_epi32( m1[6+8], m1[7+8] ); |
2438 | 0 | m2[4+8] = _mm256_unpackhi_epi32( m1[0+8], m1[1+8] ); |
2439 | 0 | m2[5+8] = _mm256_unpackhi_epi32( m1[2+8], m1[3+8] ); |
2440 | 0 | m2[6+8] = _mm256_unpackhi_epi32( m1[4+8], m1[5+8] ); |
2441 | 0 | m2[7+8] = _mm256_unpackhi_epi32( m1[6+8], m1[7+8] ); |
2442 | |
|
2443 | 0 | m1[0+8] = _mm256_unpacklo_epi64( m2[0+8], m2[1+8] ); |
2444 | 0 | m1[1+8] = _mm256_unpackhi_epi64( m2[0+8], m2[1+8] ); |
2445 | 0 | m1[2+8] = _mm256_unpacklo_epi64( m2[2+8], m2[3+8] ); |
2446 | 0 | m1[3+8] = _mm256_unpackhi_epi64( m2[2+8], m2[3+8] ); |
2447 | 0 | m1[4+8] = _mm256_unpacklo_epi64( m2[4+8], m2[5+8] ); |
2448 | 0 | m1[5+8] = _mm256_unpackhi_epi64( m2[4+8], m2[5+8] ); |
2449 | 0 | m1[6+8] = _mm256_unpacklo_epi64( m2[6+8], m2[7+8] ); |
2450 | 0 | m1[7+8] = _mm256_unpackhi_epi64( m2[6+8], m2[7+8] ); |
2451 | |
|
2452 | 0 | m2[0+8] = _mm256_permute2x128_si256( m1[0+8], m1[2+8], perm_unpacklo_epi128 ); |
2453 | 0 | m2[1+8] = _mm256_permute2x128_si256( m1[0+8], m1[2+8], perm_unpackhi_epi128 ); |
2454 | 0 | m2[2+8] = _mm256_permute2x128_si256( m1[1+8], m1[3+8], perm_unpacklo_epi128 ); |
2455 | 0 | m2[3+8] = _mm256_permute2x128_si256( m1[1+8], m1[3+8], perm_unpackhi_epi128 ); |
2456 | 0 | m2[4+8] = _mm256_permute2x128_si256( m1[4+8], m1[6+8], perm_unpacklo_epi128 ); |
2457 | 0 | m2[5+8] = _mm256_permute2x128_si256( m1[4+8], m1[6+8], perm_unpackhi_epi128 ); |
2458 | 0 | m2[6+8] = _mm256_permute2x128_si256( m1[5+8], m1[7+8], perm_unpacklo_epi128 ); |
2459 | 0 | m2[7+8] = _mm256_permute2x128_si256( m1[5+8], m1[7+8], perm_unpackhi_epi128 ); |
2460 | | |
2461 | 0 | #undef perm_unpacklo_epi128 |
2462 | 0 | #undef perm_unpackhi_epi128 |
2463 | | |
2464 | | // horizontal |
2465 | 0 | m1[0] = _mm256_add_epi32( m2[0], m2[4] ); |
2466 | 0 | m1[1] = _mm256_add_epi32( m2[1], m2[5] ); |
2467 | 0 | m1[2] = _mm256_add_epi32( m2[2], m2[6] ); |
2468 | 0 | m1[3] = _mm256_add_epi32( m2[3], m2[7] ); |
2469 | 0 | m1[4] = _mm256_sub_epi32( m2[0], m2[4] ); |
2470 | 0 | m1[5] = _mm256_sub_epi32( m2[1], m2[5] ); |
2471 | 0 | m1[6] = _mm256_sub_epi32( m2[2], m2[6] ); |
2472 | 0 | m1[7] = _mm256_sub_epi32( m2[3], m2[7] ); |
2473 | |
|
2474 | 0 | m2[0] = _mm256_add_epi32( m1[0], m1[2] ); |
2475 | 0 | m2[1] = _mm256_add_epi32( m1[1], m1[3] ); |
2476 | 0 | m2[2] = _mm256_sub_epi32( m1[0], m1[2] ); |
2477 | 0 | m2[3] = _mm256_sub_epi32( m1[1], m1[3] ); |
2478 | 0 | m2[4] = _mm256_add_epi32( m1[4], m1[6] ); |
2479 | 0 | m2[5] = _mm256_add_epi32( m1[5], m1[7] ); |
2480 | 0 | m2[6] = _mm256_sub_epi32( m1[4], m1[6] ); |
2481 | 0 | m2[7] = _mm256_sub_epi32( m1[5], m1[7] ); |
2482 | |
|
2483 | 0 | m1[0] = _mm256_abs_epi32( _mm256_add_epi32( m2[0], m2[1] ) ); |
2484 | 0 | m1[1] = _mm256_abs_epi32( _mm256_sub_epi32( m2[0], m2[1] ) ); |
2485 | 0 | m1[2] = _mm256_abs_epi32( _mm256_add_epi32( m2[2], m2[3] ) ); |
2486 | 0 | m1[3] = _mm256_abs_epi32( _mm256_sub_epi32( m2[2], m2[3] ) ); |
2487 | 0 | m1[4] = _mm256_abs_epi32( _mm256_add_epi32( m2[4], m2[5] ) ); |
2488 | 0 | m1[5] = _mm256_abs_epi32( _mm256_sub_epi32( m2[4], m2[5] ) ); |
2489 | 0 | m1[6] = _mm256_abs_epi32( _mm256_add_epi32( m2[6], m2[7] ) ); |
2490 | 0 | m1[7] = _mm256_abs_epi32( _mm256_sub_epi32( m2[6], m2[7] ) ); |
2491 | |
|
2492 | 0 | int absDc = _mm_cvtsi128_si32( _mm256_castsi256_si128( m1[0] ) ); |
2493 | |
|
2494 | 0 | m1[0 + 8] = _mm256_add_epi32( m2[0 + 8], m2[4 + 8] ); |
2495 | 0 | m1[1 + 8] = _mm256_add_epi32( m2[1 + 8], m2[5 + 8] ); |
2496 | 0 | m1[2 + 8] = _mm256_add_epi32( m2[2 + 8], m2[6 + 8] ); |
2497 | 0 | m1[3 + 8] = _mm256_add_epi32( m2[3 + 8], m2[7 + 8] ); |
2498 | 0 | m1[4 + 8] = _mm256_sub_epi32( m2[0 + 8], m2[4 + 8] ); |
2499 | 0 | m1[5 + 8] = _mm256_sub_epi32( m2[1 + 8], m2[5 + 8] ); |
2500 | 0 | m1[6 + 8] = _mm256_sub_epi32( m2[2 + 8], m2[6 + 8] ); |
2501 | 0 | m1[7 + 8] = _mm256_sub_epi32( m2[3 + 8], m2[7 + 8] ); |
2502 | |
|
2503 | 0 | m2[0 + 8] = _mm256_add_epi32( m1[0 + 8], m1[2 + 8] ); |
2504 | 0 | m2[1 + 8] = _mm256_add_epi32( m1[1 + 8], m1[3 + 8] ); |
2505 | 0 | m2[2 + 8] = _mm256_sub_epi32( m1[0 + 8], m1[2 + 8] ); |
2506 | 0 | m2[3 + 8] = _mm256_sub_epi32( m1[1 + 8], m1[3 + 8] ); |
2507 | 0 | m2[4 + 8] = _mm256_add_epi32( m1[4 + 8], m1[6 + 8] ); |
2508 | 0 | m2[5 + 8] = _mm256_add_epi32( m1[5 + 8], m1[7 + 8] ); |
2509 | 0 | m2[6 + 8] = _mm256_sub_epi32( m1[4 + 8], m1[6 + 8] ); |
2510 | 0 | m2[7 + 8] = _mm256_sub_epi32( m1[5 + 8], m1[7 + 8] ); |
2511 | |
|
2512 | 0 | m1[0 + 8] = _mm256_abs_epi32( _mm256_add_epi32( m2[0 + 8], m2[1 + 8] ) ); |
2513 | 0 | m1[1 + 8] = _mm256_abs_epi32( _mm256_sub_epi32( m2[0 + 8], m2[1 + 8] ) ); |
2514 | 0 | m1[2 + 8] = _mm256_abs_epi32( _mm256_add_epi32( m2[2 + 8], m2[3 + 8] ) ); |
2515 | 0 | m1[3 + 8] = _mm256_abs_epi32( _mm256_sub_epi32( m2[2 + 8], m2[3 + 8] ) ); |
2516 | 0 | m1[4 + 8] = _mm256_abs_epi32( _mm256_add_epi32( m2[4 + 8], m2[5 + 8] ) ); |
2517 | 0 | m1[5 + 8] = _mm256_abs_epi32( _mm256_sub_epi32( m2[4 + 8], m2[5 + 8] ) ); |
2518 | 0 | m1[6 + 8] = _mm256_abs_epi32( _mm256_add_epi32( m2[6 + 8], m2[7 + 8] ) ); |
2519 | 0 | m1[7 + 8] = _mm256_abs_epi32( _mm256_sub_epi32( m2[6 + 8], m2[7 + 8] ) ); |
2520 | | |
2521 | | // sum up |
2522 | 0 | m1[0] = _mm256_add_epi32( m1[0], m1[1] ); |
2523 | 0 | m1[1] = _mm256_add_epi32( m1[2], m1[3] ); |
2524 | 0 | m1[2] = _mm256_add_epi32( m1[4], m1[5] ); |
2525 | 0 | m1[3] = _mm256_add_epi32( m1[6], m1[7] ); |
2526 | 0 | m1[4] = _mm256_add_epi32( m1[8], m1[9] ); |
2527 | 0 | m1[5] = _mm256_add_epi32( m1[10], m1[11] ); |
2528 | 0 | m1[6] = _mm256_add_epi32( m1[12], m1[13] ); |
2529 | 0 | m1[7] = _mm256_add_epi32( m1[14], m1[15] ); |
2530 | | |
2531 | | // sum up |
2532 | 0 | m1[ 0] = _mm256_add_epi32( m1[ 0], m1[ 1] ); |
2533 | 0 | m1[ 1] = _mm256_add_epi32( m1[ 2], m1[ 3] ); |
2534 | 0 | m1[ 2] = _mm256_add_epi32( m1[ 4], m1[ 5] ); |
2535 | 0 | m1[ 3] = _mm256_add_epi32( m1[ 6], m1[ 7] ); |
2536 | |
|
2537 | 0 | m1[ 0] = _mm256_add_epi32( m1[ 0], m1[ 1] ); |
2538 | 0 | m1[ 1] = _mm256_add_epi32( m1[ 2], m1[ 3] ); |
2539 | |
|
2540 | 0 | __m256i iSum = _mm256_add_epi32( m1[0], m1[1] ); |
2541 | |
|
2542 | 0 | iSum = _mm256_hadd_epi32( iSum, iSum ); |
2543 | 0 | iSum = _mm256_hadd_epi32( iSum, iSum ); |
2544 | 0 | iSum = _mm256_add_epi32( iSum, _mm256_permute2x128_si256( iSum, iSum, 0x11 ) ); |
2545 | |
|
2546 | 0 | int sad2 = _mm_cvtsi128_si32( _mm256_castsi256_si128( iSum ) ); |
2547 | 0 | sad2 -= absDc; |
2548 | 0 | sad2 += absDc >> 2; |
2549 | 0 | sad = (uint32_t)(sad2 / sqrt(16.0 * 8) * 2); |
2550 | 0 | } |
2551 | |
|
2552 | 0 | #endif //USE_AVX2 |
2553 | |
|
2554 | 0 | return (sad); |
2555 | 0 | } Unexecuted instantiation: RdCost_sse41.cpp:vvenc::xCalcHAD8x16_AVX2(short const*, short const*, int, int, int) Unexecuted instantiation: RdCost_avx2.cpp:vvenc::xCalcHAD8x16_AVX2(short const*, short const*, int, int, int) |
2556 | | |
2557 | | template<X86_VEXT vext > |
2558 | | Distortion RdCost::xGetHAD2SADs_SIMD( const DistParam &rcDtParam ) |
2559 | 0 | { |
2560 | 0 | Distortion distHad = xGetHADs_SIMD<vext, false>( rcDtParam ); |
2561 | 0 | Distortion distSad = 0; |
2562 | |
|
2563 | 0 | { |
2564 | 0 | const short* pSrc1 = (const short*)rcDtParam.org.buf; |
2565 | 0 | const short* pSrc2 = (const short*)rcDtParam.cur.buf; |
2566 | 0 | const int iStrideSrc1 = rcDtParam.org.stride<<2; |
2567 | 0 | const int iStrideSrc2 = rcDtParam.cur.stride<<2; |
2568 | 0 | const int iRows = rcDtParam.org.height>>2; |
2569 | 0 | const int iCols = rcDtParam.org.width<<2; |
2570 | |
|
2571 | 0 | uint32_t uiSum = 0; |
2572 | 0 | CHECKD( (rcDtParam.org.width != rcDtParam.org.stride) || (rcDtParam.cur.stride != rcDtParam.org.stride) , "this functions assumes compact, aligned buffering"); |
2573 | |
|
2574 | | #ifdef USE_AVX2 |
2575 | 0 | if( vext >= AVX2 ) |
2576 | 0 | { |
2577 | | // Do for width that multiple of 16 |
2578 | 0 | __m256i vone = _mm256_set1_epi16( 1 ); |
2579 | 0 | __m256i vsum32 = _mm256_setzero_si256(); |
2580 | 0 | for( int iY = 0; iY < iRows; iY++ ) |
2581 | 0 | { |
2582 | 0 | __m256i vsum16 = _mm256_setzero_si256(); |
2583 | 0 | for( int iX = 0; iX < iCols; iX+=16 ) |
2584 | 0 | { |
2585 | 0 | __m256i vsrc1 = _mm256_load_si256( ( __m256i* )( &pSrc1[iX] ) ); |
2586 | 0 | __m256i vsrc2 = _mm256_load_si256( ( __m256i* )( &pSrc2[iX] ) ); |
2587 | 0 | vsum16 = _mm256_add_epi16( vsum16, _mm256_abs_epi16( _mm256_sub_epi16( vsrc1, vsrc2 ) ) ); |
2588 | 0 | } |
2589 | 0 | __m256i vsumtemp = _mm256_madd_epi16( vsum16, vone ); |
2590 | 0 | vsum32 = _mm256_add_epi32( vsum32, vsumtemp ); |
2591 | 0 | pSrc1 += iStrideSrc1; |
2592 | 0 | pSrc2 += iStrideSrc2; |
2593 | 0 | } |
2594 | 0 | vsum32 = _mm256_hadd_epi32( vsum32, vone ); |
2595 | 0 | vsum32 = _mm256_hadd_epi32( vsum32, vone ); |
2596 | 0 | uiSum = _mm_cvtsi128_si32( _mm256_castsi256_si128( vsum32 ) ) + _mm_cvtsi128_si32( _mm256_extracti128_si256( vsum32, 1 ) ); |
2597 | 0 | } |
2598 | 0 | else |
2599 | 0 | #endif |
2600 | 0 | { |
2601 | | // For width that multiple of 8 |
2602 | 0 | __m128i vone = _mm_set1_epi16( 1 ); |
2603 | 0 | __m128i vsum32 = _mm_setzero_si128(); |
2604 | 0 | for( int iY = 0; iY < iRows; iY++ ) |
2605 | 0 | { |
2606 | 0 | __m128i vsum16 = _mm_setzero_si128(); |
2607 | 0 | for( int iX = 0; iX < iCols; iX+=8 ) |
2608 | 0 | { |
2609 | 0 | __m128i vsrc1 = _mm_load_si128( ( const __m128i* )( &pSrc1[iX] ) ); |
2610 | 0 | __m128i vsrc2 = _mm_load_si128( ( const __m128i* )( &pSrc2[iX] ) ); |
2611 | 0 | vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) ); |
2612 | 0 | } |
2613 | 0 | __m128i vsumtemp = _mm_madd_epi16( vsum16, vone ); |
2614 | 0 | vsum32 = _mm_add_epi32( vsum32, vsumtemp ); |
2615 | 0 | pSrc1 += iStrideSrc1; |
2616 | 0 | pSrc2 += iStrideSrc2; |
2617 | 0 | } |
2618 | 0 | vsum32 = _mm_hadd_epi32( vsum32, vone ); |
2619 | 0 | vsum32 = _mm_hadd_epi32( vsum32, vone ); |
2620 | 0 | uiSum = _mm_cvtsi128_si32( vsum32 ); |
2621 | 0 | } |
2622 | 0 | distSad = uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth); |
2623 | 0 | } |
2624 | | |
2625 | 0 | return std::min( distHad, 2*distSad); |
2626 | 0 | } Unexecuted instantiation: unsigned long vvenc::RdCost::xGetHAD2SADs_SIMD<(vvenc::x86_simd::X86_VEXT)1>(vvenc::DistParam const&) Unexecuted instantiation: unsigned long vvenc::RdCost::xGetHAD2SADs_SIMD<(vvenc::x86_simd::X86_VEXT)4>(vvenc::DistParam const&) |
2627 | | |
2628 | | template<X86_VEXT vext> |
2629 | | Distortion RdCost::xGetSADwMask_SIMD(const DistParam &rcDtParam) |
2630 | 0 | { |
2631 | 0 | if (rcDtParam.org.width < 4 || rcDtParam.bitDepth > 10 || rcDtParam.applyWeight) |
2632 | 0 | return RdCost::xGetSADwMask(rcDtParam); |
2633 | | |
2634 | 0 | const short *src1 = (const short *) rcDtParam.org.buf; |
2635 | 0 | const short *src2 = (const short *) rcDtParam.cur.buf; |
2636 | 0 | const short *weightMask = (const short *) rcDtParam.mask; |
2637 | 0 | int rows = rcDtParam.org.height; |
2638 | 0 | int cols = rcDtParam.org.width; |
2639 | 0 | int subShift = rcDtParam.subShift; |
2640 | 0 | int subStep = (1 << subShift); |
2641 | 0 | const int strideSrc1 = rcDtParam.org.stride * subStep; |
2642 | 0 | const int strideSrc2 = rcDtParam.cur.stride * subStep; |
2643 | 0 | const int strideMask = rcDtParam.maskStride * subStep; |
2644 | |
|
2645 | 0 | Distortion sum = 0; |
2646 | 0 | if (vext >= AVX2 && (cols & 15) == 0) |
2647 | 0 | { |
2648 | | #if defined( USE_AVX2 ) |
2649 | | // Do for width that multiple of 16 |
2650 | | __m256i vzero = _mm256_setzero_si256(); |
2651 | | __m256i vsum32 = vzero; |
2652 | 0 | for (int y = 0; y < rows; y += subStep) |
2653 | 0 | { |
2654 | 0 | for (int x = 0; x < cols; x += 16) |
2655 | 0 | { |
2656 | 0 | __m256i vsrc1 = _mm256_loadu_si256((__m256i *) (&src1[x])); |
2657 | 0 | __m256i vsrc2 = _mm256_loadu_si256((__m256i *) (&src2[x])); |
2658 | 0 | __m256i vmask; |
2659 | | |
2660 | 0 | if (rcDtParam.stepX == -1) |
2661 | 0 | { |
2662 | 0 | vmask = _mm256_loadu_si256((__m256i *) ((&weightMask[x]) - (x << 1) - (16 - 1))); |
2663 | 0 | const __m256i shuffle_mask = _mm256_set_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, |
2664 | 0 | 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); |
2665 | 0 | vmask = _mm256_shuffle_epi8(vmask, shuffle_mask); |
2666 | 0 | vmask = _mm256_permute4x64_epi64(vmask, _MM_SHUFFLE(1, 0, 3, 2)); |
2667 | 0 | } |
2668 | 0 | else |
2669 | 0 | { |
2670 | 0 | vmask = _mm256_loadu_si256( ( __m256i * ) ( &weightMask[x] ) ); |
2671 | 0 | } |
2672 | |
|
2673 | 0 | vsum32 = _mm256_add_epi32(vsum32, _mm256_madd_epi16(vmask, _mm256_abs_epi16(_mm256_sub_epi16(vsrc1, vsrc2)))); |
2674 | 0 | } |
2675 | 0 | src1 += strideSrc1; |
2676 | 0 | src2 += strideSrc2; |
2677 | 0 | weightMask += strideMask; |
2678 | 0 | } |
2679 | | vsum32 = _mm256_hadd_epi32(vsum32, vzero); |
2680 | | vsum32 = _mm256_hadd_epi32(vsum32, vzero); |
2681 | | sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(vsum32)) |
2682 | | + _mm_cvtsi128_si32(_mm256_castsi256_si128(_mm256_permute2x128_si256(vsum32, vsum32, 0x11))); |
2683 | | #endif |
2684 | 0 | } |
2685 | 0 | else |
2686 | 0 | { |
2687 | | // Do with step of 8 |
2688 | 0 | __m128i vzero = _mm_setzero_si128(); |
2689 | 0 | __m128i vsum32 = vzero; |
2690 | 0 | for (int y = 0; y < rows; y += subStep) |
2691 | 0 | { |
2692 | 0 | for (int x = 0; x < cols; x += 8) |
2693 | 0 | { |
2694 | 0 | __m128i vsrc1 = _mm_loadu_si128((const __m128i *) (&src1[x])); |
2695 | 0 | __m128i vsrc2 = _mm_loadu_si128((const __m128i *) (&src2[x])); |
2696 | 0 | __m128i vmask; |
2697 | 0 | if (rcDtParam.stepX == -1) |
2698 | 0 | { |
2699 | 0 | vmask = _mm_loadu_si128((__m128i *) ((&weightMask[x]) - (x << 1) - (8 - 1))); |
2700 | 0 | const __m128i shuffle_mask = _mm_set_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); |
2701 | 0 | vmask = _mm_shuffle_epi8(vmask, shuffle_mask); |
2702 | 0 | } |
2703 | 0 | else |
2704 | 0 | { |
2705 | 0 | vmask = _mm_loadu_si128((const __m128i *) (&weightMask[x])); |
2706 | 0 | } |
2707 | 0 | vsum32 = _mm_add_epi32(vsum32, _mm_madd_epi16(vmask, _mm_abs_epi16(_mm_sub_epi16(vsrc1, vsrc2)))); |
2708 | 0 | } |
2709 | 0 | src1 += strideSrc1; |
2710 | 0 | src2 += strideSrc2; |
2711 | 0 | weightMask += strideMask; |
2712 | 0 | } |
2713 | 0 | vsum32 = _mm_hadd_epi32(vsum32, vzero); |
2714 | 0 | vsum32 = _mm_hadd_epi32(vsum32, vzero); |
2715 | 0 | sum = _mm_cvtsi128_si32(vsum32); |
2716 | 0 | } |
2717 | 0 | sum <<= subShift; |
2718 | 0 | return sum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth); |
2719 | 0 | } Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSADwMask_SIMD<(vvenc::x86_simd::X86_VEXT)1>(vvenc::DistParam const&) Unexecuted instantiation: unsigned long vvenc::RdCost::xGetSADwMask_SIMD<(vvenc::x86_simd::X86_VEXT)4>(vvenc::DistParam const&) |
2720 | | |
2721 | | template<X86_VEXT vext, bool fastHad> |
2722 | | Distortion RdCost::xGetHADs_SIMD( const DistParam &rcDtParam ) |
2723 | 0 | { |
2724 | 0 | const Pel* piOrg = rcDtParam.org.buf; |
2725 | 0 | const Pel* piCur = rcDtParam.cur.buf; |
2726 | 0 | const int iRows = rcDtParam.org.height; |
2727 | 0 | const int iCols = rcDtParam.org.width; |
2728 | 0 | const int iStrideCur = rcDtParam.cur.stride; |
2729 | 0 | const int iStrideOrg = rcDtParam.org.stride; |
2730 | 0 | const int iBitDepth = rcDtParam.bitDepth; |
2731 | |
|
2732 | 0 | int x, y; |
2733 | 0 | Distortion uiSum = 0; |
2734 | |
|
2735 | 0 | if( iCols > iRows && ( iCols & 15 ) == 0 && ( iRows & 7 ) == 0 ) |
2736 | 0 | { |
2737 | 0 | for( y = 0; y < iRows; y += 8 ) |
2738 | 0 | { |
2739 | 0 | for( x = 0; x < iCols; x += 16 ) |
2740 | 0 | { |
2741 | 0 | if( vext >= AVX2 ) |
2742 | 0 | uiSum += xCalcHAD16x8_AVX2( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur, iBitDepth ); |
2743 | 0 | else |
2744 | 0 | uiSum += xCalcHAD16x8_SSE( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur, iBitDepth ); |
2745 | 0 | } |
2746 | 0 | piOrg += 8*iStrideOrg; |
2747 | 0 | piCur += 8*iStrideCur; |
2748 | 0 | } |
2749 | 0 | } |
2750 | 0 | else if( iCols < iRows && ( iRows & 15 ) == 0 && ( iCols & 7 ) == 0 ) |
2751 | 0 | { |
2752 | 0 | for( y = 0; y < iRows; y += 16 ) |
2753 | 0 | { |
2754 | 0 | for( x = 0; x < iCols; x += 8 ) |
2755 | 0 | { |
2756 | 0 | if( vext >= AVX2 ) |
2757 | 0 | uiSum += xCalcHAD8x16_AVX2( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur, iBitDepth ); |
2758 | 0 | else |
2759 | 0 | uiSum += xCalcHAD8x16_SSE( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur, iBitDepth ); |
2760 | 0 | } |
2761 | 0 | piOrg += 16*iStrideOrg; |
2762 | 0 | piCur += 16*iStrideCur; |
2763 | 0 | } |
2764 | 0 | } |
2765 | 0 | else if( iCols > iRows && ( iCols & 7 ) == 0 && ( iRows & 3 ) == 0 ) |
2766 | 0 | { |
2767 | 0 | for( y = 0; y < iRows; y += 4 ) |
2768 | 0 | { |
2769 | 0 | for( x = 0; x < iCols; x += 8 ) |
2770 | 0 | { |
2771 | 0 | uiSum += xCalcHAD8x4_SSE( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur, iBitDepth ); |
2772 | 0 | } |
2773 | 0 | piOrg += 4*iStrideOrg; |
2774 | 0 | piCur += 4*iStrideCur; |
2775 | 0 | } |
2776 | 0 | } |
2777 | 0 | else if( iCols < iRows && ( iRows & 7 ) == 0 && ( iCols & 3 ) == 0 ) |
2778 | 0 | { |
2779 | 0 | for( y = 0; y < iRows; y += 8 ) |
2780 | 0 | { |
2781 | 0 | for( x = 0; x < iCols; x += 4 ) |
2782 | 0 | { |
2783 | 0 | uiSum += xCalcHAD4x8_SSE( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur, iBitDepth ); |
2784 | 0 | } |
2785 | 0 | piOrg += 8*iStrideOrg; |
2786 | 0 | piCur += 8*iStrideCur; |
2787 | 0 | } |
2788 | 0 | } |
2789 | 0 | else if( fastHad && vext >= AVX2 && ( ( ( iRows | iCols ) & 31 ) == 0 ) && ( iRows == iCols ) ) |
2790 | 0 | { |
2791 | 0 | for( y = 0; y < iRows; y += 32 ) |
2792 | 0 | { |
2793 | 0 | for( x = 0; x < iCols; x += 32 ) |
2794 | 0 | { |
2795 | 0 | uiSum += xCalcHAD32x32_fast_AVX2( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur, iBitDepth ); |
2796 | 0 | } |
2797 | 0 | piOrg += 32 * iStrideOrg; |
2798 | 0 | piCur += 32 * iStrideCur; |
2799 | 0 | } |
2800 | 0 | } |
2801 | 0 | else if( fastHad && ( ( ( iRows | iCols ) & 31 ) == 0 ) && ( iRows == iCols ) ) |
2802 | 0 | { |
2803 | 0 | for( y = 0; y < iRows; y += 16 ) |
2804 | 0 | { |
2805 | 0 | for( x = 0; x < iCols; x += 16 ) |
2806 | 0 | { |
2807 | 0 | uiSum += xCalcHAD16x16_fast_SSE( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur, iBitDepth ); |
2808 | 0 | } |
2809 | 0 | piOrg += 16 * iStrideOrg; |
2810 | 0 | piCur += 16 * iStrideCur; |
2811 | 0 | } |
2812 | 0 | } |
2813 | 0 | else if( vext >= AVX2 && ( ( ( iRows | iCols ) & 15 ) == 0 ) && ( iRows == iCols ) ) |
2814 | 0 | { |
2815 | 0 | for( y = 0; y < iRows; y += 16 ) |
2816 | 0 | { |
2817 | 0 | for( x = 0; x < iCols; x += 16 ) |
2818 | 0 | { |
2819 | 0 | uiSum += xCalcHAD16x16_AVX2( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur, iBitDepth ); |
2820 | 0 | } |
2821 | 0 | piOrg += 16*iStrideOrg; |
2822 | 0 | piCur += 16*iStrideCur; |
2823 | 0 | } |
2824 | 0 | } |
2825 | 0 | else if( ( ( ( iRows | iCols ) & 7 ) == 0 ) && ( iRows == iCols ) ) |
2826 | 0 | { |
2827 | 0 | for( y = 0; y<iRows; y += 8 ) |
2828 | 0 | { |
2829 | 0 | for( x = 0; x < iCols; x += 8 ) |
2830 | 0 | { |
2831 | 0 | uiSum += xCalcHAD8x8_SSE( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur, iBitDepth ); |
2832 | 0 | } |
2833 | 0 | piOrg += 8*iStrideOrg; |
2834 | 0 | piCur += 8*iStrideCur; |
2835 | 0 | } |
2836 | 0 | } |
2837 | 0 | else if( ( iRows % 4 == 0 ) && ( iCols % 4 == 0 ) ) |
2838 | 0 | { |
2839 | 0 | for( y = 0; y < iRows; y += 4 ) |
2840 | 0 | { |
2841 | 0 | for( x = 0; x < iCols; x += 4 ) |
2842 | 0 | { |
2843 | 0 | uiSum += xCalcHAD4x4_SSE( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur ); |
2844 | 0 | } |
2845 | 0 | piOrg += 4*iStrideOrg; |
2846 | 0 | piCur += 4*iStrideCur; |
2847 | 0 | } |
2848 | 0 | } |
2849 | 0 | else if( ( iRows % 2 == 0 ) && ( iCols % 2 == 0 ) ) |
2850 | 0 | { |
2851 | 0 | for( y = 0; y < iRows; y += 2 ) |
2852 | 0 | { |
2853 | 0 | for( x = 0; x < iCols; x += 2 ) |
2854 | 0 | { |
2855 | 0 | uiSum += xCalcHADs2x2( &piOrg[x], &piCur[x], iStrideOrg, iStrideCur ); |
2856 | 0 | } |
2857 | 0 | piOrg += 2*iStrideOrg; |
2858 | 0 | piCur += 2*iStrideCur; |
2859 | 0 | } |
2860 | 0 | } |
2861 | 0 | else |
2862 | 0 | { |
2863 | 0 | THROW( "Unsupported size" ); |
2864 | 0 | } |
2865 | | |
2866 | 0 | return uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth); |
2867 | 0 | } Unexecuted instantiation: unsigned long vvenc::RdCost::xGetHADs_SIMD<(vvenc::x86_simd::X86_VEXT)1, false>(vvenc::DistParam const&) Unexecuted instantiation: unsigned long vvenc::RdCost::xGetHADs_SIMD<(vvenc::x86_simd::X86_VEXT)1, true>(vvenc::DistParam const&) Unexecuted instantiation: unsigned long vvenc::RdCost::xGetHADs_SIMD<(vvenc::x86_simd::X86_VEXT)4, false>(vvenc::DistParam const&) Unexecuted instantiation: unsigned long vvenc::RdCost::xGetHADs_SIMD<(vvenc::x86_simd::X86_VEXT)4, true>(vvenc::DistParam const&) |
2868 | | |
2869 | | inline Distortion getWeightedMSE_SIMD(const Pel org, const Pel cur, const int64_t fixedPTweight, unsigned uiShift) |
2870 | 0 | { |
2871 | 0 | const Intermediate_Int iTemp = org - cur; |
2872 | 0 | return Intermediate_Int((fixedPTweight*(iTemp*iTemp) + (1 << 15)) >> uiShift); |
2873 | 0 | } |
2874 | | |
2875 | | template<X86_VEXT vext, int csx> |
2876 | | static Distortion lumaWeightedSSE_SIMD( const DistParam& rcDtParam, ChromaFormat chmFmt, const uint32_t* lumaWeights ) |
2877 | 0 | { |
2878 | 0 | int iRows = rcDtParam.org.height; |
2879 | 0 | const Pel* piOrg = rcDtParam.org.buf; |
2880 | 0 | const Pel* piCur = rcDtParam.cur.buf; |
2881 | 0 | const int iCols = rcDtParam.org.width; |
2882 | 0 | const int iStrideCur = rcDtParam.cur.stride; |
2883 | 0 | const int iStrideOrg = rcDtParam.org.stride; |
2884 | 0 | const Pel* piOrgLuma = rcDtParam.orgLuma->buf; |
2885 | 0 | const int iStrideOrgLuma = rcDtParam.orgLuma->stride; |
2886 | |
|
2887 | 0 | Distortion uiSum = 0; |
2888 | 0 | const uint32_t uiShift = 16 + (DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth) << 1); |
2889 | 0 | const __m128i vShift = _mm_cvtsi32_si128(uiShift); |
2890 | |
|
2891 | 0 | const ComponentID compId = rcDtParam.compID; |
2892 | 0 | const size_t cShiftY = getComponentScaleY(compId, chmFmt); |
2893 | | |
2894 | 0 | if( ( iCols & 7 ) == 0 ) |
2895 | 0 | { |
2896 | 0 | const __m128i xoffs = _mm_set1_epi64x( 1 << 15 ); |
2897 | 0 | __m128i xsum = _mm_setzero_si128(); |
2898 | |
|
2899 | 0 | for( ; iRows != 0; iRows-- ) |
2900 | 0 | { |
2901 | 0 | for (int n = 0; n < iCols; n += 8 ) |
2902 | 0 | { |
2903 | 0 | const int o = n<<csx; |
2904 | | |
2905 | 0 | __m128i xorg = _mm_loadu_si128( ( const __m128i* ) &piOrg[n] ); |
2906 | 0 | __m128i xcur = _mm_loadu_si128( ( const __m128i* ) &piCur[n] ); |
2907 | | |
2908 | 0 | xcur = _mm_sub_epi16 ( xorg, xcur ); |
2909 | |
|
2910 | 0 | const __m128i |
2911 | 0 | xmlo = _mm_mullo_epi16 ( xcur, xcur ), |
2912 | 0 | xmhi = _mm_mulhi_epi16 ( xcur, xcur ); |
2913 | |
|
2914 | 0 | __m128i |
2915 | 0 | xwgt = _mm_setr_epi32 ( lumaWeights[piOrgLuma[o+(0<<csx)]], |
2916 | 0 | lumaWeights[piOrgLuma[o+(1<<csx)]], |
2917 | 0 | lumaWeights[piOrgLuma[o+(2<<csx)]], |
2918 | 0 | lumaWeights[piOrgLuma[o+(3<<csx)]] ); |
2919 | | |
2920 | 0 | __m128i |
2921 | 0 | xmul = _mm_unpacklo_epi16( xmlo, xmhi ); |
2922 | 0 | __m128i |
2923 | 0 | xtmp = _mm_mul_epi32 ( xmul, xwgt ); |
2924 | 0 | xtmp = _mm_add_epi64 ( xtmp, xoffs ); |
2925 | 0 | xtmp = _mm_srl_epi64 ( xtmp, vShift ); |
2926 | 0 | xsum = _mm_add_epi64 ( xsum, xtmp ); |
2927 | |
|
2928 | 0 | xwgt = _mm_shuffle_epi32 ( xwgt, 1 + 0 + 48 + 128 ); |
2929 | 0 | xmul = _mm_shuffle_epi32 ( xmul, 1 + 0 + 48 + 128 ); |
2930 | 0 | xtmp = _mm_mul_epi32 ( xmul, xwgt ); |
2931 | 0 | xtmp = _mm_add_epi64 ( xtmp, xoffs ); |
2932 | 0 | xtmp = _mm_srl_epi64 ( xtmp, vShift ); |
2933 | 0 | xsum = _mm_add_epi64 ( xsum, xtmp ); |
2934 | | |
2935 | 0 | xwgt = _mm_setr_epi32 ( lumaWeights[piOrgLuma[o+(4<<csx)]], |
2936 | 0 | lumaWeights[piOrgLuma[o+(5<<csx)]], |
2937 | 0 | lumaWeights[piOrgLuma[o+(6<<csx)]], |
2938 | 0 | lumaWeights[piOrgLuma[o+(7<<csx)]] ); |
2939 | |
|
2940 | 0 | xmul = _mm_unpackhi_epi16( xmlo, xmhi ); |
2941 | 0 | xtmp = _mm_mul_epi32 ( xmul, xwgt ); |
2942 | 0 | xtmp = _mm_add_epi64 ( xtmp, xoffs ); |
2943 | 0 | xtmp = _mm_srl_epi64 ( xtmp, vShift ); |
2944 | 0 | xsum = _mm_add_epi64 ( xsum, xtmp ); |
2945 | |
|
2946 | 0 | xwgt = _mm_shuffle_epi32 ( xwgt, 1 + 0 + 48 + 128 ); |
2947 | 0 | xmul = _mm_shuffle_epi32 ( xmul, 1 + 0 + 48 + 128 ); |
2948 | 0 | xtmp = _mm_mul_epi32 ( xmul, xwgt ); |
2949 | 0 | xtmp = _mm_add_epi64 ( xtmp, xoffs ); |
2950 | 0 | xtmp = _mm_srl_epi64 ( xtmp, vShift ); |
2951 | 0 | xsum = _mm_add_epi64 ( xsum, xtmp ); |
2952 | | |
2953 | | //uiSum += getWeightedMSE_SIMD( piOrg[n ], piCur[n ], lumaWeights[piOrgLuma[(n )<<csx]], uiShift ); |
2954 | | //uiSum += getWeightedMSE_SIMD( piOrg[n+1], piCur[n+1], lumaWeights[piOrgLuma[(n+1)<<csx]], uiShift ); |
2955 | 0 | } |
2956 | |
|
2957 | 0 | piOrg += iStrideOrg; |
2958 | 0 | piCur += iStrideCur; |
2959 | 0 | piOrgLuma += iStrideOrgLuma<<cShiftY; |
2960 | 0 | } |
2961 | |
|
2962 | 0 | uiSum += _mm_extract_epi64( xsum, 0 ); |
2963 | 0 | uiSum += _mm_extract_epi64( xsum, 1 ); |
2964 | |
|
2965 | 0 | return uiSum; |
2966 | 0 | } |
2967 | 0 | else |
2968 | 0 | if( ( iCols & 3 ) == 0 ) |
2969 | 0 | { |
2970 | 0 | const __m128i xoffs = _mm_set1_epi64x( 1 << 15 ); |
2971 | 0 | __m128i xsum = _mm_setzero_si128(); |
2972 | |
|
2973 | 0 | for( ; iRows != 0; iRows-- ) |
2974 | 0 | { |
2975 | 0 | for (int n = 0; n < iCols; n += 4 ) |
2976 | 0 | { |
2977 | 0 | const int o = n<<csx; |
2978 | | |
2979 | 0 | __m128i xorg = _vv_loadl_epi64( ( const __m128i* ) &piOrg[n] ); |
2980 | 0 | __m128i xcur = _vv_loadl_epi64( ( const __m128i* ) &piCur[n] ); |
2981 | | |
2982 | 0 | xcur = _mm_sub_epi16 ( xorg, xcur ); |
2983 | |
|
2984 | 0 | const __m128i |
2985 | 0 | xmlo = _mm_mullo_epi16 ( xcur, xcur ), |
2986 | 0 | xmhi = _mm_mulhi_epi16 ( xcur, xcur ); |
2987 | |
|
2988 | 0 | __m128i |
2989 | 0 | xwgt = _mm_setr_epi32 ( lumaWeights[piOrgLuma[o+(0<<csx)]], |
2990 | 0 | lumaWeights[piOrgLuma[o+(1<<csx)]], |
2991 | 0 | lumaWeights[piOrgLuma[o+(2<<csx)]], |
2992 | 0 | lumaWeights[piOrgLuma[o+(3<<csx)]] ); |
2993 | | |
2994 | 0 | __m128i |
2995 | 0 | xmul = _mm_unpacklo_epi16( xmlo, xmhi ); |
2996 | 0 | __m128i |
2997 | 0 | xtmp = _mm_mul_epi32 ( xmul, xwgt ); |
2998 | 0 | xtmp = _mm_add_epi64 ( xtmp, xoffs ); |
2999 | 0 | xtmp = _mm_srl_epi64 ( xtmp, vShift ); |
3000 | 0 | xsum = _mm_add_epi64 ( xsum, xtmp ); |
3001 | |
|
3002 | 0 | xwgt = _mm_shuffle_epi32 ( xwgt, 1 + 0 + 48 + 128 ); |
3003 | 0 | xmul = _mm_shuffle_epi32 ( xmul, 1 + 0 + 48 + 128 ); |
3004 | 0 | xtmp = _mm_mul_epi32 ( xmul, xwgt ); |
3005 | 0 | xtmp = _mm_add_epi64 ( xtmp, xoffs ); |
3006 | 0 | xtmp = _mm_srl_epi64 ( xtmp, vShift ); |
3007 | 0 | xsum = _mm_add_epi64 ( xsum, xtmp ); |
3008 | | |
3009 | | //uiSum += getWeightedMSE_SIMD( piOrg[n ], piCur[n ], lumaWeights[piOrgLuma[(n )<<csx]], uiShift ); |
3010 | | //uiSum += getWeightedMSE_SIMD( piOrg[n+1], piCur[n+1], lumaWeights[piOrgLuma[(n+1)<<csx]], uiShift ); |
3011 | 0 | } |
3012 | |
|
3013 | 0 | piOrg += iStrideOrg; |
3014 | 0 | piCur += iStrideCur; |
3015 | 0 | piOrgLuma += iStrideOrgLuma<<cShiftY; |
3016 | 0 | } |
3017 | |
|
3018 | 0 | uiSum += _mm_extract_epi64( xsum, 0 ); |
3019 | 0 | uiSum += _mm_extract_epi64( xsum, 1 ); |
3020 | |
|
3021 | 0 | return uiSum; |
3022 | 0 | } |
3023 | 0 | else |
3024 | 0 | if( ( iCols & 1 ) == 0 ) |
3025 | 0 | { |
3026 | 0 | for( ; iRows != 0; iRows-- ) |
3027 | 0 | { |
3028 | 0 | for (int n = 0; n < iCols; n+=2 ) |
3029 | 0 | { |
3030 | 0 | uiSum += getWeightedMSE_SIMD( piOrg[n ], piCur[n ], lumaWeights[piOrgLuma[(n )<<csx]], uiShift ); |
3031 | 0 | uiSum += getWeightedMSE_SIMD( piOrg[n+1], piCur[n+1], lumaWeights[piOrgLuma[(n+1)<<csx]], uiShift ); |
3032 | 0 | } |
3033 | |
|
3034 | 0 | piOrg += iStrideOrg; |
3035 | 0 | piCur += iStrideCur; |
3036 | 0 | piOrgLuma += iStrideOrgLuma<<cShiftY; |
3037 | 0 | } |
3038 | |
|
3039 | 0 | return uiSum; |
3040 | 0 | } |
3041 | 0 | else |
3042 | 0 | { |
3043 | 0 | for( ; iRows != 0; iRows-- ) |
3044 | 0 | { |
3045 | 0 | for (int n = 0; n < iCols; n++ ) |
3046 | 0 | { |
3047 | 0 | uiSum += getWeightedMSE_SIMD( piOrg[n ], piCur[n ], lumaWeights[piOrgLuma[(n )<<csx]], uiShift ); |
3048 | 0 | } |
3049 | |
|
3050 | 0 | piOrg += iStrideOrg; |
3051 | 0 | piCur += iStrideCur; |
3052 | 0 | piOrgLuma += iStrideOrgLuma<<cShiftY; |
3053 | 0 | } |
3054 | |
|
3055 | 0 | return uiSum; |
3056 | 0 | } |
3057 | | |
3058 | 0 | return 0; |
3059 | 0 | } Unexecuted instantiation: RdCost_sse41.cpp:unsigned long vvenc::lumaWeightedSSE_SIMD<(vvenc::x86_simd::X86_VEXT)1, 0>(vvenc::DistParam const&, vvencChromaFormat, unsigned int const*) Unexecuted instantiation: RdCost_sse41.cpp:unsigned long vvenc::lumaWeightedSSE_SIMD<(vvenc::x86_simd::X86_VEXT)1, 1>(vvenc::DistParam const&, vvencChromaFormat, unsigned int const*) Unexecuted instantiation: RdCost_avx2.cpp:unsigned long vvenc::lumaWeightedSSE_SIMD<(vvenc::x86_simd::X86_VEXT)4, 0>(vvenc::DistParam const&, vvencChromaFormat, unsigned int const*) Unexecuted instantiation: RdCost_avx2.cpp:unsigned long vvenc::lumaWeightedSSE_SIMD<(vvenc::x86_simd::X86_VEXT)4, 1>(vvenc::DistParam const&, vvencChromaFormat, unsigned int const*) |
3060 | | |
3061 | | template<X86_VEXT vext> |
3062 | | static Distortion fixWeightedSSE_SIMD( const DistParam& rcDtParam, uint32_t fixedPTweight ) |
3063 | 0 | { |
3064 | 0 | int iRows = rcDtParam.org.height; |
3065 | 0 | const Pel* piOrg = rcDtParam.org.buf; |
3066 | 0 | const Pel* piCur = rcDtParam.cur.buf; |
3067 | 0 | const int iCols = rcDtParam.org.width; |
3068 | 0 | const int iStrideCur = rcDtParam.cur.stride; |
3069 | 0 | const int iStrideOrg = rcDtParam.org.stride; |
3070 | |
|
3071 | 0 | Distortion uiSum = 0; |
3072 | 0 | const uint32_t uiShift = 16 + ( DISTORTION_PRECISION_ADJUSTMENT( rcDtParam.bitDepth ) << 1 ); |
3073 | 0 | const __m128i vShift = _mm_cvtsi32_si128(uiShift); |
3074 | |
|
3075 | 0 | if( ( iCols & 3 ) == 0 ) |
3076 | 0 | { |
3077 | 0 | const __m128i xfxdw = _mm_set1_epi32 ( fixedPTweight ); |
3078 | 0 | const __m128i xoffs = _mm_set1_epi64x( 1 << 15 ); |
3079 | 0 | __m128i xsum = _mm_setzero_si128(); |
3080 | |
|
3081 | 0 | for( ; iRows != 0; iRows-- ) |
3082 | 0 | { |
3083 | 0 | for( int n = 0; n < iCols; n += 4 ) |
3084 | 0 | { |
3085 | 0 | __m128i xorg = _vv_loadl_epi64( ( const __m128i* ) &piOrg[n] ); |
3086 | 0 | __m128i xcur = _vv_loadl_epi64( ( const __m128i* ) &piCur[n] ); |
3087 | |
|
3088 | 0 | xcur = _mm_sub_epi16 ( xorg, xcur ); |
3089 | |
|
3090 | 0 | const __m128i |
3091 | 0 | xmlo = _mm_mullo_epi16 ( xcur, xcur ), |
3092 | 0 | xmhi = _mm_mulhi_epi16 ( xcur, xcur ); |
3093 | |
|
3094 | 0 | __m128i |
3095 | 0 | xmul = _mm_unpacklo_epi16( xmlo, xmhi ); |
3096 | 0 | __m128i |
3097 | 0 | xtmp = _mm_mul_epi32 ( xmul, xfxdw ); |
3098 | 0 | xtmp = _mm_add_epi64 ( xtmp, xoffs ); |
3099 | 0 | xtmp = _mm_srl_epi64 ( xtmp, vShift ); |
3100 | 0 | xsum = _mm_add_epi64 ( xsum, xtmp ); |
3101 | |
|
3102 | 0 | xmul = _mm_shuffle_epi32 ( xmul, 1 + 0 + 48 + 128 ); |
3103 | 0 | xtmp = _mm_mul_epi32 ( xmul, xfxdw ); |
3104 | 0 | xtmp = _mm_add_epi64 ( xtmp, xoffs ); |
3105 | 0 | xtmp = _mm_srl_epi64 ( xtmp, vShift ); |
3106 | 0 | xsum = _mm_add_epi64 ( xsum, xtmp ); |
3107 | 0 | } |
3108 | 0 | piOrg += iStrideOrg; |
3109 | 0 | piCur += iStrideCur; |
3110 | 0 | } |
3111 | |
|
3112 | 0 | uiSum += _mm_extract_epi64( xsum, 0 ); |
3113 | 0 | uiSum += _mm_extract_epi64( xsum, 1 ); |
3114 | |
|
3115 | 0 | return uiSum; |
3116 | 0 | } |
3117 | 0 | else if( iCols == 2 ) |
3118 | 0 | { |
3119 | 0 | for( ; iRows != 0; iRows-- ) |
3120 | 0 | { |
3121 | 0 | for( int n = 0; n < iCols; n += 2 ) |
3122 | 0 | { |
3123 | 0 | uiSum += getWeightedMSE_SIMD( piOrg[n ], piCur[n ], fixedPTweight, uiShift ); |
3124 | 0 | uiSum += getWeightedMSE_SIMD( piOrg[n + 1], piCur[n + 1], fixedPTweight, uiShift ); |
3125 | 0 | } |
3126 | 0 | piOrg += iStrideOrg; |
3127 | 0 | piCur += iStrideCur; |
3128 | 0 | } |
3129 | |
|
3130 | 0 | return uiSum; |
3131 | 0 | } |
3132 | 0 | else |
3133 | 0 | { |
3134 | 0 | for( ; iRows != 0; iRows-- ) |
3135 | 0 | { |
3136 | 0 | for( int n = 0; n < iCols; n++ ) |
3137 | 0 | { |
3138 | 0 | uiSum += getWeightedMSE_SIMD( piOrg[n], piCur[n], fixedPTweight, uiShift ); |
3139 | 0 | } |
3140 | 0 | piOrg += iStrideOrg; |
3141 | 0 | piCur += iStrideCur; |
3142 | 0 | } |
3143 | |
|
3144 | 0 | return uiSum; |
3145 | 0 | } |
3146 | | |
3147 | 0 | return 0; |
3148 | 0 | } Unexecuted instantiation: RdCost_sse41.cpp:unsigned long vvenc::fixWeightedSSE_SIMD<(vvenc::x86_simd::X86_VEXT)1>(vvenc::DistParam const&, unsigned int) Unexecuted instantiation: RdCost_avx2.cpp:unsigned long vvenc::fixWeightedSSE_SIMD<(vvenc::x86_simd::X86_VEXT)4>(vvenc::DistParam const&, unsigned int) |
3149 | | |
3150 | | |
3151 | | template <X86_VEXT vext, bool isCalCentrePos> |
3152 | 0 | void xGetSADX5_8xN_SIMDImp(const DistParam& rcDtParam, Distortion* cost) { |
3153 | 0 | int i; |
3154 | 0 | const Pel* piOrg = rcDtParam.org.buf; |
3155 | 0 | const Pel* piCur = rcDtParam.cur.buf - 4; |
3156 | 0 | int height = rcDtParam.org.height; |
3157 | 0 | int iSubShift = rcDtParam.subShift; |
3158 | 0 | int iSubStep = (1 << iSubShift); |
3159 | 0 | ptrdiff_t iStrideCur = rcDtParam.cur.stride * iSubStep; |
3160 | 0 | ptrdiff_t iStrideOrg = rcDtParam.org.stride * iSubStep; |
3161 | |
|
3162 | 0 | __m128i sum0 = _mm_setzero_si128(); |
3163 | 0 | __m128i sum1 = _mm_setzero_si128(); |
3164 | 0 | __m128i sum2 = _mm_setzero_si128(); |
3165 | 0 | __m128i sum3 = _mm_setzero_si128(); |
3166 | 0 | __m128i sum4 = _mm_setzero_si128(); |
3167 | |
|
3168 | 0 | __m128i vone = _mm_set1_epi16(1); |
3169 | 0 | for (i = 0; i < height; i += iSubStep) { |
3170 | 0 | __m128i s0 = _mm_loadu_si128((__m128i*)piOrg); |
3171 | 0 | __m128i s1 = _mm_loadu_si128((__m128i*)piCur); |
3172 | 0 | __m128i s2 = _vv_loadl_epi64((__m128i*)(piOrg + 8)); |
3173 | 0 | __m128i s3 = _vv_loadl_epi64((__m128i*)(piCur + 8)); |
3174 | |
|
3175 | 0 | __m128i org0, org1, org2, org3, org4; |
3176 | 0 | org0 = s0; |
3177 | 0 | org1 = _mm_alignr_epi8(s2, s0, 2); |
3178 | 0 | if (isCalCentrePos) org2 = _mm_alignr_epi8(s2, s0, 4); |
3179 | 0 | org3 = _mm_alignr_epi8(s2, s0, 6); |
3180 | 0 | org4 = _mm_alignr_epi8(s2, s0, 8); |
3181 | |
|
3182 | 0 | __m128i cur0, cur1, cur2, cur3, cur4; |
3183 | 0 | cur4 = s1; |
3184 | 0 | cur0 = _mm_alignr_epi8(s3, s1, 8); |
3185 | 0 | cur1 = _mm_alignr_epi8(s3, s1, 6); |
3186 | 0 | if (isCalCentrePos) cur2 = _mm_alignr_epi8(s3, s1, 4); |
3187 | 0 | cur3 = _mm_alignr_epi8(s3, s1, 2); |
3188 | |
|
3189 | 0 | __m128i diff0, diff1, diff2, diff3, diff4; |
3190 | 0 | diff0 = _mm_sub_epi16(org0, cur0); |
3191 | 0 | diff1 = _mm_sub_epi16(org1, cur1); |
3192 | 0 | if (isCalCentrePos) diff2 = _mm_sub_epi16(org2, cur2); |
3193 | 0 | diff3 = _mm_sub_epi16(org3, cur3); |
3194 | 0 | diff4 = _mm_sub_epi16(org4, cur4); |
3195 | |
|
3196 | 0 | diff0 = _mm_abs_epi16(diff0); |
3197 | 0 | diff1 = _mm_abs_epi16(diff1); |
3198 | 0 | if (isCalCentrePos) diff2 = _mm_abs_epi16(diff2); |
3199 | 0 | diff3 = _mm_abs_epi16(diff3); |
3200 | 0 | diff4 = _mm_abs_epi16(diff4); |
3201 | |
|
3202 | 0 | sum0 = _mm_add_epi16(sum0, diff0); |
3203 | 0 | sum1 = _mm_add_epi16(sum1, diff1); |
3204 | 0 | if (isCalCentrePos) sum2 = _mm_add_epi32(sum2, diff2); |
3205 | 0 | sum3 = _mm_add_epi16(sum3, diff3); |
3206 | 0 | sum4 = _mm_add_epi16(sum4, diff4); |
3207 | |
|
3208 | 0 | piOrg += iStrideOrg; |
3209 | 0 | piCur += iStrideCur; |
3210 | 0 | } |
3211 | |
|
3212 | 0 | sum0 = _mm_madd_epi16( sum0, vone ); |
3213 | 0 | sum1 = _mm_madd_epi16( sum1, vone ); |
3214 | 0 | if( isCalCentrePos ) sum2 = _mm_madd_epi16( sum2, vone ); |
3215 | 0 | sum3 = _mm_madd_epi16( sum3, vone ); |
3216 | 0 | sum4 = _mm_madd_epi16( sum4, vone ); |
3217 | |
|
3218 | 0 | sum0 = _mm_hadd_epi32(sum0, sum1); |
3219 | 0 | sum3 = _mm_hadd_epi32(sum3, sum4); |
3220 | 0 | if (isCalCentrePos) sum2 = _mm_hadd_epi32(sum2, sum2); |
3221 | |
|
3222 | 0 | sum0 = _mm_hadd_epi32(sum0, sum3); |
3223 | 0 | if (isCalCentrePos) sum2 = _mm_hadd_epi32(sum2, sum2); |
3224 | |
|
3225 | 0 | const __m128i vSubShift = _mm_cvtsi32_si128(iSubShift); |
3226 | 0 | sum0 = _mm_sll_epi32(sum0, vSubShift); |
3227 | 0 | if (isCalCentrePos) sum2 = _mm_sll_epi32(sum2, vSubShift); |
3228 | |
|
3229 | 0 | sum0 = _mm_srl_epi32(sum0, _mm_cvtsi32_si128(1 + (DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth)))); |
3230 | 0 | if (isCalCentrePos) sum2 = _mm_srl_epi32(sum2, _mm_cvtsi32_si128(1 + (DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth)))); |
3231 | |
|
3232 | 0 | _mm_storeu_si128( ( __m128i* ) &cost[0], _mm_unpacklo_epi32( sum0, _mm_setzero_si128() ) ); |
3233 | 0 | if (isCalCentrePos) cost[2] = (_mm_cvtsi128_si32(sum2)); |
3234 | 0 | _mm_storeu_si128( ( __m128i* ) &cost[3], _mm_unpackhi_epi32( sum0, _mm_setzero_si128() ) ); |
3235 | 0 | } Unexecuted instantiation: void vvenc::xGetSADX5_8xN_SIMDImp<(vvenc::x86_simd::X86_VEXT)1, true>(vvenc::DistParam const&, unsigned long*) Unexecuted instantiation: void vvenc::xGetSADX5_8xN_SIMDImp<(vvenc::x86_simd::X86_VEXT)1, false>(vvenc::DistParam const&, unsigned long*) Unexecuted instantiation: void vvenc::xGetSADX5_8xN_SIMDImp<(vvenc::x86_simd::X86_VEXT)4, true>(vvenc::DistParam const&, unsigned long*) Unexecuted instantiation: void vvenc::xGetSADX5_8xN_SIMDImp<(vvenc::x86_simd::X86_VEXT)4, false>(vvenc::DistParam const&, unsigned long*) |
3236 | | |
3237 | | template <X86_VEXT vext> |
3238 | 0 | void RdCost::xGetSADX5_8xN_SIMD(const DistParam& rcDtParam, Distortion* cost, bool isCalCentrePos) { |
3239 | 0 | if( rcDtParam.bitDepth > 10 ){ |
3240 | 0 | RdCost::xGetSAD8X5( rcDtParam, cost, isCalCentrePos ); |
3241 | 0 | return; |
3242 | 0 | } |
3243 | | |
3244 | 0 | if (isCalCentrePos) |
3245 | 0 | xGetSADX5_8xN_SIMDImp<vext, true>(rcDtParam, cost); |
3246 | 0 | else |
3247 | 0 | xGetSADX5_8xN_SIMDImp<vext, false>(rcDtParam, cost); |
3248 | 0 | } Unexecuted instantiation: void vvenc::RdCost::xGetSADX5_8xN_SIMD<(vvenc::x86_simd::X86_VEXT)1>(vvenc::DistParam const&, unsigned long*, bool) Unexecuted instantiation: void vvenc::RdCost::xGetSADX5_8xN_SIMD<(vvenc::x86_simd::X86_VEXT)4>(vvenc::DistParam const&, unsigned long*, bool) |
3249 | | |
3250 | | template <X86_VEXT vext, bool isCalCentrePos> |
3251 | 0 | void xGetSADX5_16xN_SIMDImp_X86(const DistParam& rcDtParam, Distortion* cost) { |
3252 | 0 | int i, j; |
3253 | 0 | const Pel* piOrg = rcDtParam.org.buf; |
3254 | 0 | const Pel* piCur = rcDtParam.cur.buf - 4; |
3255 | 0 | int height = rcDtParam.org.height; |
3256 | 0 | int iSubShift = rcDtParam.subShift; |
3257 | 0 | int iSubStep = (1 << iSubShift); |
3258 | 0 | ptrdiff_t iStrideCur = rcDtParam.cur.stride * iSubStep; |
3259 | 0 | ptrdiff_t iStrideOrg = rcDtParam.org.stride * iSubStep; |
3260 | |
|
3261 | | # ifdef USE_AVX2 |
3262 | 0 | if (vext >= AVX2) { |
3263 | | // sum of 8 unsigned 10-bit ints (0-1023) can maximally be 3 + 10 bits, i.e. fits into 16 bit |
3264 | | |
3265 | | __m256i sum0 = _mm256_setzero_si256(); |
3266 | | __m256i sum1 = _mm256_setzero_si256(); |
3267 | | __m256i sum2 = _mm256_setzero_si256(); |
3268 | | __m256i sum3 = _mm256_setzero_si256(); |
3269 | | __m256i sum4 = _mm256_setzero_si256(); |
3270 | | |
3271 | | __m256i vone = _mm256_set1_epi16(1); |
3272 | | |
3273 | 0 | for (int i = 0; i < ( height >> 3 ); i++) { |
3274 | 0 | __m256i s0 = _mm256_loadu_si256((__m256i*)piOrg); |
3275 | 0 | __m256i s1 = _mm256_loadu_si256((__m256i*)piCur); |
3276 | 0 | __m256i s2 = _mm256_castsi128_si256(_vv_loadl_epi64((__m128i*)(piOrg + 16))); |
3277 | 0 | __m256i s3 = _mm256_castsi128_si256(_vv_loadl_epi64((__m128i*)(piCur + 16))); |
3278 | 0 | s2 = _mm256_permute2x128_si256(s0, s2, 0x21); |
3279 | 0 | s3 = _mm256_permute2x128_si256(s1, s3, 0x21); |
3280 | |
|
3281 | 0 | piOrg += iStrideOrg; |
3282 | 0 | piCur += iStrideCur; |
3283 | |
|
3284 | 0 | __m256i org0, org1, org2, org3, org4; |
3285 | 0 | org0 = s0; |
3286 | 0 | org1 = _mm256_alignr_epi8(s2, s0, 2); |
3287 | 0 | if (isCalCentrePos) org2 = _mm256_alignr_epi8(s2, s0, 4); |
3288 | 0 | org3 = _mm256_alignr_epi8(s2, s0, 6); |
3289 | 0 | org4 = _mm256_alignr_epi8(s2, s0, 8); |
3290 | |
|
3291 | 0 | __m256i cur0, cur1, cur2, cur3, cur4; |
3292 | 0 | cur4 = s1; |
3293 | 0 | cur0 = _mm256_alignr_epi8(s3, s1, 8); |
3294 | 0 | cur1 = _mm256_alignr_epi8(s3, s1, 6); |
3295 | 0 | if (isCalCentrePos) cur2 = _mm256_alignr_epi8(s3, s1, 4); |
3296 | 0 | cur3 = _mm256_alignr_epi8(s3, s1, 2); |
3297 | |
|
3298 | 0 | __m256i diff0, diff1, diff2, diff3, diff4; |
3299 | 0 | diff0 = _mm256_sub_epi16(org0, cur0); |
3300 | 0 | diff1 = _mm256_sub_epi16(org1, cur1); |
3301 | 0 | if (isCalCentrePos) diff2 = _mm256_sub_epi16(org2, cur2); |
3302 | 0 | diff3 = _mm256_sub_epi16(org3, cur3); |
3303 | 0 | diff4 = _mm256_sub_epi16(org4, cur4); |
3304 | |
|
3305 | 0 | diff0 = _mm256_abs_epi16( diff0 ); |
3306 | 0 | diff1 = _mm256_abs_epi16( diff1 ); |
3307 | 0 | if( isCalCentrePos ) diff2 = _mm256_abs_epi16( diff2 ); |
3308 | 0 | diff3 = _mm256_abs_epi16( diff3 ); |
3309 | 0 | diff4 = _mm256_abs_epi16( diff4 ); |
3310 | |
|
3311 | 0 | sum0 = _mm256_add_epi16( diff0, sum0 ); |
3312 | 0 | sum1 = _mm256_add_epi16( diff1, sum1 ); |
3313 | 0 | if( isCalCentrePos ) sum2 = _mm256_add_epi16( diff2, sum2 ); |
3314 | 0 | sum3 = _mm256_add_epi16( diff3, sum3 ); |
3315 | 0 | sum4 = _mm256_add_epi16( diff4, sum4 ); |
3316 | |
|
3317 | 0 | s0 = _mm256_loadu_si256((__m256i*)piOrg); |
3318 | 0 | s1 = _mm256_loadu_si256((__m256i*)piCur); |
3319 | 0 | s2 = _mm256_castsi128_si256(_vv_loadl_epi64((__m128i*)(piOrg + 16))); |
3320 | 0 | s3 = _mm256_castsi128_si256(_vv_loadl_epi64((__m128i*)(piCur + 16))); |
3321 | 0 | s2 = _mm256_permute2x128_si256(s0, s2, 0x21); |
3322 | 0 | s3 = _mm256_permute2x128_si256(s1, s3, 0x21); |
3323 | |
|
3324 | 0 | piOrg += iStrideOrg; |
3325 | 0 | piCur += iStrideCur; |
3326 | |
|
3327 | 0 | org0 = s0; |
3328 | 0 | org1 = _mm256_alignr_epi8(s2, s0, 2); |
3329 | 0 | if (isCalCentrePos) org2 = _mm256_alignr_epi8(s2, s0, 4); |
3330 | 0 | org3 = _mm256_alignr_epi8(s2, s0, 6); |
3331 | 0 | org4 = _mm256_alignr_epi8(s2, s0, 8); |
3332 | |
|
3333 | 0 | cur4 = s1; |
3334 | 0 | cur0 = _mm256_alignr_epi8(s3, s1, 8); |
3335 | 0 | cur1 = _mm256_alignr_epi8(s3, s1, 6); |
3336 | 0 | if (isCalCentrePos) cur2 = _mm256_alignr_epi8(s3, s1, 4); |
3337 | 0 | cur3 = _mm256_alignr_epi8(s3, s1, 2); |
3338 | |
|
3339 | 0 | diff0 = _mm256_sub_epi16(org0, cur0); |
3340 | 0 | diff1 = _mm256_sub_epi16(org1, cur1); |
3341 | 0 | if (isCalCentrePos) diff2 = _mm256_sub_epi16(org2, cur2); |
3342 | 0 | diff3 = _mm256_sub_epi16(org3, cur3); |
3343 | 0 | diff4 = _mm256_sub_epi16(org4, cur4); |
3344 | |
|
3345 | 0 | diff0 = _mm256_abs_epi16(diff0); |
3346 | 0 | diff1 = _mm256_abs_epi16(diff1); |
3347 | 0 | if (isCalCentrePos) diff2 = _mm256_abs_epi16(diff2); |
3348 | 0 | diff3 = _mm256_abs_epi16(diff3); |
3349 | 0 | diff4 = _mm256_abs_epi16(diff4); |
3350 | |
|
3351 | 0 | sum0 = _mm256_add_epi16(diff0, sum0); |
3352 | 0 | sum1 = _mm256_add_epi16(diff1, sum1); |
3353 | 0 | if (isCalCentrePos) sum2 = _mm256_add_epi16(diff2, sum2); |
3354 | 0 | sum3 = _mm256_add_epi16(diff3, sum3); |
3355 | 0 | sum4 = _mm256_add_epi16(diff4, sum4); |
3356 | |
|
3357 | 0 | s0 = _mm256_loadu_si256((__m256i*)piOrg); |
3358 | 0 | s1 = _mm256_loadu_si256((__m256i*)piCur); |
3359 | 0 | s2 = _mm256_castsi128_si256(_vv_loadl_epi64((__m128i*)(piOrg + 16))); |
3360 | 0 | s3 = _mm256_castsi128_si256(_vv_loadl_epi64((__m128i*)(piCur + 16))); |
3361 | 0 | s2 = _mm256_permute2x128_si256(s0, s2, 0x21); |
3362 | 0 | s3 = _mm256_permute2x128_si256(s1, s3, 0x21); |
3363 | |
|
3364 | 0 | piOrg += iStrideOrg; |
3365 | 0 | piCur += iStrideCur; |
3366 | |
|
3367 | 0 | org0 = s0; |
3368 | 0 | org1 = _mm256_alignr_epi8(s2, s0, 2); |
3369 | 0 | if (isCalCentrePos) org2 = _mm256_alignr_epi8(s2, s0, 4); |
3370 | 0 | org3 = _mm256_alignr_epi8(s2, s0, 6); |
3371 | 0 | org4 = _mm256_alignr_epi8(s2, s0, 8); |
3372 | |
|
3373 | 0 | cur4 = s1; |
3374 | 0 | cur0 = _mm256_alignr_epi8(s3, s1, 8); |
3375 | 0 | cur1 = _mm256_alignr_epi8(s3, s1, 6); |
3376 | 0 | if (isCalCentrePos) cur2 = _mm256_alignr_epi8(s3, s1, 4); |
3377 | 0 | cur3 = _mm256_alignr_epi8(s3, s1, 2); |
3378 | |
|
3379 | 0 | diff0 = _mm256_sub_epi16(org0, cur0); |
3380 | 0 | diff1 = _mm256_sub_epi16(org1, cur1); |
3381 | 0 | if (isCalCentrePos) diff2 = _mm256_sub_epi16(org2, cur2); |
3382 | 0 | diff3 = _mm256_sub_epi16(org3, cur3); |
3383 | 0 | diff4 = _mm256_sub_epi16(org4, cur4); |
3384 | |
|
3385 | 0 | diff0 = _mm256_abs_epi16(diff0); |
3386 | 0 | diff1 = _mm256_abs_epi16(diff1); |
3387 | 0 | if (isCalCentrePos) diff2 = _mm256_abs_epi16(diff2); |
3388 | 0 | diff3 = _mm256_abs_epi16(diff3); |
3389 | 0 | diff4 = _mm256_abs_epi16(diff4); |
3390 | |
|
3391 | 0 | sum0 = _mm256_add_epi16( diff0, sum0 ); |
3392 | 0 | sum1 = _mm256_add_epi16( diff1, sum1 ); |
3393 | 0 | if( isCalCentrePos ) sum2 = _mm256_add_epi16( diff2, sum2 ); |
3394 | 0 | sum3 = _mm256_add_epi16( diff3, sum3 ); |
3395 | 0 | sum4 = _mm256_add_epi16( diff4, sum4 ); |
3396 | |
|
3397 | 0 | s0 = _mm256_loadu_si256((__m256i*)piOrg); |
3398 | 0 | s1 = _mm256_loadu_si256((__m256i*)piCur); |
3399 | 0 | s2 = _mm256_castsi128_si256(_vv_loadl_epi64((__m128i*)(piOrg + 16))); |
3400 | 0 | s3 = _mm256_castsi128_si256(_vv_loadl_epi64((__m128i*)(piCur + 16))); |
3401 | 0 | s2 = _mm256_permute2x128_si256(s0, s2, 0x21); |
3402 | 0 | s3 = _mm256_permute2x128_si256(s1, s3, 0x21); |
3403 | |
|
3404 | 0 | piOrg += iStrideOrg; |
3405 | 0 | piCur += iStrideCur; |
3406 | |
|
3407 | 0 | org0 = s0; |
3408 | 0 | org1 = _mm256_alignr_epi8(s2, s0, 2); |
3409 | 0 | if (isCalCentrePos) org2 = _mm256_alignr_epi8(s2, s0, 4); |
3410 | 0 | org3 = _mm256_alignr_epi8(s2, s0, 6); |
3411 | 0 | org4 = _mm256_alignr_epi8(s2, s0, 8); |
3412 | |
|
3413 | 0 | cur4 = s1; |
3414 | 0 | cur0 = _mm256_alignr_epi8(s3, s1, 8); |
3415 | 0 | cur1 = _mm256_alignr_epi8(s3, s1, 6); |
3416 | 0 | if (isCalCentrePos) cur2 = _mm256_alignr_epi8(s3, s1, 4); |
3417 | 0 | cur3 = _mm256_alignr_epi8(s3, s1, 2); |
3418 | |
|
3419 | 0 | diff0 = _mm256_sub_epi16(org0, cur0); |
3420 | 0 | diff1 = _mm256_sub_epi16(org1, cur1); |
3421 | 0 | if (isCalCentrePos) diff2 = _mm256_sub_epi16(org2, cur2); |
3422 | 0 | diff3 = _mm256_sub_epi16(org3, cur3); |
3423 | 0 | diff4 = _mm256_sub_epi16(org4, cur4); |
3424 | |
|
3425 | 0 | diff0 = _mm256_abs_epi16(diff0); |
3426 | 0 | diff1 = _mm256_abs_epi16(diff1); |
3427 | 0 | if (isCalCentrePos) diff2 = _mm256_abs_epi16(diff2); |
3428 | 0 | diff3 = _mm256_abs_epi16(diff3); |
3429 | 0 | diff4 = _mm256_abs_epi16(diff4); |
3430 | |
|
3431 | 0 | sum0 = _mm256_add_epi16(diff0, sum0); |
3432 | 0 | sum1 = _mm256_add_epi16(diff1, sum1); |
3433 | 0 | if (isCalCentrePos) sum2 = _mm256_add_epi16(diff2, sum2); |
3434 | 0 | sum3 = _mm256_add_epi16(diff3, sum3); |
3435 | 0 | sum4 = _mm256_add_epi16(diff4, sum4); |
3436 | 0 | } |
3437 | | |
3438 | | sum0 = _mm256_madd_epi16( sum0, vone ); |
3439 | | sum1 = _mm256_madd_epi16( sum1, vone ); |
3440 | 0 | if( isCalCentrePos ) sum2 = _mm256_madd_epi16( sum2, vone ); |
3441 | | sum3 = _mm256_madd_epi16( sum3, vone ); |
3442 | | sum4 = _mm256_madd_epi16( sum4, vone ); |
3443 | | |
3444 | | sum0 = _mm256_hadd_epi32(sum0, sum1); |
3445 | | sum3 = _mm256_hadd_epi32(sum3, sum4); |
3446 | 0 | if (isCalCentrePos) sum2 = _mm256_hadd_epi32(sum2, sum2); |
3447 | | |
3448 | | sum0 = _mm256_hadd_epi32(sum0, sum3); |
3449 | 0 | if (isCalCentrePos) sum2 = _mm256_hadd_epi32(sum2, sum2); |
3450 | | |
3451 | | __m128i sum0134 = _mm_add_epi32(_mm256_castsi256_si128(sum0), _mm256_extracti128_si256(sum0, 1)); |
3452 | | |
3453 | | sum0134 = _mm_sll_epi32(sum0134, _mm_cvtsi32_si128(iSubShift)); |
3454 | | |
3455 | 0 | sum0134 = _mm_srl_epi32(sum0134, _mm_cvtsi32_si128(1 + (DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth)))); |
3456 | |
|
3457 | 0 | _mm_storeu_si128( ( __m128i* ) &cost[0], _mm_unpacklo_epi32( sum0134, _mm_setzero_si128() ) ); |
3458 | 0 | if (isCalCentrePos) { |
3459 | 0 | int tmp = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum2)) + _mm256_extract_epi32(sum2, 4); |
3460 | 0 | tmp <<= iSubShift; |
3461 | 0 | tmp >>= (1 + (DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth))); |
3462 | 0 | cost[2] = tmp; |
3463 | 0 | } |
3464 | 0 | _mm_storeu_si128( ( __m128i* ) &cost[3], _mm_unpackhi_epi32( sum0134, _mm_setzero_si128() ) ); |
3465 | 0 | } |
3466 | 0 | else |
3467 | 0 | # endif |
3468 | 0 | { |
3469 | | // sum of 16 unsigned 10-bit ints (0-1023) can maximally be 4 + 10 bits, i.e. fits into 16 bit |
3470 | |
|
3471 | 0 | __m128i sum0 = _mm_setzero_si128(); |
3472 | 0 | __m128i sum1 = _mm_setzero_si128(); |
3473 | 0 | __m128i sum2 = _mm_setzero_si128(); |
3474 | 0 | __m128i sum3 = _mm_setzero_si128(); |
3475 | 0 | __m128i sum4 = _mm_setzero_si128(); |
3476 | |
|
3477 | 0 | __m128i vone = _mm_set1_epi16(1); |
3478 | 0 | for (i = 0; i < height; i += iSubStep) { |
3479 | 0 | for (j = 0; j < 16; j += 8) { |
3480 | 0 | __m128i s0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(piOrg + j + 0)); |
3481 | 0 | __m128i s1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(piCur + j + 0)); |
3482 | 0 | __m128i s2 = _vv_loadl_epi64(reinterpret_cast<const __m128i*>(piOrg + j + 8)); |
3483 | 0 | __m128i s3 = _vv_loadl_epi64(reinterpret_cast<const __m128i*>(piCur + j + 8)); |
3484 | |
|
3485 | 0 | __m128i org0, org1, org2, org3, org4; |
3486 | 0 | org0 = s0; |
3487 | 0 | org1 = _mm_alignr_epi8(s2, s0, 2); |
3488 | 0 | if (isCalCentrePos) org2 = _mm_alignr_epi8(s2, s0, 4); |
3489 | 0 | org3 = _mm_alignr_epi8(s2, s0, 6); |
3490 | 0 | org4 = _mm_alignr_epi8(s2, s0, 8); |
3491 | |
|
3492 | 0 | __m128i cur0, cur1, cur2, cur3, cur4; |
3493 | 0 | cur4 = s1; |
3494 | 0 | cur0 = _mm_alignr_epi8(s3, s1, 8); |
3495 | 0 | cur1 = _mm_alignr_epi8(s3, s1, 6); |
3496 | 0 | if (isCalCentrePos) cur2 = _mm_alignr_epi8(s3, s1, 4); |
3497 | 0 | cur3 = _mm_alignr_epi8(s3, s1, 2); |
3498 | |
|
3499 | 0 | __m128i diff0, diff1, diff2, diff3, diff4; |
3500 | 0 | diff0 = _mm_sub_epi16(org0, cur0); |
3501 | 0 | diff1 = _mm_sub_epi16(org1, cur1); |
3502 | 0 | if (isCalCentrePos) diff2 = _mm_sub_epi16(org2, cur2); |
3503 | 0 | diff3 = _mm_sub_epi16(org3, cur3); |
3504 | 0 | diff4 = _mm_sub_epi16(org4, cur4); |
3505 | |
|
3506 | 0 | diff0 = _mm_abs_epi16(diff0); |
3507 | 0 | diff1 = _mm_abs_epi16(diff1); |
3508 | 0 | if (isCalCentrePos) diff2 = _mm_abs_epi16(diff2); |
3509 | 0 | diff3 = _mm_abs_epi16(diff3); |
3510 | 0 | diff4 = _mm_abs_epi16(diff4); |
3511 | |
|
3512 | 0 | sum0 = _mm_add_epi16(sum0, diff0); |
3513 | 0 | sum1 = _mm_add_epi16(sum1, diff1); |
3514 | 0 | if (isCalCentrePos) sum2 = _mm_add_epi16(sum2, diff2); |
3515 | 0 | sum3 = _mm_add_epi16(sum3, diff3); |
3516 | 0 | sum4 = _mm_add_epi16(sum4, diff4); |
3517 | 0 | } |
3518 | |
|
3519 | 0 | piOrg += iStrideOrg; |
3520 | 0 | piCur += iStrideCur; |
3521 | 0 | } |
3522 | |
|
3523 | 0 | sum0 = _mm_madd_epi16( sum0, vone ); |
3524 | 0 | sum1 = _mm_madd_epi16( sum1, vone ); |
3525 | 0 | if( isCalCentrePos ) sum2 = _mm_madd_epi16( sum2, vone ); |
3526 | 0 | sum3 = _mm_madd_epi16( sum3, vone ); |
3527 | 0 | sum4 = _mm_madd_epi16( sum4, vone ); |
3528 | |
|
3529 | 0 | sum0 = _mm_hadd_epi32(sum0, sum1); |
3530 | 0 | sum3 = _mm_hadd_epi32(sum3, sum4); |
3531 | 0 | if (isCalCentrePos) sum2 = _mm_hadd_epi32(sum2, sum2); |
3532 | |
|
3533 | 0 | sum0 = _mm_hadd_epi32(sum0, sum3); |
3534 | 0 | if (isCalCentrePos) sum2 = _mm_hadd_epi32(sum2, sum2); |
3535 | |
|
3536 | 0 | const __m128i vSubShift = _mm_cvtsi32_si128(iSubShift); |
3537 | 0 | sum0 = _mm_sll_epi32(sum0, vSubShift); |
3538 | 0 | if (isCalCentrePos) sum2 = _mm_sll_epi32(sum2, vSubShift); |
3539 | |
|
3540 | 0 | sum0 = _mm_srl_epi32(sum0, _mm_cvtsi32_si128(1 + (DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth)))); |
3541 | 0 | if (isCalCentrePos) sum2 = _mm_srl_epi32(sum2, _mm_cvtsi32_si128(1 + (DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth)))); |
3542 | |
|
3543 | 0 | _mm_storeu_si128( ( __m128i* ) &cost[0], _mm_unpacklo_epi32( sum0, _mm_setzero_si128() ) ); |
3544 | 0 | if (isCalCentrePos) cost[2] = (_mm_cvtsi128_si32(sum2)); |
3545 | 0 | _mm_storeu_si128( ( __m128i* ) &cost[3], _mm_unpackhi_epi32( sum0, _mm_setzero_si128() ) ); |
3546 | 0 | } |
3547 | 0 | } Unexecuted instantiation: void vvenc::xGetSADX5_16xN_SIMDImp_X86<(vvenc::x86_simd::X86_VEXT)1, true>(vvenc::DistParam const&, unsigned long*) Unexecuted instantiation: void vvenc::xGetSADX5_16xN_SIMDImp_X86<(vvenc::x86_simd::X86_VEXT)1, false>(vvenc::DistParam const&, unsigned long*) Unexecuted instantiation: void vvenc::xGetSADX5_16xN_SIMDImp_X86<(vvenc::x86_simd::X86_VEXT)4, true>(vvenc::DistParam const&, unsigned long*) Unexecuted instantiation: void vvenc::xGetSADX5_16xN_SIMDImp_X86<(vvenc::x86_simd::X86_VEXT)4, false>(vvenc::DistParam const&, unsigned long*) |
3548 | | |
3549 | | template <X86_VEXT vext> |
3550 | 0 | void RdCost::xGetSADX5_16xN_SIMD_X86(const DistParam& rcDtParam, Distortion* cost, bool isCalCentrePos) { |
3551 | 0 | if( rcDtParam.bitDepth > 10 ){ |
3552 | 0 | RdCost::xGetSAD16X5( rcDtParam, cost, isCalCentrePos ); |
3553 | 0 | return; |
3554 | 0 | } |
3555 | | |
3556 | 0 | if (isCalCentrePos) |
3557 | 0 | xGetSADX5_16xN_SIMDImp_X86<vext, true>(rcDtParam, cost); |
3558 | 0 | else |
3559 | 0 | xGetSADX5_16xN_SIMDImp_X86<vext, false>(rcDtParam, cost); |
3560 | 0 | } Unexecuted instantiation: void vvenc::RdCost::xGetSADX5_16xN_SIMD_X86<(vvenc::x86_simd::X86_VEXT)1>(vvenc::DistParam const&, unsigned long*, bool) Unexecuted instantiation: void vvenc::RdCost::xGetSADX5_16xN_SIMD_X86<(vvenc::x86_simd::X86_VEXT)4>(vvenc::DistParam const&, unsigned long*, bool) |
3561 | | |
3562 | | template <X86_VEXT vext> |
3563 | | void RdCost::_initRdCostX86() |
3564 | 0 | { |
3565 | | /* SIMD SSE implementation shifts the final sum instead of every addend |
3566 | | * resulting in slightly different result compared to the scalar impl. */ |
3567 | |
|
3568 | 0 | m_afpDistortFunc[0][DF_SSE ] = xGetSSE_SIMD<vext>; |
3569 | | //m_afpDistortFunc[0][DF_SSE2 ] = xGetSSE_SIMD<vext>; |
3570 | 0 | m_afpDistortFunc[0][DF_SSE4 ] = xGetSSE_NxN_SIMD<4, vext>; |
3571 | 0 | m_afpDistortFunc[0][DF_SSE8 ] = xGetSSE_NxN_SIMD<8, vext>; |
3572 | 0 | m_afpDistortFunc[0][DF_SSE16 ] = xGetSSE_NxN_SIMD<16, vext>; |
3573 | 0 | m_afpDistortFunc[0][DF_SSE32 ] = xGetSSE_NxN_SIMD<32, vext>; |
3574 | 0 | m_afpDistortFunc[0][DF_SSE64 ] = xGetSSE_NxN_SIMD<64, vext>; |
3575 | 0 | m_afpDistortFunc[0][DF_SSE128] = xGetSSE_NxN_SIMD<128, vext>; |
3576 | |
|
3577 | 0 | m_afpDistortFunc[0][DF_SAD ] = xGetSAD_SIMD<vext>; |
3578 | | //m_afpDistortFunc[0][DF_SAD2 ] = xGetSAD_SIMD<vext>; |
3579 | 0 | m_afpDistortFunc[0][DF_SAD4 ] = xGetSAD_NxN_SIMD<4, vext>; |
3580 | 0 | m_afpDistortFunc[0][DF_SAD8 ] = xGetSAD_NxN_SIMD<8, vext>; |
3581 | 0 | m_afpDistortFunc[0][DF_SAD16 ] = xGetSAD_NxN_SIMD<16, vext>; |
3582 | 0 | m_afpDistortFunc[0][DF_SAD32 ] = xGetSAD_NxN_SIMD<32, vext>; |
3583 | 0 | m_afpDistortFunc[0][DF_SAD64 ] = xGetSAD_NxN_SIMD<64, vext>; |
3584 | 0 | m_afpDistortFunc[0][DF_SAD128] = xGetSAD_NxN_SIMD<128, vext>; |
3585 | |
|
3586 | 0 | m_afpDistortFunc[0][DF_HAD] = RdCost::xGetHADs_SIMD<vext, false>; |
3587 | 0 | m_afpDistortFunc[0][DF_HAD2] = RdCost::xGetHADs_SIMD<vext, false>; |
3588 | 0 | m_afpDistortFunc[0][DF_HAD4] = RdCost::xGetHADs_SIMD<vext, false>; |
3589 | 0 | m_afpDistortFunc[0][DF_HAD8] = RdCost::xGetHADs_SIMD<vext, false>; |
3590 | 0 | m_afpDistortFunc[0][DF_HAD16] = RdCost::xGetHADs_SIMD<vext, false>; |
3591 | 0 | m_afpDistortFunc[0][DF_HAD32] = RdCost::xGetHADs_SIMD<vext, false>; |
3592 | 0 | m_afpDistortFunc[0][DF_HAD64] = RdCost::xGetHADs_SIMD<vext, false>; |
3593 | 0 | m_afpDistortFunc[0][DF_HAD128] = RdCost::xGetHADs_SIMD<vext, false>; |
3594 | |
|
3595 | 0 | m_afpDistortFunc[0][DF_HAD_fast] = RdCost::xGetHADs_SIMD<vext, true>; |
3596 | 0 | m_afpDistortFunc[0][DF_HAD2_fast] = RdCost::xGetHADs_SIMD<vext, true>; |
3597 | 0 | m_afpDistortFunc[0][DF_HAD4_fast] = RdCost::xGetHADs_SIMD<vext, true>; |
3598 | 0 | m_afpDistortFunc[0][DF_HAD8_fast] = RdCost::xGetHADs_SIMD<vext, true>; |
3599 | 0 | m_afpDistortFunc[0][DF_HAD16_fast] = RdCost::xGetHADs_SIMD<vext, true>; |
3600 | 0 | m_afpDistortFunc[0][DF_HAD32_fast] = RdCost::xGetHADs_SIMD<vext, true>; |
3601 | 0 | m_afpDistortFunc[0][DF_HAD64_fast] = RdCost::xGetHADs_SIMD<vext, true>; |
3602 | 0 | m_afpDistortFunc[0][DF_HAD128_fast] = RdCost::xGetHADs_SIMD<vext, true>; |
3603 | |
|
3604 | 0 | m_afpDistortFunc[0][DF_HAD_2SAD ] = RdCost::xGetHAD2SADs_SIMD<vext>; |
3605 | 0 | m_afpDistortFunc[0][DF_SAD_WITH_MASK] = xGetSADwMask_SIMD<vext>; |
3606 | |
|
3607 | 0 | m_wtdPredPtr[0] = lumaWeightedSSE_SIMD<vext, 0>; |
3608 | 0 | m_wtdPredPtr[1] = lumaWeightedSSE_SIMD<vext, 1>; |
3609 | 0 | m_fxdWtdPredPtr = fixWeightedSSE_SIMD <vext>; |
3610 | |
|
3611 | 0 | m_afpDistortFuncX5[0] = xGetSADX5_8xN_SIMD <vext>; |
3612 | 0 | m_afpDistortFuncX5[1] = xGetSADX5_16xN_SIMD_X86<vext>; |
3613 | 0 | } Unexecuted instantiation: void vvenc::RdCost::_initRdCostX86<(vvenc::x86_simd::X86_VEXT)1>() Unexecuted instantiation: void vvenc::RdCost::_initRdCostX86<(vvenc::x86_simd::X86_VEXT)4>() |
3614 | | |
3615 | | template void RdCost::_initRdCostX86<SIMDX86>(); |
3616 | | |
3617 | | } // namespace vvenc |
3618 | | |
3619 | | //! \} |
3620 | | |
3621 | | #endif // TARGET_SIMD_X86 |
3622 | | |