/src/vvenc/source/Lib/CommonLib/x86/FGAX86.h
Line | Count | Source |
1 | | /* ----------------------------------------------------------------------------- |
2 | | The copyright in this software is being made available under the Clear BSD |
3 | | License, included below. No patent rights, trademark rights and/or |
4 | | other Intellectual Property Rights other than the copyrights concerning |
5 | | the Software are granted under this license. |
6 | | |
7 | | The Clear BSD License |
8 | | |
9 | | Copyright (c) 2019-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. |
10 | | All rights reserved. |
11 | | |
12 | | Redistribution and use in source and binary forms, with or without modification, |
13 | | are permitted (subject to the limitations in the disclaimer below) provided that |
14 | | the following conditions are met: |
15 | | |
16 | | * Redistributions of source code must retain the above copyright notice, |
17 | | this list of conditions and the following disclaimer. |
18 | | |
19 | | * Redistributions in binary form must reproduce the above copyright |
20 | | notice, this list of conditions and the following disclaimer in the |
21 | | documentation and/or other materials provided with the distribution. |
22 | | |
23 | | * Neither the name of the copyright holder nor the names of its |
24 | | contributors may be used to endorse or promote products derived from this |
25 | | software without specific prior written permission. |
26 | | |
27 | | NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY |
28 | | THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND |
29 | | CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
30 | | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A |
31 | | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR |
32 | | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
33 | | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
34 | | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR |
35 | | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER |
36 | | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
37 | | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
38 | | POSSIBILITY OF SUCH DAMAGE. |
39 | | |
40 | | |
41 | | ------------------------------------------------------------------------------------------- */ |
42 | | /** \file FGAX86.h |
43 | | \brief SIMD for FilmGrainAnalyse |
44 | | */ |
45 | | |
46 | | //! \ingroup CommonLib |
47 | | //! \{ |
48 | | |
49 | | //#include "CommonLib/CommonDef.h" |
50 | | #include "CommonDefX86.h" |
51 | | #include "SEIFilmGrainAnalyzer.h" |
52 | | |
53 | | |
54 | | //#ifdef TARGET_SIMD_X86 |
55 | | #if defined(TARGET_SIMD_X86) && ENABLE_SIMD_OPT_FGA |
56 | | |
57 | | //! \ingroup CommonLib |
58 | | //! \{ |
59 | | |
60 | | namespace vvenc { |
61 | | #ifdef USE_AVX2 |
62 | | /* ----------------------------------------------------------------------------- |
63 | | atan2 aproximation taken from: |
64 | | https://mazzo.li/posts/vectorized-atan2.html |
65 | | ------------------------------------------------------------------------------------------- */ |
66 | 0 | inline __m256 atan_avx_approximation(__m256 x) { |
67 | | // Store the coefficients -- `_mm256_set1_ps` creates a vector |
68 | | // with the same value in every element. |
69 | 0 | __m256 a1 = _mm256_set1_ps( 0.99997726f); |
70 | 0 | __m256 a3 = _mm256_set1_ps(-0.33262347f); |
71 | 0 | __m256 a5 = _mm256_set1_ps( 0.19354346f); |
72 | 0 | __m256 a7 = _mm256_set1_ps(-0.11643287f); |
73 | 0 | __m256 a9 = _mm256_set1_ps( 0.05265332f); |
74 | 0 | __m256 a11 = _mm256_set1_ps(-0.01172120f); |
75 | | // Compute the polynomial on an 8-vector with FMA. |
76 | 0 | __m256 x_sq = _mm256_mul_ps(x, x); |
77 | 0 | __m256 result; |
78 | 0 | result = a11; |
79 | 0 | result = _mm256_add_ps(_mm256_mul_ps(x_sq, result), a9); |
80 | 0 | result = _mm256_add_ps(_mm256_mul_ps(x_sq, result), a7); |
81 | 0 | result = _mm256_add_ps(_mm256_mul_ps(x_sq, result), a5); |
82 | 0 | result = _mm256_add_ps(_mm256_mul_ps(x_sq, result), a3); |
83 | 0 | result = _mm256_add_ps(_mm256_mul_ps(x_sq, result), a1); |
84 | 0 | result = _mm256_mul_ps(x, result); |
85 | 0 | return result; |
86 | 0 | } |
87 | | #endif |
88 | 0 | inline __m128 atan_avx_approximation(__m128 x) { |
89 | | // Store the coefficients -- `_mm256_set1_ps` creates a vector |
90 | | // with the same value in every element. |
91 | 0 | __m128 a1 = _mm_set1_ps( 0.99997726f); |
92 | 0 | __m128 a3 = _mm_set1_ps(-0.33262347f); |
93 | 0 | __m128 a5 = _mm_set1_ps( 0.19354346f); |
94 | 0 | __m128 a7 = _mm_set1_ps(-0.11643287f); |
95 | 0 | __m128 a9 = _mm_set1_ps( 0.05265332f); |
96 | 0 | __m128 a11 = _mm_set1_ps(-0.01172120f); |
97 | | // Compute the polynomial on an 8-vector with FMA. |
98 | 0 | __m128 x_sq = _mm_mul_ps(x, x); |
99 | 0 | __m128 result; |
100 | 0 | result = a11; |
101 | 0 | result = _mm_add_ps(_mm_mul_ps(x_sq, result), a9); |
102 | 0 | result = _mm_add_ps(_mm_mul_ps(x_sq, result), a7); |
103 | 0 | result = _mm_add_ps(_mm_mul_ps(x_sq, result), a5); |
104 | 0 | result = _mm_add_ps(_mm_mul_ps(x_sq, result), a3); |
105 | 0 | result = _mm_add_ps(_mm_mul_ps(x_sq, result), a1); |
106 | 0 | result = _mm_mul_ps(x, result); |
107 | 0 | return result; |
108 | 0 | } |
109 | | |
110 | | template<X86_VEXT vext> |
111 | | void gradient_SIMD (PelStorage *buff1, PelStorage *buff2, |
112 | | PelStorage *AccGxBuf, PelStorage *AccGyBuf, |
113 | | unsigned int width, unsigned int height, |
114 | | unsigned int bitDepth, ComponentID compID) |
115 | 0 | { |
116 | | // buff1 - magnitude; buff2 - orientation (Only luma in buff2) |
117 | 0 | const unsigned int convWidthS=CONV_WIDTH_S; |
118 | 0 | const int maxClpRange = (1 << bitDepth) - 1; |
119 | 0 | const int padding = convWidthS / 2; |
120 | 0 | Pel* p_buf1; |
121 | 0 | Pel* p_buf1_up; |
122 | 0 | Pel* p_buf1_down; |
123 | 0 | int stride = buff1->Y().stride; |
124 | 0 | Pel* p_ACC = AccGxBuf->Y().buf; |
125 | 0 | Pel* p_ACC_Y = AccGyBuf->Y().buf; |
126 | |
|
127 | 0 | int res16 = width & 0xf; |
128 | | // avoid compiler warnings |
129 | 0 | __m128i v0_mid = _mm_set1_epi16 (0); |
130 | 0 | __m128i vold_down = _mm_set1_epi16 (0); |
131 | 0 | __m128i vold_up = _mm_set1_epi16 (0); |
132 | 0 | __m128i v0_down = _mm_set1_epi16 (0); |
133 | 0 | __m128i v0_up = _mm_set1_epi16 (0); |
134 | 0 | __m128i vold_mid = _mm_set1_epi16 (0); |
135 | |
|
136 | 0 | for (int y = 0; y < height; y++) |
137 | 0 | { |
138 | 0 | p_buf1=buff1->Y().buf + y*stride; |
139 | 0 | if (y==0) |
140 | 0 | { |
141 | 0 | p_buf1_up = p_buf1; |
142 | 0 | p_buf1_down = p_buf1_up+stride; |
143 | 0 | } |
144 | 0 | else if (y==height-1) |
145 | 0 | { |
146 | 0 | p_buf1_down = p_buf1; |
147 | 0 | p_buf1_up = p_buf1_down - stride; |
148 | 0 | } |
149 | 0 | else |
150 | 0 | { |
151 | 0 | p_buf1_up = p_buf1 - stride; //starts at 1 now |
152 | 0 | p_buf1_down = p_buf1+stride; |
153 | 0 | } |
154 | 0 | if( vext >= AVX2 && !res16) |
155 | 0 | { |
156 | | #ifdef USE_AVX2 |
157 | | __m256i v0_up; |
158 | | __m256i v0_down; |
159 | | __m256i v0_mid; |
160 | | int x; |
161 | 0 | for (x=0; x < width-16; x+=16) |
162 | 0 | { |
163 | 0 | if (x==0) |
164 | 0 | { |
165 | 0 | v0_up = _mm256_lddqu_si256((const __m256i *)(p_buf1_up)); |
166 | 0 | __m256i vr_up = _mm256_lddqu_si256((const __m256i *)(p_buf1_up+1)); |
167 | 0 | v0_down = _mm256_lddqu_si256((const __m256i *)(p_buf1_down)); |
168 | 0 | __m256i vr_down = _mm256_lddqu_si256((const __m256i *)(p_buf1_down+1)); |
169 | | |
170 | | __m256i vl_up = _mm256_slli_si256 (v0_up,2); // jeweils der unterste fehlt, aus vold holen |
171 | | __m256i tmp = _mm256_permute4x64_epi64 (v0_up,0x10); |
172 | | tmp = _mm256_bsrli_epi128(tmp,6); |
173 | | tmp = _mm256_inserti128_si256(tmp,_mm256_castsi256_si128 (v0_up),0); |
174 | | vl_up = _mm256_blend_epi16(vl_up,tmp,1); |
175 | | __m256i vm_up = _mm256_slli_epi16 (v0_up,1); // middle *2 |
176 | | __m256i acc_up = _mm256_adds_epi16 (vm_up,vl_up); |
177 | | acc_up = _mm256_adds_epi16 (acc_up,vr_up); |
178 | | |
179 | | __m256i vl_down = _mm256_slli_si256 (v0_down,2); // jeweils der unterste fehlt, aus vold holen |
180 | | tmp = _mm256_permute4x64_epi64 (v0_down,0x10); |
181 | | tmp = _mm256_bsrli_epi128(tmp,6); |
182 | | tmp = _mm256_inserti128_si256(tmp,_mm256_castsi256_si128 (v0_down),0); |
183 | | vl_down = _mm256_blend_epi16(vl_down,tmp,1); |
184 | | __m256i vm_down = _mm256_slli_epi16 (v0_down,1); // middle *2 |
185 | | __m256i acc_down = _mm256_adds_epi16 (vm_down,vl_down); |
186 | | acc_down = _mm256_adds_epi16 (acc_down,vr_down); |
187 | | |
188 | | __m256i acc = _mm256_subs_epi16 (acc_down,acc_up); |
189 | | _mm256_storeu_si256((__m256i *)&p_ACC[0], acc); |
190 | | |
191 | | // mid |
192 | | v0_mid = _mm256_lddqu_si256((const __m256i *)(p_buf1)); |
193 | | __m256i vr_mid = _mm256_lddqu_si256((const __m256i *)(p_buf1+1)); |
194 | | __m256i vl_mid = _mm256_slli_si256 (v0_mid,2); // jeweils der unterste fehlt, aus vold holen |
195 | | tmp = _mm256_permute4x64_epi64 (v0_mid,0x10); |
196 | | tmp = _mm256_bsrli_epi128(tmp,6); |
197 | | tmp = _mm256_inserti128_si256(tmp,_mm256_castsi256_si128 (v0_mid),0); |
198 | | vl_mid = _mm256_blend_epi16(vl_mid,tmp,1); |
199 | | |
200 | | __m256i acc_right = _mm256_adds_epi16 (vr_up,vr_down); |
201 | | vr_mid = _mm256_slli_epi16 (vr_mid,1); // middle *2 |
202 | | acc_right = _mm256_adds_epi16 (acc_right,vr_mid); |
203 | | |
204 | | __m256i acc_left = _mm256_adds_epi16 (vl_up,vl_down); |
205 | | vl_mid = _mm256_slli_epi16 (vl_mid,1); // middle *2 |
206 | | acc_left = _mm256_adds_epi16 (acc_left,vl_mid); |
207 | | acc = _mm256_subs_epi16 (acc_right,acc_left); |
208 | | _mm256_storeu_si256((__m256i *)&p_ACC_Y[x], acc); |
209 | | } |
210 | 0 | else |
211 | 0 | { |
212 | 0 | v0_up = _mm256_lddqu_si256((const __m256i *)(p_buf1_up+x)); |
213 | 0 | __m256i vr_up = _mm256_lddqu_si256((const __m256i *)(p_buf1_up+x+1)); |
214 | 0 | __m256i vl_up = _mm256_lddqu_si256((const __m256i *)(p_buf1_up+x-1)); |
215 | |
|
216 | 0 | v0_down = _mm256_lddqu_si256((const __m256i *)(p_buf1_down+x)); |
217 | 0 | __m256i vr_down = _mm256_lddqu_si256((const __m256i *)(p_buf1_down+x+1)); |
218 | 0 | __m256i vl_down = _mm256_lddqu_si256((const __m256i *)(p_buf1_down+x-1)); |
219 | |
|
220 | 0 | __m256i vm_up = _mm256_slli_epi16 (v0_up,1); // middle *2 |
221 | 0 | __m256i acc_up = _mm256_adds_epi16 (vm_up,vl_up); |
222 | 0 | acc_up = _mm256_adds_epi16 (acc_up,vr_up); |
223 | |
|
224 | 0 | __m256i vm_down = _mm256_slli_epi16 (v0_down,1); // middle *2 |
225 | 0 | __m256i acc_down = _mm256_adds_epi16 (vm_down,vl_down); |
226 | 0 | acc_down = _mm256_adds_epi16 (acc_down,vr_down); |
227 | |
|
228 | 0 | __m256i acc = _mm256_subs_epi16 (acc_down,acc_up); |
229 | 0 | _mm256_storeu_si256((__m256i *)&p_ACC[x], acc); |
230 | | |
231 | | // mid |
232 | 0 | v0_mid = _mm256_lddqu_si256((const __m256i *)(p_buf1+x)); |
233 | 0 | __m256i vr_mid = _mm256_lddqu_si256((const __m256i *)(p_buf1+x+1)); |
234 | 0 | __m256i vl_mid = _mm256_lddqu_si256((const __m256i *)(p_buf1+x-1)); |
235 | |
|
236 | 0 | __m256i acc_right = _mm256_adds_epi16 (vr_up,vr_down); |
237 | 0 | vr_mid = _mm256_slli_epi16 (vr_mid,1); // middle *2 |
238 | 0 | acc_right = _mm256_adds_epi16 (acc_right,vr_mid); |
239 | |
|
240 | 0 | __m256i acc_left = _mm256_adds_epi16 (vl_up,vl_down); |
241 | 0 | vl_mid = _mm256_slli_epi16 (vl_mid,1); // middle *2 |
242 | 0 | acc_left = _mm256_adds_epi16 (acc_left,vl_mid); |
243 | 0 | acc = _mm256_subs_epi16 (acc_right,acc_left); |
244 | 0 | _mm256_storeu_si256((__m256i *)&p_ACC_Y[x], acc); |
245 | 0 | } |
246 | 0 | } //for x |
247 | | // last collum |
248 | | { |
249 | | v0_up = _mm256_lddqu_si256((const __m256i *)(p_buf1_up+x)); |
250 | | __m256i vl_up = _mm256_lddqu_si256((const __m256i *)(p_buf1_up+x-1)); |
251 | | |
252 | | v0_down = _mm256_lddqu_si256((const __m256i *)(p_buf1_down+x)); |
253 | | __m256i vl_down = _mm256_lddqu_si256((const __m256i *)(p_buf1_down+x-1)); |
254 | | |
255 | | __m256i vr_up = _mm256_srli_si256 (v0_up,2); // jeweils der oberste fehlt |
256 | | vr_up = _mm256_insert_epi16 (vr_up,_mm256_extract_epi16 (v0_up,8), 7); |
257 | | vr_up = _mm256_insert_epi16 (vr_up,_mm256_extract_epi16 (v0_up,15), 15); |
258 | | |
259 | | __m256i vr_down = _mm256_srli_si256 (v0_down,2); // jeweils der oberste fehlt |
260 | | vr_down = _mm256_insert_epi16 (vr_down,_mm256_extract_epi16 (v0_down,8), 7); |
261 | | vr_down = _mm256_insert_epi16 (vr_down,_mm256_extract_epi16 (v0_down,15), 15); |
262 | | |
263 | | __m256i vm_up = _mm256_slli_epi16 (v0_up,1); // middle *2 |
264 | | __m256i acc_up = _mm256_adds_epi16 (vm_up,vl_up); |
265 | | acc_up = _mm256_adds_epi16 (acc_up,vr_up); |
266 | | |
267 | | __m256i vm_down = _mm256_slli_epi16 (v0_down,1); // middle *2 |
268 | | __m256i acc_down = _mm256_adds_epi16 (vm_down,vl_down); |
269 | | acc_down = _mm256_adds_epi16 (acc_down,vr_down); |
270 | | |
271 | | __m256i acc = _mm256_subs_epi16 (acc_down,acc_up); |
272 | | _mm256_storeu_si256((__m256i *)&p_ACC[x], acc); |
273 | | |
274 | | // mid |
275 | | v0_mid = _mm256_lddqu_si256((const __m256i *)(p_buf1+x)); |
276 | | __m256i vl_mid = _mm256_lddqu_si256((const __m256i *)(p_buf1+x-1)); |
277 | | |
278 | | __m256i vr_mid = _mm256_srli_si256 (v0_mid,2); // jeweils der oberste fehlt |
279 | | vr_mid = _mm256_insert_epi16 (vr_mid,_mm256_extract_epi16 (v0_mid,8), 7); |
280 | | vr_mid = _mm256_insert_epi16 (vr_mid,_mm256_extract_epi16 (v0_mid,15), 15); |
281 | | |
282 | | __m256i acc_right = _mm256_adds_epi16 (vr_up,vr_down); |
283 | | vr_mid = _mm256_slli_epi16 (vr_mid,1); // middle *2 |
284 | | acc_right = _mm256_adds_epi16 (acc_right,vr_mid); |
285 | | |
286 | | __m256i acc_left = _mm256_adds_epi16 (vl_up,vl_down); |
287 | | vl_mid = _mm256_slli_epi16 (vl_mid,1); // middle *2 |
288 | | acc_left = _mm256_adds_epi16 (acc_left,vl_mid); |
289 | | acc = _mm256_subs_epi16 (acc_right,acc_left); |
290 | | _mm256_storeu_si256((__m256i *)&p_ACC_Y[x], acc); |
291 | | } |
292 | | #endif |
293 | 0 | } //AVX2 |
294 | 0 | else |
295 | 0 | { |
296 | 0 | __m128i v1_up; |
297 | 0 | __m128i v1_down; |
298 | 0 | __m128i v1_mid; |
299 | 0 | int x; |
300 | 0 | for (x=0; x < width-8; x+=8) |
301 | 0 | { |
302 | 0 | if (x==0) |
303 | 0 | { |
304 | 0 | v0_up = _mm_loadu_si128((const __m128i*)(p_buf1_up)); |
305 | 0 | v1_up = _mm_loadu_si128((const __m128i*)(p_buf1_up+8)); |
306 | 0 | v0_down = _mm_loadu_si128((const __m128i*)(p_buf1_down)); |
307 | 0 | v1_down = _mm_loadu_si128((const __m128i*)(p_buf1_down+8)); |
308 | 0 | v0_mid = _mm_loadu_si128((const __m128i*)(p_buf1)); |
309 | 0 | v1_mid = _mm_loadu_si128((const __m128i*)(p_buf1+8)); |
310 | |
|
311 | 0 | __m128i vl_up = _mm_slli_si128 (v0_up,2); // der unterste fehlt, aus vold holen |
312 | 0 | vl_up = _mm_blend_epi16 (vl_up,v0_up,1); |
313 | 0 | __m128i vr_up = _mm_srli_si128 (v0_up,2); // der oberste fehlt, aus v1 holen |
314 | 0 | vr_up = _mm_blend_epi16 (vr_up,_mm_slli_si128 (v1_up,14),0x80); |
315 | |
|
316 | 0 | __m128i vl_down = _mm_slli_si128 (v0_down,2); // der unterste fehlt, |
317 | 0 | vl_down = _mm_blend_epi16 (vl_down,v0_down,1); |
318 | 0 | __m128i vr_down = _mm_srli_si128 (v0_down,2); // der oberste fehlt, aus v1 holen |
319 | 0 | vr_down = _mm_blend_epi16 (vr_down,_mm_slli_si128 (v1_down,14),0x80); |
320 | |
|
321 | 0 | __m128i vl_mid = _mm_slli_si128 (v0_mid,2); // der unterste fehlt, |
322 | 0 | vl_mid = _mm_blend_epi16 (vl_mid,v0_mid,1); |
323 | 0 | __m128i vr_mid = _mm_srli_si128 (v0_mid,2); // der oberste fehlt, aus v1 holen |
324 | 0 | vr_mid = _mm_blend_epi16 (vr_mid,_mm_slli_si128 (v1_mid,14),0x80); |
325 | |
|
326 | 0 | __m128i vm_up = _mm_slli_epi16 (v0_up,1); // middle *2 |
327 | 0 | __m128i acc_up = _mm_adds_epi16 (vm_up,vl_up); |
328 | 0 | acc_up = _mm_adds_epi16 (acc_up,vr_up); |
329 | |
|
330 | 0 | __m128i vm_down = _mm_slli_epi16 (v0_down,1); // middle *2 |
331 | 0 | __m128i acc_down = _mm_adds_epi16 (vm_down,vl_down); |
332 | 0 | acc_down = _mm_adds_epi16 (acc_down,vr_down); |
333 | |
|
334 | 0 | __m128i acc = _mm_subs_epi16 (acc_down,acc_up); |
335 | 0 | _mm_storeu_si128((__m128i*)&p_ACC[x], acc); |
336 | |
|
337 | 0 | __m128i acc_right = _mm_adds_epi16 (vr_up,vr_down); |
338 | 0 | vr_mid = _mm_slli_epi16 (vr_mid,1); // middle *2 |
339 | 0 | acc_right = _mm_adds_epi16 (acc_right,vr_mid); |
340 | |
|
341 | 0 | __m128i acc_left = _mm_adds_epi16 (vl_up,vl_down); |
342 | 0 | vl_mid = _mm_slli_epi16 (vl_mid,1); // middle *2 |
343 | 0 | acc_left = _mm_adds_epi16 (acc_left,vl_mid); |
344 | 0 | acc = _mm_subs_epi16 (acc_right,acc_left); |
345 | 0 | _mm_storeu_si128((__m128i*)&p_ACC_Y[x], acc); |
346 | 0 | } |
347 | 0 | else |
348 | 0 | { |
349 | 0 | v1_up = _mm_loadu_si128((const __m128i*)(p_buf1_up+x+8)); |
350 | 0 | v1_down = _mm_loadu_si128((const __m128i*)(p_buf1_down+x+8)); |
351 | 0 | v1_mid = _mm_loadu_si128((const __m128i*)(p_buf1+x+8)); |
352 | |
|
353 | 0 | __m128i vl_up = _mm_slli_si128 (v0_up,2); // der unterste fehlt, aus vold holen |
354 | 0 | vl_up = _mm_blend_epi16 (vl_up,_mm_srli_si128 (vold_up,14),1); |
355 | 0 | __m128i vr_up = _mm_srli_si128 (v0_up,2); // der oberste fehlt, aus v1 holen |
356 | 0 | vr_up = _mm_blend_epi16 (vr_up,_mm_slli_si128 (v1_up,14),0x80); |
357 | |
|
358 | 0 | __m128i vl_down = _mm_slli_si128 (v0_down,2); // der unterste fehlt, aus vold holen |
359 | 0 | vl_down = _mm_blend_epi16 (vl_down,_mm_srli_si128 (vold_down,14),1); |
360 | 0 | __m128i vr_down = _mm_srli_si128 (v0_down,2); // der oberste fehlt, aus v1 holen |
361 | 0 | vr_down = _mm_blend_epi16 (vr_down,_mm_slli_si128 (v1_down,14),0x80); |
362 | |
|
363 | 0 | __m128i vl_mid = _mm_slli_si128 (v0_mid,2); // der unterste fehlt, aus vold holen |
364 | 0 | vl_mid = _mm_blend_epi16 (vl_mid,_mm_srli_si128 (vold_mid,14),1); |
365 | 0 | __m128i vr_mid = _mm_srli_si128 (v0_mid,2); // der oberste fehlt, aus v1 holen |
366 | 0 | vr_mid = _mm_blend_epi16 (vr_mid,_mm_slli_si128 (v1_mid,14),0x80); |
367 | |
|
368 | 0 | __m128i vm_up = _mm_slli_epi16 (v0_up,1); // middle *2 |
369 | 0 | __m128i acc_up = _mm_adds_epi16 (vm_up,vl_up); |
370 | 0 | acc_up = _mm_adds_epi16 (acc_up,vr_up); |
371 | |
|
372 | 0 | __m128i vm_down = _mm_slli_epi16 (v0_down,1); // middle *2 |
373 | 0 | __m128i acc_down = _mm_adds_epi16 (vm_down,vl_down); |
374 | 0 | acc_down = _mm_adds_epi16 (acc_down,vr_down); |
375 | 0 | __m128i acc = _mm_subs_epi16 (acc_down,acc_up); |
376 | 0 | _mm_storeu_si128((__m128i*)&p_ACC[x], acc); |
377 | |
|
378 | 0 | __m128i acc_right = _mm_adds_epi16 (vr_up,vr_down); |
379 | 0 | vr_mid = _mm_slli_epi16 (vr_mid,1); // middle *2 |
380 | 0 | acc_right = _mm_adds_epi16 (acc_right,vr_mid); |
381 | |
|
382 | 0 | __m128i acc_left = _mm_adds_epi16 (vl_up,vl_down); |
383 | 0 | vl_mid = _mm_slli_epi16 (vl_mid,1); // middle *2 |
384 | 0 | acc_left = _mm_adds_epi16 (acc_left,vl_mid); |
385 | 0 | acc = _mm_subs_epi16 (acc_right,acc_left); |
386 | 0 | _mm_storeu_si128((__m128i*)&p_ACC_Y[x], acc); |
387 | 0 | } |
388 | 0 | vold_up = v0_up; |
389 | 0 | vold_down = v0_down; |
390 | 0 | vold_mid = v0_mid; |
391 | 0 | v0_up = v1_up; |
392 | 0 | v0_down = v1_down; |
393 | 0 | v0_mid = v1_mid; |
394 | 0 | } //for x |
395 | | // last collum |
396 | 0 | { |
397 | 0 | __m128i vl_up = _mm_slli_si128 (v0_up,2); // der unterste fehlt, aus vold holen |
398 | 0 | vl_up = _mm_blend_epi16 (vl_up,_mm_srli_si128 (vold_up,14),1); |
399 | 0 | __m128i vr_up = _mm_srli_si128 (v0_up,2); // der oberste fehlt, aus v0 holen |
400 | 0 | vr_up = _mm_blend_epi16 (vr_up,v0_up,0x80); |
401 | |
|
402 | 0 | __m128i vl_down = _mm_slli_si128 (v0_down,2); // der unterste fehlt, aus vold holen |
403 | 0 | vl_down = _mm_blend_epi16 (vl_down,_mm_srli_si128 (vold_down,14),1); |
404 | 0 | __m128i vr_down = _mm_srli_si128 (v0_down,2); // der oberste fehlt, aus v0 holen |
405 | 0 | vr_down = _mm_blend_epi16(vr_down,v0_down,0x80); |
406 | |
|
407 | 0 | __m128i vl_mid = _mm_slli_si128 (v0_mid,2); // der unterste fehlt, aus vold holen |
408 | 0 | vl_mid = _mm_blend_epi16 (vl_mid,_mm_srli_si128 (vold_mid,14),1); |
409 | 0 | __m128i vr_mid = _mm_srli_si128 (v0_mid,2); // der oberste fehlt, aus v0 holen |
410 | 0 | vr_mid = _mm_blend_epi16 (vr_mid,v0_mid,0x80); |
411 | |
|
412 | 0 | __m128i vm_up = _mm_slli_epi16 (v0_up,1); // middle *2 |
413 | 0 | __m128i acc_up = _mm_adds_epi16 (vm_up,vl_up); |
414 | 0 | acc_up = _mm_adds_epi16 (acc_up,vr_up); |
415 | |
|
416 | 0 | __m128i vm_down = _mm_slli_epi16 (v0_down,1); // middle *2 |
417 | 0 | __m128i acc_down = _mm_adds_epi16 (vm_down,vl_down); |
418 | 0 | acc_down = _mm_adds_epi16 (acc_down,vr_down); |
419 | |
|
420 | 0 | __m128i acc = _mm_subs_epi16 (acc_down,acc_up); |
421 | 0 | _mm_storeu_si128((__m128i*)&p_ACC[x], acc); |
422 | |
|
423 | 0 | __m128i acc_right = _mm_adds_epi16 (vr_up,vr_down); |
424 | 0 | vr_mid = _mm_slli_epi16 (vr_mid,1); // middle *2 |
425 | 0 | acc_right = _mm_adds_epi16 (acc_right,vr_mid); |
426 | |
|
427 | 0 | __m128i acc_left = _mm_adds_epi16 (vl_up,vl_down); |
428 | 0 | vl_mid = _mm_slli_epi16 (vl_mid,1); // middle *2 |
429 | 0 | acc_left = _mm_adds_epi16 (acc_left,vl_mid); |
430 | 0 | acc = _mm_subs_epi16 (acc_right,acc_left); |
431 | 0 | _mm_storeu_si128((__m128i*)&p_ACC_Y[x], acc); |
432 | 0 | } |
433 | 0 | } |
434 | 0 | p_ACC+=width; |
435 | 0 | p_ACC_Y+=width; |
436 | 0 | } // y |
437 | | |
438 | | // magnitude |
439 | 0 | p_ACC = AccGxBuf->Y().buf; |
440 | 0 | p_ACC_Y = AccGyBuf->Y().buf; |
441 | |
|
442 | 0 | for (int y = 0; y < height; y++) |
443 | 0 | { |
444 | 0 | p_buf1=buff1->Y().buf + y*stride; |
445 | |
|
446 | 0 | if( vext >= AVX2 && !res16) |
447 | 0 | { |
448 | | #ifdef USE_AVX2 |
449 | | int x; |
450 | | __m256i vbdmax = _mm256_set1_epi16 ( maxClpRange); |
451 | | |
452 | 0 | for (x=0; x < width; x+=16) |
453 | 0 | { |
454 | 0 | __m256i GX = _mm256_loadu_si256((const __m256i*)&p_ACC[x]); |
455 | 0 | __m256i GY = _mm256_loadu_si256((const __m256i*)&p_ACC_Y[x]); |
456 | 0 | GX = _mm256_abs_epi16(GX); |
457 | 0 | GY = _mm256_abs_epi16(GY); |
458 | 0 | GX = _mm256_add_epi16(GX,GY); |
459 | 0 | GX = _mm256_srli_epi16(GX,1); |
460 | 0 | GX = _mm256_min_epi16 (GX,vbdmax); |
461 | 0 | _mm256_storeu_si256((__m256i*)&p_buf1[x], GX); |
462 | 0 | } |
463 | | #endif |
464 | 0 | } //AVX2 |
465 | 0 | else |
466 | 0 | { |
467 | 0 | int x; |
468 | 0 | __m128i vbdmax = _mm_set1_epi16 ( maxClpRange); |
469 | |
|
470 | 0 | for (x=0; x < width; x+=8) |
471 | 0 | { |
472 | 0 | __m128i GX = _mm_loadu_si128((const __m128i*)&p_ACC[x]); |
473 | 0 | __m128i GY = _mm_loadu_si128((const __m128i*)&p_ACC_Y[x]); |
474 | 0 | GX = _mm_abs_epi16(GX); |
475 | 0 | GY = _mm_abs_epi16(GY); |
476 | 0 | GX = _mm_add_epi16(GX,GY); |
477 | 0 | GX = _mm_srli_epi16(GX,1); |
478 | 0 | GX = _mm_min_epi16 (GX,vbdmax); |
479 | 0 | _mm_storeu_si128((__m128i*)&p_buf1[x], GX); |
480 | 0 | } |
481 | 0 | } |
482 | 0 | p_ACC+=width; |
483 | 0 | p_ACC_Y+=width; |
484 | 0 | p_buf1+=stride; |
485 | 0 | } |
486 | | |
487 | | // Loop through each pixel |
488 | 0 | Pel* pX = AccGxBuf->Y().buf; |
489 | 0 | Pel* pY = AccGyBuf->Y().buf; |
490 | 0 | int strideX = AccGxBuf->Y().stride; |
491 | 0 | int strideY = AccGyBuf->Y().stride; |
492 | |
|
493 | 0 | Pel* pQD = buff2->Y().buf; |
494 | |
|
495 | 0 | for (int y = 0; y < height; y++) |
496 | 0 | { |
497 | 0 | if( vext >= AVX2 && !res16) |
498 | 0 | { |
499 | | #ifdef USE_AVX2 |
500 | | // Store pi and pi/2 as constants |
501 | | const __m256 pi = _mm256_set1_ps((float)PI); |
502 | | const __m256 pi_2 = _mm256_set1_ps((float)PI_2); |
503 | | const __m256 vpi_8 = _mm256_set1_ps((float)pi_8); |
504 | | const __m256 vpi_3_8 = _mm256_set1_ps((float)pi_3_8); |
505 | | const __m256 vpi_5_8 = _mm256_set1_ps((float)pi_5_8); |
506 | | const __m256 vpi_7_8 = _mm256_set1_ps((float)pi_7_8); |
507 | | |
508 | | const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));; |
509 | | const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); |
510 | | |
511 | 0 | for (int x = 0; x < width; x+=16) |
512 | 0 | { |
513 | 0 | for (int n=0; n<16;n+=8) |
514 | 0 | { |
515 | 0 | __m128i Ix = _mm_loadu_si128((const __m128i*)&pX[x+n]); |
516 | 0 | __m128i Iy = _mm_loadu_si128((const __m128i*)&pY[x+n]); |
517 | |
|
518 | 0 | __m256 vx = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(Ix)); |
519 | 0 | __m256 vy = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(Iy)); |
520 | 0 | __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(vy, abs_mask),_mm256_and_ps(vx, abs_mask),_CMP_GT_OS); |
521 | 0 | __m256 atan_input = _mm256_div_ps(_mm256_blendv_ps(vy, vx, swap_mask),_mm256_blendv_ps(vx, vy, swap_mask)); |
522 | 0 | __m256 result = atan_avx_approximation(atan_input); |
523 | |
|
524 | 0 | result = _mm256_blendv_ps(result,_mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(atan_input, sign_mask)),result),swap_mask); |
525 | 0 | __m256 x_sign_mask = _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(vx), 31)); |
526 | 0 | result = _mm256_add_ps(_mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, vy)),x_sign_mask),result); |
527 | | |
528 | | // take abs value |
529 | 0 | result = _mm256_andnot_ps(sign_mask,result); |
530 | | // compare |
531 | 0 | __m256 QD0 = _mm256_cmp_ps (result,vpi_8,_CMP_LE_OS); |
532 | 0 | QD0 = _mm256_or_ps(QD0,_mm256_cmp_ps (result,vpi_7_8,_CMP_GE_OS)); |
533 | 0 | __m256 QD90 = _mm256_cmp_ps (result,vpi_3_8,_CMP_GT_OS); |
534 | 0 | QD90 = _mm256_and_ps(QD90,_mm256_cmp_ps (result,vpi_5_8,_CMP_LE_OS)); |
535 | 0 | __m256 QD45 = _mm256_cmp_ps (result,vpi_8,_CMP_GT_OS); |
536 | 0 | QD45 = _mm256_and_ps(QD45,_mm256_cmp_ps (result,vpi_3_8,_CMP_LE_OS)); |
537 | 0 | __m256 QD135 = _mm256_cmp_ps (result,vpi_5_8,_CMP_GT_OS); |
538 | 0 | QD135 = _mm256_and_ps(QD135,_mm256_cmp_ps (result,vpi_7_8,_CMP_LE_OS)); |
539 | | // Dy > 0 |
540 | 0 | __m256 Neg = _mm256_cmp_ps (vy,_mm256_set1_ps(0.0),_CMP_LT_OS); |
541 | 0 | QD45 = _mm256_xor_ps(QD45,_mm256_and_ps(Neg,QD135)); |
542 | 0 | QD135 = _mm256_xor_ps(QD135,_mm256_and_ps(Neg,QD45)); |
543 | |
|
544 | 0 | __m256 FQD = _mm256_set1_ps(0.0); |
545 | 0 | FQD = _mm256_blendv_ps(FQD,_mm256_set1_ps(90.0),QD90); |
546 | 0 | FQD = _mm256_blendv_ps(FQD,_mm256_set1_ps(45.0),QD45); |
547 | 0 | FQD = _mm256_blendv_ps(FQD,_mm256_set1_ps(135.0),QD135); |
548 | | // integer 32 bit |
549 | 0 | __m256i QD0I = _mm256_cvtps_epi32(FQD); |
550 | | // integer 16 bit |
551 | 0 | QD0I = _mm256_packus_epi32(QD0I,QD0I); |
552 | 0 | QD0I = _mm256_permute4x64_epi64(QD0I,0x8); |
553 | 0 | _mm_storeu_si128((__m128i*)&pQD[x+n], _mm256_castsi256_si128(QD0I)); |
554 | 0 | } |
555 | 0 | } |
556 | | #endif |
557 | 0 | } |
558 | 0 | else //SSE |
559 | 0 | { |
560 | | // Store pi and pi/2 as constants |
561 | 0 | const __m128 pi = _mm_set1_ps((float)PI); |
562 | 0 | const __m128 pi_2 = _mm_set1_ps((float)PI_2); |
563 | 0 | const __m128 vpi_8 = _mm_set1_ps((float)pi_8); |
564 | 0 | const __m128 vpi_3_8 = _mm_set1_ps((float)pi_3_8); |
565 | 0 | const __m128 vpi_5_8 = _mm_set1_ps((float)pi_5_8); |
566 | 0 | const __m128 vpi_7_8 = _mm_set1_ps((float)pi_7_8); |
567 | |
|
568 | 0 | const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));; |
569 | 0 | const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); |
570 | |
|
571 | 0 | for (int x = 0; x < width; x+=8) |
572 | 0 | { |
573 | 0 | for (int n=0; n<8;n+=4) |
574 | 0 | { |
575 | 0 | __m128i Ix = _mm_loadu_si128((const __m128i*)&pX[x+n]); |
576 | 0 | __m128i Iy = _mm_loadu_si128((const __m128i*)&pY[x+n]); |
577 | |
|
578 | 0 | __m128 vx = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(Ix)); |
579 | 0 | __m128 vy = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(Iy)); |
580 | 0 | __m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(vy, abs_mask),_mm_and_ps(vx, abs_mask)); |
581 | 0 | __m128 atan_input = _mm_div_ps(_mm_blendv_ps(vy, vx, swap_mask),_mm_blendv_ps(vx, vy, swap_mask)); |
582 | 0 | __m128 result = atan_avx_approximation(atan_input); |
583 | |
|
584 | 0 | result = _mm_blendv_ps(result,_mm_sub_ps(_mm_or_ps(pi_2, _mm_and_ps(atan_input, sign_mask)),result),swap_mask); |
585 | 0 | __m128 x_sign_mask = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(vx), 31)); |
586 | 0 | result = _mm_add_ps(_mm_and_ps(_mm_xor_ps(pi, _mm_and_ps(sign_mask, vy)),x_sign_mask),result); |
587 | | |
588 | | // take abs value |
589 | 0 | result = _mm_andnot_ps(sign_mask,result); |
590 | | // compare |
591 | 0 | __m128 QD0 = _mm_cmple_ps (result,vpi_8); |
592 | 0 | QD0 = _mm_or_ps(QD0,_mm_cmpge_ps (result,vpi_7_8)); |
593 | 0 | __m128 QD90 = _mm_cmpgt_ps (result,vpi_3_8); |
594 | 0 | QD90 = _mm_and_ps(QD90,_mm_cmple_ps (result,vpi_5_8)); |
595 | 0 | __m128 QD45 = _mm_cmpgt_ps (result,vpi_8); |
596 | 0 | QD45 = _mm_and_ps(QD45,_mm_cmple_ps (result,vpi_3_8)); |
597 | 0 | __m128 QD135 = _mm_cmpgt_ps (result,vpi_5_8); |
598 | 0 | QD135 = _mm_and_ps(QD135,_mm_cmple_ps (result,vpi_7_8)); |
599 | | // Dy > 0 |
600 | 0 | __m128 Neg = _mm_cmplt_ps (vy,_mm_set1_ps(0.0)); |
601 | 0 | QD45 = _mm_xor_ps(QD45,_mm_and_ps(Neg,QD135)); |
602 | 0 | QD135 = _mm_xor_ps(QD135,_mm_and_ps(Neg,QD45)); |
603 | |
|
604 | 0 | __m128 FQD = _mm_set1_ps(0.0); |
605 | 0 | FQD = _mm_blendv_ps(FQD,_mm_set1_ps(90.0),QD90); |
606 | 0 | FQD = _mm_blendv_ps(FQD,_mm_set1_ps(45.0),QD45); |
607 | 0 | FQD = _mm_blendv_ps(FQD,_mm_set1_ps(135.0),QD135); |
608 | | // integer 32 bit |
609 | 0 | __m128i QD0I = _mm_cvtps_epi32(FQD); |
610 | | // integer 16 bit |
611 | 0 | QD0I = _mm_packus_epi32(QD0I,QD0I); |
612 | 0 | _mm_storeu_si64((__m128i*)&pQD[x+n],QD0I); |
613 | 0 | } |
614 | 0 | } |
615 | 0 | } |
616 | 0 | pX+=strideX; |
617 | 0 | pY+=strideY; |
618 | 0 | pQD+=buff2->Y().stride;; |
619 | 0 | } |
620 | |
|
621 | 0 | buff1->get(compID).extendBorderPel(padding, padding); // extend border for the next steps |
622 | 0 | } Unexecuted instantiation: void vvenc::gradient_SIMD<(vvenc::x86_simd::X86_VEXT)1>(vvenc::PelStorage*, vvenc::PelStorage*, vvenc::PelStorage*, vvenc::PelStorage*, unsigned int, unsigned int, unsigned int, vvenc::ComponentID) Unexecuted instantiation: void vvenc::gradient_SIMD<(vvenc::x86_simd::X86_VEXT)4>(vvenc::PelStorage*, vvenc::PelStorage*, vvenc::PelStorage*, vvenc::PelStorage*, unsigned int, unsigned int, unsigned int, vvenc::ComponentID) |
623 | | |
624 | | template<X86_VEXT vext> |
625 | | int dilation_SIMD ( PelStorage *buff, |
626 | | PelStorage *Wbuf, |
627 | | unsigned int bitDepth, |
628 | | ComponentID compID, |
629 | | int numIter, |
630 | | int iter, |
631 | | Pel Value) |
632 | 0 | { |
633 | 0 | if ( iter == numIter ) |
634 | 0 | { |
635 | 0 | return iter; |
636 | 0 | } |
637 | 0 | unsigned int width = buff->get(compID).width, |
638 | 0 | height = buff->get(compID).height; // Width and Height of current frame |
639 | 0 | unsigned int windowSize = KERNELSIZE; |
640 | 0 | unsigned int padding = windowSize / 2; |
641 | |
|
642 | 0 | Wbuf->bufs[0].copyFrom( buff->get(compID) ); |
643 | |
|
644 | 0 | Pel* p_buf; |
645 | 0 | Pel* p_buf_up; |
646 | 0 | Pel* p_buf_down; |
647 | 0 | int stride = buff->Y().stride; |
648 | 0 | Pel* p_tmpBuf = Wbuf->Y().buf; |
649 | |
|
650 | 0 | int res16 = width & 0xf; |
651 | | |
652 | | // avoid compiler warnings |
653 | 0 | __m128i v0_mid = _mm_set1_epi16 (0); |
654 | 0 | __m128i vold_down = _mm_set1_epi16 (0); |
655 | 0 | __m128i vold_up = _mm_set1_epi16 (0); |
656 | 0 | __m128i v0_down = _mm_set1_epi16 (0); |
657 | 0 | __m128i v0_up = _mm_set1_epi16 (0); |
658 | 0 | __m128i vold_mid = _mm_set1_epi16 (0); |
659 | |
|
660 | 0 | for (int y = 0; y < height; y++) |
661 | 0 | { |
662 | 0 | p_buf=buff->Y().buf + y*stride; |
663 | 0 | if (y==0) |
664 | 0 | { |
665 | 0 | p_buf_up = p_buf; |
666 | 0 | p_buf_down = p_buf_up+stride; |
667 | 0 | } |
668 | 0 | else if (y==height-1) |
669 | 0 | { |
670 | 0 | p_buf_down = p_buf; |
671 | 0 | p_buf_up = p_buf_down - stride; |
672 | 0 | } |
673 | 0 | else |
674 | 0 | { |
675 | 0 | p_buf_up = p_buf - stride; //starts at 1 now |
676 | 0 | p_buf_down = p_buf+stride; |
677 | 0 | } |
678 | 0 | if( vext >= AVX2 && !res16) |
679 | 0 | { |
680 | | #ifdef USE_AVX2 |
681 | | __m256i v0_up; |
682 | | __m256i v0_down; |
683 | | __m256i v0_mid; |
684 | | __m256i vstrong = _mm256_set1_epi16 (Value); |
685 | | |
686 | | int x; |
687 | 0 | for (x=0; x < width-16; x+=16) |
688 | 0 | { |
689 | 0 | if (x==0) |
690 | 0 | { |
691 | 0 | v0_up = _mm256_lddqu_si256((const __m256i *)(p_buf_up)); |
692 | 0 | __m256i vr_up = _mm256_lddqu_si256((const __m256i *)(p_buf_up+1)); |
693 | 0 | v0_down = _mm256_lddqu_si256((const __m256i *)(p_buf_down)); |
694 | 0 | __m256i vr_down = _mm256_lddqu_si256((const __m256i *)(p_buf_down+1)); |
695 | |
|
696 | 0 | __m256i vl_up = _mm256_slli_si256 (v0_up,2); // jeweils der unterste fehlt, aus vold holen |
697 | 0 | __m256i tmp = _mm256_permute4x64_epi64 (v0_up,0x10); |
698 | 0 | tmp = _mm256_bsrli_epi128(tmp,6); |
699 | 0 | tmp = _mm256_inserti128_si256(tmp,_mm256_castsi256_si128 (v0_up),0); |
700 | 0 | vl_up = _mm256_blend_epi16(vl_up,tmp,1); |
701 | |
|
702 | 0 | __m256i vl_down = _mm256_slli_si256 (v0_down,2); // jeweils der unterste fehlt, aus vold holen |
703 | 0 | tmp = _mm256_permute4x64_epi64 (v0_down,0x10); |
704 | 0 | tmp = _mm256_bsrli_epi128(tmp,6); |
705 | 0 | tmp = _mm256_inserti128_si256(tmp,_mm256_castsi256_si128 (v0_down),0); |
706 | 0 | vl_down = _mm256_blend_epi16(vl_down,tmp,1); |
707 | | |
708 | | // mid |
709 | 0 | v0_mid = _mm256_lddqu_si256((const __m256i *)(p_buf)); |
710 | 0 | __m256i vr_mid = _mm256_lddqu_si256((const __m256i *)(p_buf+1)); |
711 | 0 | __m256i vl_mid = _mm256_slli_si256 (v0_mid,2); // jeweils der unterste fehlt, aus vold holen |
712 | 0 | tmp = _mm256_permute4x64_epi64 (v0_mid,0x10); |
713 | 0 | tmp = _mm256_bsrli_epi128(tmp,6); |
714 | 0 | tmp = _mm256_inserti128_si256(tmp,_mm256_castsi256_si128 (v0_mid),0); |
715 | 0 | vl_mid = _mm256_blend_epi16(vl_mid,tmp,1); |
716 | |
|
717 | 0 | __m256i v_mask = _mm256_cmpeq_epi16(vl_up,vstrong); |
718 | 0 | v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(v0_up,vstrong),v_mask); |
719 | 0 | v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(vr_up,vstrong),v_mask); |
720 | 0 | v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(vl_mid,vstrong),v_mask); |
721 | 0 | v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(v0_mid,vstrong),v_mask); |
722 | 0 | v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(vr_mid,vstrong),v_mask); |
723 | 0 | v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(vl_down,vstrong),v_mask); |
724 | 0 | v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(v0_down,vstrong),v_mask); |
725 | 0 | v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(vr_down,vstrong),v_mask); |
726 | |
|
727 | 0 | __m256i vres = _mm256_blendv_epi8(v0_mid,vstrong,v_mask); |
728 | 0 | _mm256_storeu_si256((__m256i*)&p_tmpBuf[x], vres); |
729 | 0 | } |
730 | 0 | else |
731 | 0 | { |
732 | 0 | v0_up = _mm256_lddqu_si256((const __m256i *)(p_buf_up+x)); |
733 | 0 | __m256i vr_up = _mm256_lddqu_si256((const __m256i *)(p_buf_up+x+1)); |
734 | 0 | __m256i vl_up = _mm256_lddqu_si256((const __m256i *)(p_buf_up+x-1)); |
735 | |
|
736 | 0 | v0_down = _mm256_lddqu_si256((const __m256i *)(p_buf_down+x)); |
737 | 0 | __m256i vr_down = _mm256_lddqu_si256((const __m256i *)(p_buf_down+x+1)); |
738 | 0 | __m256i vl_down = _mm256_lddqu_si256((const __m256i *)(p_buf_down+x-1)); |
739 | | |
740 | | // mid |
741 | 0 | v0_mid = _mm256_lddqu_si256((const __m256i *)(p_buf+x)); |
742 | 0 | __m256i vr_mid = _mm256_lddqu_si256((const __m256i *)(p_buf+x+1)); |
743 | 0 | __m256i vl_mid = _mm256_lddqu_si256((const __m256i *)(p_buf+x-1)); |
744 | |
|
745 | 0 | __m256i v_mask = _mm256_cmpeq_epi16(vl_up,vstrong); |
746 | 0 | v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(v0_up,vstrong),v_mask); |
747 | 0 | v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(vr_up,vstrong),v_mask); |
748 | 0 | v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(vl_mid,vstrong),v_mask); |
749 | 0 | v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(v0_mid,vstrong),v_mask); |
750 | 0 | v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(vr_mid,vstrong),v_mask); |
751 | 0 | v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(vl_down,vstrong),v_mask); |
752 | 0 | v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(v0_down,vstrong),v_mask); |
753 | 0 | v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(vr_down,vstrong),v_mask); |
754 | |
|
755 | 0 | __m256i vres = _mm256_blendv_epi8(v0_mid,vstrong,v_mask); |
756 | 0 | _mm256_storeu_si256((__m256i*)&p_tmpBuf[x], vres); |
757 | 0 | } |
758 | 0 | } //for x |
759 | | // last collum |
760 | | { |
761 | | v0_up = _mm256_lddqu_si256((const __m256i *)(p_buf_up+x)); |
762 | | __m256i vl_up = _mm256_lddqu_si256((const __m256i *)(p_buf_up+x-1)); |
763 | | |
764 | | v0_down = _mm256_lddqu_si256((const __m256i *)(p_buf_down+x)); |
765 | | __m256i vl_down = _mm256_lddqu_si256((const __m256i *)(p_buf_down+x-1)); |
766 | | |
767 | | __m256i vr_up = _mm256_srli_si256 (v0_up,2); // jeweils der oberste fehlt |
768 | | vr_up = _mm256_insert_epi16 (vr_up,_mm256_extract_epi16 (v0_up,8), 7); |
769 | | vr_up = _mm256_insert_epi16 (vr_up,_mm256_extract_epi16 (v0_up,15), 15); |
770 | | |
771 | | __m256i vr_down = _mm256_srli_si256 (v0_down,2); // jeweils der oberste fehlt |
772 | | vr_down = _mm256_insert_epi16 (vr_down,_mm256_extract_epi16 (v0_down,8), 7); |
773 | | vr_down = _mm256_insert_epi16 (vr_down,_mm256_extract_epi16 (v0_down,15), 15); |
774 | | |
775 | | // mid |
776 | | v0_mid = _mm256_lddqu_si256((const __m256i *)(p_buf+x)); |
777 | | __m256i vl_mid = _mm256_lddqu_si256((const __m256i *)(p_buf+x-1)); |
778 | | __m256i vr_mid = _mm256_srli_si256 (v0_mid,2); // jeweils der oberste fehlt |
779 | | vr_mid = _mm256_insert_epi16 (vr_mid,_mm256_extract_epi16 (v0_mid,8), 7); |
780 | | vr_mid = _mm256_insert_epi16 (vr_mid,_mm256_extract_epi16 (v0_mid,15), 15); |
781 | | |
782 | | __m256i v_mask = _mm256_cmpeq_epi16(vl_up,vstrong); |
783 | | v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(v0_up,vstrong),v_mask); |
784 | | v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(vr_up,vstrong),v_mask); |
785 | | v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(vl_mid,vstrong),v_mask); |
786 | | v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(v0_mid,vstrong),v_mask); |
787 | | v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(vr_mid,vstrong),v_mask); |
788 | | v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(vl_down,vstrong),v_mask); |
789 | | v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(v0_down,vstrong),v_mask); |
790 | | v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(vr_down,vstrong),v_mask); |
791 | | |
792 | | __m256i vres = _mm256_blendv_epi8(v0_mid,vstrong,v_mask); |
793 | | _mm256_storeu_si256((__m256i*)&p_tmpBuf[x], vres); |
794 | | } |
795 | | #endif |
796 | 0 | } //AVX2 |
797 | 0 | else |
798 | 0 | { |
799 | 0 | __m128i v1_up; |
800 | 0 | __m128i v1_down; |
801 | 0 | __m128i v1_mid; |
802 | 0 | __m128i vstrong = _mm_set1_epi16 (Value); |
803 | 0 | int x; |
804 | 0 | for (x=0; x < width-8; x+=8) |
805 | 0 | { |
806 | 0 | if (x==0) |
807 | 0 | { |
808 | 0 | v0_up = _mm_loadu_si128((const __m128i*)(p_buf_up)); |
809 | 0 | v1_up = _mm_loadu_si128((const __m128i*)(p_buf_up+8)); |
810 | 0 | v0_down = _mm_loadu_si128((const __m128i*)(p_buf_down)); |
811 | 0 | v1_down = _mm_loadu_si128((const __m128i*)(p_buf_down+8)); |
812 | 0 | v0_mid = _mm_loadu_si128((const __m128i*)(p_buf)); |
813 | 0 | v1_mid = _mm_loadu_si128((const __m128i*)(p_buf+8)); |
814 | |
|
815 | 0 | __m128i vl_up = _mm_slli_si128 (v0_up,2); // der unterste fehlt, aus vold holen |
816 | 0 | vl_up = _mm_blend_epi16 (vl_up,v0_up,1); |
817 | 0 | __m128i vr_up = _mm_srli_si128 (v0_up,2); // der oberste fehlt, aus v1 holen |
818 | 0 | vr_up = _mm_blend_epi16 (vr_up,_mm_slli_si128 (v1_up,14),0x80); |
819 | |
|
820 | 0 | __m128i vl_down = _mm_slli_si128 (v0_down,2); // der unterste fehlt, |
821 | 0 | vl_down = _mm_blend_epi16 (vl_down,v0_down,1); |
822 | 0 | __m128i vr_down = _mm_srli_si128 (v0_down,2); // der oberste fehlt, aus v1 holen |
823 | 0 | vr_down = _mm_blend_epi16 (vr_down,_mm_slli_si128 (v1_down,14),0x80); |
824 | |
|
825 | 0 | __m128i vl_mid = _mm_slli_si128 (v0_mid,2); // der unterste fehlt, |
826 | 0 | vl_mid = _mm_blend_epi16 (vl_mid,v0_mid,1); |
827 | 0 | __m128i vr_mid = _mm_srli_si128 (v0_mid,2); // der oberste fehlt, aus v1 holen |
828 | 0 | vr_mid = _mm_blend_epi16 (vr_mid,_mm_slli_si128 (v1_mid,14),0x80); |
829 | |
|
830 | 0 | __m128i v_mask = _mm_cmpeq_epi16(vl_up,vstrong); |
831 | 0 | v_mask = _mm_or_si128(_mm_cmpeq_epi16(v0_up,vstrong),v_mask); |
832 | 0 | v_mask = _mm_or_si128(_mm_cmpeq_epi16(vr_up,vstrong),v_mask); |
833 | 0 | v_mask = _mm_or_si128(_mm_cmpeq_epi16(vl_mid,vstrong),v_mask); |
834 | 0 | v_mask = _mm_or_si128(_mm_cmpeq_epi16(v0_mid,vstrong),v_mask); |
835 | 0 | v_mask = _mm_or_si128(_mm_cmpeq_epi16(vr_mid,vstrong),v_mask); |
836 | 0 | v_mask = _mm_or_si128(_mm_cmpeq_epi16(vl_down,vstrong),v_mask); |
837 | 0 | v_mask = _mm_or_si128(_mm_cmpeq_epi16(v0_down,vstrong),v_mask); |
838 | 0 | v_mask = _mm_or_si128(_mm_cmpeq_epi16(vr_down,vstrong),v_mask); |
839 | |
|
840 | 0 | __m128i vres = _mm_blendv_epi8(v0_mid,vstrong,v_mask); |
841 | 0 | _mm_storeu_si128((__m128i*)&p_tmpBuf[x], vres); |
842 | 0 | } //x==0 |
843 | 0 | else |
844 | 0 | { |
845 | 0 | v1_up = _mm_loadu_si128((const __m128i*)(p_buf_up+x+8)); |
846 | 0 | v1_down = _mm_loadu_si128((const __m128i*)(p_buf_down+x+8)); |
847 | 0 | v1_mid = _mm_loadu_si128((const __m128i*)(p_buf+x+8)); |
848 | |
|
849 | 0 | __m128i vl_up = _mm_slli_si128 (v0_up,2); // der unterste fehlt, aus vold holen |
850 | 0 | vl_up = _mm_blend_epi16 (vl_up,_mm_srli_si128 (vold_up,14),1); |
851 | 0 | __m128i vr_up = _mm_srli_si128 (v0_up,2); // der oberste fehlt, aus v1 holen |
852 | 0 | vr_up = _mm_blend_epi16 (vr_up,_mm_slli_si128 (v1_up,14),0x80); |
853 | |
|
854 | 0 | __m128i vl_down = _mm_slli_si128 (v0_down,2); // der unterste fehlt, aus vold holen |
855 | 0 | vl_down = _mm_blend_epi16 (vl_down,_mm_srli_si128 (vold_down,14),1); |
856 | 0 | __m128i vr_down = _mm_srli_si128 (v0_down,2); // der oberste fehlt, aus v1 holen |
857 | 0 | vr_down = _mm_blend_epi16 (vr_down,_mm_slli_si128 (v1_down,14),0x80); |
858 | |
|
859 | 0 | __m128i vl_mid = _mm_slli_si128 (v0_mid,2); // der unterste fehlt, aus vold holen |
860 | 0 | vl_mid = _mm_blend_epi16 (vl_mid,_mm_srli_si128 (vold_mid,14),1); |
861 | 0 | __m128i vr_mid = _mm_srli_si128 (v0_mid,2); // der oberste fehlt, aus v1 holen |
862 | 0 | vr_mid = _mm_blend_epi16 (vr_mid,_mm_slli_si128 (v1_mid,14),0x80); |
863 | |
|
864 | 0 | __m128i v_mask = _mm_cmpeq_epi16(vl_up,vstrong); |
865 | 0 | v_mask = _mm_or_si128(_mm_cmpeq_epi16(v0_up,vstrong),v_mask); |
866 | 0 | v_mask = _mm_or_si128(_mm_cmpeq_epi16(vr_up,vstrong),v_mask); |
867 | 0 | v_mask = _mm_or_si128(_mm_cmpeq_epi16(vl_mid,vstrong),v_mask); |
868 | 0 | v_mask = _mm_or_si128(_mm_cmpeq_epi16(v0_mid,vstrong),v_mask); |
869 | 0 | v_mask = _mm_or_si128(_mm_cmpeq_epi16(vr_mid,vstrong),v_mask); |
870 | 0 | v_mask = _mm_or_si128(_mm_cmpeq_epi16(vl_down,vstrong),v_mask); |
871 | 0 | v_mask = _mm_or_si128(_mm_cmpeq_epi16(v0_down,vstrong),v_mask); |
872 | 0 | v_mask = _mm_or_si128(_mm_cmpeq_epi16(vr_down,vstrong),v_mask); |
873 | |
|
874 | 0 | __m128i vres = _mm_blendv_epi8(v0_mid,vstrong,v_mask); |
875 | 0 | _mm_storeu_si128((__m128i*)&p_tmpBuf[x], vres); |
876 | 0 | } |
877 | 0 | vold_up = v0_up; |
878 | 0 | vold_down = v0_down; |
879 | 0 | vold_mid = v0_mid; |
880 | 0 | v0_up = v1_up; |
881 | 0 | v0_down = v1_down; |
882 | 0 | v0_mid = v1_mid; |
883 | 0 | } //for x |
884 | | // last collum |
885 | 0 | { |
886 | 0 | __m128i vl_up = _mm_slli_si128 (v0_up,2); // der unterste fehlt, aus vold holen |
887 | 0 | vl_up = _mm_blend_epi16 (vl_up,_mm_srli_si128 (vold_up,14),1); |
888 | 0 | __m128i vr_up = _mm_srli_si128 (v0_up,2); // der oberste fehlt, aus v0 holen |
889 | 0 | vr_up = _mm_blend_epi16 (vr_up,v0_up,0x80); |
890 | |
|
891 | 0 | __m128i vl_down = _mm_slli_si128 (v0_down,2); // der unterste fehlt, aus vold holen |
892 | 0 | vl_down = _mm_blend_epi16 (vl_down,_mm_srli_si128 (vold_down,14),1); |
893 | 0 | __m128i vr_down = _mm_srli_si128 (v0_down,2); // der oberste fehlt, aus v0 holen |
894 | 0 | vr_down = _mm_blend_epi16(vr_down,v0_down,0x80); |
895 | |
|
896 | 0 | __m128i vl_mid = _mm_slli_si128 (v0_mid,2); // der unterste fehlt, aus vold holen |
897 | 0 | vl_mid = _mm_blend_epi16 (vl_mid,_mm_srli_si128 (vold_mid,14),1); |
898 | 0 | __m128i vr_mid = _mm_srli_si128 (v0_mid,2); // der oberste fehlt, aus v0 holen |
899 | 0 | vr_mid = _mm_blend_epi16 (vr_mid,v0_mid,0x80); |
900 | |
|
901 | 0 | __m128i v_mask = _mm_cmpeq_epi16(vl_up,vstrong); |
902 | 0 | v_mask = _mm_or_si128(_mm_cmpeq_epi16(v0_up,vstrong),v_mask); |
903 | 0 | v_mask = _mm_or_si128(_mm_cmpeq_epi16(vr_up,vstrong),v_mask); |
904 | 0 | v_mask = _mm_or_si128(_mm_cmpeq_epi16(vl_mid,vstrong),v_mask); |
905 | 0 | v_mask = _mm_or_si128(_mm_cmpeq_epi16(v0_mid,vstrong),v_mask); |
906 | 0 | v_mask = _mm_or_si128(_mm_cmpeq_epi16(vr_mid,vstrong),v_mask); |
907 | 0 | v_mask = _mm_or_si128(_mm_cmpeq_epi16(vl_down,vstrong),v_mask); |
908 | 0 | v_mask = _mm_or_si128(_mm_cmpeq_epi16(v0_down,vstrong),v_mask); |
909 | 0 | v_mask = _mm_or_si128(_mm_cmpeq_epi16(vr_down,vstrong),v_mask); |
910 | |
|
911 | 0 | __m128i vres = _mm_blendv_epi8(v0_mid,vstrong,v_mask); |
912 | 0 | _mm_storeu_si128((__m128i*)&p_tmpBuf[x], vres); |
913 | 0 | } |
914 | 0 | } //!AVX |
915 | 0 | p_tmpBuf+=Wbuf->get(compID).stride; |
916 | 0 | }//y |
917 | |
|
918 | 0 | buff->get(compID).extendBorderPel( padding, padding ); |
919 | 0 | buff->get(compID).copyFrom( Wbuf->bufs[0] ); |
920 | |
|
921 | 0 | iter++; |
922 | |
|
923 | 0 | iter = dilation_SIMD<vext> ( buff,Wbuf, |
924 | 0 | bitDepth, |
925 | 0 | compID, |
926 | 0 | numIter, |
927 | 0 | iter, |
928 | 0 | Value); |
929 | |
|
930 | 0 | return iter; |
931 | 0 | } Unexecuted instantiation: int vvenc::dilation_SIMD<(vvenc::x86_simd::X86_VEXT)1>(vvenc::PelStorage*, vvenc::PelStorage*, unsigned int, vvenc::ComponentID, int, int, short) Unexecuted instantiation: int vvenc::dilation_SIMD<(vvenc::x86_simd::X86_VEXT)4>(vvenc::PelStorage*, vvenc::PelStorage*, unsigned int, vvenc::ComponentID, int, int, short) |
932 | | |
933 | | template<X86_VEXT vext> |
934 | | double calcVarSse( const Pel* org, const ptrdiff_t origStride, const int w, const int h ); |
935 | | |
936 | | |
937 | | template<X86_VEXT vext> |
938 | | int calcMeanSSE ( const Pel* org, const ptrdiff_t origStride, const int w, const int h ) |
939 | 0 | { |
940 | 0 | int avg; |
941 | | // calculate average |
942 | 0 | __m128i xavg32 = _mm_setzero_si128(); |
943 | 0 | __m128i xavg16 = _mm_setzero_si128(); |
944 | 0 | const __m128i xone = _mm_set1_epi16( 1 ); |
945 | 0 | for( int y1 = 0; y1 < h; y1++ ) |
946 | 0 | { |
947 | 0 | xavg16 = _mm_setzero_si128(); |
948 | 0 | for( int x1 = 0; x1 < w; x1 += 8 ) |
949 | 0 | { |
950 | 0 | xavg16 = _mm_add_epi16( xavg16, _mm_loadu_si128( ( const __m128i* ) ( org + x1 + y1 * origStride ) ) ); |
951 | 0 | } |
952 | 0 | xavg32 = _mm_add_epi32( xavg32, _mm_madd_epi16( xone, xavg16 ) ); |
953 | 0 | } |
954 | |
|
955 | 0 | xavg32 = _mm_hadd_epi32( xavg32, xavg32 ); |
956 | 0 | xavg32 = _mm_hadd_epi32( xavg32, xavg32 ); |
957 | 0 | xavg32 = _mm_shuffle_epi32( xavg32, 0 ); |
958 | 0 | avg = _mm_extract_epi32 (xavg32, 0); |
959 | 0 | return avg; |
960 | 0 | } Unexecuted instantiation: int vvenc::calcMeanSSE<(vvenc::x86_simd::X86_VEXT)1>(short const*, long, int, int) Unexecuted instantiation: int vvenc::calcMeanSSE<(vvenc::x86_simd::X86_VEXT)4>(short const*, long, int, int) |
961 | | |
962 | | #if ENABLE_SIMD_OPT_FGA |
963 | | template<X86_VEXT vext> |
964 | | void Canny::_initFGACannyX86() |
965 | 0 | { |
966 | 0 | gradient = gradient_SIMD<vext>; |
967 | 0 | } Unexecuted instantiation: void vvenc::Canny::_initFGACannyX86<(vvenc::x86_simd::X86_VEXT)1>() Unexecuted instantiation: void vvenc::Canny::_initFGACannyX86<(vvenc::x86_simd::X86_VEXT)4>() |
968 | | template void Canny::_initFGACannyX86<SIMDX86>(); |
969 | | |
970 | | template<X86_VEXT vext> |
971 | | void Morph::_initFGAMorphX86() |
972 | 0 | { |
973 | 0 | dilation = dilation_SIMD<vext>; |
974 | 0 | } Unexecuted instantiation: void vvenc::Morph::_initFGAMorphX86<(vvenc::x86_simd::X86_VEXT)1>() Unexecuted instantiation: void vvenc::Morph::_initFGAMorphX86<(vvenc::x86_simd::X86_VEXT)4>() |
975 | | template void Morph::_initFGAMorphX86<SIMDX86>(); |
976 | | |
977 | | |
978 | | template<X86_VEXT vext> |
979 | | void FGAnalyzer::_initFGAnalyzerX86() |
980 | 0 | { |
981 | 0 | #if ENABLE_SIMD_OPT_MCTF |
982 | 0 | calcVar = calcVarSse<vext>; |
983 | 0 | calcMean = calcMeanSSE<vext>; |
984 | 0 | #endif |
985 | 0 | } Unexecuted instantiation: void vvenc::FGAnalyzer::_initFGAnalyzerX86<(vvenc::x86_simd::X86_VEXT)1>() Unexecuted instantiation: void vvenc::FGAnalyzer::_initFGAnalyzerX86<(vvenc::x86_simd::X86_VEXT)4>() |
986 | | |
987 | | template void FGAnalyzer::_initFGAnalyzerX86<SIMDX86>(); |
988 | | #endif |
989 | | |
990 | | } // namespace vvenc |
991 | | |
992 | | //! \} |
993 | | |
994 | | #endif // TARGET_SIMD_X86 |
995 | | //! \} |