Coverage Report

Created: 2026-04-01 07:49

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/vvenc/source/Lib/CommonLib/x86/FGAX86.h
Line
Count
Source
1
/* -----------------------------------------------------------------------------
2
The copyright in this software is being made available under the Clear BSD
3
License, included below. No patent rights, trademark rights and/or 
4
other Intellectual Property Rights other than the copyrights concerning 
5
the Software are granted under this license.
6
7
The Clear BSD License
8
9
Copyright (c) 2019-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
10
All rights reserved.
11
12
Redistribution and use in source and binary forms, with or without modification,
13
are permitted (subject to the limitations in the disclaimer below) provided that
14
the following conditions are met:
15
16
     * Redistributions of source code must retain the above copyright notice,
17
     this list of conditions and the following disclaimer.
18
19
     * Redistributions in binary form must reproduce the above copyright
20
     notice, this list of conditions and the following disclaimer in the
21
     documentation and/or other materials provided with the distribution.
22
23
     * Neither the name of the copyright holder nor the names of its
24
     contributors may be used to endorse or promote products derived from this
25
     software without specific prior written permission.
26
27
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
28
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
29
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
32
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
36
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
40
41
------------------------------------------------------------------------------------------- */
42
/** \file     FGAX86.h
43
    \brief    SIMD for FilmGrainAnalyse
44
*/
45
46
//! \ingroup CommonLib
47
//! \{
48
49
//#include "CommonLib/CommonDef.h"
50
#include "CommonDefX86.h"
51
#include "SEIFilmGrainAnalyzer.h"
52
53
54
//#ifdef TARGET_SIMD_X86
55
#if defined(TARGET_SIMD_X86)  && ENABLE_SIMD_OPT_FGA
56
57
//! \ingroup CommonLib
58
//! \{
59
60
namespace vvenc {
61
#ifdef USE_AVX2
62
/* -----------------------------------------------------------------------------
63
atan2 aproximation taken from:
64
https://mazzo.li/posts/vectorized-atan2.html
65
------------------------------------------------------------------------------------------- */
66
0
inline __m256 atan_avx_approximation(__m256 x) {
67
  // Store the coefficients -- `_mm256_set1_ps` creates a vector
68
  // with the same value in every element.
69
0
  __m256 a1  = _mm256_set1_ps( 0.99997726f);
70
0
  __m256 a3  = _mm256_set1_ps(-0.33262347f);
71
0
  __m256 a5  = _mm256_set1_ps( 0.19354346f);
72
0
  __m256 a7  = _mm256_set1_ps(-0.11643287f);
73
0
  __m256 a9  = _mm256_set1_ps( 0.05265332f);
74
0
  __m256 a11 = _mm256_set1_ps(-0.01172120f);
75
  // Compute the polynomial on an 8-vector with FMA.
76
0
  __m256 x_sq = _mm256_mul_ps(x, x);
77
0
  __m256 result;
78
0
  result = a11;
79
0
  result = _mm256_add_ps(_mm256_mul_ps(x_sq, result), a9);
80
0
  result = _mm256_add_ps(_mm256_mul_ps(x_sq, result), a7);
81
0
  result = _mm256_add_ps(_mm256_mul_ps(x_sq, result), a5);
82
0
  result = _mm256_add_ps(_mm256_mul_ps(x_sq, result), a3);
83
0
  result = _mm256_add_ps(_mm256_mul_ps(x_sq, result), a1);
84
0
  result = _mm256_mul_ps(x, result);
85
0
  return result;
86
0
}
87
#endif
88
0
inline __m128 atan_avx_approximation(__m128 x) {
89
  // Store the coefficients -- `_mm256_set1_ps` creates a vector
90
  // with the same value in every element.
91
0
  __m128 a1  = _mm_set1_ps( 0.99997726f);
92
0
  __m128 a3  = _mm_set1_ps(-0.33262347f);
93
0
  __m128 a5  = _mm_set1_ps( 0.19354346f);
94
0
  __m128 a7  = _mm_set1_ps(-0.11643287f);
95
0
  __m128 a9  = _mm_set1_ps( 0.05265332f);
96
0
  __m128 a11 = _mm_set1_ps(-0.01172120f);
97
  // Compute the polynomial on an 8-vector with FMA.
98
0
  __m128 x_sq = _mm_mul_ps(x, x);
99
0
  __m128 result;
100
0
  result = a11;
101
0
  result = _mm_add_ps(_mm_mul_ps(x_sq, result), a9);
102
0
  result = _mm_add_ps(_mm_mul_ps(x_sq, result), a7);
103
0
  result = _mm_add_ps(_mm_mul_ps(x_sq, result), a5);
104
0
  result = _mm_add_ps(_mm_mul_ps(x_sq, result), a3);
105
0
  result = _mm_add_ps(_mm_mul_ps(x_sq, result), a1);
106
0
  result = _mm_mul_ps(x, result);
107
0
  return result;
108
0
}
109
110
template<X86_VEXT vext>
111
void gradient_SIMD (PelStorage *buff1, PelStorage *buff2,
112
                    PelStorage *AccGxBuf, PelStorage *AccGyBuf,
113
                    unsigned int width, unsigned int height,
114
                    unsigned int bitDepth, ComponentID compID)
115
0
{
116
  // buff1 - magnitude; buff2 - orientation (Only luma in buff2)
117
0
  const unsigned int convWidthS=CONV_WIDTH_S;
118
0
  const int maxClpRange = (1 << bitDepth) - 1;
119
0
  const int padding     = convWidthS / 2;
120
0
  Pel* p_buf1;
121
0
  Pel* p_buf1_up;
122
0
  Pel* p_buf1_down;
123
0
  int stride  = buff1->Y().stride;
124
0
  Pel* p_ACC = AccGxBuf->Y().buf;
125
0
  Pel* p_ACC_Y = AccGyBuf->Y().buf;
126
127
0
  int res16 = width & 0xf;
128
  // avoid compiler warnings
129
0
  __m128i v0_mid =  _mm_set1_epi16 (0);
130
0
  __m128i vold_down =  _mm_set1_epi16 (0);
131
0
  __m128i vold_up =  _mm_set1_epi16 (0);
132
0
  __m128i v0_down =  _mm_set1_epi16 (0);
133
0
  __m128i v0_up =  _mm_set1_epi16 (0);
134
0
  __m128i vold_mid =  _mm_set1_epi16 (0);
135
136
0
  for (int y = 0; y < height; y++)
137
0
  {
138
0
    p_buf1=buff1->Y().buf + y*stride;
139
0
    if (y==0)
140
0
    {
141
0
      p_buf1_up = p_buf1;
142
0
      p_buf1_down = p_buf1_up+stride;
143
0
    }
144
0
    else if (y==height-1)
145
0
    {
146
0
      p_buf1_down = p_buf1;
147
0
      p_buf1_up = p_buf1_down - stride;
148
0
    }
149
0
    else
150
0
    {
151
0
      p_buf1_up = p_buf1 - stride;  //starts at 1 now
152
0
      p_buf1_down = p_buf1+stride;
153
0
    }
154
0
    if( vext >= AVX2 && !res16)
155
0
    {
156
#ifdef USE_AVX2
157
      __m256i  v0_up;
158
      __m256i  v0_down;
159
      __m256i  v0_mid;
160
      int x;
161
0
      for (x=0; x < width-16; x+=16)
162
0
      {
163
0
        if (x==0)
164
0
        {
165
0
          v0_up = _mm256_lddqu_si256((const __m256i *)(p_buf1_up));
166
0
          __m256i  vr_up = _mm256_lddqu_si256((const __m256i *)(p_buf1_up+1));
167
0
          v0_down = _mm256_lddqu_si256((const __m256i *)(p_buf1_down));
168
0
          __m256i  vr_down = _mm256_lddqu_si256((const __m256i *)(p_buf1_down+1));
169
170
          __m256i  vl_up = _mm256_slli_si256 (v0_up,2);  // jeweils der unterste fehlt, aus vold holen
171
          __m256i  tmp = _mm256_permute4x64_epi64 (v0_up,0x10);
172
          tmp = _mm256_bsrli_epi128(tmp,6);
173
          tmp = _mm256_inserti128_si256(tmp,_mm256_castsi256_si128 (v0_up),0);
174
          vl_up = _mm256_blend_epi16(vl_up,tmp,1);
175
          __m256i  vm_up = _mm256_slli_epi16 (v0_up,1);  // middle *2
176
          __m256i  acc_up = _mm256_adds_epi16 (vm_up,vl_up);
177
          acc_up = _mm256_adds_epi16 (acc_up,vr_up);
178
179
          __m256i  vl_down = _mm256_slli_si256 (v0_down,2);  // jeweils der unterste fehlt, aus vold holen
180
          tmp = _mm256_permute4x64_epi64 (v0_down,0x10);
181
          tmp = _mm256_bsrli_epi128(tmp,6);
182
          tmp = _mm256_inserti128_si256(tmp,_mm256_castsi256_si128 (v0_down),0);
183
          vl_down = _mm256_blend_epi16(vl_down,tmp,1);
184
          __m256i  vm_down = _mm256_slli_epi16 (v0_down,1);  // middle *2
185
          __m256i  acc_down = _mm256_adds_epi16 (vm_down,vl_down);
186
          acc_down = _mm256_adds_epi16 (acc_down,vr_down);
187
188
          __m256i  acc = _mm256_subs_epi16 (acc_down,acc_up);
189
          _mm256_storeu_si256((__m256i *)&p_ACC[0], acc);
190
191
          // mid
192
          v0_mid = _mm256_lddqu_si256((const __m256i *)(p_buf1));
193
          __m256i  vr_mid = _mm256_lddqu_si256((const __m256i *)(p_buf1+1));
194
          __m256i  vl_mid = _mm256_slli_si256 (v0_mid,2);  // jeweils der unterste fehlt, aus vold holen
195
          tmp = _mm256_permute4x64_epi64 (v0_mid,0x10);
196
          tmp = _mm256_bsrli_epi128(tmp,6);
197
          tmp = _mm256_inserti128_si256(tmp,_mm256_castsi256_si128 (v0_mid),0);
198
          vl_mid = _mm256_blend_epi16(vl_mid,tmp,1);
199
200
          __m256i acc_right = _mm256_adds_epi16 (vr_up,vr_down);
201
          vr_mid = _mm256_slli_epi16 (vr_mid,1);  // middle *2
202
          acc_right = _mm256_adds_epi16 (acc_right,vr_mid);
203
204
          __m256i acc_left = _mm256_adds_epi16 (vl_up,vl_down);
205
          vl_mid = _mm256_slli_epi16 (vl_mid,1);  // middle *2
206
          acc_left = _mm256_adds_epi16 (acc_left,vl_mid);
207
          acc = _mm256_subs_epi16 (acc_right,acc_left);
208
          _mm256_storeu_si256((__m256i *)&p_ACC_Y[x], acc);
209
        }
210
0
        else
211
0
        {
212
0
          v0_up = _mm256_lddqu_si256((const __m256i *)(p_buf1_up+x));
213
0
          __m256i  vr_up = _mm256_lddqu_si256((const __m256i *)(p_buf1_up+x+1));
214
0
          __m256i  vl_up = _mm256_lddqu_si256((const __m256i *)(p_buf1_up+x-1));
215
216
0
          v0_down = _mm256_lddqu_si256((const __m256i *)(p_buf1_down+x));
217
0
          __m256i  vr_down = _mm256_lddqu_si256((const __m256i *)(p_buf1_down+x+1));
218
0
          __m256i  vl_down = _mm256_lddqu_si256((const __m256i *)(p_buf1_down+x-1));
219
220
0
          __m256i  vm_up = _mm256_slli_epi16 (v0_up,1);  // middle *2
221
0
          __m256i  acc_up = _mm256_adds_epi16 (vm_up,vl_up);
222
0
          acc_up = _mm256_adds_epi16 (acc_up,vr_up);
223
224
0
          __m256i  vm_down = _mm256_slli_epi16 (v0_down,1);  // middle *2
225
0
          __m256i  acc_down = _mm256_adds_epi16 (vm_down,vl_down);
226
0
          acc_down = _mm256_adds_epi16 (acc_down,vr_down);
227
228
0
          __m256i  acc = _mm256_subs_epi16 (acc_down,acc_up);
229
0
          _mm256_storeu_si256((__m256i *)&p_ACC[x], acc);
230
231
          // mid
232
0
          v0_mid = _mm256_lddqu_si256((const __m256i *)(p_buf1+x));
233
0
          __m256i  vr_mid = _mm256_lddqu_si256((const __m256i *)(p_buf1+x+1));
234
0
          __m256i  vl_mid = _mm256_lddqu_si256((const __m256i *)(p_buf1+x-1));
235
236
0
          __m256i acc_right = _mm256_adds_epi16 (vr_up,vr_down);
237
0
          vr_mid = _mm256_slli_epi16 (vr_mid,1);  // middle *2
238
0
          acc_right = _mm256_adds_epi16 (acc_right,vr_mid);
239
240
0
          __m256i acc_left = _mm256_adds_epi16 (vl_up,vl_down);
241
0
          vl_mid = _mm256_slli_epi16 (vl_mid,1);  // middle *2
242
0
          acc_left = _mm256_adds_epi16 (acc_left,vl_mid);
243
0
          acc = _mm256_subs_epi16 (acc_right,acc_left);
244
0
          _mm256_storeu_si256((__m256i *)&p_ACC_Y[x], acc);
245
0
        }
246
0
      }  //for x
247
      // last collum
248
      {
249
        v0_up = _mm256_lddqu_si256((const __m256i *)(p_buf1_up+x));
250
        __m256i  vl_up = _mm256_lddqu_si256((const __m256i *)(p_buf1_up+x-1));
251
252
        v0_down = _mm256_lddqu_si256((const __m256i *)(p_buf1_down+x));
253
        __m256i  vl_down = _mm256_lddqu_si256((const __m256i *)(p_buf1_down+x-1));
254
255
        __m256i  vr_up = _mm256_srli_si256 (v0_up,2);  // jeweils der oberste fehlt
256
        vr_up = _mm256_insert_epi16 (vr_up,_mm256_extract_epi16 (v0_up,8), 7);
257
        vr_up = _mm256_insert_epi16 (vr_up,_mm256_extract_epi16 (v0_up,15), 15);
258
259
        __m256i  vr_down = _mm256_srli_si256 (v0_down,2);  // jeweils der oberste fehlt
260
        vr_down = _mm256_insert_epi16 (vr_down,_mm256_extract_epi16 (v0_down,8), 7);
261
        vr_down = _mm256_insert_epi16 (vr_down,_mm256_extract_epi16 (v0_down,15), 15);
262
263
        __m256i  vm_up = _mm256_slli_epi16 (v0_up,1);  // middle *2
264
        __m256i  acc_up = _mm256_adds_epi16 (vm_up,vl_up);
265
        acc_up = _mm256_adds_epi16 (acc_up,vr_up);
266
267
        __m256i  vm_down = _mm256_slli_epi16 (v0_down,1);  // middle *2
268
        __m256i  acc_down = _mm256_adds_epi16 (vm_down,vl_down);
269
        acc_down = _mm256_adds_epi16 (acc_down,vr_down);
270
271
        __m256i  acc = _mm256_subs_epi16 (acc_down,acc_up);
272
        _mm256_storeu_si256((__m256i *)&p_ACC[x], acc);
273
274
        // mid
275
        v0_mid = _mm256_lddqu_si256((const __m256i *)(p_buf1+x));
276
        __m256i  vl_mid = _mm256_lddqu_si256((const __m256i *)(p_buf1+x-1));
277
278
        __m256i  vr_mid = _mm256_srli_si256 (v0_mid,2);  // jeweils der oberste fehlt
279
        vr_mid = _mm256_insert_epi16 (vr_mid,_mm256_extract_epi16 (v0_mid,8), 7);
280
        vr_mid = _mm256_insert_epi16 (vr_mid,_mm256_extract_epi16 (v0_mid,15), 15);
281
282
        __m256i acc_right = _mm256_adds_epi16 (vr_up,vr_down);
283
        vr_mid = _mm256_slli_epi16 (vr_mid,1);  // middle *2
284
        acc_right = _mm256_adds_epi16 (acc_right,vr_mid);
285
286
        __m256i acc_left = _mm256_adds_epi16 (vl_up,vl_down);
287
        vl_mid = _mm256_slli_epi16 (vl_mid,1);  // middle *2
288
        acc_left = _mm256_adds_epi16 (acc_left,vl_mid);
289
        acc = _mm256_subs_epi16 (acc_right,acc_left);
290
        _mm256_storeu_si256((__m256i *)&p_ACC_Y[x], acc);
291
      }
292
#endif
293
0
    }  //AVX2
294
0
    else
295
0
    {
296
0
      __m128i v1_up;
297
0
      __m128i v1_down;
298
0
      __m128i v1_mid;
299
0
      int x;
300
0
      for (x=0; x < width-8; x+=8)
301
0
      {
302
0
        if (x==0)
303
0
        {
304
0
          v0_up = _mm_loadu_si128((const __m128i*)(p_buf1_up));
305
0
          v1_up = _mm_loadu_si128((const __m128i*)(p_buf1_up+8));
306
0
          v0_down = _mm_loadu_si128((const __m128i*)(p_buf1_down));
307
0
          v1_down = _mm_loadu_si128((const __m128i*)(p_buf1_down+8));
308
0
          v0_mid = _mm_loadu_si128((const __m128i*)(p_buf1));
309
0
          v1_mid = _mm_loadu_si128((const __m128i*)(p_buf1+8));
310
311
0
          __m128i vl_up = _mm_slli_si128 (v0_up,2);  // der unterste fehlt, aus vold holen
312
0
          vl_up = _mm_blend_epi16 (vl_up,v0_up,1);
313
0
          __m128i vr_up = _mm_srli_si128 (v0_up,2);  // der oberste fehlt, aus v1 holen
314
0
          vr_up = _mm_blend_epi16 (vr_up,_mm_slli_si128 (v1_up,14),0x80);
315
316
0
          __m128i vl_down = _mm_slli_si128 (v0_down,2);  // der unterste fehlt,
317
0
          vl_down = _mm_blend_epi16 (vl_down,v0_down,1);
318
0
          __m128i vr_down = _mm_srli_si128 (v0_down,2);  // der oberste fehlt, aus v1 holen
319
0
          vr_down = _mm_blend_epi16 (vr_down,_mm_slli_si128 (v1_down,14),0x80);
320
321
0
          __m128i vl_mid = _mm_slli_si128 (v0_mid,2);  // der unterste fehlt,
322
0
          vl_mid = _mm_blend_epi16 (vl_mid,v0_mid,1);
323
0
          __m128i vr_mid = _mm_srli_si128 (v0_mid,2);  // der oberste fehlt, aus v1 holen
324
0
          vr_mid = _mm_blend_epi16 (vr_mid,_mm_slli_si128 (v1_mid,14),0x80);
325
326
0
          __m128i vm_up = _mm_slli_epi16 (v0_up,1);  // middle *2
327
0
          __m128i acc_up = _mm_adds_epi16 (vm_up,vl_up);
328
0
          acc_up = _mm_adds_epi16 (acc_up,vr_up);
329
330
0
          __m128i vm_down = _mm_slli_epi16 (v0_down,1);  // middle *2
331
0
          __m128i acc_down = _mm_adds_epi16 (vm_down,vl_down);
332
0
          acc_down = _mm_adds_epi16 (acc_down,vr_down);
333
334
0
          __m128i acc = _mm_subs_epi16 (acc_down,acc_up);
335
0
          _mm_storeu_si128((__m128i*)&p_ACC[x], acc);
336
337
0
          __m128i acc_right = _mm_adds_epi16 (vr_up,vr_down);
338
0
          vr_mid = _mm_slli_epi16 (vr_mid,1);  // middle *2
339
0
          acc_right = _mm_adds_epi16 (acc_right,vr_mid);
340
341
0
          __m128i acc_left = _mm_adds_epi16 (vl_up,vl_down);
342
0
          vl_mid = _mm_slli_epi16 (vl_mid,1);  // middle *2
343
0
          acc_left = _mm_adds_epi16 (acc_left,vl_mid);
344
0
          acc = _mm_subs_epi16 (acc_right,acc_left);
345
0
          _mm_storeu_si128((__m128i*)&p_ACC_Y[x], acc);
346
0
        }
347
0
        else
348
0
        {
349
0
          v1_up = _mm_loadu_si128((const __m128i*)(p_buf1_up+x+8));
350
0
          v1_down = _mm_loadu_si128((const __m128i*)(p_buf1_down+x+8));
351
0
          v1_mid = _mm_loadu_si128((const __m128i*)(p_buf1+x+8));
352
353
0
          __m128i vl_up = _mm_slli_si128 (v0_up,2);  // der unterste fehlt, aus vold holen
354
0
          vl_up = _mm_blend_epi16 (vl_up,_mm_srli_si128 (vold_up,14),1);
355
0
          __m128i vr_up = _mm_srli_si128 (v0_up,2);  // der oberste fehlt, aus v1 holen
356
0
          vr_up = _mm_blend_epi16 (vr_up,_mm_slli_si128 (v1_up,14),0x80);
357
358
0
          __m128i vl_down = _mm_slli_si128 (v0_down,2);  // der unterste fehlt, aus vold holen
359
0
          vl_down = _mm_blend_epi16 (vl_down,_mm_srli_si128 (vold_down,14),1);
360
0
          __m128i vr_down = _mm_srli_si128 (v0_down,2);  // der oberste fehlt, aus v1 holen
361
0
          vr_down = _mm_blend_epi16 (vr_down,_mm_slli_si128 (v1_down,14),0x80);
362
363
0
          __m128i vl_mid = _mm_slli_si128 (v0_mid,2);  // der unterste fehlt, aus vold holen
364
0
          vl_mid = _mm_blend_epi16 (vl_mid,_mm_srli_si128 (vold_mid,14),1);
365
0
          __m128i vr_mid = _mm_srli_si128 (v0_mid,2);  // der oberste fehlt, aus v1 holen
366
0
          vr_mid = _mm_blend_epi16 (vr_mid,_mm_slli_si128 (v1_mid,14),0x80);
367
368
0
          __m128i vm_up = _mm_slli_epi16 (v0_up,1);  // middle *2
369
0
          __m128i acc_up = _mm_adds_epi16 (vm_up,vl_up);
370
0
          acc_up = _mm_adds_epi16 (acc_up,vr_up);
371
372
0
          __m128i vm_down = _mm_slli_epi16 (v0_down,1);  // middle *2
373
0
          __m128i acc_down = _mm_adds_epi16 (vm_down,vl_down);
374
0
          acc_down = _mm_adds_epi16 (acc_down,vr_down);
375
0
          __m128i acc = _mm_subs_epi16 (acc_down,acc_up);
376
0
          _mm_storeu_si128((__m128i*)&p_ACC[x], acc);
377
378
0
          __m128i acc_right = _mm_adds_epi16 (vr_up,vr_down);
379
0
          vr_mid = _mm_slli_epi16 (vr_mid,1);  // middle *2
380
0
          acc_right = _mm_adds_epi16 (acc_right,vr_mid);
381
382
0
          __m128i acc_left = _mm_adds_epi16 (vl_up,vl_down);
383
0
          vl_mid = _mm_slli_epi16 (vl_mid,1);  // middle *2
384
0
          acc_left = _mm_adds_epi16 (acc_left,vl_mid);
385
0
          acc = _mm_subs_epi16 (acc_right,acc_left);
386
0
          _mm_storeu_si128((__m128i*)&p_ACC_Y[x], acc);
387
0
        }
388
0
        vold_up = v0_up;
389
0
        vold_down = v0_down;
390
0
        vold_mid = v0_mid;
391
0
        v0_up = v1_up;
392
0
        v0_down = v1_down;
393
0
        v0_mid = v1_mid;
394
0
      }  //for x
395
      // last collum
396
0
      {
397
0
        __m128i vl_up = _mm_slli_si128 (v0_up,2);  // der unterste fehlt, aus vold holen
398
0
        vl_up = _mm_blend_epi16 (vl_up,_mm_srli_si128 (vold_up,14),1);
399
0
        __m128i vr_up = _mm_srli_si128 (v0_up,2);  // der oberste fehlt, aus v0 holen
400
0
        vr_up = _mm_blend_epi16 (vr_up,v0_up,0x80);
401
402
0
        __m128i vl_down = _mm_slli_si128 (v0_down,2);  // der unterste fehlt, aus vold holen
403
0
        vl_down = _mm_blend_epi16 (vl_down,_mm_srli_si128 (vold_down,14),1);
404
0
        __m128i vr_down = _mm_srli_si128 (v0_down,2);  // der oberste fehlt, aus v0 holen
405
0
        vr_down = _mm_blend_epi16(vr_down,v0_down,0x80);
406
407
0
        __m128i vl_mid = _mm_slli_si128 (v0_mid,2);  // der unterste fehlt, aus vold holen
408
0
        vl_mid = _mm_blend_epi16 (vl_mid,_mm_srli_si128 (vold_mid,14),1);
409
0
        __m128i vr_mid = _mm_srli_si128 (v0_mid,2);  // der oberste fehlt, aus v0 holen
410
0
        vr_mid = _mm_blend_epi16 (vr_mid,v0_mid,0x80);
411
412
0
        __m128i vm_up = _mm_slli_epi16 (v0_up,1);  // middle *2
413
0
        __m128i acc_up = _mm_adds_epi16 (vm_up,vl_up);
414
0
        acc_up = _mm_adds_epi16 (acc_up,vr_up);
415
416
0
        __m128i vm_down = _mm_slli_epi16 (v0_down,1);  // middle *2
417
0
        __m128i acc_down = _mm_adds_epi16 (vm_down,vl_down);
418
0
        acc_down = _mm_adds_epi16 (acc_down,vr_down);
419
420
0
        __m128i acc = _mm_subs_epi16 (acc_down,acc_up);
421
0
        _mm_storeu_si128((__m128i*)&p_ACC[x], acc);
422
423
0
        __m128i acc_right = _mm_adds_epi16 (vr_up,vr_down);
424
0
        vr_mid = _mm_slli_epi16 (vr_mid,1);  // middle *2
425
0
        acc_right = _mm_adds_epi16 (acc_right,vr_mid);
426
427
0
        __m128i acc_left = _mm_adds_epi16 (vl_up,vl_down);
428
0
        vl_mid = _mm_slli_epi16 (vl_mid,1);  // middle *2
429
0
        acc_left = _mm_adds_epi16 (acc_left,vl_mid);
430
0
        acc = _mm_subs_epi16 (acc_right,acc_left);
431
0
        _mm_storeu_si128((__m128i*)&p_ACC_Y[x], acc);
432
0
      }
433
0
    }
434
0
    p_ACC+=width;
435
0
    p_ACC_Y+=width;
436
0
  } // y
437
438
  // magnitude
439
0
  p_ACC = AccGxBuf->Y().buf;
440
0
  p_ACC_Y = AccGyBuf->Y().buf;
441
442
0
  for (int y = 0; y < height; y++)
443
0
  {
444
0
    p_buf1=buff1->Y().buf + y*stride;
445
446
0
    if( vext >= AVX2 && !res16)
447
0
    {
448
#ifdef USE_AVX2
449
      int x;
450
      __m256i vbdmax  = _mm256_set1_epi16   ( maxClpRange);
451
452
0
      for (x=0; x < width; x+=16)
453
0
      {
454
0
        __m256i GX = _mm256_loadu_si256((const __m256i*)&p_ACC[x]);
455
0
        __m256i GY = _mm256_loadu_si256((const __m256i*)&p_ACC_Y[x]);
456
0
        GX = _mm256_abs_epi16(GX);
457
0
        GY = _mm256_abs_epi16(GY);
458
0
        GX = _mm256_add_epi16(GX,GY);
459
0
        GX = _mm256_srli_epi16(GX,1);
460
0
        GX = _mm256_min_epi16 (GX,vbdmax);
461
0
        _mm256_storeu_si256((__m256i*)&p_buf1[x], GX);
462
0
      }
463
#endif
464
0
    }  //AVX2
465
0
    else
466
0
    {
467
0
      int x;
468
0
      __m128i vbdmax  = _mm_set1_epi16   ( maxClpRange);
469
470
0
      for (x=0; x < width; x+=8)
471
0
      {
472
0
        __m128i GX = _mm_loadu_si128((const __m128i*)&p_ACC[x]);
473
0
        __m128i GY = _mm_loadu_si128((const __m128i*)&p_ACC_Y[x]);
474
0
        GX = _mm_abs_epi16(GX);
475
0
        GY = _mm_abs_epi16(GY);
476
0
        GX = _mm_add_epi16(GX,GY);
477
0
        GX = _mm_srli_epi16(GX,1);
478
0
        GX = _mm_min_epi16 (GX,vbdmax);
479
0
        _mm_storeu_si128((__m128i*)&p_buf1[x], GX);
480
0
      }
481
0
    }
482
0
    p_ACC+=width;
483
0
    p_ACC_Y+=width;
484
0
    p_buf1+=stride;
485
0
  }
486
487
  // Loop through each pixel
488
0
  Pel* pX = AccGxBuf->Y().buf;
489
0
  Pel* pY = AccGyBuf->Y().buf;
490
0
  int strideX = AccGxBuf->Y().stride;
491
0
  int strideY = AccGyBuf->Y().stride;
492
493
0
  Pel* pQD = buff2->Y().buf;
494
495
0
  for (int y = 0; y < height; y++)
496
0
  {
497
0
    if( vext >= AVX2 && !res16)
498
0
    {
499
#ifdef USE_AVX2
500
      // Store pi and pi/2 as constants
501
      const __m256 pi = _mm256_set1_ps((float)PI);
502
      const __m256 pi_2 = _mm256_set1_ps((float)PI_2);
503
      const __m256 vpi_8 = _mm256_set1_ps((float)pi_8);
504
      const __m256 vpi_3_8 = _mm256_set1_ps((float)pi_3_8);
505
      const __m256 vpi_5_8 = _mm256_set1_ps((float)pi_5_8);
506
      const __m256 vpi_7_8 = _mm256_set1_ps((float)pi_7_8);
507
508
      const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));;
509
      const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
510
511
0
      for (int x = 0; x < width; x+=16)
512
0
      {
513
0
        for (int n=0; n<16;n+=8)
514
0
        {
515
0
          __m128i Ix = _mm_loadu_si128((const __m128i*)&pX[x+n]);
516
0
          __m128i Iy = _mm_loadu_si128((const __m128i*)&pY[x+n]);
517
518
0
          __m256 vx = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(Ix));
519
0
          __m256 vy = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(Iy));
520
0
          __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(vy, abs_mask),_mm256_and_ps(vx, abs_mask),_CMP_GT_OS);
521
0
          __m256 atan_input = _mm256_div_ps(_mm256_blendv_ps(vy, vx, swap_mask),_mm256_blendv_ps(vx, vy, swap_mask));
522
0
          __m256 result = atan_avx_approximation(atan_input);
523
524
0
          result = _mm256_blendv_ps(result,_mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(atan_input, sign_mask)),result),swap_mask);
525
0
          __m256 x_sign_mask = _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(vx), 31));
526
0
          result = _mm256_add_ps(_mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, vy)),x_sign_mask),result);
527
528
          // take abs value
529
0
          result = _mm256_andnot_ps(sign_mask,result);
530
          // compare
531
0
          __m256 QD0 = _mm256_cmp_ps (result,vpi_8,_CMP_LE_OS);
532
0
          QD0 = _mm256_or_ps(QD0,_mm256_cmp_ps (result,vpi_7_8,_CMP_GE_OS));
533
0
          __m256 QD90 = _mm256_cmp_ps (result,vpi_3_8,_CMP_GT_OS);
534
0
          QD90 = _mm256_and_ps(QD90,_mm256_cmp_ps (result,vpi_5_8,_CMP_LE_OS));
535
0
          __m256 QD45 = _mm256_cmp_ps (result,vpi_8,_CMP_GT_OS);
536
0
          QD45 = _mm256_and_ps(QD45,_mm256_cmp_ps (result,vpi_3_8,_CMP_LE_OS));
537
0
          __m256 QD135 = _mm256_cmp_ps (result,vpi_5_8,_CMP_GT_OS);
538
0
          QD135 = _mm256_and_ps(QD135,_mm256_cmp_ps (result,vpi_7_8,_CMP_LE_OS));
539
          // Dy > 0
540
0
          __m256 Neg = _mm256_cmp_ps (vy,_mm256_set1_ps(0.0),_CMP_LT_OS);
541
0
          QD45 = _mm256_xor_ps(QD45,_mm256_and_ps(Neg,QD135));
542
0
          QD135 = _mm256_xor_ps(QD135,_mm256_and_ps(Neg,QD45));
543
544
0
          __m256 FQD = _mm256_set1_ps(0.0);
545
0
          FQD =  _mm256_blendv_ps(FQD,_mm256_set1_ps(90.0),QD90);
546
0
          FQD =  _mm256_blendv_ps(FQD,_mm256_set1_ps(45.0),QD45);
547
0
          FQD =  _mm256_blendv_ps(FQD,_mm256_set1_ps(135.0),QD135);
548
          // integer 32 bit
549
0
          __m256i QD0I = _mm256_cvtps_epi32(FQD);
550
          // integer 16 bit
551
0
          QD0I = _mm256_packus_epi32(QD0I,QD0I);
552
0
          QD0I = _mm256_permute4x64_epi64(QD0I,0x8);
553
0
          _mm_storeu_si128((__m128i*)&pQD[x+n], _mm256_castsi256_si128(QD0I));
554
0
        }
555
0
      }
556
#endif
557
0
    }
558
0
    else      //SSE
559
0
    {
560
      // Store pi and pi/2 as constants
561
0
      const __m128 pi = _mm_set1_ps((float)PI);
562
0
      const __m128 pi_2 = _mm_set1_ps((float)PI_2);
563
0
      const __m128 vpi_8 = _mm_set1_ps((float)pi_8);
564
0
      const __m128 vpi_3_8 = _mm_set1_ps((float)pi_3_8);
565
0
      const __m128 vpi_5_8 = _mm_set1_ps((float)pi_5_8);
566
0
      const __m128 vpi_7_8 = _mm_set1_ps((float)pi_7_8);
567
568
0
      const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));;
569
0
      const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
570
571
0
      for (int x = 0; x < width; x+=8)
572
0
      {
573
0
        for (int n=0; n<8;n+=4)
574
0
        {
575
0
          __m128i Ix = _mm_loadu_si128((const __m128i*)&pX[x+n]);
576
0
          __m128i Iy = _mm_loadu_si128((const __m128i*)&pY[x+n]);
577
578
0
          __m128 vx = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(Ix));
579
0
          __m128 vy = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(Iy));
580
0
          __m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(vy, abs_mask),_mm_and_ps(vx, abs_mask));
581
0
          __m128 atan_input = _mm_div_ps(_mm_blendv_ps(vy, vx, swap_mask),_mm_blendv_ps(vx, vy, swap_mask));
582
0
          __m128 result = atan_avx_approximation(atan_input);
583
584
0
          result = _mm_blendv_ps(result,_mm_sub_ps(_mm_or_ps(pi_2, _mm_and_ps(atan_input, sign_mask)),result),swap_mask);
585
0
          __m128 x_sign_mask = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(vx), 31));
586
0
          result = _mm_add_ps(_mm_and_ps(_mm_xor_ps(pi, _mm_and_ps(sign_mask, vy)),x_sign_mask),result);
587
588
          // take abs value
589
0
          result = _mm_andnot_ps(sign_mask,result);
590
          // compare
591
0
          __m128 QD0 = _mm_cmple_ps (result,vpi_8);
592
0
          QD0 = _mm_or_ps(QD0,_mm_cmpge_ps (result,vpi_7_8));
593
0
          __m128 QD90 = _mm_cmpgt_ps (result,vpi_3_8);
594
0
          QD90 = _mm_and_ps(QD90,_mm_cmple_ps (result,vpi_5_8));
595
0
          __m128 QD45 = _mm_cmpgt_ps (result,vpi_8);
596
0
          QD45 = _mm_and_ps(QD45,_mm_cmple_ps (result,vpi_3_8));
597
0
          __m128 QD135 = _mm_cmpgt_ps (result,vpi_5_8);
598
0
          QD135 = _mm_and_ps(QD135,_mm_cmple_ps (result,vpi_7_8));
599
          // Dy > 0
600
0
          __m128 Neg = _mm_cmplt_ps (vy,_mm_set1_ps(0.0));
601
0
          QD45 = _mm_xor_ps(QD45,_mm_and_ps(Neg,QD135));
602
0
          QD135 = _mm_xor_ps(QD135,_mm_and_ps(Neg,QD45));
603
604
0
          __m128 FQD = _mm_set1_ps(0.0);
605
0
          FQD =  _mm_blendv_ps(FQD,_mm_set1_ps(90.0),QD90);
606
0
          FQD =  _mm_blendv_ps(FQD,_mm_set1_ps(45.0),QD45);
607
0
          FQD =  _mm_blendv_ps(FQD,_mm_set1_ps(135.0),QD135);
608
          // integer 32 bit
609
0
          __m128i QD0I = _mm_cvtps_epi32(FQD);
610
          // integer 16 bit
611
0
          QD0I = _mm_packus_epi32(QD0I,QD0I);
612
0
          _mm_storeu_si64((__m128i*)&pQD[x+n],QD0I);
613
0
        }
614
0
      }
615
0
    }
616
0
    pX+=strideX;
617
0
    pY+=strideY;
618
0
    pQD+=buff2->Y().stride;;
619
0
  }
620
621
0
  buff1->get(compID).extendBorderPel(padding, padding);   // extend border for the next steps
622
0
}
Unexecuted instantiation: void vvenc::gradient_SIMD<(vvenc::x86_simd::X86_VEXT)1>(vvenc::PelStorage*, vvenc::PelStorage*, vvenc::PelStorage*, vvenc::PelStorage*, unsigned int, unsigned int, unsigned int, vvenc::ComponentID)
Unexecuted instantiation: void vvenc::gradient_SIMD<(vvenc::x86_simd::X86_VEXT)4>(vvenc::PelStorage*, vvenc::PelStorage*, vvenc::PelStorage*, vvenc::PelStorage*, unsigned int, unsigned int, unsigned int, vvenc::ComponentID)
623
624
template<X86_VEXT vext>
625
int dilation_SIMD ( PelStorage *buff,
626
                    PelStorage *Wbuf,
627
                    unsigned int bitDepth,
628
                    ComponentID compID,
629
                    int numIter,
630
                    int iter,
631
                    Pel Value)
632
0
{
633
0
  if ( iter == numIter )
634
0
  {
635
0
    return iter;
636
0
  }
637
0
  unsigned int width      = buff->get(compID).width,
638
0
               height     = buff->get(compID).height;   // Width and Height of current frame
639
0
  unsigned int windowSize = KERNELSIZE;
640
0
  unsigned int padding    = windowSize / 2;
641
642
0
  Wbuf->bufs[0].copyFrom( buff->get(compID) );
643
644
0
  Pel* p_buf;
645
0
  Pel* p_buf_up;
646
0
  Pel* p_buf_down;
647
0
  int stride  = buff->Y().stride;
648
0
  Pel* p_tmpBuf = Wbuf->Y().buf;
649
650
0
  int res16 = width & 0xf;
651
652
  // avoid compiler warnings
653
0
  __m128i v0_mid =  _mm_set1_epi16 (0);
654
0
  __m128i vold_down =  _mm_set1_epi16 (0);
655
0
  __m128i vold_up =  _mm_set1_epi16 (0);
656
0
  __m128i v0_down =  _mm_set1_epi16 (0);
657
0
  __m128i v0_up =  _mm_set1_epi16 (0);
658
0
  __m128i vold_mid =  _mm_set1_epi16 (0);
659
660
0
  for (int y = 0; y < height; y++)
661
0
  {
662
0
    p_buf=buff->Y().buf + y*stride;
663
0
    if (y==0)
664
0
    {
665
0
      p_buf_up = p_buf;
666
0
      p_buf_down = p_buf_up+stride;
667
0
    }
668
0
    else if (y==height-1)
669
0
    {
670
0
      p_buf_down = p_buf;
671
0
      p_buf_up = p_buf_down - stride;
672
0
    }
673
0
    else
674
0
    {
675
0
      p_buf_up = p_buf - stride;  //starts at 1 now
676
0
      p_buf_down = p_buf+stride;
677
0
    }
678
0
    if( vext >= AVX2 && !res16)
679
0
    {
680
#ifdef USE_AVX2
681
      __m256i  v0_up;
682
      __m256i  v0_down;
683
      __m256i  v0_mid;
684
      __m256i vstrong =  _mm256_set1_epi16 (Value);
685
686
      int x;
687
0
      for (x=0; x < width-16; x+=16)
688
0
      {
689
0
        if (x==0)
690
0
        {
691
0
          v0_up = _mm256_lddqu_si256((const __m256i *)(p_buf_up));
692
0
          __m256i  vr_up = _mm256_lddqu_si256((const __m256i *)(p_buf_up+1));
693
0
          v0_down = _mm256_lddqu_si256((const __m256i *)(p_buf_down));
694
0
          __m256i  vr_down = _mm256_lddqu_si256((const __m256i *)(p_buf_down+1));
695
696
0
          __m256i  vl_up = _mm256_slli_si256 (v0_up,2);  // jeweils der unterste fehlt, aus vold holen
697
0
          __m256i  tmp = _mm256_permute4x64_epi64 (v0_up,0x10);
698
0
          tmp = _mm256_bsrli_epi128(tmp,6);
699
0
          tmp = _mm256_inserti128_si256(tmp,_mm256_castsi256_si128 (v0_up),0);
700
0
          vl_up = _mm256_blend_epi16(vl_up,tmp,1);
701
702
0
          __m256i  vl_down = _mm256_slli_si256 (v0_down,2);  // jeweils der unterste fehlt, aus vold holen
703
0
          tmp = _mm256_permute4x64_epi64 (v0_down,0x10);
704
0
          tmp = _mm256_bsrli_epi128(tmp,6);
705
0
          tmp = _mm256_inserti128_si256(tmp,_mm256_castsi256_si128 (v0_down),0);
706
0
          vl_down = _mm256_blend_epi16(vl_down,tmp,1);
707
708
          // mid
709
0
          v0_mid = _mm256_lddqu_si256((const __m256i *)(p_buf));
710
0
          __m256i  vr_mid = _mm256_lddqu_si256((const __m256i *)(p_buf+1));
711
0
          __m256i  vl_mid = _mm256_slli_si256 (v0_mid,2);  // jeweils der unterste fehlt, aus vold holen
712
0
          tmp = _mm256_permute4x64_epi64 (v0_mid,0x10);
713
0
          tmp = _mm256_bsrli_epi128(tmp,6);
714
0
          tmp = _mm256_inserti128_si256(tmp,_mm256_castsi256_si128 (v0_mid),0);
715
0
          vl_mid = _mm256_blend_epi16(vl_mid,tmp,1);
716
717
0
          __m256i v_mask = _mm256_cmpeq_epi16(vl_up,vstrong);
718
0
          v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(v0_up,vstrong),v_mask);
719
0
          v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(vr_up,vstrong),v_mask);
720
0
          v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(vl_mid,vstrong),v_mask);
721
0
          v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(v0_mid,vstrong),v_mask);
722
0
          v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(vr_mid,vstrong),v_mask);
723
0
          v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(vl_down,vstrong),v_mask);
724
0
          v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(v0_down,vstrong),v_mask);
725
0
          v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(vr_down,vstrong),v_mask);
726
727
0
          __m256i vres = _mm256_blendv_epi8(v0_mid,vstrong,v_mask);
728
0
          _mm256_storeu_si256((__m256i*)&p_tmpBuf[x], vres);
729
0
        }
730
0
        else
731
0
        {
732
0
          v0_up = _mm256_lddqu_si256((const __m256i *)(p_buf_up+x));
733
0
          __m256i  vr_up = _mm256_lddqu_si256((const __m256i *)(p_buf_up+x+1));
734
0
          __m256i  vl_up = _mm256_lddqu_si256((const __m256i *)(p_buf_up+x-1));
735
736
0
          v0_down = _mm256_lddqu_si256((const __m256i *)(p_buf_down+x));
737
0
          __m256i  vr_down = _mm256_lddqu_si256((const __m256i *)(p_buf_down+x+1));
738
0
          __m256i  vl_down = _mm256_lddqu_si256((const __m256i *)(p_buf_down+x-1));
739
740
          // mid
741
0
          v0_mid = _mm256_lddqu_si256((const __m256i *)(p_buf+x));
742
0
          __m256i  vr_mid = _mm256_lddqu_si256((const __m256i *)(p_buf+x+1));
743
0
          __m256i  vl_mid = _mm256_lddqu_si256((const __m256i *)(p_buf+x-1));
744
745
0
          __m256i v_mask = _mm256_cmpeq_epi16(vl_up,vstrong);
746
0
          v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(v0_up,vstrong),v_mask);
747
0
          v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(vr_up,vstrong),v_mask);
748
0
          v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(vl_mid,vstrong),v_mask);
749
0
          v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(v0_mid,vstrong),v_mask);
750
0
          v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(vr_mid,vstrong),v_mask);
751
0
          v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(vl_down,vstrong),v_mask);
752
0
          v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(v0_down,vstrong),v_mask);
753
0
          v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(vr_down,vstrong),v_mask);
754
755
0
          __m256i vres = _mm256_blendv_epi8(v0_mid,vstrong,v_mask);
756
0
          _mm256_storeu_si256((__m256i*)&p_tmpBuf[x], vres);
757
0
         }
758
0
      }  //for x
759
      // last collum
760
      {
761
        v0_up = _mm256_lddqu_si256((const __m256i *)(p_buf_up+x));
762
        __m256i  vl_up = _mm256_lddqu_si256((const __m256i *)(p_buf_up+x-1));
763
764
        v0_down = _mm256_lddqu_si256((const __m256i *)(p_buf_down+x));
765
        __m256i  vl_down = _mm256_lddqu_si256((const __m256i *)(p_buf_down+x-1));
766
767
        __m256i  vr_up = _mm256_srli_si256 (v0_up,2);  // jeweils der oberste fehlt
768
        vr_up = _mm256_insert_epi16 (vr_up,_mm256_extract_epi16 (v0_up,8), 7);
769
        vr_up = _mm256_insert_epi16 (vr_up,_mm256_extract_epi16 (v0_up,15), 15);
770
771
        __m256i  vr_down = _mm256_srli_si256 (v0_down,2);  // jeweils der oberste fehlt
772
        vr_down = _mm256_insert_epi16 (vr_down,_mm256_extract_epi16 (v0_down,8), 7);
773
        vr_down = _mm256_insert_epi16 (vr_down,_mm256_extract_epi16 (v0_down,15), 15);
774
775
        // mid
776
        v0_mid = _mm256_lddqu_si256((const __m256i *)(p_buf+x));
777
        __m256i  vl_mid = _mm256_lddqu_si256((const __m256i *)(p_buf+x-1));
778
        __m256i  vr_mid = _mm256_srli_si256 (v0_mid,2);  // jeweils der oberste fehlt
779
        vr_mid = _mm256_insert_epi16 (vr_mid,_mm256_extract_epi16 (v0_mid,8), 7);
780
        vr_mid = _mm256_insert_epi16 (vr_mid,_mm256_extract_epi16 (v0_mid,15), 15);
781
782
        __m256i v_mask = _mm256_cmpeq_epi16(vl_up,vstrong);
783
        v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(v0_up,vstrong),v_mask);
784
        v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(vr_up,vstrong),v_mask);
785
        v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(vl_mid,vstrong),v_mask);
786
        v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(v0_mid,vstrong),v_mask);
787
        v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(vr_mid,vstrong),v_mask);
788
        v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(vl_down,vstrong),v_mask);
789
        v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(v0_down,vstrong),v_mask);
790
        v_mask = _mm256_or_si256(_mm256_cmpeq_epi16(vr_down,vstrong),v_mask);
791
792
        __m256i vres = _mm256_blendv_epi8(v0_mid,vstrong,v_mask);
793
        _mm256_storeu_si256((__m256i*)&p_tmpBuf[x], vres);
794
      }
795
#endif
796
0
    }  //AVX2
797
0
    else
798
0
    {
799
0
      __m128i v1_up;
800
0
      __m128i v1_down;
801
0
      __m128i v1_mid;
802
0
      __m128i vstrong =  _mm_set1_epi16 (Value);
803
0
      int x;
804
0
      for (x=0; x < width-8; x+=8)
805
0
      {
806
0
        if (x==0)
807
0
        {
808
0
          v0_up = _mm_loadu_si128((const __m128i*)(p_buf_up));
809
0
          v1_up = _mm_loadu_si128((const __m128i*)(p_buf_up+8));
810
0
          v0_down = _mm_loadu_si128((const __m128i*)(p_buf_down));
811
0
          v1_down = _mm_loadu_si128((const __m128i*)(p_buf_down+8));
812
0
          v0_mid = _mm_loadu_si128((const __m128i*)(p_buf));
813
0
          v1_mid = _mm_loadu_si128((const __m128i*)(p_buf+8));
814
815
0
          __m128i vl_up = _mm_slli_si128 (v0_up,2);  // der unterste fehlt, aus vold holen
816
0
          vl_up = _mm_blend_epi16 (vl_up,v0_up,1);
817
0
          __m128i vr_up = _mm_srli_si128 (v0_up,2);  // der oberste fehlt, aus v1 holen
818
0
          vr_up = _mm_blend_epi16 (vr_up,_mm_slli_si128 (v1_up,14),0x80);
819
820
0
          __m128i vl_down = _mm_slli_si128 (v0_down,2);  // der unterste fehlt,
821
0
          vl_down = _mm_blend_epi16 (vl_down,v0_down,1);
822
0
          __m128i vr_down = _mm_srli_si128 (v0_down,2);  // der oberste fehlt, aus v1 holen
823
0
          vr_down = _mm_blend_epi16 (vr_down,_mm_slli_si128 (v1_down,14),0x80);
824
825
0
          __m128i vl_mid = _mm_slli_si128 (v0_mid,2);  // der unterste fehlt,
826
0
          vl_mid = _mm_blend_epi16 (vl_mid,v0_mid,1);
827
0
          __m128i vr_mid = _mm_srli_si128 (v0_mid,2);  // der oberste fehlt, aus v1 holen
828
0
          vr_mid = _mm_blend_epi16 (vr_mid,_mm_slli_si128 (v1_mid,14),0x80);
829
830
0
          __m128i v_mask = _mm_cmpeq_epi16(vl_up,vstrong);
831
0
          v_mask = _mm_or_si128(_mm_cmpeq_epi16(v0_up,vstrong),v_mask);
832
0
          v_mask = _mm_or_si128(_mm_cmpeq_epi16(vr_up,vstrong),v_mask);
833
0
          v_mask = _mm_or_si128(_mm_cmpeq_epi16(vl_mid,vstrong),v_mask);
834
0
          v_mask = _mm_or_si128(_mm_cmpeq_epi16(v0_mid,vstrong),v_mask);
835
0
          v_mask = _mm_or_si128(_mm_cmpeq_epi16(vr_mid,vstrong),v_mask);
836
0
          v_mask = _mm_or_si128(_mm_cmpeq_epi16(vl_down,vstrong),v_mask);
837
0
          v_mask = _mm_or_si128(_mm_cmpeq_epi16(v0_down,vstrong),v_mask);
838
0
          v_mask = _mm_or_si128(_mm_cmpeq_epi16(vr_down,vstrong),v_mask);
839
840
0
          __m128i vres = _mm_blendv_epi8(v0_mid,vstrong,v_mask);
841
0
          _mm_storeu_si128((__m128i*)&p_tmpBuf[x], vres);
842
0
        }  //x==0
843
0
        else
844
0
        {
845
0
          v1_up = _mm_loadu_si128((const __m128i*)(p_buf_up+x+8));
846
0
          v1_down = _mm_loadu_si128((const __m128i*)(p_buf_down+x+8));
847
0
          v1_mid = _mm_loadu_si128((const __m128i*)(p_buf+x+8));
848
849
0
          __m128i vl_up = _mm_slli_si128 (v0_up,2);  // der unterste fehlt, aus vold holen
850
0
          vl_up = _mm_blend_epi16 (vl_up,_mm_srli_si128 (vold_up,14),1);
851
0
          __m128i vr_up = _mm_srli_si128 (v0_up,2);  // der oberste fehlt, aus v1 holen
852
0
          vr_up = _mm_blend_epi16 (vr_up,_mm_slli_si128 (v1_up,14),0x80);
853
854
0
          __m128i vl_down = _mm_slli_si128 (v0_down,2);  // der unterste fehlt, aus vold holen
855
0
          vl_down = _mm_blend_epi16 (vl_down,_mm_srli_si128 (vold_down,14),1);
856
0
          __m128i vr_down = _mm_srli_si128 (v0_down,2);  // der oberste fehlt, aus v1 holen
857
0
          vr_down = _mm_blend_epi16 (vr_down,_mm_slli_si128 (v1_down,14),0x80);
858
859
0
          __m128i vl_mid = _mm_slli_si128 (v0_mid,2);  // der unterste fehlt, aus vold holen
860
0
          vl_mid = _mm_blend_epi16 (vl_mid,_mm_srli_si128 (vold_mid,14),1);
861
0
          __m128i vr_mid = _mm_srli_si128 (v0_mid,2);  // der oberste fehlt, aus v1 holen
862
0
          vr_mid = _mm_blend_epi16 (vr_mid,_mm_slli_si128 (v1_mid,14),0x80);
863
864
0
          __m128i v_mask = _mm_cmpeq_epi16(vl_up,vstrong);
865
0
          v_mask = _mm_or_si128(_mm_cmpeq_epi16(v0_up,vstrong),v_mask);
866
0
          v_mask = _mm_or_si128(_mm_cmpeq_epi16(vr_up,vstrong),v_mask);
867
0
          v_mask = _mm_or_si128(_mm_cmpeq_epi16(vl_mid,vstrong),v_mask);
868
0
          v_mask = _mm_or_si128(_mm_cmpeq_epi16(v0_mid,vstrong),v_mask);
869
0
          v_mask = _mm_or_si128(_mm_cmpeq_epi16(vr_mid,vstrong),v_mask);
870
0
          v_mask = _mm_or_si128(_mm_cmpeq_epi16(vl_down,vstrong),v_mask);
871
0
          v_mask = _mm_or_si128(_mm_cmpeq_epi16(v0_down,vstrong),v_mask);
872
0
          v_mask = _mm_or_si128(_mm_cmpeq_epi16(vr_down,vstrong),v_mask);
873
874
0
          __m128i vres = _mm_blendv_epi8(v0_mid,vstrong,v_mask);
875
0
          _mm_storeu_si128((__m128i*)&p_tmpBuf[x], vres);
876
0
        }
877
0
        vold_up = v0_up;
878
0
        vold_down = v0_down;
879
0
        vold_mid = v0_mid;
880
0
        v0_up = v1_up;
881
0
        v0_down = v1_down;
882
0
        v0_mid = v1_mid;
883
0
      }  //for x
884
      // last collum
885
0
      {
886
0
        __m128i vl_up = _mm_slli_si128 (v0_up,2);  // der unterste fehlt, aus vold holen
887
0
        vl_up = _mm_blend_epi16 (vl_up,_mm_srli_si128 (vold_up,14),1);
888
0
        __m128i vr_up = _mm_srli_si128 (v0_up,2);  // der oberste fehlt, aus v0 holen
889
0
        vr_up = _mm_blend_epi16 (vr_up,v0_up,0x80);
890
891
0
        __m128i vl_down = _mm_slli_si128 (v0_down,2);  // der unterste fehlt, aus vold holen
892
0
        vl_down = _mm_blend_epi16 (vl_down,_mm_srli_si128 (vold_down,14),1);
893
0
        __m128i vr_down = _mm_srli_si128 (v0_down,2);  // der oberste fehlt, aus v0 holen
894
0
        vr_down = _mm_blend_epi16(vr_down,v0_down,0x80);
895
896
0
        __m128i vl_mid = _mm_slli_si128 (v0_mid,2);  // der unterste fehlt, aus vold holen
897
0
        vl_mid = _mm_blend_epi16 (vl_mid,_mm_srli_si128 (vold_mid,14),1);
898
0
        __m128i vr_mid = _mm_srli_si128 (v0_mid,2);  // der oberste fehlt, aus v0 holen
899
0
        vr_mid = _mm_blend_epi16 (vr_mid,v0_mid,0x80);
900
901
0
        __m128i v_mask = _mm_cmpeq_epi16(vl_up,vstrong);
902
0
        v_mask = _mm_or_si128(_mm_cmpeq_epi16(v0_up,vstrong),v_mask);
903
0
        v_mask = _mm_or_si128(_mm_cmpeq_epi16(vr_up,vstrong),v_mask);
904
0
        v_mask = _mm_or_si128(_mm_cmpeq_epi16(vl_mid,vstrong),v_mask);
905
0
        v_mask = _mm_or_si128(_mm_cmpeq_epi16(v0_mid,vstrong),v_mask);
906
0
        v_mask = _mm_or_si128(_mm_cmpeq_epi16(vr_mid,vstrong),v_mask);
907
0
        v_mask = _mm_or_si128(_mm_cmpeq_epi16(vl_down,vstrong),v_mask);
908
0
        v_mask = _mm_or_si128(_mm_cmpeq_epi16(v0_down,vstrong),v_mask);
909
0
        v_mask = _mm_or_si128(_mm_cmpeq_epi16(vr_down,vstrong),v_mask);
910
911
0
        __m128i vres = _mm_blendv_epi8(v0_mid,vstrong,v_mask);
912
0
        _mm_storeu_si128((__m128i*)&p_tmpBuf[x], vres);
913
0
      }
914
0
    }  //!AVX
915
0
    p_tmpBuf+=Wbuf->get(compID).stride;
916
0
  }//y
917
918
0
  buff->get(compID).extendBorderPel( padding, padding );
919
0
  buff->get(compID).copyFrom( Wbuf->bufs[0] );
920
921
0
  iter++;
922
923
0
  iter = dilation_SIMD<vext> ( buff,Wbuf,
924
0
                    bitDepth,
925
0
                    compID,
926
0
                    numIter,
927
0
                    iter,
928
0
                    Value);
929
930
0
  return iter;
931
0
}
Unexecuted instantiation: int vvenc::dilation_SIMD<(vvenc::x86_simd::X86_VEXT)1>(vvenc::PelStorage*, vvenc::PelStorage*, unsigned int, vvenc::ComponentID, int, int, short)
Unexecuted instantiation: int vvenc::dilation_SIMD<(vvenc::x86_simd::X86_VEXT)4>(vvenc::PelStorage*, vvenc::PelStorage*, unsigned int, vvenc::ComponentID, int, int, short)
932
933
template<X86_VEXT vext>
934
double calcVarSse( const Pel* org, const ptrdiff_t origStride, const int w, const int h );
935
936
937
template<X86_VEXT vext>
938
int calcMeanSSE ( const Pel* org, const ptrdiff_t origStride, const int w, const int h )
939
0
{
940
0
  int avg;
941
  // calculate average
942
0
  __m128i xavg32 = _mm_setzero_si128();
943
0
  __m128i xavg16 = _mm_setzero_si128();
944
0
  const __m128i xone = _mm_set1_epi16( 1 );
945
0
  for( int y1 = 0; y1 < h; y1++ )
946
0
  {
947
0
    xavg16 = _mm_setzero_si128();
948
0
    for( int x1 = 0; x1 < w; x1 += 8 )
949
0
    {
950
0
      xavg16 = _mm_add_epi16( xavg16, _mm_loadu_si128( ( const __m128i* ) ( org + x1 + y1 * origStride ) ) );
951
0
    }
952
0
    xavg32 = _mm_add_epi32( xavg32, _mm_madd_epi16( xone, xavg16 ) );
953
0
  }
954
955
0
  xavg32 = _mm_hadd_epi32( xavg32, xavg32 );
956
0
  xavg32 = _mm_hadd_epi32( xavg32, xavg32 );
957
0
  xavg32 = _mm_shuffle_epi32( xavg32, 0 );
958
0
  avg = _mm_extract_epi32 (xavg32, 0);
959
0
  return avg;
960
0
}
Unexecuted instantiation: int vvenc::calcMeanSSE<(vvenc::x86_simd::X86_VEXT)1>(short const*, long, int, int)
Unexecuted instantiation: int vvenc::calcMeanSSE<(vvenc::x86_simd::X86_VEXT)4>(short const*, long, int, int)
961
962
#if ENABLE_SIMD_OPT_FGA
963
template<X86_VEXT vext>
964
void Canny::_initFGACannyX86()
965
0
{
966
0
  gradient  = gradient_SIMD<vext>;
967
0
}
Unexecuted instantiation: void vvenc::Canny::_initFGACannyX86<(vvenc::x86_simd::X86_VEXT)1>()
Unexecuted instantiation: void vvenc::Canny::_initFGACannyX86<(vvenc::x86_simd::X86_VEXT)4>()
968
template void Canny::_initFGACannyX86<SIMDX86>();
969
970
template<X86_VEXT vext>
971
void Morph::_initFGAMorphX86()
972
0
{
973
0
  dilation  = dilation_SIMD<vext>;
974
0
}
Unexecuted instantiation: void vvenc::Morph::_initFGAMorphX86<(vvenc::x86_simd::X86_VEXT)1>()
Unexecuted instantiation: void vvenc::Morph::_initFGAMorphX86<(vvenc::x86_simd::X86_VEXT)4>()
975
template void Morph::_initFGAMorphX86<SIMDX86>();
976
977
978
template<X86_VEXT vext>
979
void FGAnalyzer::_initFGAnalyzerX86()
980
0
{
981
0
#if  ENABLE_SIMD_OPT_MCTF
982
0
  calcVar  = calcVarSse<vext>;
983
0
  calcMean = calcMeanSSE<vext>;
984
0
#endif
985
0
}
Unexecuted instantiation: void vvenc::FGAnalyzer::_initFGAnalyzerX86<(vvenc::x86_simd::X86_VEXT)1>()
Unexecuted instantiation: void vvenc::FGAnalyzer::_initFGAnalyzerX86<(vvenc::x86_simd::X86_VEXT)4>()
986
987
template void FGAnalyzer::_initFGAnalyzerX86<SIMDX86>();
988
#endif
989
990
} // namespace vvenc
991
992
//! \}
993
994
#endif // TARGET_SIMD_X86
995
//! \}