Coverage Report

Created: 2026-04-01 07:49

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/vvenc/source/Lib/CommonLib/x86/IntraPredX86.h
Line
Count
Source
1
/* -----------------------------------------------------------------------------
2
The copyright in this software is being made available under the Clear BSD
3
License, included below. No patent rights, trademark rights and/or 
4
other Intellectual Property Rights other than the copyrights concerning 
5
the Software are granted under this license.
6
7
The Clear BSD License
8
9
Copyright (c) 2019-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
10
All rights reserved.
11
12
Redistribution and use in source and binary forms, with or without modification,
13
are permitted (subject to the limitations in the disclaimer below) provided that
14
the following conditions are met:
15
16
     * Redistributions of source code must retain the above copyright notice,
17
     this list of conditions and the following disclaimer.
18
19
     * Redistributions in binary form must reproduce the above copyright
20
     notice, this list of conditions and the following disclaimer in the
21
     documentation and/or other materials provided with the distribution.
22
23
     * Neither the name of the copyright holder nor the names of its
24
     contributors may be used to endorse or promote products derived from this
25
     software without specific prior written permission.
26
27
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
28
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
29
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
32
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
36
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
40
41
------------------------------------------------------------------------------------------- */
42
/** \file     IntraPredX86.h
43
    \brief    SIMD for IntraPrediction
44
*/
45
46
#pragma once
47
48
#include "CommonDefX86.h"
49
#include "Rom.h"
50
#include "IntraPrediction.h"
51
#include "InterpolationFilter.h"
52
53
#include "Unit.h"
54
55
#if ENABLE_SIMD_OPT_INTRAPRED
56
#ifdef TARGET_SIMD_X86
57
//! \ingroup CommonLib
58
//! \{
59
60
namespace vvenc {
61
62
//#define USE_AVX2
63
template< X86_VEXT vext >
64
void IntraPredAngleChroma_SIMD(int16_t* pDst,const ptrdiff_t dstStride,int16_t* pBorder,int width,int height,int deltaPos,int intraPredAngle)
65
0
{
66
0
  int deltaInt;
67
0
  int deltaFract;
68
0
  int refMainIndex;
69
0
  __m128i voffset = _mm_set1_epi16(16);
70
0
  if( width >= 8 )
71
0
  {
72
0
    if( vext >= AVX2 )
73
0
    {
74
#ifdef USE_AVX2
75
0
      if (( width & 15 ) == 0 )
76
0
      {
77
0
       int deltaInt;
78
0
        int deltaFract;
79
0
        int refMainIndex;
80
81
        __m256i voffset = _mm256_set1_epi16(16);
82
0
        for (int k=0; k<height; k++) {
83
84
0
          deltaInt   = deltaPos >> 5;
85
0
          deltaFract = deltaPos & (32 - 1);
86
87
0
          __m256i vfract = _mm256_set1_epi16(deltaFract);
88
0
          __m256i v32minfract = _mm256_set1_epi16(32-deltaFract);
89
          // Do linear filtering
90
0
          for (int l=0; l<width; l+=16) {
91
0
            refMainIndex   = l+ deltaInt+1;
92
0
            __m256i vpred0 = _mm256_lddqu_si256((__m256i*)&pBorder[refMainIndex]);
93
0
            __m256i vpred1 = _mm256_lddqu_si256((__m256i*)&pBorder[refMainIndex+1]);
94
0
            vpred0 = _mm256_mullo_epi16(v32minfract, vpred0);
95
0
            vpred1 = _mm256_mullo_epi16(vfract, vpred1);
96
0
            __m256i vpred = _mm256_srli_epi16(_mm256_add_epi16(_mm256_add_epi16(vpred0, vpred1), voffset), 5);
97
0
            _mm256_storeu_si256((__m256i*)&pDst[l], vpred);
98
0
          }
99
0
          pDst+=dstStride;
100
0
          deltaPos += intraPredAngle;
101
0
        }
102
0
      }
103
0
      else // width==8
104
0
      {
105
0
        for (int k=0; k<height; k++)
106
0
        {
107
0
          deltaInt   = deltaPos >> 5;
108
0
          deltaFract = deltaPos & (32 - 1);
109
110
0
          __m128i vfract = _mm_set1_epi16(deltaFract);
111
0
          __m128i v32minfract = _mm_set1_epi16(32-deltaFract);
112
          // Do linear filtering
113
0
          for (int l=0; l<width; l+=8) {
114
0
            refMainIndex        = l+ deltaInt+1;
115
0
            __m128i vpred0 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex]);
116
0
            __m128i vpred1 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex+1]);
117
0
            vpred0 = _mm_mullo_epi16(v32minfract, vpred0);
118
0
            vpred1 = _mm_mullo_epi16(vfract, vpred1);
119
0
            __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5);
120
0
            _mm_storeu_si128((__m128i*)&pDst[l], vpred);
121
0
          }
122
0
          deltaPos += intraPredAngle;
123
124
0
          pDst+=dstStride;
125
0
        }
126
127
0
      }
128
#endif
129
0
    }  //AVX2
130
0
    else
131
0
    {
132
0
      for (int k=0; k<height; k++) {
133
0
        deltaInt   = deltaPos >> 5;
134
0
        deltaFract = deltaPos & (32 - 1);
135
136
0
        __m128i vfract = _mm_set1_epi16(deltaFract);
137
0
        __m128i v32minfract = _mm_set1_epi16(32-deltaFract);
138
        // Do linear filtering
139
0
        for (int l=0; l<width; l+=8) {
140
0
          refMainIndex        = l+ deltaInt+1;
141
0
          __m128i vpred0 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex]);
142
0
          __m128i vpred1 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex+1]);
143
0
          vpred0 = _mm_mullo_epi16(v32minfract, vpred0);
144
0
          vpred1 = _mm_mullo_epi16(vfract, vpred1);
145
0
          __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5);
146
0
          _mm_storeu_si128((__m128i*)&pDst[l], vpred);
147
0
        }
148
0
        deltaPos += intraPredAngle;
149
0
        pDst+=dstStride;
150
0
      }
151
0
    }
152
0
  }
153
0
  else if( width == 4 )
154
0
  {
155
0
    for (int k=0; k<height; k++) {
156
0
      deltaInt   = deltaPos >> 5;
157
0
      deltaFract = deltaPos & (32 - 1);
158
159
0
      __m128i vfract = _mm_set1_epi16(deltaFract);
160
0
      __m128i v32minfract = _mm_set1_epi16(32-deltaFract);
161
      // Do linear filtering
162
0
      refMainIndex        = deltaInt+1;
163
0
      __m128i vpred0 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex]);
164
0
      __m128i vpred1 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex+1]);
165
0
      vpred0 = _mm_mullo_epi16(v32minfract, vpred0);
166
0
      vpred1 = _mm_mullo_epi16(vfract, vpred1);
167
0
      __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5);
168
0
      _vv_storel_epi64( ( __m128i * )(pDst ), vpred);
169
0
      deltaPos += intraPredAngle;
170
0
      pDst+=dstStride;
171
0
    }
172
0
  }
173
0
  else
174
0
  {
175
0
    for (int y = 0; y<height; y++)
176
0
    {
177
0
      const int deltaInt   = deltaPos >> 5;
178
0
      const int deltaFract = deltaPos & (32 - 1);
179
180
      // Do linear filtering
181
0
      const Pel* pRM = pBorder + deltaInt + 1;
182
0
      int lastRefMainPel = *pRM++;
183
184
0
      for( int x = 0; x < 2; pRM++, x++ )
185
0
      {
186
0
        int thisRefMainPel = *pRM;
187
0
        pDst[x + 0] = ( Pel ) ( ( ( 32 - deltaFract )*lastRefMainPel + deltaFract*thisRefMainPel + 16 ) >> 5 );
188
0
        lastRefMainPel = thisRefMainPel;
189
0
      }
190
0
      deltaPos += intraPredAngle;
191
0
      pDst += dstStride;
192
0
    }
193
0
  }
194
#if USE_AVX2
195
196
  _mm256_zeroupper();
197
#endif
198
0
}
Unexecuted instantiation: void vvenc::IntraPredAngleChroma_SIMD<(vvenc::x86_simd::X86_VEXT)1>(short*, long, short*, int, int, int, int)
Unexecuted instantiation: void vvenc::IntraPredAngleChroma_SIMD<(vvenc::x86_simd::X86_VEXT)4>(short*, long, short*, int, int, int, int)
199
200
template< X86_VEXT vext >
201
void IntraPredAngleLumaCore_SIMD(int16_t* pDstBuf,const ptrdiff_t dstStride,int16_t* refMain,int width,int height,int deltaPos,int intraPredAngle,const TFilterCoeff *ff_unused,const bool useCubicFilter,const ClpRng& clpRng)
202
0
{
203
  
204
0
  int16_t* pDst;
205
0
  if( width >= 8 )
206
0
  {
207
208
0
    if( vext >= AVX2 )
209
0
    {
210
#ifdef USE_AVX2
211
      __m256i shflmask1= _mm256_set_epi8(0xd, 0xc, 0xb, 0xa,0x9, 0x8, 0x7, 0x6,   0xb, 0xa, 0x9, 0x8,0x7, 0x6, 0x5, 0x4,
212
          0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2,     0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 );
213
      __m256i offset = _mm256_set1_epi32( 32 );
214
215
0
      if (( width & 15 ) == 0 )
216
0
      {
217
0
        __m256i vbdmin = _mm256_set1_epi16( clpRng.min() );
218
0
        __m256i vbdmax = _mm256_set1_epi16( clpRng.max() );
219
220
0
        for (int y = 0; y<height; y++ )
221
0
        {
222
0
          int deltaInt   = deltaPos >> 5;
223
0
          int deltaFract = deltaPos & (32 - 1);
224
          
225
0
          const TFilterCoeff      intraSmoothingFilter[4] = {TFilterCoeff(16 - (deltaFract >> 1)), TFilterCoeff(32 - (deltaFract >> 1)), TFilterCoeff(16 + (deltaFract >> 1)), TFilterCoeff(deltaFract >> 1)};
226
0
          const TFilterCoeff *f = useCubicFilter ? InterpolationFilter::getChromaFilterTable(deltaFract) : intraSmoothingFilter;
227
228
0
          int refMainIndex   = deltaInt + 1;
229
0
          pDst=&pDstBuf[y*dstStride];
230
//          __m128i tmp = _vv_loadl_epi64( ( __m128i const * )&ff[deltaFract<<2] );   //load 4 16 bit filter coeffs
231
0
          __m128i tmp = _vv_loadl_epi64( ( __m128i const * )f );   //load 4 16 bit filter coeffs
232
0
          tmp = _mm_shuffle_epi32(tmp,0x44);
233
0
          __m256i coeff = _mm256_broadcastsi128_si256(tmp);
234
0
          for( int x = 0; x < width; x+=16)
235
0
          {
236
0
            __m256i src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( const __m128i* ) &refMain[refMainIndex     - 1] ) );
237
0
            __m256i src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( const __m128i* ) &refMain[refMainIndex + 4 - 1] ) );
238
0
            src1 = _mm256_shuffle_epi8(src1,shflmask1);                 // -1 0 1 2  0 1 2 3 1 2 3 4  2 3 4 5
239
0
            src2 = _mm256_shuffle_epi8(src2,shflmask1);                 // 3 4 5 6  4 5 6 7  5 6 7 8 6 7 8 9
240
241
0
            src1 = _mm256_madd_epi16 (src1, coeff);
242
0
            src2 = _mm256_madd_epi16 (src2, coeff);
243
244
0
            __m256i  sum  = _mm256_hadd_epi32( src1, src2 );
245
0
            sum = _mm256_permute4x64_epi64(sum,0xD8);
246
247
0
            sum = _mm256_add_epi32( sum, offset );
248
0
            sum = _mm256_srai_epi32( sum, 6 );
249
250
0
            refMainIndex+=8;
251
252
0
            src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) &refMain[refMainIndex     - 1] ) );
253
0
            src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) &refMain[refMainIndex + 4 - 1] ) );
254
255
0
            src1 = _mm256_shuffle_epi8(src1,shflmask1);                 // -1 0 1 2  0 1 2 3 1 2 3 4  2 3 4 5
256
0
            src2 = _mm256_shuffle_epi8(src2,shflmask1);                 // 3 4 5 6  4 5 6 7  5 6 7 8 6 7 8 9
257
0
            src1 = _mm256_madd_epi16 (src1, coeff);
258
0
            src2 = _mm256_madd_epi16 (src2, coeff);
259
260
0
            __m256i  sum1  = _mm256_hadd_epi32( src1, src2 );
261
0
            sum1 = _mm256_permute4x64_epi64(sum1,0xD8);
262
263
0
            sum1 = _mm256_add_epi32( sum1, offset );
264
0
            sum1 = _mm256_srai_epi32( sum1, 6 );
265
0
            __m256i
266
0
            src0 = _mm256_packs_epi32( sum, sum1 );
267
268
0
            src0 = _mm256_permute4x64_epi64(src0,0xD8);
269
270
0
            refMainIndex+=8;
271
272
0
            if (useCubicFilter)
273
0
              src0 = _mm256_min_epi16( vbdmax, _mm256_max_epi16( vbdmin, src0 ) );
274
275
0
            _mm256_storeu_si256( ( __m256i * )(pDst + x), src0);
276
0
          }
277
0
          deltaPos += intraPredAngle;
278
0
        }
279
0
      }
280
0
      else // width =8
281
0
      {
282
0
        __m128i vbdmin = _mm_set1_epi16( clpRng.min() );
283
0
        __m128i vbdmax = _mm_set1_epi16( clpRng.max() );
284
285
0
        for (int y = 0; y<height; y++ )
286
0
        {
287
0
          int deltaInt   = deltaPos >> 5;
288
0
          int deltaFract = deltaPos & (32 - 1);
289
290
0
          const TFilterCoeff      intraSmoothingFilter[4] = {TFilterCoeff(16 - (deltaFract >> 1)), TFilterCoeff(32 - (deltaFract >> 1)), TFilterCoeff(16 + (deltaFract >> 1)), TFilterCoeff(deltaFract >> 1)};
291
0
          const TFilterCoeff *f = useCubicFilter ? InterpolationFilter::getChromaFilterTable(deltaFract) : intraSmoothingFilter;
292
293
0
          int refMainIndex   = deltaInt + 1;
294
0
          pDst=&pDstBuf[y*dstStride];
295
//          __m128i tmp = _vv_loadl_epi64( ( __m128i const * )&ff[deltaFract<<2] );   //load 4 16 bit filter coeffs
296
0
          __m128i tmp = _mm_loadu_si64( f );   //load 4 16 bit filter coeffs
297
0
          tmp = _mm_shuffle_epi32(tmp,0x44);
298
0
          __m256i coeff = _mm256_broadcastsi128_si256(tmp);
299
0
          __m256i src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) &refMain[refMainIndex     - 1] ) );
300
0
          __m256i src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) &refMain[refMainIndex + 4 - 1] ) );
301
0
          src1 = _mm256_shuffle_epi8(src1,shflmask1);                 // -1 0 1 2  0 1 2 3 1 2 3 4  2 3 4 5
302
0
          src2 = _mm256_shuffle_epi8(src2,shflmask1);                 // 3 4 5 6  4 5 6 7  5 6 7 8 6 7 8 9
303
304
0
          src1 = _mm256_madd_epi16 (src1, coeff);
305
0
          src2 = _mm256_madd_epi16 (src2, coeff);
306
307
0
          __m256i  sum  = _mm256_hadd_epi32( src1, src2 );
308
0
          sum = _mm256_permute4x64_epi64(sum,0xD8);
309
310
0
          sum = _mm256_add_epi32( sum, offset );
311
0
          sum = _mm256_srai_epi32( sum, 6 );
312
0
          __m256i
313
0
          src0 = _mm256_permute4x64_epi64( _mm256_packs_epi32( sum, sum ), 0x88  );
314
0
          __m128i dest128 = _mm256_castsi256_si128( src0);
315
316
0
          if (useCubicFilter)
317
0
            dest128 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, dest128 ) );
318
319
0
          _mm_storeu_si128( ( __m128i * )(pDst), dest128);
320
0
          deltaPos += intraPredAngle;
321
0
        }
322
0
      }
323
#endif
324
0
    }
325
0
    else
326
0
    {
327
0
      __m128i shflmask1= _mm_set_epi8( 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2,   0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 );
328
0
      __m128i shflmask2= _mm_set_epi8( 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6,   0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4 );
329
0
      __m128i offset = _mm_set1_epi32( 32 );
330
0
      __m128i vbdmin = _mm_set1_epi16( clpRng.min() );
331
0
      __m128i vbdmax = _mm_set1_epi16( clpRng.max() );
332
333
0
      for (int y = 0; y<height; y++ )
334
0
      {
335
0
        int deltaInt   = deltaPos >> 5;
336
0
        int deltaFract = deltaPos & (32 - 1);
337
338
0
        const TFilterCoeff      intraSmoothingFilter[4] = {TFilterCoeff(16 - (deltaFract >> 1)), TFilterCoeff(32 - (deltaFract >> 1)), TFilterCoeff(16 + (deltaFract >> 1)), TFilterCoeff(deltaFract >> 1)};
339
0
        const TFilterCoeff *f = useCubicFilter ? InterpolationFilter::getChromaFilterTable(deltaFract) : intraSmoothingFilter;
340
341
0
        int refMainIndex   = deltaInt + 1;
342
0
        pDst=&pDstBuf[y*dstStride];
343
0
        __m128i coeff = _mm_loadu_si64( f );   //load 4 16 bit filter coeffs
344
//        __m128i coeff = _vv_loadl_epi64( ( __m128i const * )&ff[deltaFract<<2] );   //load 4 16 bit filter coeffs
345
0
        coeff = _mm_shuffle_epi32(coeff,0x44);
346
0
        for( int x = 0; x < width; x+=8)
347
0
        {
348
0
          __m128i src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] );   //load 8 16 bit reference Pels   -1 0 1 2 3 4 5 6
349
0
          __m128i src1 = _mm_shuffle_epi8(src0,shflmask1);                  // -1 0 1 2  0 1 2 3
350
0
          __m128i src2 = _mm_shuffle_epi8(src0,shflmask2);                  // 1 2 3 4  2 3 4 5
351
0
          src0 = _mm_madd_epi16( coeff,src1 );
352
0
          src1 = _mm_madd_epi16( coeff,src2 );
353
0
          __m128i sum  = _mm_hadd_epi32( src0, src1 );
354
0
          sum = _mm_add_epi32( sum, offset );
355
0
          sum = _mm_srai_epi32( sum, 6 );
356
357
0
          refMainIndex+=4;
358
0
          src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] );   //load 8 16 bit reference Pels   -1 0 1 2 3 4 5 6
359
0
          src1 = _mm_shuffle_epi8(src0,shflmask1);                                // -1 0 1 2  0 1 2 3
360
0
          src2 = _mm_shuffle_epi8(src0,shflmask2);
361
362
          // 1 2 3 4  2 3 4 5
363
0
          src0 = _mm_madd_epi16( coeff,src1 );
364
0
          src1 = _mm_madd_epi16( coeff,src2 );
365
366
0
          __m128i sum1  = _mm_hadd_epi32( src0, src1 );
367
0
          sum1 = _mm_add_epi32( sum1, offset );
368
0
          sum1 = _mm_srai_epi32( sum1, 6 );
369
0
          src0 = _mm_packs_epi32( sum, sum1 );
370
371
0
          refMainIndex+=4;
372
0
          if (useCubicFilter)
373
0
            src0 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, src0 ) );
374
375
0
          _mm_storeu_si128( ( __m128i * )(pDst + x), src0);
376
377
0
        }
378
0
        deltaPos += intraPredAngle;
379
0
      }
380
0
    }
381
0
  }
382
0
  else if( width == 4 )
383
0
  {
384
0
    __m128i shflmask1= _mm_set_epi8( 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2,   0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 );
385
0
    __m128i shflmask2= _mm_set_epi8( 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6,   0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4 );
386
0
    __m128i offset = _mm_set1_epi32( 32 );
387
0
    __m128i vbdmin = _mm_set1_epi16( clpRng.min() );
388
0
    __m128i vbdmax = _mm_set1_epi16( clpRng.max() );
389
390
0
    for (int y = 0; y<height; y++ )
391
0
    {
392
0
      int deltaInt   = deltaPos >> 5;
393
0
      int deltaFract = deltaPos & (32 - 1);
394
 
395
0
      const TFilterCoeff      intraSmoothingFilter[4] = {TFilterCoeff(16 - (deltaFract >> 1)), TFilterCoeff(32 - (deltaFract >> 1)), TFilterCoeff(16 + (deltaFract >> 1)), TFilterCoeff(deltaFract >> 1)};
396
0
      const TFilterCoeff *f = useCubicFilter ? InterpolationFilter::getChromaFilterTable(deltaFract) : intraSmoothingFilter;
397
   
398
0
      int refMainIndex   = deltaInt + 1;
399
0
      pDst=&pDstBuf[y*dstStride];
400
0
      __m128i coeff = _mm_loadu_si64( f );   //load 4 16 bit filter coeffs
401
//      __m128i coeff = _vv_loadl_epi64( ( __m128i const * )&ff[deltaFract<<2] );   //load 4 16 bit filter coeffs
402
0
      coeff = _mm_shuffle_epi32(coeff,0x44);
403
0
      {
404
0
        __m128i src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] );   //load 8 16 bit reference Pels   -1 0 1 2 3 4 5 6
405
0
        __m128i src1 = _mm_shuffle_epi8(src0,shflmask1);                  // -1 0 1 2  0 1 2 3
406
0
        __m128i src2 = _mm_shuffle_epi8(src0,shflmask2);                  // 1 2 3 4  2 3 4 5
407
0
        src0 = _mm_madd_epi16( coeff,src1 );
408
0
        src1 = _mm_madd_epi16( coeff,src2 );
409
0
        __m128i sum  = _mm_hadd_epi32( src0, src1 );
410
0
        sum = _mm_add_epi32( sum, offset );
411
0
        sum = _mm_srai_epi32( sum, 6 );
412
413
0
        src0 = _mm_packs_epi32( sum, sum );
414
415
0
        refMainIndex+=4;
416
417
0
        if (useCubicFilter)
418
0
          src0 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, src0 ) );
419
420
0
        _vv_storel_epi64( ( __m128i * )(pDst ), src0);
421
422
0
      }
423
0
      deltaPos += intraPredAngle;
424
0
    }
425
0
  }
426
0
  else
427
0
  {
428
0
    THROW( "Unsupported size in IntraPredAngleCore_SIMD" );
429
0
  }
430
#if USE_AVX2
431
0
  _mm256_zeroupper();
432
0
#endif
433
0
}
Unexecuted instantiation: void vvenc::IntraPredAngleLumaCore_SIMD<(vvenc::x86_simd::X86_VEXT)1>(short*, long, short*, int, int, int, int, short const*, bool, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::IntraPredAngleLumaCore_SIMD<(vvenc::x86_simd::X86_VEXT)4>(short*, long, short*, int, int, int, int, short const*, bool, vvenc::ClpRng const&)
434
435
template< X86_VEXT vext, int W >
436
void  IntraPredSampleFilter_SIMD(PelBuf& dstBuf, const CPelBuf& Src)
437
0
{
438
0
  const int iWidth  = dstBuf.width;
439
0
  const int iHeight = dstBuf.height;
440
0
  Pel* pDst         = dstBuf.buf;
441
0
  const ptrdiff_t  dstStride=dstBuf.stride;
442
443
0
  const Pel* ptrSrc = Src.buf;
444
0
  const ptrdiff_t  srcStride=Src.stride;
445
446
0
  const int scale = ((floorLog2(iWidth * iHeight) - 2) >> 2);
447
0
  CHECK(scale < 0 || scale > 31, "PDPC: scale < 0 || scale > 2");
448
449
#if USE_AVX2
450
0
  if( W > 8 )
451
0
  {
452
0
    __m256i tmplo,tmphi;
453
0
    __m256i w32 = _mm256_set_epi32(32,32,32,32,32,32,32,32);
454
0
    __m256i wl16,wl16start;
455
456
    wl16start = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,0,0,0,2,8,32);
457
    
458
0
    if (scale==1)
459
0
    {
460
0
      wl16start = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,1,2,4,8,16,32);
461
0
    }
462
0
    else if (scale==2)
463
0
    {
464
0
      wl16start = _mm256_set_epi16(0,0,0,0,1,1,2,2,4,4,8,8,16,16,32,32);
465
0
    }
466
    
467
0
    for (int y = 0; y < iHeight; y++)
468
0
    {
469
0
      int wT = 32 >> std::min(31, ((y << 1) >> scale));
470
471
0
      __m256i wt16 = _mm256_set_epi16(wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT);
472
0
      __m256i x16left = _mm256_broadcastw_epi16(_mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)+srcStride))));
473
0
      if (wT)
474
0
      {
475
0
        for (int x = 0; x < iWidth; x+=16)
476
0
        {
477
0
          if (x==0)
478
0
          {
479
0
            wl16=wl16start;
480
481
0
            __m256i x16top = _mm256_loadu_si256((__m256i *) (ptrSrc+x+1)); // load top
482
0
            __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride+x)); // load dst
483
484
0
            tmphi = _mm256_sub_epi16(x16left,x16dst);
485
0
            tmplo = _mm256_mullo_epi16(tmphi,wl16);  //wL * left-val
486
0
            tmphi = _mm256_mulhi_epi16(tmphi,wl16);  //wL * left-val
487
0
            __m256i leftlo = _mm256_unpacklo_epi16(tmplo,tmphi);
488
0
            __m256i lefthi = _mm256_unpackhi_epi16(tmplo,tmphi);
489
490
0
            x16top = _mm256_sub_epi16(x16top,x16dst);
491
0
            tmplo = _mm256_mullo_epi16(x16top,wt16);    // wT*top-val
492
0
            tmphi = _mm256_mulhi_epi16(x16top,wt16);    // wT*top-val
493
0
            __m256i toplo = _mm256_unpacklo_epi16(tmplo,tmphi);
494
0
            __m256i tophi = _mm256_unpackhi_epi16(tmplo,tmphi);
495
496
0
            __m256i dstlo = _mm256_add_epi32(leftlo,toplo);
497
0
            __m256i dsthi = _mm256_add_epi32(lefthi,tophi);
498
0
            dstlo = _mm256_add_epi32(dstlo,w32);
499
0
            dsthi = _mm256_add_epi32(dsthi,w32);
500
501
0
            dstlo =  _mm256_srai_epi32(dstlo,6);
502
0
            dsthi =  _mm256_srai_epi32(dsthi,6);
503
504
0
            dstlo =  _mm256_packs_epi32(dstlo,dsthi);
505
0
            dstlo =  _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) );
506
507
0
            dstlo = _mm256_adds_epi16(dstlo,x16dst);
508
0
            _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride+x), dstlo );
509
0
          }
510
0
          else
511
0
          {
512
0
            __m256i x16top = _mm256_loadu_si256((__m256i *) (ptrSrc+x+1)); // load top
513
0
            __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride+x)); // load dst
514
515
0
            x16top = _mm256_sub_epi16(x16top,x16dst);
516
0
            tmplo = _mm256_mullo_epi16(x16top,wt16);    // wT*top-val
517
0
            tmphi = _mm256_mulhi_epi16(x16top,wt16);    // wT*top-val
518
0
            __m256i toplo = _mm256_unpacklo_epi16(tmplo,tmphi);
519
0
            __m256i tophi = _mm256_unpackhi_epi16(tmplo,tmphi);
520
521
0
            __m256i dstlo = _mm256_add_epi32(toplo,w32);
522
0
            __m256i dsthi = _mm256_add_epi32(tophi,w32);
523
524
0
            dstlo =  _mm256_srai_epi32(dstlo,6);
525
0
            dsthi =  _mm256_srai_epi32(dsthi,6);
526
527
0
            dstlo =  _mm256_packs_epi32(dstlo,dsthi);
528
0
            dstlo =  _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) );
529
530
0
            dstlo = _mm256_adds_epi16(dstlo,x16dst);
531
0
            _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride+x), dstlo );
532
0
          }
533
0
        }  // for x
534
0
      }
535
0
      else
536
0
      { // wT =0
537
0
        wl16=wl16start;
538
539
0
        __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride)); // load dst
540
541
0
        tmphi = _mm256_sub_epi16(x16left,x16dst);
542
0
        tmplo = _mm256_mullo_epi16(tmphi,wl16);  //wL * left-val
543
0
        tmphi = _mm256_mulhi_epi16(tmphi,wl16);  //wL * left-val
544
0
        __m256i leftlo = _mm256_unpacklo_epi16(tmplo,tmphi);
545
0
        __m256i lefthi = _mm256_unpackhi_epi16(tmplo,tmphi);
546
547
0
        __m256i dstlo = _mm256_add_epi32(leftlo,w32);
548
0
        __m256i dsthi = _mm256_add_epi32(lefthi,w32);
549
550
0
        dstlo =  _mm256_srai_epi32(dstlo,6);
551
0
        dsthi =  _mm256_srai_epi32(dsthi,6);
552
553
0
        dstlo =  _mm256_packs_epi32(dstlo,dsthi);
554
0
        dstlo =  _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) );
555
556
0
        dstlo = _mm256_adds_epi16(dstlo,x16dst);
557
0
        _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride), dstlo );
558
0
      }
559
0
    }
560
0
  }
561
0
  else
562
0
#endif
563
0
  {
564
0
    __m128i tmplo8,tmphi8;
565
0
    __m128i w32_8 = _mm_set_epi32(32,32,32,32);
566
0
    __m128i wl8start,wl8start2;
567
0
    CHECK(scale < 0 || scale > 2, "PDPC: scale < 0 || scale > 2");
568
569
0
    wl8start = _mm_set_epi16(0,0,0,0,0,2,8,32);
570
0
    wl8start2 = _mm_set_epi16(0,0,0,0,0,0,0,0);
571
    
572
0
    if (scale==1)
573
0
    {
574
0
      wl8start = _mm_set_epi16(0,0,1,2,4,8,16,32);
575
0
      wl8start2 = _mm_set_epi16(0,0,0,0,0,0,0,0);
576
0
    }
577
0
    else if (scale==2)
578
0
    {
579
0
      wl8start = _mm_set_epi16(4,4,8,8,16,16,32,32);
580
0
      wl8start2 = _mm_set_epi16(0,0,0,0,1,1,2,2);
581
0
    }
582
583
0
    __m128i wl8 = wl8start;
584
0
    for (int y = 0; y < iHeight; y++)
585
0
    {
586
0
      int wT = 32 >> std::min(31, ((y << 1) >> scale));
587
588
0
      __m128i wt8 = _mm_set_epi16(wT,wT,wT,wT,wT,wT,wT,wT);
589
0
      __m128i x8left;
590
591
592
0
      if ( W == 4 )
593
0
      {
594
0
        x8left = _mm_loadu_si64 ((__m128i const *) (ptrSrc+((y+1)+srcStride)));
595
0
      }
596
0
      else if ( W == 2 )
597
0
      {
598
0
        x8left = _mm_loadu_si32 ((__m128i const *) (ptrSrc+((y+1)+srcStride)));
599
0
      }
600
0
      else
601
0
      {
602
0
        x8left = _mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)+srcStride)));
603
0
      }
604
0
      x8left =_mm_shufflelo_epi16(x8left,0);
605
0
      x8left =_mm_shuffle_epi32(x8left,0);
606
607
608
0
      if (wT)
609
0
      {
610
0
        for (int x = 0; x < iWidth; x+=8)
611
0
        {
612
0
          if (x>8)
613
0
          {
614
0
            __m128i x8top;
615
0
            __m128i x8dst;
616
617
0
            if ( W == 4 )
618
0
            {
619
0
              x8top =  _mm_loadu_si64((__m128i *) (ptrSrc+x+1)); // load top
620
0
              x8dst =  _mm_loadu_si64((const __m128i *) (pDst+y*dstStride+x)); // load dst
621
0
            }
622
0
            else if ( W == 2 )
623
0
            {
624
0
              x8top =  _mm_loadu_si32((__m128i *) (ptrSrc+x+1)); // load top
625
0
              x8dst =  _mm_loadu_si32((const __m128i *) (pDst+y*dstStride+x)); // load dst
626
0
            }
627
0
            else
628
0
            {
629
0
              x8top =  _mm_loadu_si128((__m128i *) (ptrSrc+x+1)); // load top
630
0
              x8dst =  _mm_loadu_si128((const __m128i *) (pDst+y*dstStride+x)); // load dst
631
0
            }
632
633
0
            tmphi8 = _mm_sub_epi16(x8top,x8dst);
634
0
            tmplo8 = _mm_mullo_epi16(tmphi8,wt8);    // wT*top-val
635
0
            tmphi8 = _mm_mulhi_epi16(tmphi8,wt8);    // wT*top-val
636
0
            __m128i toplo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
637
0
            __m128i tophi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
638
639
0
            __m128i dstlo8 = _mm_add_epi32(toplo8,w32_8);
640
0
            __m128i dsthi8 = _mm_add_epi32(tophi8,w32_8);
641
642
0
            dstlo8 =  _mm_srai_epi32(dstlo8,6);
643
0
            dsthi8 =  _mm_srai_epi32(dsthi8,6);
644
645
0
            dstlo8 =  _mm_packs_epi32(dstlo8,dsthi8);
646
0
            dstlo8 =  _mm_adds_epi16(dstlo8,x8dst);
647
648
0
            _mm_storeu_si128(( __m128i * )(pDst+y*dstStride+x), (dstlo8) );
649
650
0
          }
651
0
          else // x<=8
652
0
          {
653
            
654
0
            if (x==0)
655
0
              wl8=wl8start;
656
0
            else if (x==8)
657
0
              wl8=wl8start2;
658
659
0
            __m128i x8top;
660
0
            __m128i x8dst;
661
662
0
            if ( W == 4 )
663
0
            {
664
0
              x8top =  _mm_loadu_si64((__m128i *) (ptrSrc+x+1)); // load top
665
0
              x8dst =  _mm_loadu_si64((const __m128i *) (pDst+y*dstStride+x)); // load dst
666
0
            }
667
0
            else if ( W == 2 )
668
0
            {
669
0
              x8top =  _mm_loadu_si32((__m128i *) (ptrSrc+x+1)); // load top
670
0
              x8dst =  _mm_loadu_si32((const __m128i *) (pDst+y*dstStride+x)); // load dst
671
0
            }
672
0
            else
673
0
            {
674
0
              x8top =  _mm_loadu_si128((__m128i *) (ptrSrc+x+1)); // load top
675
0
              x8dst =  _mm_loadu_si128((const __m128i *) (pDst+y*dstStride+x)); // load dst
676
0
            }
677
0
            tmphi8 = _mm_sub_epi16(x8left,x8dst);
678
0
            tmplo8 = _mm_mullo_epi16(tmphi8,wl8);  //wL * left-val
679
0
            tmphi8 = _mm_mulhi_epi16(tmphi8,wl8);  //wL * left-val
680
0
            __m128i leftlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
681
0
            __m128i lefthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
682
683
0
            tmphi8 = _mm_sub_epi16(x8top,x8dst);
684
0
            tmplo8 = _mm_mullo_epi16(tmphi8,wt8);    // wT*top-val
685
0
            tmphi8 = _mm_mulhi_epi16(tmphi8,wt8);    // wT*top-val
686
0
            __m128i toplo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
687
0
            __m128i tophi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
688
689
0
            __m128i dstlo8 = _mm_add_epi32(leftlo8,toplo8);
690
0
            __m128i dsthi8 = _mm_add_epi32(lefthi8,tophi8);
691
0
            dstlo8 = _mm_add_epi32(dstlo8,w32_8);
692
0
            dsthi8 = _mm_add_epi32(dsthi8,w32_8);
693
694
0
            dstlo8 =  _mm_srai_epi32(dstlo8,6);
695
0
            dsthi8 =  _mm_srai_epi32(dsthi8,6);
696
697
0
            dstlo8 =  _mm_packs_epi32(dstlo8,dsthi8);
698
0
            dstlo8 =  _mm_adds_epi16(dstlo8,x8dst);
699
700
0
            if (W>=8)
701
0
              _mm_storeu_si128(( __m128i * )(pDst+y*dstStride+x), (dstlo8) );
702
0
            else if (W==4)
703
0
              _vv_storel_epi64(( __m128i * )(pDst+y*dstStride+x), (dstlo8) );
704
0
            else if (W==2)
705
0
              _mm_storeu_si32(( __m128i * )(pDst+y*dstStride+x),(dstlo8) );
706
0
          }
707
0
        }
708
0
      }
709
0
      else //wT =0
710
0
      {
711
0
        for (int x = 0; x < std::min(iWidth,16); x+=8)
712
0
        {
713
0
          if (x==0)
714
0
            wl8=wl8start;
715
0
          else
716
0
            wl8=wl8start2;
717
718
0
          __m128i x8dst ;
719
720
0
          if ( W == 4 )
721
0
          {
722
0
            x8dst =  _mm_loadu_si64((const __m128i *) (pDst+y*dstStride+x)); // load dst
723
0
          }
724
0
          else if ( W == 2 )
725
0
          {
726
0
            x8dst =  _mm_loadu_si32((const __m128i *) (pDst+y*dstStride+x)); // load dst
727
0
          }
728
0
          else
729
0
          {
730
0
            x8dst =  _mm_loadu_si128((const __m128i *) (pDst+y*dstStride+x)); // load dst
731
0
          }
732
0
          tmphi8 = _mm_sub_epi16(x8left,x8dst);
733
0
          tmplo8 = _mm_mullo_epi16(tmphi8,wl8);  //wL * left-val
734
0
          tmphi8 = _mm_mulhi_epi16(tmphi8,wl8);  //wL * left-val
735
0
          __m128i leftlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
736
0
          __m128i lefthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
737
738
0
          __m128i dstlo8 = _mm_add_epi32(leftlo8,w32_8);
739
0
          __m128i dsthi8 = _mm_add_epi32(lefthi8,w32_8);
740
741
0
          dstlo8 =  _mm_srai_epi32(dstlo8,6);
742
0
          dsthi8 =  _mm_srai_epi32(dsthi8,6);
743
744
0
          dstlo8 =  _mm_packs_epi32(dstlo8,dsthi8);
745
0
          dstlo8 =  _mm_adds_epi16(dstlo8,x8dst);
746
747
0
          if (W>=8)
748
0
            _mm_storeu_si128(( __m128i * )(pDst+y*dstStride+x), (dstlo8) );
749
0
          else if (W==4)
750
0
            _vv_storel_epi64(( __m128i * )(pDst+y*dstStride+x), (dstlo8) );
751
0
          else if (W==2)
752
0
            _mm_storeu_si32(( __m128i * )(pDst+y*dstStride+x),(dstlo8) );
753
0
        }
754
0
      }
755
0
    }
756
0
  }
757
758
759
0
}
Unexecuted instantiation: void vvenc::IntraPredSampleFilter_SIMD<(vvenc::x86_simd::X86_VEXT)1, 16>(vvenc::AreaBuf<short>&, vvenc::AreaBuf<short const> const&)
Unexecuted instantiation: void vvenc::IntraPredSampleFilter_SIMD<(vvenc::x86_simd::X86_VEXT)1, 8>(vvenc::AreaBuf<short>&, vvenc::AreaBuf<short const> const&)
Unexecuted instantiation: void vvenc::IntraPredSampleFilter_SIMD<(vvenc::x86_simd::X86_VEXT)1, 4>(vvenc::AreaBuf<short>&, vvenc::AreaBuf<short const> const&)
Unexecuted instantiation: void vvenc::IntraPredSampleFilter_SIMD<(vvenc::x86_simd::X86_VEXT)1, 2>(vvenc::AreaBuf<short>&, vvenc::AreaBuf<short const> const&)
Unexecuted instantiation: void vvenc::IntraPredSampleFilter_SIMD<(vvenc::x86_simd::X86_VEXT)4, 16>(vvenc::AreaBuf<short>&, vvenc::AreaBuf<short const> const&)
Unexecuted instantiation: void vvenc::IntraPredSampleFilter_SIMD<(vvenc::x86_simd::X86_VEXT)4, 8>(vvenc::AreaBuf<short>&, vvenc::AreaBuf<short const> const&)
Unexecuted instantiation: void vvenc::IntraPredSampleFilter_SIMD<(vvenc::x86_simd::X86_VEXT)4, 4>(vvenc::AreaBuf<short>&, vvenc::AreaBuf<short const> const&)
Unexecuted instantiation: void vvenc::IntraPredSampleFilter_SIMD<(vvenc::x86_simd::X86_VEXT)4, 2>(vvenc::AreaBuf<short>&, vvenc::AreaBuf<short const> const&)
760
761
template< X86_VEXT vext >
762
void  IntraPredSampleFilter_SIMD(PelBuf& dstBuf, const CPelBuf& srcBuf)
763
0
{
764
0
  const int iWidth  = dstBuf.width;
765
766
0
  if (iWidth>8)
767
0
    IntraPredSampleFilter_SIMD<vext,16>(dstBuf, srcBuf);
768
0
  else  if (iWidth==8)
769
0
    IntraPredSampleFilter_SIMD<vext,8>(dstBuf, srcBuf);
770
0
  else  if (iWidth==4)
771
0
    IntraPredSampleFilter_SIMD<vext,4>(dstBuf, srcBuf);
772
0
  else
773
0
    IntraPredSampleFilter_SIMD<vext,2>(dstBuf, srcBuf);
774
775
#if USE_AVX2
776
  _mm256_zeroupper();
777
#endif
778
0
}
Unexecuted instantiation: void vvenc::IntraPredSampleFilter_SIMD<(vvenc::x86_simd::X86_VEXT)1>(vvenc::AreaBuf<short>&, vvenc::AreaBuf<short const> const&)
Unexecuted instantiation: void vvenc::IntraPredSampleFilter_SIMD<(vvenc::x86_simd::X86_VEXT)4>(vvenc::AreaBuf<short>&, vvenc::AreaBuf<short const> const&)
779
780
781
/** Function for deriving planar intra prediction. This function derives the prediction samples for planar mode (intra coding).
782
 */
783
template< X86_VEXT vext>
784
void xPredIntraPlanar_SIMD( PelBuf& pDst, const CPelBuf& pSrc)
785
0
{
786
787
0
  const uint32_t width  = pDst.width;
788
0
  const uint32_t height = pDst.height;
789
0
  const uint32_t log2W  = floorLog2(width);
790
0
  const uint32_t log2H  = floorLog2(height);
791
0
  const uint32_t finalShift = 1 + log2W + log2H;
792
0
  const uint32_t offset = 1 << (log2W + log2H);
793
0
  const ptrdiff_t stride     = pDst.stride;
794
0
  Pel*       pred       = pDst.buf;
795
796
0
  const Pel* ptrSrc =pSrc.buf;
797
798
0
  int leftColumn,rightColumn;
799
0
  Pel tmp;
800
0
  int topRight = pSrc.at( width + 1, 0 );
801
802
0
  tmp=pSrc.at( height+1, 1 );
803
0
  const __m128i bottomLeft16 = _mm_set_epi16(tmp,tmp,tmp,tmp,tmp,tmp,tmp,tmp);
804
0
  const __m128i zero = _mm_xor_si128(bottomLeft16,bottomLeft16);
805
0
  const __m128i eight = _mm_set_epi16(8,8,8,8,8,8,8,8);
806
0
  const __m128i offset32 = _mm_set_epi32(offset,offset,offset,offset);
807
0
  const __m128i vLog2W = _mm_cvtsi32_si128(log2W);
808
0
  const __m128i vLog2H = _mm_cvtsi32_si128(log2H);
809
0
  const __m128i vFinalShift = _mm_cvtsi32_si128(finalShift);
810
811
812
0
  for( int y = 0; y < height; y++)
813
0
  {
814
0
    leftColumn=pSrc.at( y + 1, 1 );
815
0
    rightColumn = topRight - leftColumn;
816
0
    leftColumn  = leftColumn << log2W;
817
0
    __m128i leftColumn32 = _mm_set_epi32(leftColumn,leftColumn,leftColumn,leftColumn);
818
0
    __m128i rightcolumn16 = _mm_set_epi16(rightColumn,rightColumn,rightColumn,rightColumn,rightColumn,rightColumn,rightColumn,rightColumn);
819
0
    __m128i y16 = _mm_set_epi16(y+1,y+1,y+1,y+1,y+1,y+1,y+1,y+1);
820
0
    __m128i x16 = _mm_set_epi16(8,7,6,5,4,3,2,1);
821
822
0
    for( int x = 0; x < width; x+=8 )
823
0
    {
824
      //topRow[x] = pSrc.at( x + 1, 0 );
825
0
      __m128i topRow16 = _mm_loadu_si128 ((__m128i const *) (ptrSrc+(x+1)));
826
      //bottomRow[x] = bottomLeft - topRow[x];
827
0
      __m128i bottomRow16L = _mm_sub_epi16(bottomLeft16,topRow16);
828
      // (y+1)*bottomRow[x]
829
0
      __m128i  tmpH = _mm_mulhi_epi16(bottomRow16L,y16);
830
0
      __m128i tmpL = _mm_mullo_epi16(bottomRow16L,y16);
831
0
      bottomRow16L = _mm_unpacklo_epi16(tmpL,tmpH);
832
0
      __m128i bottomRow16H = _mm_unpackhi_epi16(tmpL,tmpH);
833
834
      // (topRow[x] topRow16H<< log2H)
835
0
      __m128i topRow32L = _mm_unpacklo_epi16(topRow16,zero);
836
0
      __m128i topRow32H = _mm_unpackhi_epi16(topRow16,zero);
837
0
      topRow32L = _mm_sll_epi32(topRow32L,vLog2H);
838
0
      topRow32H = _mm_sll_epi32(topRow32H,vLog2H);
839
      // vertPred    = (topRow[x] << log2H) + (y+1)*bottomRow[x];
840
0
      topRow32L = _mm_add_epi32(topRow32L,bottomRow16L);
841
0
      topRow32H = _mm_add_epi32(topRow32H,bottomRow16H);
842
      // horPred = leftColumn + (x+1)*rightColumn;
843
0
      tmpL = _mm_mullo_epi16(rightcolumn16,x16);
844
0
      tmpH = _mm_mulhi_epi16(rightcolumn16,x16);
845
0
      __m128i horpred32L = _mm_unpacklo_epi16(tmpL,tmpH);
846
0
      __m128i horpred32H = _mm_unpackhi_epi16(tmpL,tmpH);
847
0
      horpred32L = _mm_add_epi32(leftColumn32,horpred32L);
848
0
      horpred32H = _mm_add_epi32(leftColumn32,horpred32H);
849
      // pred[x]      = ( ( horPred << log2H ) + ( vertPred << log2W ) + offset ) >> finalShift;
850
0
      horpred32L = _mm_sll_epi32(horpred32L,vLog2H);
851
0
      horpred32H = _mm_sll_epi32(horpred32H,vLog2H);
852
0
      topRow32L = _mm_sll_epi32(topRow32L,vLog2W);
853
0
      topRow32H = _mm_sll_epi32(topRow32H,vLog2W);
854
0
      horpred32L = _mm_add_epi32(horpred32L,topRow32L);
855
0
      horpred32H = _mm_add_epi32(horpred32H,topRow32H);
856
0
      horpred32L = _mm_add_epi32(horpred32L,offset32);
857
0
      horpred32H = _mm_add_epi32(horpred32H,offset32);
858
0
      horpred32L = _mm_srl_epi32(horpred32L,vFinalShift);
859
0
      horpred32H = _mm_srl_epi32(horpred32H,vFinalShift);
860
861
0
      tmpL = _mm_packs_epi32(horpred32L,horpred32H);
862
0
      if (width>=8)
863
0
        _mm_storeu_si128(( __m128i * )(pred+y*stride+x), (tmpL) );
864
0
      else if (width==4)
865
0
        _vv_storel_epi64(( __m128i * )(pred+y*stride+x), (tmpL) );
866
0
      else if (width==2)
867
0
        _mm_storeu_si32(( __m128i * )(pred+y*stride+x),(tmpL) );
868
0
      else
869
0
        pred[y*stride+x]=(Pel)_mm_extract_epi16 (tmpL,0);
870
871
0
      x16 = _mm_add_epi16(x16,eight);
872
0
    }
873
0
  }
874
0
}
Unexecuted instantiation: void vvenc::xPredIntraPlanar_SIMD<(vvenc::x86_simd::X86_VEXT)1>(vvenc::AreaBuf<short>&, vvenc::AreaBuf<short const> const&)
Unexecuted instantiation: void vvenc::xPredIntraPlanar_SIMD<(vvenc::x86_simd::X86_VEXT)4>(vvenc::AreaBuf<short>&, vvenc::AreaBuf<short const> const&)
875
876
template< X86_VEXT vext>
877
void GetLumaRecPixel420SIMD (const int width,const int height, const Pel* pRecSrc0,const ptrdiff_t iRecStride,Pel* pDst0,const ptrdiff_t iDstStride)
878
{
879
#ifdef USE_AVX2
880
  if( ( width & 15 ) == 0 )    // width>=16
881
  {
882
    __m256i vzero = _mm256_set1_epi8(0);
883
    __m256i vfour = _mm256_set1_epi32(4);
884
    for( int y = 0; y < height; y++ )
885
    {
886
      for( int x = 0; x < width; x += 16 )
887
      {
888
        int x2=x<<1;
889
        __m256i vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2-1]);      // -1 0 1 2 3 4 5 6
890
        __m256i vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2]);          // 0 1 2 3 4 5 6 7
891
892
        __m256i vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55);      // -1 1 3 5  32 Bit
893
        __m256i vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55);      // 0 2 4 6  32 Bit
894
        __m256i vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA);      // 1 3 5 7 32 Bit
895
        vsrc10 = _mm256_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
896
        vsrc0 =  _mm256_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
897
898
        vsrc0 =  _mm256_add_epi32(vsrc0,vsrc10);
899
        __m256i vdst0 = _mm256_add_epi32(vsrc0,vsrc01);   // dst 0 1 2 3 32 Bit, untere Zeile fehlt noch
900
901
        vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 +15]);      // 7 8 9 10 11 12 13 14
902
        vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 + 16 ]);          // 8 9 10 11 12 13 14 15
903
904
        x2+= (int)iRecStride;
905
906
        vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55);
907
        vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55);
908
        vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA);
909
        vsrc10 = _mm256_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
910
        vsrc0 =  _mm256_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
911
912
        vsrc0 =  _mm256_add_epi32(vsrc0,vsrc10);
913
        __m256i vdst1 = _mm256_add_epi32(vsrc0,vsrc01);   // dst 4 5 6 7 32 Bit, untere Zeile fehlt noch
914
915
        // jetzt die nächste Zeile dazu
916
        vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2-1]);      // -1 0 1 2 3 4 5 6
917
        vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2]);          // 0 1 2 3 4 5 6 7
918
919
        vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55);      // -1 1 3 5  32 Bit
920
        vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55);      // 0 2 4 6  32 Bit
921
        vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA);      // 1 3 5 7 32 Bit
922
        vsrc10 = _mm256_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
923
        vsrc0 =  _mm256_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
924
925
        vsrc0 =  _mm256_add_epi32(vsrc0,vsrc10);
926
        __m256i vdst01 = _mm256_add_epi32(vsrc0,vsrc01);   // dst 0 1 2 3 32 Bit, untere Zeile
927
928
        vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 + 15]);      // 7 8 9 10 11 12 13 14
929
        vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 + 16 ]);          // 8 9 10 11 12 13 14 15
930
931
        vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55);
932
        vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55);
933
        vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA);
934
        vsrc10 = _mm256_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
935
        vsrc0 =  _mm256_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
936
937
        vsrc0 =  _mm256_add_epi32(vsrc0,vsrc10);
938
        __m256i vdst11 = _mm256_add_epi32(vsrc0,vsrc01);   // dst 4 5 6 7 32 Bit, untere Zeile
939
940
        vdst0 = _mm256_add_epi32(vdst0,vdst01);
941
        vdst1 = _mm256_add_epi32(vdst1,vdst11);
942
        vdst0 =  _mm256_add_epi32(vdst0,vfour);
943
        vdst1 =  _mm256_add_epi32(vdst1,vfour);
944
        vdst0 = _mm256_srli_epi32(vdst0,3);
945
        vdst1 = _mm256_srli_epi32(vdst1,3);
946
        vdst0 = _mm256_packus_epi32 (vdst0,vdst1);   // 16 bit
947
        vdst0 = _mm256_permute4x64_epi64(vdst0,0xd8);
948
949
        _mm256_storeu_si256((__m256i*)&pDst0[x], vdst0);
950
        //        _mm_storeu_si128((__m128i*)&pDstTmp[x], vdst0);
951
      }
952
      pDst0 += iDstStride;
953
      pRecSrc0 += (iRecStride<<1);
954
    }
955
  }
956
  else
957
#endif
958
    if( ( width & 7 ) == 0 )    // width>=8
959
    {
960
      __m128i vzero = _mm_set1_epi8(0);
961
      __m128i vfour = _mm_set1_epi32(4);
962
963
964
      for( int y = 0; y < height; y++ )
965
      {
966
967
        for( int x = 0; x < width; x += 8 )
968
        {
969
          int x2=x<<1;
970
          __m128i vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2-1]);      // -1 0 1 2 3 4 5 6
971
          __m128i vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2]);          // 0 1 2 3 4 5 6 7
972
973
          __m128i vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55);      // -1 1 3 5  32 Bit
974
          __m128i vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55);      // 0 2 4 6  32 Bit
975
          __m128i vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA);      // 1 3 5 7 32 Bit
976
          vsrc10 = _mm_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
977
          vsrc0 =  _mm_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
978
979
          vsrc0 =  _mm_add_epi32(vsrc0,vsrc10);
980
          __m128i vdst0 = _mm_add_epi32(vsrc0,vsrc01);   // dst 0 1 2 3 32 Bit, untere Zeile fehlt noch
981
982
          vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 +7]);      // 7 8 9 10 11 12 13 14
983
          vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 + 8 ]);          // 8 9 10 11 12 13 14 15
984
985
          x2+=(int)iRecStride;
986
987
          vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55);
988
          vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55);
989
          vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA);
990
          vsrc10 = _mm_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
991
          vsrc0 =  _mm_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
992
993
          vsrc0 =  _mm_add_epi32(vsrc0,vsrc10);
994
          __m128i vdst1 = _mm_add_epi32(vsrc0,vsrc01);   // dst 4 5 6 7 32 Bit, untere Zeile fehlt noch
995
996
          // jetzt die nächste Zeile dazu
997
          vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2-1]);      // -1 0 1 2 3 4 5 6
998
          vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2]);          // 0 1 2 3 4 5 6 7
999
1000
          vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55);      // -1 1 3 5  32 Bit
1001
          vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55);      // 0 2 4 6  32 Bit
1002
          vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA);      // 1 3 5 7 32 Bit
1003
          vsrc10 = _mm_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
1004
          vsrc0 =  _mm_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
1005
1006
          vsrc0 =  _mm_add_epi32(vsrc0,vsrc10);
1007
          __m128i vdst01 = _mm_add_epi32(vsrc0,vsrc01);   // dst 0 1 2 3 32 Bit, untere Zeile
1008
1009
          vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 + 7]);      // 7 8 9 10 11 12 13 14
1010
          vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 + 8 ]);          // 8 9 10 11 12 13 14 15
1011
1012
          vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55);
1013
          vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55);
1014
          vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA);
1015
          vsrc10 = _mm_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
1016
          vsrc0 =  _mm_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
1017
1018
          vsrc0 =  _mm_add_epi32(vsrc0,vsrc10);
1019
          __m128i vdst11 = _mm_add_epi32(vsrc0,vsrc01);   // dst 4 5 6 7 32 Bit, untere Zeile
1020
1021
          vdst0 = _mm_add_epi32(vdst0,vdst01);
1022
          vdst1 = _mm_add_epi32(vdst1,vdst11);
1023
          vdst0 =  _mm_add_epi32(vdst0,vfour);
1024
          vdst1 =  _mm_add_epi32(vdst1,vfour);
1025
          vdst0 = _mm_srli_epi32(vdst0,3);
1026
          vdst1 = _mm_srli_epi32(vdst1,3);
1027
          vdst0 = _mm_packus_epi32 (vdst0,vdst1);   // 16 bit__m256i wl16start;
1028
1029
          _mm_storeu_si128((__m128i*)&pDst0[x], vdst0);
1030
          //        _mm_storeu_si128((__m128i*)&pDstTmp[x], vdst0);
1031
        }
1032
        pDst0 += iDstStride;
1033
        pRecSrc0 += (iRecStride<<1);
1034
      }
1035
    }
1036
    else     // width<=4
1037
    {
1038
      __m128i vzero = _mm_set1_epi8(0);
1039
      __m128i vfour = _mm_set1_epi32(4);
1040
1041
      for( int y = 0; y < height; y++ )
1042
      {
1043
        __m128i vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[-1]);      // -1 0 1 2 3 4 5 6
1044
        __m128i vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[0]);          // 0 1 2 3 4 5 6 7
1045
1046
        __m128i vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55);      // -1 1 3 5  32 Bit
1047
        __m128i vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55);      // 0 2 4 6  32 Bit
1048
        __m128i vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA);      // 1 3 5 7 32 Bit
1049
        vsrc10 = _mm_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
1050
        vsrc0 =  _mm_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
1051
1052
        vsrc0 =  _mm_add_epi32(vsrc0,vsrc10);
1053
        __m128i vdst0 = _mm_add_epi32(vsrc0,vsrc01);   // dst 0 1 2 3 32 Bit, untere Zeile fehlt noch
1054
1055
        // jetzt die nächste Zeile dazu
1056
        vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[iRecStride-1]);      // -1 0 1 2 3 4 5 6
1057
        vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[iRecStride]);          // 0 1 2 3 4 5 6_mm_storeu_si32 7
1058
1059
        vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55);      // -1 1 3 5  32 Bit
1060
        vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55);      // 0 2 4 6  32 Bit
1061
        vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA);      // 1 3 5 7 32 Bit
1062
        vsrc10 = _mm_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
1063
        vsrc0 =  _mm_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
1064
1065
        vsrc0 =  _mm_add_epi32(vsrc0,vsrc10);
1066
        __m128i vdst01 = _mm_add_epi32(vsrc0,vsrc01);   // dst 0 1 2 3 32 Bit, untere Zeile
1067
1068
1069
        vdst0 = _mm_add_epi32(vdst0,vdst01);
1070
        vdst0 =  _mm_add_epi32(vdst0,vfour);
1071
        vdst0 = _mm_srli_epi32(vdst0,3);
1072
        vdst0 = _mm_packus_epi32 (vdst0,vdst0);   // 16 bit
1073
1074
        if (width==4)
1075
          _vv_storel_epi64(( __m128i * )&pDst0[0], (vdst0) );
1076
        else if (width==2)
1077
          _mm_storeu_si32(( __m128i * )&pDst0[0], (vdst0) );
1078
        else
1079
        {
1080
          int tmp = _mm_cvtsi128_si32(vdst0);
1081
          pDst0[0] = (Pel) tmp;
1082
        }
1083
1084
        pDst0 += iDstStride;
1085
        pRecSrc0 += (iRecStride<<1);
1086
      }
1087
    }
1088
}
1089
1090
1091
template<X86_VEXT vext, int W >
1092
void IntraAnglePDPC_SIMD(Pel* pDsty,const int dstStride,Pel* refSide,const int width,const int height,int scale,int invAngle)
1093
0
{  
1094
1095
0
  if (W>=16)
1096
0
  {
1097
#ifdef USE_AVX2
1098
0
    ALIGN_DATA( MEMORY_ALIGN_DEF_SIZE,short ref[16]);
1099
    VALGRIND_MEMCLEAR( ref, sizeof( ref ) );
1100
1101
    //  Pel dummy[16];
1102
    //  Pel* pdum=&dummy[0];
1103
    //  int scaledum=scale;
1104
1105
    __m256i wl16; 
1106
0
    if (scale==0)
1107
0
    {
1108
0
      wl16 = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,0,0,0,2,8,32);
1109
0
      scale=3;
1110
0
    }
1111
0
    else if (scale==1)
1112
0
    {
1113
0
      wl16 = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,1,2,4,8,16,32);
1114
0
      scale=6;
1115
0
    }
1116
0
    else
1117
0
    {
1118
0
      wl16 = _mm256_set_epi16(0,0,0,0,1,1,2,2,4,4,8,8,16,16,32,32);
1119
0
      scale=12;
1120
0
    }
1121
1122
1123
    __m256i v32 = _mm256_set1_epi32(32);
1124
0
    for (int y = 0; y<height; y++, pDsty += dstStride)
1125
0
    {
1126
0
      int       invAngleSum = 256;
1127
0
      for (int x = 0; x < scale; x++)
1128
0
      {
1129
0
        invAngleSum += invAngle;
1130
0
        ref[x]=refSide[y + (invAngleSum >> 9) + 1];
1131
0
      }
1132
0
      __m256i xleft= _mm256_load_si256((__m256i*)&ref[0]);
1133
0
      __m256i xdst= _mm256_loadu_si256((__m256i*)pDsty);
1134
0
      __m256i xdstlo=_mm256_sub_epi16 (xleft,xdst);
1135
0
      __m256i tmplo = _mm256_mullo_epi16(xdstlo,wl16);
1136
0
      __m256i tmphi = _mm256_mulhi_epi16(xdstlo,wl16);
1137
0
      xdstlo = _mm256_unpacklo_epi16(tmplo,tmphi);  //low
1138
0
      tmphi = _mm256_unpackhi_epi16(tmplo,tmphi);  // high
1139
1140
0
      tmplo = _mm256_add_epi32(xdstlo,v32);
1141
0
      tmphi = _mm256_add_epi32(tmphi,v32);
1142
0
      tmplo = _mm256_srai_epi32(tmplo,6);
1143
0
      tmphi = _mm256_srai_epi32(tmphi,6);
1144
1145
0
      tmplo =  _mm256_packs_epi32(tmplo,tmphi);
1146
0
      tmplo =  _mm256_permute4x64_epi64 ( tmplo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) );
1147
0
      xdst = _mm256_add_epi16(tmplo,xdst);
1148
0
      _mm256_storeu_si256( ( __m256i * )(pDsty), xdst );
1149
1150
0
    }
1151
#else
1152
0
    for (int y = 0; y<height; y++, pDsty += dstStride)
1153
0
    {
1154
0
      int       invAngleSum = 256;
1155
0
      for (int x = 0; x < std::min(3 << scale, width); x++)
1156
0
      {
1157
0
        invAngleSum += invAngle;
1158
0
        int wL   = 32 >> (2 * x >> scale);
1159
0
        Pel left = refSide[y + (invAngleSum >> 9) + 1];
1160
0
        pDsty[x] = pDsty[x] + ((wL * (left - pDsty[x]) + 32) >> 6);
1161
0
      }
1162
0
    }
1163
#endif
1164
0
  }
1165
0
  else
1166
0
  {
1167
0
    ALIGN_DATA( MEMORY_ALIGN_DEF_SIZE,short ref[8]);
1168
0
    VALGRIND_MEMCLEAR( ref, sizeof( ref ) );
1169
1170
0
    __m128i wl16; 
1171
0
    if (scale==0)
1172
0
    {
1173
0
      wl16 = _mm_set_epi16(0,0,0,0,0,2,8,32);
1174
0
      scale=3;
1175
0
    }
1176
0
    else if (scale==1)
1177
0
    {
1178
0
      wl16 = _mm_set_epi16(0,0,1,2,4,8,16,32);
1179
0
      scale=6;
1180
0
    }
1181
0
    else
1182
0
    {
1183
0
      wl16 = _mm_set_epi16(4,4,8,8,16,16,32,32);
1184
0
      scale=8;
1185
0
    }
1186
1187
0
    int xlim=std::min(scale, width);
1188
1189
0
    __m128i v32 = _mm_set1_epi32(32);
1190
0
    for (int y = 0; y<height; y++, pDsty += dstStride)
1191
0
    {
1192
0
      int       invAngleSum = 256;
1193
0
      for (int x = 0; x < xlim; x++)
1194
0
      {
1195
0
        invAngleSum += invAngle;
1196
0
        ref[x]=refSide[y + (invAngleSum >> 9) + 1];
1197
0
      }
1198
1199
0
      __m128i xleft;
1200
0
      __m128i xdst;
1201
0
      if (W==8)
1202
0
      {
1203
0
        xleft= _mm_load_si128((__m128i*)&ref[0]);
1204
0
        xdst= _mm_loadu_si128((__m128i*)pDsty);
1205
0
      }
1206
0
      else
1207
0
      {
1208
0
        xleft= _mm_load_si128((__m128i*)&ref[0]);
1209
0
        xdst= _mm_loadu_si64((__m128i*)pDsty);
1210
0
      }
1211
0
      __m128i xdstlo=_mm_sub_epi16 (xleft,xdst);
1212
0
      __m128i tmplo = _mm_mullo_epi16(xdstlo,wl16);
1213
0
      __m128i tmphi = _mm_mulhi_epi16(xdstlo,wl16);
1214
0
      xdstlo = _mm_unpacklo_epi16(tmplo,tmphi);  //low
1215
0
      tmphi = _mm_unpackhi_epi16(tmplo,tmphi);  // high
1216
1217
0
      tmplo = _mm_add_epi32(xdstlo,v32);
1218
0
      tmphi = _mm_add_epi32(tmphi,v32);
1219
0
      tmplo = _mm_srai_epi32(tmplo,6);
1220
0
      tmphi = _mm_srai_epi32(tmphi,6);
1221
1222
0
      tmplo =  _mm_packs_epi32(tmplo,tmphi);
1223
0
      xdst = _mm_add_epi16(tmplo,xdst);
1224
0
      if (W==8)
1225
0
        _mm_storeu_si128( ( __m128i * )(pDsty), xdst );
1226
0
      else if (W==4)
1227
0
        _vv_storel_epi64( ( __m128i * )(pDsty), xdst );
1228
0
      else
1229
0
      {
1230
0
        THROW("wrong blocksize");
1231
0
      }
1232
0
    }
1233
0
  }
1234
0
}
Unexecuted instantiation: void vvenc::IntraAnglePDPC_SIMD<(vvenc::x86_simd::X86_VEXT)1, 16>(short*, int, short*, int, int, int, int)
Unexecuted instantiation: void vvenc::IntraAnglePDPC_SIMD<(vvenc::x86_simd::X86_VEXT)1, 8>(short*, int, short*, int, int, int, int)
Unexecuted instantiation: void vvenc::IntraAnglePDPC_SIMD<(vvenc::x86_simd::X86_VEXT)1, 4>(short*, int, short*, int, int, int, int)
Unexecuted instantiation: void vvenc::IntraAnglePDPC_SIMD<(vvenc::x86_simd::X86_VEXT)4, 16>(short*, int, short*, int, int, int, int)
Unexecuted instantiation: void vvenc::IntraAnglePDPC_SIMD<(vvenc::x86_simd::X86_VEXT)4, 8>(short*, int, short*, int, int, int, int)
Unexecuted instantiation: void vvenc::IntraAnglePDPC_SIMD<(vvenc::x86_simd::X86_VEXT)4, 4>(short*, int, short*, int, int, int, int)
1235
1236
template<X86_VEXT vext >
1237
void IntraAnglePDPC_SIMD(Pel* pDsty,const int dstStride,Pel* refSide,const int width,const int height,int scale,int invAngle)
1238
0
{  
1239
0
  if (width>=16)
1240
0
    IntraAnglePDPC_SIMD<vext,16>(pDsty,dstStride,refSide,width,height,scale,invAngle);
1241
0
  else if (width==8)
1242
0
    IntraAnglePDPC_SIMD<vext,8>(pDsty,dstStride,refSide,width,height,scale,invAngle);
1243
0
  else if (width==4)
1244
0
    IntraAnglePDPC_SIMD<vext,4>(pDsty,dstStride,refSide,width,height,scale,invAngle);
1245
0
  else
1246
0
    for (int y = 0; y<height; y++, pDsty += dstStride)
1247
0
    {
1248
0
      int       invAngleSum = 256;
1249
0
      for (int x = 0; x < 2; x++)
1250
0
      {
1251
0
        invAngleSum += invAngle;
1252
0
        int wL   = 32 >> (2 * x >> scale);
1253
0
        Pel left = refSide[y + (invAngleSum >> 9) + 1];
1254
0
        pDsty[x] = pDsty[x] + ((wL * (left - pDsty[x]) + 32) >> 6);
1255
0
      }
1256
0
    }
1257
#if USE_AVX2
1258
  _mm256_zeroupper();
1259
#endif
1260
0
}
Unexecuted instantiation: void vvenc::IntraAnglePDPC_SIMD<(vvenc::x86_simd::X86_VEXT)1>(short*, int, short*, int, int, int, int)
Unexecuted instantiation: void vvenc::IntraAnglePDPC_SIMD<(vvenc::x86_simd::X86_VEXT)4>(short*, int, short*, int, int, int, int)
1261
1262
template<X86_VEXT vext>
1263
void IntraHorVerPDPC_SIMD(Pel* pDsty,const int dstStride,Pel* refSide,const int width,const int height,int scale,const Pel* refMain, const ClpRng& clpRng)
1264
0
{
1265
0
   const Pel topLeft = refMain[0];
1266
1267
0
   if (width>=16)
1268
0
   {
1269
#ifdef USE_AVX2
1270
     __m256i v32 = _mm256_set1_epi32(32);
1271
     __m256i vbdmin   = _mm256_set1_epi16( clpRng.min() );
1272
     __m256i vbdmax   = _mm256_set1_epi16( clpRng.max() );
1273
1274
     __m256i wl16;
1275
0
     if (scale==0)
1276
0
     {
1277
0
       wl16 = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,0,0,0,2,8,32);
1278
0
     }
1279
0
     else if (scale==1)
1280
0
     {
1281
0
       wl16 = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,1,2,4,8,16,32);
1282
0
     }
1283
0
     else
1284
0
     {
1285
0
       wl16 = _mm256_set_epi16(0,0,0,0,1,1,2,2,4,4,8,8,16,16,32,32);
1286
0
     }
1287
     __m256i xtopLeft = _mm256_set_epi16(topLeft,topLeft,topLeft,topLeft,topLeft,topLeft,topLeft,topLeft,topLeft,topLeft,topLeft,topLeft,topLeft,topLeft,topLeft,topLeft);
1288
1289
0
     for (int y = 0; y<height; y++, pDsty += dstStride)
1290
0
     {
1291
       // first column
1292
0
       const Pel left    = refSide[1 + y];
1293
0
       __m256i xleft= _mm256_set_epi16(left,left,left,left,left,left,left,left,left,left,left,left,left,left,left,left);
1294
1295
0
       __m256i xdst= _mm256_loadu_si256((__m256i*)&refMain[1]);
1296
0
       xleft = _mm256_sub_epi16(xleft,xtopLeft);
1297
1298
0
       __m256i tmplo = _mm256_mullo_epi16(xleft,wl16);
1299
0
       __m256i tmphi = _mm256_mulhi_epi16(xleft,wl16);
1300
0
       xleft = _mm256_unpacklo_epi16(tmplo,tmphi);  //low
1301
0
       tmphi = _mm256_unpackhi_epi16(tmplo,tmphi);  // high
1302
1303
0
       tmplo = _mm256_add_epi32(xleft,v32);
1304
0
       tmphi = _mm256_add_epi32(tmphi,v32);
1305
0
       tmplo = _mm256_srai_epi32(tmplo,6);
1306
0
       tmphi = _mm256_srai_epi32(tmphi,6);
1307
1308
0
       tmplo =  _mm256_packs_epi32(tmplo,tmphi);
1309
0
       tmplo =  _mm256_permute4x64_epi64 ( tmplo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) );
1310
1311
0
       xdst = _mm256_adds_epi16(tmplo,xdst);
1312
0
       xdst = _mm256_min_epi16( vbdmax, _mm256_max_epi16( vbdmin, xdst ) );
1313
0
       _mm256_storeu_si256( ( __m256i * )(pDsty), xdst );
1314
1315
       // rest memcpy
1316
0
       for (int x = 16; x < width; x+=16)
1317
0
       {
1318
0
         __m256i xdst= _mm256_loadu_si256((__m256i*)&refMain[x+1]);
1319
0
         _mm256_storeu_si256( ( __m256i * )(&pDsty[x]), xdst );
1320
0
       }
1321
0
     }
1322
#else
1323
0
     for( int y = 0; y < height; y++ )
1324
0
     {
1325
0
       memcpy(pDsty,&refMain[1],width*sizeof(Pel));
1326
0
       const Pel left    = refSide[1 + y];
1327
0
       for (int x = 0; x < std::min(3 << scale, width); x++)
1328
0
       {
1329
0
         const int wL  = 32 >> (2 * x >> scale);
1330
0
         const Pel val = pDsty[x];
1331
0
         pDsty[x]      = ClipPel(val + ((wL * (left - topLeft) + 32) >> 6), clpRng);
1332
0
       }
1333
0
       pDsty += dstStride;
1334
0
     }
1335
#endif
1336
0
   }
1337
0
   else      //width <= 8
1338
0
   {
1339
0
     __m128i vbdmin   = _mm_set1_epi16( clpRng.min() );
1340
0
     __m128i vbdmax   = _mm_set1_epi16( clpRng.max() );
1341
0
     __m128i wl16;
1342
1343
0
     if (scale==0)
1344
0
     {
1345
0
       wl16 = _mm_set_epi16(0,0,0,0,0,2,8,32);
1346
0
     }
1347
0
     else if (scale==1)
1348
0
     {
1349
0
       wl16 = _mm_set_epi16(0,0,1,2,4,8,16,32);
1350
0
     }
1351
0
     else
1352
0
     {
1353
0
       wl16 = _mm_set_epi16(4,4,8,8,16,16,32,32);
1354
0
     }
1355
1356
0
     __m128i v32 = _mm_set1_epi32(32);
1357
0
     __m128i xtopLeft = _mm_set_epi16(topLeft,topLeft,topLeft,topLeft,topLeft,topLeft,topLeft,topLeft);
1358
1359
0
     for (int y = 0; y<height; y++, pDsty += dstStride)
1360
0
     {
1361
       // first column
1362
0
        const Pel left    = refSide[1 + y];
1363
0
        __m128i xleft= _mm_set_epi16(left,left,left,left,left,left,left,left);
1364
1365
0
        __m128i xdst= _mm_loadu_si128((__m128i*)&refMain[1]);
1366
0
        xleft = _mm_sub_epi16(xleft,xtopLeft);
1367
1368
0
        __m128i tmplo = _mm_mullo_epi16(xleft,wl16);
1369
0
        __m128i tmphi = _mm_mulhi_epi16(xleft,wl16);
1370
0
        xleft = _mm_unpacklo_epi16(tmplo,tmphi);  //low
1371
0
        tmphi = _mm_unpackhi_epi16(tmplo,tmphi);  // high
1372
1373
0
        tmplo = _mm_add_epi32(xleft,v32);
1374
0
        tmphi = _mm_add_epi32(tmphi,v32);
1375
0
        tmplo = _mm_srai_epi32(tmplo,6);
1376
0
        tmphi = _mm_srai_epi32(tmphi,6);
1377
1378
0
        tmplo =  _mm_packs_epi32(tmplo,tmphi);
1379
1380
0
        xdst = _mm_adds_epi16(tmplo,xdst);
1381
0
        xdst = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, xdst ) );
1382
1383
0
        if (width==8)
1384
0
           _mm_storeu_si128( ( __m128i * )(pDsty), xdst );
1385
0
         else if (width==4)
1386
0
           _vv_storel_epi64( ( __m128i * )(pDsty), xdst );
1387
0
         else
1388
0
         {
1389
0
           _mm_storeu_si32( ( __m128i * )(pDsty), xdst );
1390
0
         }
1391
0
     }
1392
0
   }
1393
#if USE_AVX2
1394
  _mm256_zeroupper();
1395
#endif
1396
0
}
Unexecuted instantiation: void vvenc::IntraHorVerPDPC_SIMD<(vvenc::x86_simd::X86_VEXT)1>(short*, int, short*, int, int, int, short const*, vvenc::ClpRng const&)
Unexecuted instantiation: void vvenc::IntraHorVerPDPC_SIMD<(vvenc::x86_simd::X86_VEXT)4>(short*, int, short*, int, int, int, short const*, vvenc::ClpRng const&)
1397
1398
1399
template<X86_VEXT vext>
1400
void IntraPrediction::_initIntraPredictionX86()
1401
0
{
1402
0
  IntraPredAngleLuma    = IntraPredAngleLumaCore_SIMD<vext>;
1403
0
  IntraPredAngleChroma  = IntraPredAngleChroma_SIMD<vext>;
1404
0
  IntraAnglePDPC        = IntraAnglePDPC_SIMD<vext>;
1405
0
  IntraHorVerPDPC       = IntraHorVerPDPC_SIMD<vext>;
1406
0
  IntraPredSampleFilter = IntraPredSampleFilter_SIMD<vext>;
1407
0
  xPredIntraPlanar      = xPredIntraPlanar_SIMD<vext>;
1408
0
}
Unexecuted instantiation: void vvenc::IntraPrediction::_initIntraPredictionX86<(vvenc::x86_simd::X86_VEXT)1>()
Unexecuted instantiation: void vvenc::IntraPrediction::_initIntraPredictionX86<(vvenc::x86_simd::X86_VEXT)4>()
1409
template void IntraPrediction::_initIntraPredictionX86<SIMDX86>();
1410
1411
} // namespace vvenc
1412
1413
//! \}
1414
1415
#endif // TARGET_SIMD_X86
1416
#endif
1417
//! \}