Coverage Report

Created: 2026-04-01 07:49

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/vvdec/source/Lib/CommonLib/x86/IntraPredX86.h
Line
Count
Source
1
/* -----------------------------------------------------------------------------
2
The copyright in this software is being made available under the Clear BSD
3
License, included below. No patent rights, trademark rights and/or 
4
other Intellectual Property Rights other than the copyrights concerning 
5
the Software are granted under this license.
6
7
The Clear BSD License
8
9
Copyright (c) 2018-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVdeC Authors.
10
All rights reserved.
11
12
Redistribution and use in source and binary forms, with or without modification,
13
are permitted (subject to the limitations in the disclaimer below) provided that
14
the following conditions are met:
15
16
     * Redistributions of source code must retain the above copyright notice,
17
     this list of conditions and the following disclaimer.
18
19
     * Redistributions in binary form must reproduce the above copyright
20
     notice, this list of conditions and the following disclaimer in the
21
     documentation and/or other materials provided with the distribution.
22
23
     * Neither the name of the copyright holder nor the names of its
24
     contributors may be used to endorse or promote products derived from this
25
     software without specific prior written permission.
26
27
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
28
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
29
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
32
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
36
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
40
41
------------------------------------------------------------------------------------------- */
42
43
/** \file     IntraPredX86.h
44
    \brief    SIMD for IntraPrediction
45
*/
46
47
#include "CommonLib/CommonDef.h"
48
#include "CommonDefX86.h"
49
#include "CommonLib/IntraPrediction.h"
50
51
namespace vvdec
52
{
53
54
#if ENABLE_SIMD_OPT_INTRAPRED
55
#ifdef TARGET_SIMD_X86
56
57
//#define USE_AVX2
58
template< X86_VEXT vext, int W >
59
void IntraPredAngleChroma_SIMD(int16_t* pDst,const ptrdiff_t dstStride,int16_t* pBorder,int width,int height,int deltaPos,int intraPredAngle)
60
0
{
61
0
  int deltaInt;
62
0
  int deltaFract;
63
0
  int refMainIndex;
64
65
0
  __m128i voffset = _mm_set1_epi16(16);
66
0
  if( W == 8 )
67
0
  {
68
0
    if( vext >= AVX2 )
69
0
    {
70
#ifdef USE_AVX2
71
0
      if (( width & 15 ) == 0 )
72
0
      {
73
0
       int deltaInt;
74
0
        int deltaFract;
75
0
        int refMainIndex;
76
77
        __m256i voffset = _mm256_set1_epi16(16);
78
0
        for (int k=0; k<height; k++) {
79
80
0
          deltaInt   = deltaPos >> 5;
81
0
          deltaFract = deltaPos & (32 - 1);
82
83
0
          __m256i vfract = _mm256_set1_epi16(deltaFract);
84
0
          __m256i v32minfract = _mm256_set1_epi16(32-deltaFract);
85
          // Do linear filtering
86
0
          for (int l=0; l<width; l+=16) {
87
0
            refMainIndex   = l+ deltaInt+1;
88
0
            __m256i vpred0 = _mm256_lddqu_si256((__m256i*)&pBorder[refMainIndex]);
89
0
            __m256i vpred1 = _mm256_lddqu_si256((__m256i*)&pBorder[refMainIndex+1]);
90
0
            vpred0 = _mm256_mullo_epi16(v32minfract, vpred0);
91
0
            vpred1 = _mm256_mullo_epi16(vfract, vpred1);
92
0
            __m256i vpred = _mm256_srli_epi16(_mm256_add_epi16(_mm256_add_epi16(vpred0, vpred1), voffset), 5);
93
0
            _mm256_storeu_si256((__m256i*)&pDst[l], vpred);
94
0
          }
95
0
          pDst+=dstStride;
96
0
          deltaPos += intraPredAngle;
97
0
        }
98
0
      }
99
0
      else // width==8
100
0
      {
101
0
        for (int k=0; k<height; k++)
102
0
        {
103
0
          deltaInt   = deltaPos >> 5;
104
0
          deltaFract = deltaPos & (32 - 1);
105
106
0
          __m128i vfract = _mm_set1_epi16(deltaFract);
107
0
          __m128i v32minfract = _mm_set1_epi16(32-deltaFract);
108
          // Do linear filtering
109
0
          for (int l=0; l<width; l+=8) {
110
0
            refMainIndex        = l+ deltaInt+1;
111
0
            __m128i vpred0 = _mm_lddqu_si128((__m128i*)&pBorder[refMainIndex]);
112
0
            __m128i vpred1 = _mm_lddqu_si128((__m128i*)&pBorder[refMainIndex+1]);
113
0
            vpred0 = _mm_mullo_epi16(v32minfract, vpred0);
114
0
            vpred1 = _mm_mullo_epi16(vfract, vpred1);
115
0
            __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5);
116
0
            _mm_storeu_si128((__m128i*)&pDst[l], vpred);
117
0
          }
118
0
          deltaPos += intraPredAngle;
119
120
0
          pDst+=dstStride;
121
0
        }
122
123
0
      }
124
#endif //AVX2
125
0
    }  
126
0
    else
127
0
    {
128
0
      for (int k=0; k<height; k++) {
129
0
        deltaInt   = deltaPos >> 5;
130
0
        deltaFract = deltaPos & (32 - 1);
131
132
0
        __m128i vfract = _mm_set1_epi16(deltaFract);
133
0
        __m128i v32minfract = _mm_set1_epi16(32-deltaFract);
134
        // Do linear filtering
135
0
        for (int l=0; l<width; l+=8) {
136
0
          refMainIndex        = l+ deltaInt+1;
137
0
          __m128i vpred0 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex]);
138
0
          __m128i vpred1 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex+1]);
139
0
          vpred0 = _mm_mullo_epi16(v32minfract, vpred0);
140
0
          vpred1 = _mm_mullo_epi16(vfract, vpred1);
141
0
          __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5);
142
0
          _mm_storeu_si128((__m128i*)&pDst[l], vpred);
143
0
        }
144
0
        deltaPos += intraPredAngle;
145
146
0
        pDst+=dstStride;
147
0
      }
148
0
    }
149
150
0
  }
151
0
  else if( W == 4 )
152
0
  {
153
0
    for (int k=0; k<height; k++) {
154
0
      deltaInt   = deltaPos >> 5;
155
0
      deltaFract = deltaPos & (32 - 1);
156
157
0
      __m128i vfract = _mm_set1_epi16(deltaFract);
158
0
      __m128i v32minfract = _mm_set1_epi16(32-deltaFract);
159
      // Do linear filtering
160
0
      refMainIndex        = deltaInt+1;
161
0
      __m128i vpred0 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex]);
162
0
      __m128i vpred1 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex+1]);
163
0
      vpred0 = _mm_mullo_epi16(v32minfract, vpred0);
164
0
      vpred1 = _mm_mullo_epi16(vfract, vpred1);
165
0
      __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5);
166
0
      _mm_storeu_si64( ( __m128i * )(pDst ), vpred);
167
0
      deltaPos += intraPredAngle;
168
0
      pDst+=dstStride;
169
0
    }
170
0
  }
171
0
  else
172
0
  {
173
0
    THROW_FATAL( "Unsupported size in IntraPredAngleCore_SIMD" );
174
0
  }
175
#if USE_AVX2
176
177
0
  _mm256_zeroupper();
178
0
#endif
179
0
}
Unexecuted instantiation: void vvdec::IntraPredAngleChroma_SIMD<(vvdec::x86_simd::X86_VEXT)1, 4>(short*, long, short*, int, int, int, int)
Unexecuted instantiation: void vvdec::IntraPredAngleChroma_SIMD<(vvdec::x86_simd::X86_VEXT)1, 8>(short*, long, short*, int, int, int, int)
Unexecuted instantiation: void vvdec::IntraPredAngleChroma_SIMD<(vvdec::x86_simd::X86_VEXT)4, 4>(short*, long, short*, int, int, int, int)
Unexecuted instantiation: void vvdec::IntraPredAngleChroma_SIMD<(vvdec::x86_simd::X86_VEXT)4, 8>(short*, long, short*, int, int, int, int)
180
181
182
template< X86_VEXT vext, int W >
183
void IntraPredAngleCore_SIMD(int16_t* pDstBuf,const ptrdiff_t dstStride,int16_t* refMain,int width,int height,int deltaPos,int intraPredAngle,const TFilterCoeff *ff,const bool useCubicFilter,const ClpRng& clpRng)
184
0
{
185
0
  int16_t* pDst;
186
187
0
  if( W == 8 )
188
0
  {
189
0
    if( vext >= AVX2 )
190
0
    {
191
#ifdef USE_AVX2
192
      __m256i shflmask1= _mm256_set_epi8(0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6,   0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4,
193
          0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2,   0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 );
194
      __m256i offset = _mm256_set1_epi32( 32 );
195
196
0
      if (( width & 15 ) == 0 )
197
0
      {
198
0
        __m256i vbdmin,vbdmax;
199
200
0
        if (useCubicFilter)
201
0
        {
202
0
          vbdmin = _mm256_set1_epi16( clpRng.min() );
203
0
          vbdmax = _mm256_set1_epi16( clpRng.max() );
204
0
        }
205
206
0
        for (int y = 0; y<height; y++ )
207
0
        {
208
0
          int deltaInt   = deltaPos >> 5;
209
0
          int deltaFract = deltaPos & (32 - 1);
210
0
          int refMainIndex   = deltaInt + 1;
211
0
          pDst=&pDstBuf[y*dstStride];
212
0
          __m128i tmp = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] );   //load 4 16 bit filter coeffs
213
0
          tmp = _mm_shuffle_epi32(tmp,0x44);
214
0
          __m256i coeff = _mm256_broadcastsi128_si256(tmp);
215
0
          for( int x = 0; x < width; x+=16)
216
0
          {
217
0
            __m256i src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( const __m128i* ) &refMain[refMainIndex     - 1] ) );
218
0
            __m256i src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( const __m128i* ) &refMain[refMainIndex + 4 - 1] ) );
219
0
            src1 = _mm256_shuffle_epi8(src1,shflmask1);                 // -1 0 1 2  0 1 2 3 1 2 3 4  2 3 4 5
220
0
            src2 = _mm256_shuffle_epi8(src2,shflmask1);                 // 3 4 5 6  4 5 6 7  5 6 7 8 6 7 8 9
221
222
0
            src1 = _mm256_madd_epi16 (src1, coeff);
223
0
            src2 = _mm256_madd_epi16 (src2, coeff);
224
225
0
            __m256i  sum  = _mm256_hadd_epi32( src1, src2 );
226
0
            sum = _mm256_permute4x64_epi64(sum,0xD8);
227
228
0
            sum = _mm256_add_epi32( sum, offset );
229
0
            sum = _mm256_srai_epi32( sum, 6 );
230
231
0
            refMainIndex+=8;
232
            
233
0
            src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) &refMain[refMainIndex     - 1] ) );
234
0
            src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) &refMain[refMainIndex + 4 - 1] ) );
235
236
0
            src1 = _mm256_shuffle_epi8(src1,shflmask1);                 // -1 0 1 2  0 1 2 3 1 2 3 4  2 3 4 5
237
0
            src2 = _mm256_shuffle_epi8(src2,shflmask1);                 // 3 4 5 6  4 5 6 7  5 6 7 8 6 7 8 9
238
0
            src1 = _mm256_madd_epi16 (src1, coeff);
239
0
            src2 = _mm256_madd_epi16 (src2, coeff);
240
241
0
            __m256i  sum1  = _mm256_hadd_epi32( src1, src2 );
242
0
            sum1 = _mm256_permute4x64_epi64(sum1,0xD8);
243
244
0
            sum1 = _mm256_add_epi32( sum1, offset );
245
0
            sum1 = _mm256_srai_epi32( sum1, 6 );
246
0
            __m256i
247
0
            src0 = _mm256_packs_epi32( sum, sum1 );
248
249
0
            src0 = _mm256_permute4x64_epi64(src0,0xD8);
250
251
0
            refMainIndex+=8;
252
253
0
            if (useCubicFilter)
254
0
              src0 = _mm256_min_epi16( vbdmax, _mm256_max_epi16( vbdmin, src0 ) );
255
256
0
            _mm256_storeu_si256( ( __m256i * )(pDst + x), src0);
257
0
          }
258
0
          deltaPos += intraPredAngle;
259
0
        }
260
0
      }
261
0
      else // width =8
262
0
      {
263
        //        printf("AVX2 Block %d \n",width);
264
0
        __m128i vbdmin,vbdmax;
265
266
0
        if (useCubicFilter)
267
0
        {
268
0
          vbdmin = _mm_set1_epi16( clpRng.min() );
269
0
          vbdmax = _mm_set1_epi16( clpRng.max() );
270
0
        }
271
272
0
        for (int y = 0; y<height; y++ )
273
0
        {
274
0
          int deltaInt   = deltaPos >> 5;
275
0
          int deltaFract = deltaPos & (32 - 1);
276
0
          int refMainIndex   = deltaInt + 1;
277
0
          pDst=&pDstBuf[y*dstStride];
278
0
          __m128i tmp = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] );   //load 4 16 bit filter coeffs
279
0
          tmp = _mm_shuffle_epi32(tmp,0x44);
280
0
          __m256i coeff = _mm256_broadcastsi128_si256(tmp);
281
0
          __m256i src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) & refMain[refMainIndex - 1] ) );
282
0
          __m256i src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) & refMain[refMainIndex + 4 - 1] ) );
283
0
          src1 = _mm256_shuffle_epi8(src1,shflmask1);                 // -1 0 1 2  0 1 2 3 1 2 3 4  2 3 4 5
284
0
          src2 = _mm256_shuffle_epi8(src2,shflmask1);                 // 3 4 5 6  4 5 6 7  5 6 7 8 6 7 8 9
285
286
0
          src1 = _mm256_madd_epi16 (src1, coeff);
287
0
          src2 = _mm256_madd_epi16 (src2, coeff);
288
289
0
          __m256i  sum  = _mm256_hadd_epi32( src1, src2 );
290
0
          sum = _mm256_permute4x64_epi64(sum,0xD8);
291
292
0
          sum = _mm256_add_epi32( sum, offset );
293
0
          sum = _mm256_srai_epi32( sum, 6 );
294
0
          __m128i dest128 = _mm256_cvtepi32_epi16x( sum );
295
296
0
          if (useCubicFilter)
297
0
            dest128 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, dest128 ) );
298
299
0
          _mm_storeu_si128( ( __m128i * )(pDst), dest128);
300
0
          deltaPos += intraPredAngle;
301
0
        }
302
0
      }
303
#endif
304
0
    }
305
0
    else
306
0
    {
307
0
      __m128i shflmask1= _mm_set_epi8( 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2,   0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 );
308
0
      __m128i shflmask2= _mm_set_epi8( 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6,   0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4 );
309
0
      __m128i vbdmin,vbdmax;
310
311
0
      __m128i offset = _mm_set1_epi32( 32 );
312
313
0
      if (useCubicFilter)
314
0
      {
315
0
        vbdmin = _mm_set1_epi16( clpRng.min() );
316
0
        vbdmax = _mm_set1_epi16( clpRng.max() );
317
0
      }
318
319
0
      for (int y = 0; y<height; y++ )
320
0
      {
321
0
        int deltaInt   = deltaPos >> 5;
322
0
        int deltaFract = deltaPos & (32 - 1);
323
0
        int refMainIndex   = deltaInt + 1;
324
0
        pDst=&pDstBuf[y*dstStride];
325
0
        __m128i coeff = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] );   //load 4 16 bit filter coeffs
326
0
        coeff = _mm_shuffle_epi32(coeff,0x44);
327
0
        for( int x = 0; x < width; x+=8)
328
0
        {
329
0
          __m128i src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] );   //load 8 16 bit reference Pels   -1 0 1 2 3 4 5 6
330
0
          __m128i src1 = _mm_shuffle_epi8(src0,shflmask1);                  // -1 0 1 2  0 1 2 3
331
0
          __m128i src2 = _mm_shuffle_epi8(src0,shflmask2);                  // 1 2 3 4  2 3 4 5
332
0
          src0 = _mm_madd_epi16( coeff,src1 );
333
0
          src1 = _mm_madd_epi16( coeff,src2 );
334
0
          __m128i sum  = _mm_hadd_epi32( src0, src1 );
335
0
          sum = _mm_add_epi32( sum, offset );
336
0
          sum = _mm_srai_epi32( sum, 6 );
337
338
0
          refMainIndex+=4;
339
0
          src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] );   //load 8 16 bit reference Pels   -1 0 1 2 3 4 5 6
340
0
          src1 = _mm_shuffle_epi8(src0,shflmask1);                                // -1 0 1 2  0 1 2 3
341
0
          src2 = _mm_shuffle_epi8(src0,shflmask2);
342
343
          // 1 2 3 4  2 3 4 5
344
0
          src0 = _mm_madd_epi16( coeff,src1 );
345
0
          src1 = _mm_madd_epi16( coeff,src2 );
346
347
0
          __m128i sum1  = _mm_hadd_epi32( src0, src1 );
348
0
          sum1 = _mm_add_epi32( sum1, offset );
349
0
          sum1 = _mm_srai_epi32( sum1, 6 );
350
0
          src0 = _mm_packs_epi32( sum, sum1 );
351
352
0
          refMainIndex+=4;
353
354
0
          if (useCubicFilter)
355
0
            src0 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, src0 ) );
356
357
0
          _mm_storeu_si128( ( __m128i * )(pDst + x), src0);
358
359
0
        }
360
0
        deltaPos += intraPredAngle;
361
0
      }
362
0
    }
363
0
  }
364
0
  else if( W == 4 )
365
0
  {
366
367
0
    __m128i shflmask1= _mm_set_epi8( 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2,   0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 );
368
0
    __m128i shflmask2= _mm_set_epi8( 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6,   0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4 );
369
0
    __m128i vbdmin,vbdmax;
370
371
0
    __m128i offset = _mm_set1_epi32( 32 );
372
373
0
    if (useCubicFilter)
374
0
    {
375
0
      vbdmin = _mm_set1_epi16( clpRng.min() );
376
0
      vbdmax = _mm_set1_epi16( clpRng.max() );
377
0
    }
378
379
0
    for (int y = 0; y<height; y++ )
380
0
    {
381
0
      int deltaInt   = deltaPos >> 5;
382
0
      int deltaFract = deltaPos & (32 - 1);
383
0
      int refMainIndex   = deltaInt + 1;
384
0
      pDst=&pDstBuf[y*dstStride];
385
0
      __m128i coeff = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] );   //load 4 16 bit filter coeffs
386
0
      coeff = _mm_shuffle_epi32(coeff,0x44);
387
0
      {
388
0
        __m128i src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] );   //load 8 16 bit reference Pels   -1 0 1 2 3 4 5 6
389
0
        __m128i src1 = _mm_shuffle_epi8(src0,shflmask1);                  // -1 0 1 2  0 1 2 3
390
0
        __m128i src2 = _mm_shuffle_epi8(src0,shflmask2);                  // 1 2 3 4  2 3 4 5
391
0
        src0 = _mm_madd_epi16( coeff,src1 );
392
0
        src1 = _mm_madd_epi16( coeff,src2 );
393
0
        __m128i sum  = _mm_hadd_epi32( src0, src1 );
394
0
        sum = _mm_add_epi32( sum, offset );
395
0
        sum = _mm_srai_epi32( sum, 6 );
396
397
0
        src0 = _mm_packs_epi32( sum, sum );
398
399
0
        refMainIndex+=4;
400
401
0
        if (useCubicFilter)
402
0
          src0 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, src0 ) );
403
404
0
        _mm_storeu_si64( ( __m128i * )(pDst ), src0);
405
406
0
      }
407
0
      deltaPos += intraPredAngle;
408
0
    }
409
0
  }
410
0
  else
411
0
  {
412
0
    THROW_FATAL( "Unsupported size in IntraPredAngleCore_SIMD" );
413
0
  }
414
#if USE_AVX2
415
0
  _mm256_zeroupper();
416
0
#endif
417
0
}
Unexecuted instantiation: void vvdec::IntraPredAngleCore_SIMD<(vvdec::x86_simd::X86_VEXT)1, 4>(short*, long, short*, int, int, int, int, short const*, bool, vvdec::ClpRngTemplate<short> const&)
Unexecuted instantiation: void vvdec::IntraPredAngleCore_SIMD<(vvdec::x86_simd::X86_VEXT)1, 8>(short*, long, short*, int, int, int, int, short const*, bool, vvdec::ClpRngTemplate<short> const&)
Unexecuted instantiation: void vvdec::IntraPredAngleCore_SIMD<(vvdec::x86_simd::X86_VEXT)4, 4>(short*, long, short*, int, int, int, int, short const*, bool, vvdec::ClpRngTemplate<short> const&)
Unexecuted instantiation: void vvdec::IntraPredAngleCore_SIMD<(vvdec::x86_simd::X86_VEXT)4, 8>(short*, long, short*, int, int, int, int, short const*, bool, vvdec::ClpRngTemplate<short> const&)
418
419
template< X86_VEXT vext, int W >
420
void  IntraPredSampleFilter_SIMD(Pel *ptrSrc,const ptrdiff_t srcStride,PelBuf &piPred,const uint32_t uiDirMode,const ClpRng& clpRng)
421
0
{
422
0
  const int       iWidth    = piPred.width;
423
0
  const int       iHeight   = piPred.height;
424
0
  PelBuf          dstBuf    = piPred;
425
0
  Pel*            pDst      = dstBuf.buf;
426
0
  const ptrdiff_t dstStride = dstBuf.stride;
427
428
0
  const int scale = ((getLog2(iWidth) - 2 + getLog2(iHeight) - 2 + 2) >> 2);
429
0
  CHECK(scale < 0 || scale > 31, "PDPC: scale < 0 || scale > 2");
430
431
#if USE_AVX2
432
0
  if( W > 8 )
433
0
  {
434
0
    __m256i tmplo,tmphi;
435
0
    __m256i w64 = _mm256_set_epi16(64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64);
436
0
    __m256i w32 = _mm256_set_epi32(32,32,32,32,32,32,32,32);
437
0
    __m256i vbdmin   = _mm256_set1_epi32( clpRng.min() );
438
0
    __m256i vbdmax   = _mm256_set1_epi32( clpRng.max() );
439
0
    __m256i wl16;
440
0
    __m256i wl16start;
441
442
0
    if (scale==0)
443
0
    {
444
0
      wl16start = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,0,0,0,2,8,32);
445
0
    }
446
0
    else if (scale==1)
447
0
    {
448
0
      wl16start = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,1,2,4,8,16,32);
449
0
    }
450
0
    else if (scale==2)
451
0
    {
452
0
      wl16start = _mm256_set_epi16(0,0,0,0,1,1,2,2,4,4,8,8,16,16,32,32);
453
0
    }
454
0
    else
455
0
    {
456
0
      THROW_FATAL( "Wrong scale (" << scale << ")" );
457
0
    }
458
459
460
0
    if (uiDirMode == PLANAR_IDX || uiDirMode == DC_IDX )
461
0
    {
462
0
      for (int y = 0; y < iHeight; y++)
463
0
      {
464
0
        int wT = 32 >> std::min(31, ((y << 1) >> scale));
465
466
0
        __m256i wt16 = _mm256_set_epi16(wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT);
467
0
        __m256i x16left = _mm256_broadcastw_epi16(_mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)*srcStride))));
468
469
0
        if (wT)
470
0
        {
471
0
          for (int x = 0; x < iWidth; x+=16)
472
0
          {
473
0
            if (x==0)
474
0
            {
475
0
              wl16=wl16start;
476
477
0
              __m256i x16top = _mm256_loadu_si256((__m256i *) (ptrSrc+x+1)); // load top
478
0
              __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride+x)); // load dst
479
480
0
              tmplo = _mm256_mullo_epi16(x16left,wl16);  //wL * left
481
0
              tmphi = _mm256_mulhi_epi16(x16left,wl16);  //wL * left
482
0
              __m256i leftlo = _mm256_unpacklo_epi16(tmplo,tmphi);
483
0
              __m256i lefthi = _mm256_unpackhi_epi16(tmplo,tmphi);
484
485
0
              tmplo = _mm256_mullo_epi16(x16top,wt16);    // wT*top
486
0
              tmphi = _mm256_mulhi_epi16(x16top,wt16);    // wT*top
487
0
              __m256i toplo = _mm256_unpacklo_epi16(tmplo,tmphi);
488
0
              __m256i tophi = _mm256_unpackhi_epi16(tmplo,tmphi);
489
490
0
              __m256i wX = _mm256_sub_epi16(w64,wl16);
491
0
              wX = _mm256_sub_epi16(wX,wt16);            // 64-wL-wT
492
0
              tmplo = _mm256_mullo_epi16(x16dst,wX);    // 64-wL-wT*dst
493
0
              tmphi = _mm256_mulhi_epi16(x16dst,wX);    // 64-wL-wT*dst
494
0
              __m256i dstlo = _mm256_unpacklo_epi16(tmplo,tmphi);
495
0
              __m256i dsthi = _mm256_unpackhi_epi16(tmplo,tmphi);
496
497
0
              dstlo = _mm256_add_epi32(dstlo,toplo);
498
0
              dsthi = _mm256_add_epi32(dsthi,tophi);
499
0
              dstlo = _mm256_add_epi32(dstlo,leftlo);
500
0
              dsthi = _mm256_add_epi32(dsthi,lefthi);
501
0
              dstlo = _mm256_add_epi32(dstlo,w32);
502
0
              dsthi = _mm256_add_epi32(dsthi,w32);
503
504
0
              dstlo =  _mm256_srai_epi32(dstlo,6);
505
0
              dsthi =  _mm256_srai_epi32(dsthi,6);
506
507
0
              dstlo =  _mm256_max_epi32(vbdmin,dstlo);
508
0
              dsthi =  _mm256_max_epi32(vbdmin,dsthi);
509
0
              dstlo =  _mm256_min_epi32(vbdmax,dstlo);
510
0
              dsthi =  _mm256_min_epi32(vbdmax,dsthi);
511
512
0
              dstlo =  _mm256_packs_epi32(dstlo,dsthi);
513
0
              dstlo =  _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) );
514
515
0
              _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride+x), dstlo );
516
0
            }
517
0
            else
518
0
            {
519
520
0
              __m256i x16top = _mm256_loadu_si256((__m256i *) (ptrSrc+x+1)); // load top
521
0
              __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride+x)); // load dst
522
523
524
0
              tmplo = _mm256_mullo_epi16(x16top,wt16);    // wT*top
525
0
              tmphi = _mm256_mulhi_epi16(x16top,wt16);    // wT*top
526
0
              __m256i toplo = _mm256_unpacklo_epi16(tmplo,tmphi);
527
0
              __m256i tophi = _mm256_unpackhi_epi16(tmplo,tmphi);
528
529
0
              __m256i wX = _mm256_sub_epi16(w64,wt16);
530
0
              tmplo = _mm256_mullo_epi16(x16dst,wX);    // 64-wL-wT*dst
531
0
              tmphi = _mm256_mulhi_epi16(x16dst,wX);    // 64-wL-wT*dst
532
0
              __m256i dstlo = _mm256_unpacklo_epi16(tmplo,tmphi);
533
0
              __m256i dsthi = _mm256_unpackhi_epi16(tmplo,tmphi);
534
535
0
              dstlo = _mm256_add_epi32(dstlo,toplo);
536
0
              dsthi = _mm256_add_epi32(dsthi,tophi);
537
0
              dstlo = _mm256_add_epi32(dstlo,w32);
538
0
              dsthi = _mm256_add_epi32(dsthi,w32);
539
540
0
              dstlo =  _mm256_srai_epi32(dstlo,6);
541
0
              dsthi =  _mm256_srai_epi32(dsthi,6);
542
543
0
              dstlo =  _mm256_max_epi32(vbdmin,dstlo);
544
0
              dsthi =  _mm256_max_epi32(vbdmin,dsthi);
545
0
              dstlo =  _mm256_min_epi32(vbdmax,dstlo);
546
0
              dsthi =  _mm256_min_epi32(vbdmax,dsthi);
547
548
0
              dstlo =  _mm256_packs_epi32(dstlo,dsthi);
549
0
              dstlo =  _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) );
550
551
0
              _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride+x), dstlo );
552
0
            }
553
554
0
          }  // for
555
0
        }
556
0
        else
557
0
        { // wT =0
558
559
0
          wl16=wl16start;
560
0
          __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride)); // load dst
561
562
0
          tmplo = _mm256_mullo_epi16(x16left,wl16);  //wL * left
563
0
          tmphi = _mm256_mulhi_epi16(x16left,wl16);  //wL * left
564
0
          __m256i leftlo = _mm256_unpacklo_epi16(tmplo,tmphi);
565
0
          __m256i lefthi = _mm256_unpackhi_epi16(tmplo,tmphi);
566
567
0
          __m256i wX = _mm256_sub_epi16(w64,wl16);
568
0
          tmplo = _mm256_mullo_epi16(x16dst,wX);    // 64-wL-wT*dst
569
0
          tmphi = _mm256_mulhi_epi16(x16dst,wX);    // 64-wL-wT*dst
570
0
          __m256i dstlo = _mm256_unpacklo_epi16(tmplo,tmphi);
571
0
          __m256i dsthi = _mm256_unpackhi_epi16(tmplo,tmphi);
572
573
0
          dstlo = _mm256_add_epi32(dstlo,leftlo);
574
0
          dsthi = _mm256_add_epi32(dsthi,lefthi);
575
0
          dstlo = _mm256_add_epi32(dstlo,w32);
576
0
          dsthi = _mm256_add_epi32(dsthi,w32);
577
578
0
          dstlo =  _mm256_srai_epi32(dstlo,6);
579
0
          dsthi =  _mm256_srai_epi32(dsthi,6);
580
581
0
          dstlo =  _mm256_max_epi32(vbdmin,dstlo);
582
0
          dsthi =  _mm256_max_epi32(vbdmin,dsthi);
583
0
          dstlo =  _mm256_min_epi32(vbdmax,dstlo);
584
0
          dsthi =  _mm256_min_epi32(vbdmax,dsthi);
585
586
0
          dstlo =  _mm256_packs_epi32(dstlo,dsthi);
587
0
          dstlo =  _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) );
588
589
0
          _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride), dstlo );
590
0
        }
591
0
      }
592
0
    }
593
0
  }
594
0
  else
595
0
#endif
596
0
  {
597
0
    __m128i tmplo8,tmphi8;
598
0
    __m128i w64_8 = _mm_set_epi16(64,64,64,64,64,64,64,64);
599
0
    __m128i w32_8 = _mm_set_epi32(32,32,32,32);
600
0
    __m128i vbdmin8   = _mm_set1_epi32( clpRng.min() );
601
0
    __m128i vbdmax8   = _mm_set1_epi32( clpRng.max() );
602
0
    __m128i wl8start,wl8start2;
603
0
    CHECK(scale < 0 || scale > 2, "PDPC: scale < 0 || scale > 2");
604
605
0
    if (scale==0)
606
0
    {
607
0
      wl8start = _mm_set_epi16(0,0,0,0,0,2,8,32);
608
0
      wl8start2 = _mm_set_epi16(0,0,0,0,0,0,0,0);
609
0
    }
610
0
    else if (scale==1)
611
0
    {
612
0
      wl8start = _mm_set_epi16(0,0,1,2,4,8,16,32);
613
0
      wl8start2 = _mm_set_epi16(0,0,0,0,0,0,0,0);
614
0
    }
615
0
    else if (scale==2)
616
0
    {
617
0
      wl8start = _mm_set_epi16(4,4,8,8,16,16,32,32);
618
0
      wl8start2 = _mm_set_epi16(0,0,0,0,1,1,2,2);
619
0
    }
620
0
    if (uiDirMode == PLANAR_IDX || uiDirMode == DC_IDX )
621
0
    {
622
0
      __m128i wl8 = wl8start;
623
0
      for (int y = 0; y < iHeight; y++)
624
0
      {
625
0
        int wT = 32 >> std::min(31, ((y << 1) >> scale));
626
627
0
        __m128i wt8 = _mm_set_epi16(wT,wT,wT,wT,wT,wT,wT,wT);
628
 //       __m128i x8left = _mm_broadcastw_epi16(_mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)*srcStride))));
629
630
0
        __m128i x8left = _mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)*srcStride)));
631
0
        x8left =_mm_shufflelo_epi16(x8left,0);
632
0
        x8left =_mm_shuffle_epi32(x8left,0);
633
634
635
0
        if (wT)
636
0
        {
637
0
          for (int x = 0; x < iWidth; x+=8)
638
0
          {
639
0
            __m128i x8top = _mm_loadu_si128( (__m128i*) ( ptrSrc + x + 1 ) );   // load top
640
0
            __m128i x8dst = _mm_setzero_si128();
641
0
            if( iWidth >= 8 )
642
0
              x8dst = _mm_loadu_si128( (const __m128i*) ( pDst + y * dstStride + x ) );   // load dst
643
0
            else if( iWidth == 4 )
644
0
              x8dst = _mm_loadu_si64( (const __m128i*) ( pDst + y * dstStride + x ) );    // load dst
645
0
            else if( iWidth == 2 )
646
0
              x8dst = _mm_loadu_si32( (const __m128i*) ( pDst + y * dstStride + x ) );    // load dst
647
0
            else
648
0
            {
649
0
              CHECKD( true, "wrong iWidth in IntraPredSampleFilter_SIMD, only implemented for >=8, ==4, ==2" );
650
0
            }
651
652
0
            if (x>8)
653
0
            {
654
0
              tmplo8 = _mm_mullo_epi16(x8top,wt8);    // wT*top
655
0
              tmphi8 = _mm_mulhi_epi16(x8top,wt8);    // wT*top
656
0
              __m128i toplo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
657
0
              __m128i tophi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
658
659
660
0
              __m128i wX = _mm_sub_epi16(w64_8,wt8);
661
0
              tmplo8 = _mm_mullo_epi16(x8dst,wX);    // 64-wL-wT*dst
662
0
              tmphi8 = _mm_mulhi_epi16(x8dst,wX);    // 64-wL-wT*dst
663
0
              __m128i dstlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
664
0
              __m128i dsthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
665
666
0
              dstlo8 = _mm_add_epi32(dstlo8,toplo8);
667
0
              dsthi8 = _mm_add_epi32(dsthi8,tophi8);
668
0
              dstlo8 = _mm_add_epi32(dstlo8,w32_8);
669
0
              dsthi8 = _mm_add_epi32(dsthi8,w32_8);
670
671
0
              dstlo8 =  _mm_srai_epi32(dstlo8,6);
672
0
              dsthi8 =  _mm_srai_epi32(dsthi8,6);
673
674
0
              dstlo8 =  _mm_max_epi32(vbdmin8,dstlo8);
675
0
              dsthi8 =  _mm_max_epi32(vbdmin8,dsthi8);
676
0
              dstlo8 =  _mm_min_epi32(vbdmax8,dstlo8);
677
0
              dsthi8 =  _mm_min_epi32(vbdmax8,dsthi8);
678
679
0
              x8dst = _mm_packs_epi32(dstlo8,dsthi8);
680
0
            }
681
0
            else // x<=8
682
0
            {
683
0
              if (x==0)
684
0
                wl8=wl8start;
685
0
              else if (x==8)
686
0
                wl8=wl8start2;
687
688
0
              tmplo8 = _mm_mullo_epi16(x8left,wl8);  //wL * left
689
0
              tmphi8 = _mm_mulhi_epi16(x8left,wl8);  //wL * left
690
0
              __m128i leftlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
691
0
              __m128i lefthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
692
693
0
              tmplo8 = _mm_mullo_epi16(x8top,wt8);    // wT*top
694
0
              tmphi8 = _mm_mulhi_epi16(x8top,wt8);    // wT*top
695
0
              __m128i toplo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
696
0
              __m128i tophi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
697
698
0
              __m128i wX = _mm_sub_epi16(w64_8,wl8);
699
0
              wX = _mm_sub_epi16(wX,wt8);            // 64-wL-wT
700
0
              tmplo8 = _mm_mullo_epi16(x8dst,wX);    // 64-wL-wT*dst
701
0
              tmphi8 = _mm_mulhi_epi16(x8dst,wX);    // 64-wL-wT*dst
702
0
              __m128i dstlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
703
0
              __m128i dsthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
704
705
0
              dstlo8 = _mm_add_epi32(dstlo8,toplo8);
706
0
              dsthi8 = _mm_add_epi32(dsthi8,tophi8);
707
0
              dstlo8 = _mm_add_epi32(dstlo8,leftlo8);
708
0
              dsthi8 = _mm_add_epi32(dsthi8,lefthi8);
709
0
              dstlo8 = _mm_add_epi32(dstlo8,w32_8);
710
0
              dsthi8 = _mm_add_epi32(dsthi8,w32_8);
711
712
0
              dstlo8 =  _mm_srai_epi32(dstlo8,6);
713
0
              dsthi8 =  _mm_srai_epi32(dsthi8,6);
714
715
0
              dstlo8 =  _mm_max_epi32(vbdmin8,dstlo8);
716
0
              dsthi8 =  _mm_max_epi32(vbdmin8,dsthi8);
717
0
              dstlo8 =  _mm_min_epi32(vbdmax8,dstlo8);
718
0
              dsthi8 =  _mm_min_epi32(vbdmax8,dsthi8);
719
720
0
              x8dst = _mm_packs_epi32(dstlo8,dsthi8);
721
0
            }
722
723
0
            if( iWidth >= 8 )
724
0
              _mm_storeu_si128( (__m128i*) ( pDst + y * dstStride + x ), x8dst );
725
0
            else if( iWidth == 4 )
726
0
              _mm_storeu_si64( (__m128i*) ( pDst + y * dstStride + x ), x8dst );
727
0
            else if( iWidth == 2 )
728
0
              _mm_storeu_si32( (__m128i*) ( pDst + y * dstStride + x ), x8dst );
729
0
          }
730
0
        }
731
0
        else //wT =0
732
0
        {
733
0
          for( int x = 0; x < std::min( iWidth, 16 ); x += 8 )
734
0
          {
735
0
            if( x == 0 )
736
0
              wl8 = wl8start;
737
0
            else if( x == 8 )
738
0
              wl8 = wl8start2;
739
740
0
            __m128i x8dst;
741
0
            if( iWidth >= 8 )
742
0
              x8dst = _mm_loadu_si128( (const __m128i*)( pDst + y * dstStride + x ) );   // load dst
743
0
            else if( iWidth == 4 )
744
0
              x8dst = _mm_loadu_si64( (const __m128i*)( pDst + y * dstStride + x ) );    // load dst
745
0
            else if( iWidth == 2 )
746
0
              x8dst = _mm_loadu_si32( (const __m128i*)( pDst + y * dstStride + x ) );    // load dst
747
0
            else
748
0
              CHECK( true, "wrong iWidth in IntraPredSampleFilter_SIMD, only implemented for >=8, ==4, ==2" );
749
750
751
0
            tmplo8          = _mm_mullo_epi16( x8left, wl8 );   // wL * left
752
0
            tmphi8          = _mm_mulhi_epi16( x8left, wl8 );   // wL * left
753
0
            __m128i leftlo8 = _mm_unpacklo_epi16( tmplo8, tmphi8 );
754
0
            __m128i lefthi8 = _mm_unpackhi_epi16( tmplo8, tmphi8 );
755
756
0
            __m128i wX     = _mm_sub_epi16( w64_8, wl8 );
757
0
            tmplo8         = _mm_mullo_epi16( x8dst, wX );   // 64-wL-wT*dst
758
0
            tmphi8         = _mm_mulhi_epi16( x8dst, wX );   // 64-wL-wT*dst
759
0
            __m128i dstlo8 = _mm_unpacklo_epi16( tmplo8, tmphi8 );
760
0
            __m128i dsthi8 = _mm_unpackhi_epi16( tmplo8, tmphi8 );
761
762
0
            dstlo8 = _mm_add_epi32( dstlo8, leftlo8 );
763
0
            dsthi8 = _mm_add_epi32( dsthi8, lefthi8 );
764
0
            dstlo8 = _mm_add_epi32( dstlo8, w32_8 );
765
0
            dsthi8 = _mm_add_epi32( dsthi8, w32_8 );
766
767
0
            dstlo8 = _mm_srai_epi32( dstlo8, 6 );
768
0
            dsthi8 = _mm_srai_epi32( dsthi8, 6 );
769
770
0
            dstlo8 = _mm_max_epi32( vbdmin8, dstlo8 );
771
0
            dsthi8 = _mm_max_epi32( vbdmin8, dsthi8 );
772
0
            dstlo8 = _mm_min_epi32( vbdmax8, dstlo8 );
773
0
            dsthi8 = _mm_min_epi32( vbdmax8, dsthi8 );
774
775
0
            dstlo8 = _mm_packs_epi32( dstlo8, dsthi8 );
776
777
0
            if( iWidth >= 8 )
778
0
              _mm_storeu_si128( (__m128i*)( pDst + y * dstStride + x ), dstlo8 );
779
0
            else if( iWidth == 4 )
780
0
              _mm_storeu_si64( (__m128i*)( pDst + y * dstStride + x ), ( dstlo8 ) );
781
0
            else if( iWidth == 2 )
782
0
              _mm_storeu_si32( (__m128i*)( pDst + y * dstStride + x ), dstlo8 );
783
0
          }
784
0
        }
785
0
      }
786
0
    }
787
0
  }
788
789
790
#if USE_AVX2
791
0
  _mm256_zeroupper();
792
0
#endif
793
0
}
Unexecuted instantiation: void vvdec::IntraPredSampleFilter_SIMD<(vvdec::x86_simd::X86_VEXT)1, 8>(short*, long, vvdec::AreaBuf<short>&, unsigned int, vvdec::ClpRngTemplate<short> const&)
Unexecuted instantiation: void vvdec::IntraPredSampleFilter_SIMD<(vvdec::x86_simd::X86_VEXT)1, 16>(short*, long, vvdec::AreaBuf<short>&, unsigned int, vvdec::ClpRngTemplate<short> const&)
Unexecuted instantiation: void vvdec::IntraPredSampleFilter_SIMD<(vvdec::x86_simd::X86_VEXT)4, 8>(short*, long, vvdec::AreaBuf<short>&, unsigned int, vvdec::ClpRngTemplate<short> const&)
Unexecuted instantiation: void vvdec::IntraPredSampleFilter_SIMD<(vvdec::x86_simd::X86_VEXT)4, 16>(short*, long, vvdec::AreaBuf<short>&, unsigned int, vvdec::ClpRngTemplate<short> const&)
794
795
/** Function for deriving planar intra prediction. This function derives the prediction samples for planar mode (intra coding).
796
 */
797
template< X86_VEXT vext>
798
void xPredIntraPlanar_SIMD( const CPelBuf &pSrc, PelBuf &pDst, const SPS& sps )
799
0
{
800
801
0
  const uint32_t width      = pDst.width;
802
0
  const uint32_t height     = pDst.height;
803
0
  const uint32_t log2W      = getLog2( width );
804
0
  const uint32_t log2H      = getLog2( height );
805
0
  const uint32_t finalShift = 1 + log2W + log2H;
806
0
  const uint32_t offset     = 1 << (log2W + log2H);
807
0
  const ptrdiff_t stride    = pDst.stride;
808
0
  Pel*            pred      = pDst.buf;
809
810
0
  const Pel *ptrSrc =pSrc.buf;
811
812
0
  int leftColumn,rightColumn;
813
0
  Pel tmp;
814
0
  int topRight = pSrc.at( width + 1, 0 );
815
816
0
  tmp=pSrc.at( 0, height+1 );
817
0
  const __m128i bottomLeft16 = _mm_set_epi16(tmp,tmp,tmp,tmp,tmp,tmp,tmp,tmp);
818
0
  const __m128i zero = _mm_setzero_si128();
819
0
  const __m128i eight = _mm_set_epi16(8,8,8,8,8,8,8,8);
820
0
  const __m128i offset32 = _mm_set_epi32(offset,offset,offset,offset);
821
0
  const __m128i vLog2W = _mm_cvtsi32_si128(log2W);
822
0
  const __m128i vLog2H = _mm_cvtsi32_si128(log2H);
823
0
  const __m128i vFinalShift = _mm_cvtsi32_si128(finalShift);
824
825
0
  for( int y = 0; y < height; y++)
826
0
  {
827
0
    leftColumn=pSrc.at(  0, y + 1 );
828
0
    rightColumn = topRight - leftColumn;
829
0
    leftColumn  = leftColumn << log2W;
830
0
    const __m128i leftColumn32 = _mm_set_epi32(leftColumn,leftColumn,leftColumn,leftColumn);
831
0
    const __m128i rightcolumn16 = _mm_set_epi16(rightColumn,rightColumn,rightColumn,rightColumn,rightColumn,rightColumn,rightColumn,rightColumn);
832
0
    const __m128i y16 = _mm_set_epi16(y+1,y+1,y+1,y+1,y+1,y+1,y+1,y+1);
833
0
          __m128i x16 = _mm_set_epi16(8,7,6,5,4,3,2,1);
834
835
0
    for( int x = 0; x < width; x+=8 )
836
0
    {
837
      //topRow[x] = pSrc.at( x + 1, 0 );
838
0
      __m128i topRow16 = _mm_loadu_si128 ((__m128i const *) (ptrSrc+(x+1)));
839
      //bottomRow[x] = bottomLeft - topRow[x];
840
0
      __m128i bottomRow16L = _mm_sub_epi16(bottomLeft16,topRow16);
841
      // (y+1)*bottomRow[x]
842
0
      __m128i  tmpH = _mm_mulhi_epi16(bottomRow16L,y16);
843
0
      __m128i tmpL = _mm_mullo_epi16(bottomRow16L,y16);
844
0
      bottomRow16L = _mm_unpacklo_epi16(tmpL,tmpH);
845
0
      __m128i bottomRow16H = _mm_unpackhi_epi16(tmpL,tmpH);
846
847
      // (topRow[x] topRow16H<< log2H)
848
0
      __m128i topRow32L = _mm_unpacklo_epi16(topRow16,zero);
849
0
      __m128i topRow32H = _mm_unpackhi_epi16(topRow16,zero);
850
0
      topRow32L = _mm_sll_epi32(topRow32L,vLog2H);
851
0
      topRow32H = _mm_sll_epi32(topRow32H,vLog2H);
852
      // vertPred    = (topRow[x] << log2H) + (y+1)*bottomRow[x];
853
0
      topRow32L = _mm_add_epi32(topRow32L,bottomRow16L);
854
0
      topRow32H = _mm_add_epi32(topRow32H,bottomRow16H);
855
      // horPred = leftColumn + (x+1)*rightColumn;
856
0
      tmpL = _mm_mullo_epi16(rightcolumn16,x16);
857
0
      tmpH = _mm_mulhi_epi16(rightcolumn16,x16);
858
0
      __m128i horpred32L = _mm_unpacklo_epi16(tmpL,tmpH);
859
0
      __m128i horpred32H = _mm_unpackhi_epi16(tmpL,tmpH);
860
0
      horpred32L = _mm_add_epi32(leftColumn32,horpred32L);
861
0
      horpred32H = _mm_add_epi32(leftColumn32,horpred32H);
862
      // pred[x]      = ( ( horPred << log2H ) + ( vertPred << log2W ) + offset ) >> finalShift;
863
0
      horpred32L = _mm_sll_epi32(horpred32L,vLog2H);
864
0
      horpred32H = _mm_sll_epi32(horpred32H,vLog2H);
865
0
      topRow32L = _mm_sll_epi32(topRow32L,vLog2W);
866
0
      topRow32H = _mm_sll_epi32(topRow32H,vLog2W);
867
0
      horpred32L = _mm_add_epi32(horpred32L,topRow32L);
868
0
      horpred32H = _mm_add_epi32(horpred32H,topRow32H);
869
0
      horpred32L = _mm_add_epi32(horpred32L,offset32);
870
0
      horpred32H = _mm_add_epi32(horpred32H,offset32);
871
0
      horpred32L = _mm_srl_epi32(horpred32L,vFinalShift);
872
0
      horpred32H = _mm_srl_epi32(horpred32H,vFinalShift);
873
874
0
      tmpL = _mm_packs_epi32(horpred32L,horpred32H);
875
0
      if (width>=8)
876
0
        _mm_storeu_si128(( __m128i * )(pred+y*stride+x), (tmpL) );
877
0
      else if (width==4)
878
0
        _mm_storeu_si64(( __m128i * )(pred+y*stride+x), (tmpL) );
879
0
      else if (width==2)
880
0
        _mm_storeu_si32(( __m128i * )(pred+y*stride+x),(tmpL) );
881
0
      else
882
0
        pred[y*stride+x]=(Pel)_mm_extract_epi16 (tmpL,0);
883
884
0
      x16 = _mm_add_epi16(x16,eight);
885
0
    }
886
0
  }
887
0
}
Unexecuted instantiation: void vvdec::xPredIntraPlanar_SIMD<(vvdec::x86_simd::X86_VEXT)1>(vvdec::AreaBuf<short const> const&, vvdec::AreaBuf<short>&, vvdec::SPS const&)
Unexecuted instantiation: void vvdec::xPredIntraPlanar_SIMD<(vvdec::x86_simd::X86_VEXT)4>(vvdec::AreaBuf<short const> const&, vvdec::AreaBuf<short>&, vvdec::SPS const&)
888
889
template< X86_VEXT vext>
890
void GetLumaRecPixel420SIMD (const int width,const int height, const Pel* pRecSrc0,const ptrdiff_t iRecStride,Pel* pDst0,const ptrdiff_t iDstStride)
891
0
{
892
#ifdef USE_AVX2
893
0
  if( ( width & 15 ) == 0 )    // width>=16
894
//  if( 0 )    // width>=16
895
0
  {
896
0
    __m256i vzero = _mm256_set1_epi8(0);
897
0
    __m256i vfour = _mm256_set1_epi32(4);
898
0
    for( int y = 0; y < height; y++ )
899
0
    {
900
0
      for( int x = 0; x < width; x += 16 )
901
0
      {
902
0
        int x2=x<<1;
903
0
        __m256i vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2-1]);      // -1 0 1 2 3 4 5 6
904
0
        __m256i vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2]);          // 0 1 2 3 4 5 6 7
905
906
        __m256i vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55);      // -1 1 3 5  32 Bit
907
        __m256i vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55);      // 0 2 4 6  32 Bit
908
        __m256i vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA);      // 1 3 5 7 32 Bit
909
        vsrc10 = _mm256_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
910
        vsrc0 =  _mm256_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
911
912
        vsrc0 =  _mm256_add_epi32(vsrc0,vsrc10);
913
        __m256i vdst0 = _mm256_add_epi32(vsrc0,vsrc01);   // dst 0 1 2 3 32 Bit, untere Zeile fehlt noch
914
915
        vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 +15]);      // 7 8 9 10 11 12 13 14
916
        vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 + 16 ]);          // 8 9 10 11 12 13 14 15
917
918
        x2+= (int)iRecStride;
919
920
        vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55);
921
        vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55);
922
        vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA);
923
        vsrc10 = _mm256_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
924
        vsrc0 =  _mm256_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
925
926
        vsrc0 =  _mm256_add_epi32(vsrc0,vsrc10);
927
        __m256i vdst1 = _mm256_add_epi32(vsrc0,vsrc01);   // dst 4 5 6 7 32 Bit, untere Zeile fehlt noch
928
929
        // jetzt die nächste Zeile dazu
930
        vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2-1]);      // -1 0 1 2 3 4 5 6
931
        vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2]);          // 0 1 2 3 4 5 6 7
932
933
        vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55);      // -1 1 3 5  32 Bit
934
        vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55);      // 0 2 4 6  32 Bit
935
        vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA);      // 1 3 5 7 32 Bit
936
        vsrc10 = _mm256_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
937
        vsrc0 =  _mm256_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
938
939
        vsrc0 =  _mm256_add_epi32(vsrc0,vsrc10);
940
        __m256i vdst01 = _mm256_add_epi32(vsrc0,vsrc01);   // dst 0 1 2 3 32 Bit, untere Zeile
941
942
        vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 + 15]);      // 7 8 9 10 11 12 13 14
943
        vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 + 16 ]);          // 8 9 10 11 12 13 14 15
944
945
        vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55);
946
        vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55);
947
        vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA);
948
        vsrc10 = _mm256_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
949
        vsrc0 =  _mm256_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
950
951
        vsrc0 =  _mm256_add_epi32(vsrc0,vsrc10);
952
        __m256i vdst11 = _mm256_add_epi32(vsrc0,vsrc01);   // dst 4 5 6 7 32 Bit, untere Zeile
953
954
        vdst0 = _mm256_add_epi32(vdst0,vdst01);
955
        vdst1 = _mm256_add_epi32(vdst1,vdst11);
956
        vdst0 =  _mm256_add_epi32(vdst0,vfour);
957
        vdst1 =  _mm256_add_epi32(vdst1,vfour);
958
        vdst0 = _mm256_srli_epi32(vdst0,3);
959
        vdst1 = _mm256_srli_epi32(vdst1,3);
960
        vdst0 = _mm256_packus_epi32 (vdst0,vdst1);   // 16 bit
961
        vdst0 = _mm256_permute4x64_epi64(vdst0,0xd8);
962
963
0
        _mm256_storeu_si256((__m256i*)&pDst0[x], vdst0);
964
        //        _mm_storeu_si128((__m128i*)&pDstTmp[x], vdst0);
965
0
      }
966
0
      pDst0 += iDstStride;
967
0
      pRecSrc0 += (iRecStride<<1);
968
0
    }
969
0
  }
970
0
  else
971
0
#endif
972
0
    if( ( width & 7 ) == 0 )    // width>=8
973
0
    {
974
0
      __m128i vzero = _mm_set1_epi8(0);
975
0
      __m128i vfour = _mm_set1_epi32(4);
976
977
978
0
      for( int y = 0; y < height; y++ )
979
0
      {
980
981
0
        for( int x = 0; x < width; x += 8 )
982
0
        {
983
0
          int x2=x<<1;
984
0
          __m128i vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2-1]);      // -1 0 1 2 3 4 5 6
985
0
          __m128i vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2]);          // 0 1 2 3 4 5 6 7
986
987
0
          __m128i vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55);      // -1 1 3 5  32 Bit
988
0
          __m128i vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55);      // 0 2 4 6  32 Bit
989
0
          __m128i vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA);      // 1 3 5 7 32 Bit
990
0
          vsrc10 = _mm_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
991
0
          vsrc0 =  _mm_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
992
993
0
          vsrc0 =  _mm_add_epi32(vsrc0,vsrc10);
994
0
          __m128i vdst0 = _mm_add_epi32(vsrc0,vsrc01);   // dst 0 1 2 3 32 Bit, untere Zeile fehlt noch
995
996
0
          vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 +7]);      // 7 8 9 10 11 12 13 14
997
0
          vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 + 8 ]);          // 8 9 10 11 12 13 14 15
998
999
0
          x2+=(int)iRecStride;
1000
1001
0
          vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55);
1002
0
          vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55);
1003
0
          vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA);
1004
0
          vsrc10 = _mm_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
1005
0
          vsrc0 =  _mm_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
1006
1007
0
          vsrc0 =  _mm_add_epi32(vsrc0,vsrc10);
1008
0
          __m128i vdst1 = _mm_add_epi32(vsrc0,vsrc01);   // dst 4 5 6 7 32 Bit, untere Zeile fehlt noch
1009
1010
          // jetzt die nächste Zeile dazu
1011
0
          vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2-1]);      // -1 0 1 2 3 4 5 6
1012
0
          vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2]);          // 0 1 2 3 4 5 6 7
1013
1014
0
          vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55);      // -1 1 3 5  32 Bit
1015
0
          vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55);      // 0 2 4 6  32 Bit
1016
0
          vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA);      // 1 3 5 7 32 Bit
1017
0
          vsrc10 = _mm_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
1018
0
          vsrc0 =  _mm_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
1019
1020
0
          vsrc0 =  _mm_add_epi32(vsrc0,vsrc10);
1021
0
          __m128i vdst01 = _mm_add_epi32(vsrc0,vsrc01);   // dst 0 1 2 3 32 Bit, untere Zeile
1022
1023
0
          vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 + 7]);      // 7 8 9 10 11 12 13 14
1024
0
          vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 + 8 ]);          // 8 9 10 11 12 13 14 15
1025
1026
0
          vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55);
1027
0
          vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55);
1028
0
          vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA);
1029
0
          vsrc10 = _mm_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
1030
0
          vsrc0 =  _mm_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
1031
1032
0
          vsrc0 =  _mm_add_epi32(vsrc0,vsrc10);
1033
0
          __m128i vdst11 = _mm_add_epi32(vsrc0,vsrc01);   // dst 4 5 6 7 32 Bit, untere Zeile
1034
1035
0
          vdst0 = _mm_add_epi32(vdst0,vdst01);
1036
0
          vdst1 = _mm_add_epi32(vdst1,vdst11);
1037
0
          vdst0 =  _mm_add_epi32(vdst0,vfour);
1038
0
          vdst1 =  _mm_add_epi32(vdst1,vfour);
1039
0
          vdst0 = _mm_srli_epi32(vdst0,3);
1040
0
          vdst1 = _mm_srli_epi32(vdst1,3);
1041
0
          vdst0 = _mm_packus_epi32 (vdst0,vdst1);   // 16 bit
1042
1043
0
          _mm_storeu_si128((__m128i*)&pDst0[x], vdst0);
1044
          //        _mm_storeu_si128((__m128i*)&pDstTmp[x], vdst0);
1045
0
        }
1046
0
        pDst0 += iDstStride;
1047
0
        pRecSrc0 += (iRecStride<<1);
1048
0
      }
1049
0
    }
1050
0
    else     // width<=4
1051
0
    {
1052
0
      __m128i vzero = _mm_set1_epi8(0);
1053
0
      __m128i vfour = _mm_set1_epi32(4);
1054
1055
0
      for( int y = 0; y < height; y++ )
1056
0
      {
1057
0
        __m128i vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[-1]);      // -1 0 1 2 3 4 5 6
1058
0
        __m128i vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[0]);          // 0 1 2 3 4 5 6 7
1059
1060
0
        __m128i vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55);      // -1 1 3 5  32 Bit
1061
0
        __m128i vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55);      // 0 2 4 6  32 Bit
1062
0
        __m128i vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA);      // 1 3 5 7 32 Bit
1063
0
        vsrc10 = _mm_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
1064
0
        vsrc0 =  _mm_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
1065
1066
0
        vsrc0 =  _mm_add_epi32(vsrc0,vsrc10);
1067
0
        __m128i vdst0 = _mm_add_epi32(vsrc0,vsrc01);   // dst 0 1 2 3 32 Bit, untere Zeile fehlt noch
1068
1069
        // jetzt die nächste Zeile dazu
1070
0
        vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[iRecStride-1]);      // -1 0 1 2 3 4 5 6
1071
0
        vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[iRecStride]);          // 0 1 2 3 4 5 6_mm_storeu_si32 7
1072
1073
0
        vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55);      // -1 1 3 5  32 Bit
1074
0
        vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55);      // 0 2 4 6  32 Bit
1075
0
        vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA);      // 1 3 5 7 32 Bit
1076
0
        vsrc10 = _mm_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
1077
0
        vsrc0 =  _mm_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
1078
1079
0
        vsrc0 =  _mm_add_epi32(vsrc0,vsrc10);
1080
0
        __m128i vdst01 = _mm_add_epi32(vsrc0,vsrc01);   // dst 0 1 2 3 32 Bit, untere Zeile
1081
1082
1083
0
        vdst0 = _mm_add_epi32(vdst0,vdst01);
1084
0
        vdst0 =  _mm_add_epi32(vdst0,vfour);
1085
0
        vdst0 = _mm_srli_epi32(vdst0,3);
1086
0
        vdst0 = _mm_packus_epi32 (vdst0,vdst0);   // 16 bit
1087
1088
0
        if (width==4)
1089
0
          _mm_storeu_si64(( __m128i * )&pDst0[0], (vdst0) );
1090
0
        else if (width==2)
1091
0
          _mm_storeu_si32(( __m128i * )&pDst0[0], (vdst0) );
1092
0
        else
1093
0
        {
1094
0
          int tmp = _mm_cvtsi128_si32(vdst0);
1095
0
          pDst0[0] = (Pel) tmp;
1096
0
        }
1097
1098
0
        pDst0 += iDstStride;
1099
0
        pRecSrc0 += (iRecStride<<1);
1100
0
      }
1101
0
    }
1102
0
}
Unexecuted instantiation: void vvdec::GetLumaRecPixel420SIMD<(vvdec::x86_simd::X86_VEXT)1>(int, int, short const*, long, short*, long)
Unexecuted instantiation: void vvdec::GetLumaRecPixel420SIMD<(vvdec::x86_simd::X86_VEXT)4>(int, int, short const*, long, short*, long)
1103
1104
1105
1106
template<X86_VEXT vext>
1107
void IntraPrediction::_initIntraPredictionX86()
1108
0
{
1109
0
  IntraPredAngleCore4 = IntraPredAngleCore_SIMD<vext, 4>;
1110
0
  IntraPredAngleCore8 = IntraPredAngleCore_SIMD<vext, 8>;
1111
0
  IntraPredAngleChroma4 = IntraPredAngleChroma_SIMD<vext, 4>;
1112
0
  IntraPredAngleChroma8 = IntraPredAngleChroma_SIMD<vext, 8>;
1113
1114
0
  IntraPredSampleFilter8 = IntraPredSampleFilter_SIMD<vext, 8>;
1115
0
  IntraPredSampleFilter16 = IntraPredSampleFilter_SIMD<vext, 16>;
1116
1117
0
  xPredIntraPlanar = xPredIntraPlanar_SIMD<vext>;
1118
1119
0
  GetLumaRecPixel420 = GetLumaRecPixel420SIMD<vext>;
1120
1121
0
}
Unexecuted instantiation: void vvdec::IntraPrediction::_initIntraPredictionX86<(vvdec::x86_simd::X86_VEXT)1>()
Unexecuted instantiation: void vvdec::IntraPrediction::_initIntraPredictionX86<(vvdec::x86_simd::X86_VEXT)4>()
1122
template void IntraPrediction::_initIntraPredictionX86<SIMDX86>();
1123
1124
#endif // TARGET_SIMD_X86
1125
#endif
1126
1127
}