Coverage Report

Created: 2026-05-24 07:45

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/vvdec/source/Lib/CommonLib/x86/IntraPredX86.h
Line
Count
Source
1
/* -----------------------------------------------------------------------------
2
The copyright in this software is being made available under the Clear BSD
3
License, included below. No patent rights, trademark rights and/or 
4
other Intellectual Property Rights other than the copyrights concerning 
5
the Software are granted under this license.
6
7
The Clear BSD License
8
9
Copyright (c) 2018-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVdeC Authors.
10
All rights reserved.
11
12
Redistribution and use in source and binary forms, with or without modification,
13
are permitted (subject to the limitations in the disclaimer below) provided that
14
the following conditions are met:
15
16
     * Redistributions of source code must retain the above copyright notice,
17
     this list of conditions and the following disclaimer.
18
19
     * Redistributions in binary form must reproduce the above copyright
20
     notice, this list of conditions and the following disclaimer in the
21
     documentation and/or other materials provided with the distribution.
22
23
     * Neither the name of the copyright holder nor the names of its
24
     contributors may be used to endorse or promote products derived from this
25
     software without specific prior written permission.
26
27
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
28
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
29
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
32
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
36
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
40
41
------------------------------------------------------------------------------------------- */
42
43
/** \file     IntraPredX86.h
44
    \brief    SIMD for IntraPrediction
45
*/
46
47
#include "CommonLib/CommonDef.h"
48
#include "CommonDefX86.h"
49
#include "CommonLib/IntraPrediction.h"
50
51
namespace vvdec
52
{
53
54
#if ENABLE_SIMD_OPT_INTRAPRED
55
#ifdef TARGET_SIMD_X86
56
57
//#define USE_AVX2
58
template< X86_VEXT vext, int W >
59
void IntraPredAngleChroma_SIMD(int16_t* pDst,const ptrdiff_t dstStride,int16_t* pBorder,int width,int height,int deltaPos,int intraPredAngle)
60
798
{
61
798
  int deltaInt;
62
798
  int deltaFract;
63
798
  int refMainIndex;
64
65
798
  __m128i voffset = _mm_set1_epi16(16);
66
798
  if( W == 8 )
67
574
  {
68
574
    if( vext >= AVX2 )
69
574
    {
70
#ifdef USE_AVX2
71
574
      if (( width & 15 ) == 0 )
72
464
      {
73
464
       int deltaInt;
74
464
        int deltaFract;
75
464
        int refMainIndex;
76
77
        __m256i voffset = _mm256_set1_epi16(16);
78
7.13k
        for (int k=0; k<height; k++) {
79
80
6.66k
          deltaInt   = deltaPos >> 5;
81
6.66k
          deltaFract = deltaPos & (32 - 1);
82
83
6.66k
          __m256i vfract = _mm256_set1_epi16(deltaFract);
84
6.66k
          __m256i v32minfract = _mm256_set1_epi16(32-deltaFract);
85
          // Do linear filtering
86
16.6k
          for (int l=0; l<width; l+=16) {
87
9.93k
            refMainIndex   = l+ deltaInt+1;
88
9.93k
            __m256i vpred0 = _mm256_lddqu_si256((__m256i*)&pBorder[refMainIndex]);
89
9.93k
            __m256i vpred1 = _mm256_lddqu_si256((__m256i*)&pBorder[refMainIndex+1]);
90
9.93k
            vpred0 = _mm256_mullo_epi16(v32minfract, vpred0);
91
9.93k
            vpred1 = _mm256_mullo_epi16(vfract, vpred1);
92
9.93k
            __m256i vpred = _mm256_srli_epi16(_mm256_add_epi16(_mm256_add_epi16(vpred0, vpred1), voffset), 5);
93
9.93k
            _mm256_storeu_si256((__m256i*)&pDst[l], vpred);
94
9.93k
          }
95
6.66k
          pDst+=dstStride;
96
6.66k
          deltaPos += intraPredAngle;
97
6.66k
        }
98
464
      }
99
110
      else // width==8
100
110
      {
101
1.28k
        for (int k=0; k<height; k++)
102
1.17k
        {
103
1.17k
          deltaInt   = deltaPos >> 5;
104
1.17k
          deltaFract = deltaPos & (32 - 1);
105
106
1.17k
          __m128i vfract = _mm_set1_epi16(deltaFract);
107
1.17k
          __m128i v32minfract = _mm_set1_epi16(32-deltaFract);
108
          // Do linear filtering
109
2.35k
          for (int l=0; l<width; l+=8) {
110
1.17k
            refMainIndex        = l+ deltaInt+1;
111
1.17k
            __m128i vpred0 = _mm_lddqu_si128((__m128i*)&pBorder[refMainIndex]);
112
1.17k
            __m128i vpred1 = _mm_lddqu_si128((__m128i*)&pBorder[refMainIndex+1]);
113
1.17k
            vpred0 = _mm_mullo_epi16(v32minfract, vpred0);
114
1.17k
            vpred1 = _mm_mullo_epi16(vfract, vpred1);
115
1.17k
            __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5);
116
1.17k
            _mm_storeu_si128((__m128i*)&pDst[l], vpred);
117
1.17k
          }
118
1.17k
          deltaPos += intraPredAngle;
119
120
1.17k
          pDst+=dstStride;
121
1.17k
        }
122
123
110
      }
124
#endif //AVX2
125
574
    }  
126
0
    else
127
0
    {
128
0
      for (int k=0; k<height; k++) {
129
0
        deltaInt   = deltaPos >> 5;
130
0
        deltaFract = deltaPos & (32 - 1);
131
132
0
        __m128i vfract = _mm_set1_epi16(deltaFract);
133
0
        __m128i v32minfract = _mm_set1_epi16(32-deltaFract);
134
        // Do linear filtering
135
0
        for (int l=0; l<width; l+=8) {
136
0
          refMainIndex        = l+ deltaInt+1;
137
0
          __m128i vpred0 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex]);
138
0
          __m128i vpred1 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex+1]);
139
0
          vpred0 = _mm_mullo_epi16(v32minfract, vpred0);
140
0
          vpred1 = _mm_mullo_epi16(vfract, vpred1);
141
0
          __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5);
142
0
          _mm_storeu_si128((__m128i*)&pDst[l], vpred);
143
0
        }
144
0
        deltaPos += intraPredAngle;
145
146
0
        pDst+=dstStride;
147
0
      }
148
0
    }
149
150
574
  }
151
224
  else if( W == 4 )
152
224
  {
153
1.64k
    for (int k=0; k<height; k++) {
154
1.42k
      deltaInt   = deltaPos >> 5;
155
1.42k
      deltaFract = deltaPos & (32 - 1);
156
157
1.42k
      __m128i vfract = _mm_set1_epi16(deltaFract);
158
1.42k
      __m128i v32minfract = _mm_set1_epi16(32-deltaFract);
159
      // Do linear filtering
160
1.42k
      refMainIndex        = deltaInt+1;
161
1.42k
      __m128i vpred0 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex]);
162
1.42k
      __m128i vpred1 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex+1]);
163
1.42k
      vpred0 = _mm_mullo_epi16(v32minfract, vpred0);
164
1.42k
      vpred1 = _mm_mullo_epi16(vfract, vpred1);
165
1.42k
      __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5);
166
1.42k
      _mm_storeu_si64( ( __m128i * )(pDst ), vpred);
167
1.42k
      deltaPos += intraPredAngle;
168
1.42k
      pDst+=dstStride;
169
1.42k
    }
170
224
  }
171
0
  else
172
0
  {
173
0
    THROW_FATAL( "Unsupported size in IntraPredAngleCore_SIMD" );
174
0
  }
175
#if USE_AVX2
176
177
798
  _mm256_zeroupper();
178
798
#endif
179
798
}
Unexecuted instantiation: void vvdec::IntraPredAngleChroma_SIMD<(vvdec::x86_simd::X86_VEXT)1, 4>(short*, long, short*, int, int, int, int)
Unexecuted instantiation: void vvdec::IntraPredAngleChroma_SIMD<(vvdec::x86_simd::X86_VEXT)1, 8>(short*, long, short*, int, int, int, int)
void vvdec::IntraPredAngleChroma_SIMD<(vvdec::x86_simd::X86_VEXT)4, 4>(short*, long, short*, int, int, int, int)
Line
Count
Source
60
224
{
61
224
  int deltaInt;
62
224
  int deltaFract;
63
224
  int refMainIndex;
64
65
224
  __m128i voffset = _mm_set1_epi16(16);
66
224
  if( W == 8 )
67
0
  {
68
0
    if( vext >= AVX2 )
69
0
    {
70
0
#ifdef USE_AVX2
71
0
      if (( width & 15 ) == 0 )
72
0
      {
73
0
       int deltaInt;
74
0
        int deltaFract;
75
0
        int refMainIndex;
76
77
0
        __m256i voffset = _mm256_set1_epi16(16);
78
0
        for (int k=0; k<height; k++) {
79
80
0
          deltaInt   = deltaPos >> 5;
81
0
          deltaFract = deltaPos & (32 - 1);
82
83
0
          __m256i vfract = _mm256_set1_epi16(deltaFract);
84
0
          __m256i v32minfract = _mm256_set1_epi16(32-deltaFract);
85
          // Do linear filtering
86
0
          for (int l=0; l<width; l+=16) {
87
0
            refMainIndex   = l+ deltaInt+1;
88
0
            __m256i vpred0 = _mm256_lddqu_si256((__m256i*)&pBorder[refMainIndex]);
89
0
            __m256i vpred1 = _mm256_lddqu_si256((__m256i*)&pBorder[refMainIndex+1]);
90
0
            vpred0 = _mm256_mullo_epi16(v32minfract, vpred0);
91
0
            vpred1 = _mm256_mullo_epi16(vfract, vpred1);
92
0
            __m256i vpred = _mm256_srli_epi16(_mm256_add_epi16(_mm256_add_epi16(vpred0, vpred1), voffset), 5);
93
0
            _mm256_storeu_si256((__m256i*)&pDst[l], vpred);
94
0
          }
95
0
          pDst+=dstStride;
96
0
          deltaPos += intraPredAngle;
97
0
        }
98
0
      }
99
0
      else // width==8
100
0
      {
101
0
        for (int k=0; k<height; k++)
102
0
        {
103
0
          deltaInt   = deltaPos >> 5;
104
0
          deltaFract = deltaPos & (32 - 1);
105
106
0
          __m128i vfract = _mm_set1_epi16(deltaFract);
107
0
          __m128i v32minfract = _mm_set1_epi16(32-deltaFract);
108
          // Do linear filtering
109
0
          for (int l=0; l<width; l+=8) {
110
0
            refMainIndex        = l+ deltaInt+1;
111
0
            __m128i vpred0 = _mm_lddqu_si128((__m128i*)&pBorder[refMainIndex]);
112
0
            __m128i vpred1 = _mm_lddqu_si128((__m128i*)&pBorder[refMainIndex+1]);
113
0
            vpred0 = _mm_mullo_epi16(v32minfract, vpred0);
114
0
            vpred1 = _mm_mullo_epi16(vfract, vpred1);
115
0
            __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5);
116
0
            _mm_storeu_si128((__m128i*)&pDst[l], vpred);
117
0
          }
118
0
          deltaPos += intraPredAngle;
119
120
0
          pDst+=dstStride;
121
0
        }
122
123
0
      }
124
0
#endif //AVX2
125
0
    }  
126
0
    else
127
0
    {
128
0
      for (int k=0; k<height; k++) {
129
0
        deltaInt   = deltaPos >> 5;
130
0
        deltaFract = deltaPos & (32 - 1);
131
132
0
        __m128i vfract = _mm_set1_epi16(deltaFract);
133
0
        __m128i v32minfract = _mm_set1_epi16(32-deltaFract);
134
        // Do linear filtering
135
0
        for (int l=0; l<width; l+=8) {
136
0
          refMainIndex        = l+ deltaInt+1;
137
0
          __m128i vpred0 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex]);
138
0
          __m128i vpred1 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex+1]);
139
0
          vpred0 = _mm_mullo_epi16(v32minfract, vpred0);
140
0
          vpred1 = _mm_mullo_epi16(vfract, vpred1);
141
0
          __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5);
142
0
          _mm_storeu_si128((__m128i*)&pDst[l], vpred);
143
0
        }
144
0
        deltaPos += intraPredAngle;
145
146
0
        pDst+=dstStride;
147
0
      }
148
0
    }
149
150
0
  }
151
224
  else if( W == 4 )
152
224
  {
153
1.64k
    for (int k=0; k<height; k++) {
154
1.42k
      deltaInt   = deltaPos >> 5;
155
1.42k
      deltaFract = deltaPos & (32 - 1);
156
157
1.42k
      __m128i vfract = _mm_set1_epi16(deltaFract);
158
1.42k
      __m128i v32minfract = _mm_set1_epi16(32-deltaFract);
159
      // Do linear filtering
160
1.42k
      refMainIndex        = deltaInt+1;
161
1.42k
      __m128i vpred0 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex]);
162
1.42k
      __m128i vpred1 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex+1]);
163
1.42k
      vpred0 = _mm_mullo_epi16(v32minfract, vpred0);
164
1.42k
      vpred1 = _mm_mullo_epi16(vfract, vpred1);
165
1.42k
      __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5);
166
1.42k
      _mm_storeu_si64( ( __m128i * )(pDst ), vpred);
167
1.42k
      deltaPos += intraPredAngle;
168
1.42k
      pDst+=dstStride;
169
1.42k
    }
170
224
  }
171
0
  else
172
0
  {
173
0
    THROW_FATAL( "Unsupported size in IntraPredAngleCore_SIMD" );
174
0
  }
175
224
#if USE_AVX2
176
177
224
  _mm256_zeroupper();
178
224
#endif
179
224
}
void vvdec::IntraPredAngleChroma_SIMD<(vvdec::x86_simd::X86_VEXT)4, 8>(short*, long, short*, int, int, int, int)
Line
Count
Source
60
574
{
61
574
  int deltaInt;
62
574
  int deltaFract;
63
574
  int refMainIndex;
64
65
574
  __m128i voffset = _mm_set1_epi16(16);
66
574
  if( W == 8 )
67
574
  {
68
574
    if( vext >= AVX2 )
69
574
    {
70
574
#ifdef USE_AVX2
71
574
      if (( width & 15 ) == 0 )
72
464
      {
73
464
       int deltaInt;
74
464
        int deltaFract;
75
464
        int refMainIndex;
76
77
464
        __m256i voffset = _mm256_set1_epi16(16);
78
7.13k
        for (int k=0; k<height; k++) {
79
80
6.66k
          deltaInt   = deltaPos >> 5;
81
6.66k
          deltaFract = deltaPos & (32 - 1);
82
83
6.66k
          __m256i vfract = _mm256_set1_epi16(deltaFract);
84
6.66k
          __m256i v32minfract = _mm256_set1_epi16(32-deltaFract);
85
          // Do linear filtering
86
16.6k
          for (int l=0; l<width; l+=16) {
87
9.93k
            refMainIndex   = l+ deltaInt+1;
88
9.93k
            __m256i vpred0 = _mm256_lddqu_si256((__m256i*)&pBorder[refMainIndex]);
89
9.93k
            __m256i vpred1 = _mm256_lddqu_si256((__m256i*)&pBorder[refMainIndex+1]);
90
9.93k
            vpred0 = _mm256_mullo_epi16(v32minfract, vpred0);
91
9.93k
            vpred1 = _mm256_mullo_epi16(vfract, vpred1);
92
9.93k
            __m256i vpred = _mm256_srli_epi16(_mm256_add_epi16(_mm256_add_epi16(vpred0, vpred1), voffset), 5);
93
9.93k
            _mm256_storeu_si256((__m256i*)&pDst[l], vpred);
94
9.93k
          }
95
6.66k
          pDst+=dstStride;
96
6.66k
          deltaPos += intraPredAngle;
97
6.66k
        }
98
464
      }
99
110
      else // width==8
100
110
      {
101
1.28k
        for (int k=0; k<height; k++)
102
1.17k
        {
103
1.17k
          deltaInt   = deltaPos >> 5;
104
1.17k
          deltaFract = deltaPos & (32 - 1);
105
106
1.17k
          __m128i vfract = _mm_set1_epi16(deltaFract);
107
1.17k
          __m128i v32minfract = _mm_set1_epi16(32-deltaFract);
108
          // Do linear filtering
109
2.35k
          for (int l=0; l<width; l+=8) {
110
1.17k
            refMainIndex        = l+ deltaInt+1;
111
1.17k
            __m128i vpred0 = _mm_lddqu_si128((__m128i*)&pBorder[refMainIndex]);
112
1.17k
            __m128i vpred1 = _mm_lddqu_si128((__m128i*)&pBorder[refMainIndex+1]);
113
1.17k
            vpred0 = _mm_mullo_epi16(v32minfract, vpred0);
114
1.17k
            vpred1 = _mm_mullo_epi16(vfract, vpred1);
115
1.17k
            __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5);
116
1.17k
            _mm_storeu_si128((__m128i*)&pDst[l], vpred);
117
1.17k
          }
118
1.17k
          deltaPos += intraPredAngle;
119
120
1.17k
          pDst+=dstStride;
121
1.17k
        }
122
123
110
      }
124
574
#endif //AVX2
125
574
    }  
126
0
    else
127
0
    {
128
0
      for (int k=0; k<height; k++) {
129
0
        deltaInt   = deltaPos >> 5;
130
0
        deltaFract = deltaPos & (32 - 1);
131
132
0
        __m128i vfract = _mm_set1_epi16(deltaFract);
133
0
        __m128i v32minfract = _mm_set1_epi16(32-deltaFract);
134
        // Do linear filtering
135
0
        for (int l=0; l<width; l+=8) {
136
0
          refMainIndex        = l+ deltaInt+1;
137
0
          __m128i vpred0 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex]);
138
0
          __m128i vpred1 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex+1]);
139
0
          vpred0 = _mm_mullo_epi16(v32minfract, vpred0);
140
0
          vpred1 = _mm_mullo_epi16(vfract, vpred1);
141
0
          __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5);
142
0
          _mm_storeu_si128((__m128i*)&pDst[l], vpred);
143
0
        }
144
0
        deltaPos += intraPredAngle;
145
146
0
        pDst+=dstStride;
147
0
      }
148
0
    }
149
150
574
  }
151
0
  else if( W == 4 )
152
0
  {
153
0
    for (int k=0; k<height; k++) {
154
0
      deltaInt   = deltaPos >> 5;
155
0
      deltaFract = deltaPos & (32 - 1);
156
157
0
      __m128i vfract = _mm_set1_epi16(deltaFract);
158
0
      __m128i v32minfract = _mm_set1_epi16(32-deltaFract);
159
      // Do linear filtering
160
0
      refMainIndex        = deltaInt+1;
161
0
      __m128i vpred0 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex]);
162
0
      __m128i vpred1 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex+1]);
163
0
      vpred0 = _mm_mullo_epi16(v32minfract, vpred0);
164
0
      vpred1 = _mm_mullo_epi16(vfract, vpred1);
165
0
      __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5);
166
0
      _mm_storeu_si64( ( __m128i * )(pDst ), vpred);
167
0
      deltaPos += intraPredAngle;
168
0
      pDst+=dstStride;
169
0
    }
170
0
  }
171
0
  else
172
0
  {
173
0
    THROW_FATAL( "Unsupported size in IntraPredAngleCore_SIMD" );
174
0
  }
175
574
#if USE_AVX2
176
177
574
  _mm256_zeroupper();
178
574
#endif
179
574
}
180
181
182
template< X86_VEXT vext, int W >
183
void IntraPredAngleCore_SIMD(int16_t* pDstBuf,const ptrdiff_t dstStride,int16_t* refMain,int width,int height,int deltaPos,int intraPredAngle,const TFilterCoeff *ff,const bool useCubicFilter,const ClpRng& clpRng)
184
910
{
185
910
  int16_t* pDst;
186
187
910
  if( W == 8 )
188
791
  {
189
791
    if( vext >= AVX2 )
190
791
    {
191
#ifdef USE_AVX2
192
      __m256i shflmask1= _mm256_set_epi8(0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6,   0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4,
193
          0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2,   0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 );
194
      __m256i offset = _mm256_set1_epi32( 32 );
195
196
791
      if (( width & 15 ) == 0 )
197
656
      {
198
656
        __m256i vbdmin,vbdmax;
199
200
656
        if (useCubicFilter)
201
268
        {
202
268
          vbdmin = _mm256_set1_epi16( clpRng.min() );
203
268
          vbdmax = _mm256_set1_epi16( clpRng.max() );
204
268
        }
205
206
18.7k
        for (int y = 0; y<height; y++ )
207
18.0k
        {
208
18.0k
          int deltaInt   = deltaPos >> 5;
209
18.0k
          int deltaFract = deltaPos & (32 - 1);
210
18.0k
          int refMainIndex   = deltaInt + 1;
211
18.0k
          pDst=&pDstBuf[y*dstStride];
212
18.0k
          __m128i tmp = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] );   //load 4 16 bit filter coeffs
213
18.0k
          tmp = _mm_shuffle_epi32(tmp,0x44);
214
18.0k
          __m256i coeff = _mm256_broadcastsi128_si256(tmp);
215
72.2k
          for( int x = 0; x < width; x+=16)
216
54.2k
          {
217
54.2k
            __m256i src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( const __m128i* ) &refMain[refMainIndex     - 1] ) );
218
54.2k
            __m256i src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( const __m128i* ) &refMain[refMainIndex + 4 - 1] ) );
219
54.2k
            src1 = _mm256_shuffle_epi8(src1,shflmask1);                 // -1 0 1 2  0 1 2 3 1 2 3 4  2 3 4 5
220
54.2k
            src2 = _mm256_shuffle_epi8(src2,shflmask1);                 // 3 4 5 6  4 5 6 7  5 6 7 8 6 7 8 9
221
222
54.2k
            src1 = _mm256_madd_epi16 (src1, coeff);
223
54.2k
            src2 = _mm256_madd_epi16 (src2, coeff);
224
225
54.2k
            __m256i  sum  = _mm256_hadd_epi32( src1, src2 );
226
54.2k
            sum = _mm256_permute4x64_epi64(sum,0xD8);
227
228
54.2k
            sum = _mm256_add_epi32( sum, offset );
229
54.2k
            sum = _mm256_srai_epi32( sum, 6 );
230
231
54.2k
            refMainIndex+=8;
232
            
233
54.2k
            src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) &refMain[refMainIndex     - 1] ) );
234
54.2k
            src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) &refMain[refMainIndex + 4 - 1] ) );
235
236
54.2k
            src1 = _mm256_shuffle_epi8(src1,shflmask1);                 // -1 0 1 2  0 1 2 3 1 2 3 4  2 3 4 5
237
54.2k
            src2 = _mm256_shuffle_epi8(src2,shflmask1);                 // 3 4 5 6  4 5 6 7  5 6 7 8 6 7 8 9
238
54.2k
            src1 = _mm256_madd_epi16 (src1, coeff);
239
54.2k
            src2 = _mm256_madd_epi16 (src2, coeff);
240
241
54.2k
            __m256i  sum1  = _mm256_hadd_epi32( src1, src2 );
242
54.2k
            sum1 = _mm256_permute4x64_epi64(sum1,0xD8);
243
244
54.2k
            sum1 = _mm256_add_epi32( sum1, offset );
245
54.2k
            sum1 = _mm256_srai_epi32( sum1, 6 );
246
54.2k
            __m256i
247
54.2k
            src0 = _mm256_packs_epi32( sum, sum1 );
248
249
54.2k
            src0 = _mm256_permute4x64_epi64(src0,0xD8);
250
251
54.2k
            refMainIndex+=8;
252
253
54.2k
            if (useCubicFilter)
254
13.3k
              src0 = _mm256_min_epi16( vbdmax, _mm256_max_epi16( vbdmin, src0 ) );
255
256
54.2k
            _mm256_storeu_si256( ( __m256i * )(pDst + x), src0);
257
54.2k
          }
258
18.0k
          deltaPos += intraPredAngle;
259
18.0k
        }
260
656
      }
261
135
      else // width =8
262
135
      {
263
        //        printf("AVX2 Block %d \n",width);
264
135
        __m128i vbdmin,vbdmax;
265
266
135
        if (useCubicFilter)
267
122
        {
268
122
          vbdmin = _mm_set1_epi16( clpRng.min() );
269
122
          vbdmax = _mm_set1_epi16( clpRng.max() );
270
122
        }
271
272
2.31k
        for (int y = 0; y<height; y++ )
273
2.17k
        {
274
2.17k
          int deltaInt   = deltaPos >> 5;
275
2.17k
          int deltaFract = deltaPos & (32 - 1);
276
2.17k
          int refMainIndex   = deltaInt + 1;
277
2.17k
          pDst=&pDstBuf[y*dstStride];
278
2.17k
          __m128i tmp = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] );   //load 4 16 bit filter coeffs
279
2.17k
          tmp = _mm_shuffle_epi32(tmp,0x44);
280
2.17k
          __m256i coeff = _mm256_broadcastsi128_si256(tmp);
281
2.17k
          __m256i src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) & refMain[refMainIndex - 1] ) );
282
2.17k
          __m256i src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) & refMain[refMainIndex + 4 - 1] ) );
283
2.17k
          src1 = _mm256_shuffle_epi8(src1,shflmask1);                 // -1 0 1 2  0 1 2 3 1 2 3 4  2 3 4 5
284
2.17k
          src2 = _mm256_shuffle_epi8(src2,shflmask1);                 // 3 4 5 6  4 5 6 7  5 6 7 8 6 7 8 9
285
286
2.17k
          src1 = _mm256_madd_epi16 (src1, coeff);
287
2.17k
          src2 = _mm256_madd_epi16 (src2, coeff);
288
289
2.17k
          __m256i  sum  = _mm256_hadd_epi32( src1, src2 );
290
2.17k
          sum = _mm256_permute4x64_epi64(sum,0xD8);
291
292
2.17k
          sum = _mm256_add_epi32( sum, offset );
293
2.17k
          sum = _mm256_srai_epi32( sum, 6 );
294
2.17k
          __m128i dest128 = _mm256_cvtepi32_epi16x( sum );
295
296
2.17k
          if (useCubicFilter)
297
1.83k
            dest128 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, dest128 ) );
298
299
2.17k
          _mm_storeu_si128( ( __m128i * )(pDst), dest128);
300
2.17k
          deltaPos += intraPredAngle;
301
2.17k
        }
302
135
      }
303
#endif
304
791
    }
305
0
    else
306
0
    {
307
0
      __m128i shflmask1= _mm_set_epi8( 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2,   0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 );
308
0
      __m128i shflmask2= _mm_set_epi8( 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6,   0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4 );
309
0
      __m128i vbdmin,vbdmax;
310
311
0
      __m128i offset = _mm_set1_epi32( 32 );
312
313
0
      if (useCubicFilter)
314
0
      {
315
0
        vbdmin = _mm_set1_epi16( clpRng.min() );
316
0
        vbdmax = _mm_set1_epi16( clpRng.max() );
317
0
      }
318
319
0
      for (int y = 0; y<height; y++ )
320
0
      {
321
0
        int deltaInt   = deltaPos >> 5;
322
0
        int deltaFract = deltaPos & (32 - 1);
323
0
        int refMainIndex   = deltaInt + 1;
324
0
        pDst=&pDstBuf[y*dstStride];
325
0
        __m128i coeff = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] );   //load 4 16 bit filter coeffs
326
0
        coeff = _mm_shuffle_epi32(coeff,0x44);
327
0
        for( int x = 0; x < width; x+=8)
328
0
        {
329
0
          __m128i src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] );   //load 8 16 bit reference Pels   -1 0 1 2 3 4 5 6
330
0
          __m128i src1 = _mm_shuffle_epi8(src0,shflmask1);                  // -1 0 1 2  0 1 2 3
331
0
          __m128i src2 = _mm_shuffle_epi8(src0,shflmask2);                  // 1 2 3 4  2 3 4 5
332
0
          src0 = _mm_madd_epi16( coeff,src1 );
333
0
          src1 = _mm_madd_epi16( coeff,src2 );
334
0
          __m128i sum  = _mm_hadd_epi32( src0, src1 );
335
0
          sum = _mm_add_epi32( sum, offset );
336
0
          sum = _mm_srai_epi32( sum, 6 );
337
338
0
          refMainIndex+=4;
339
0
          src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] );   //load 8 16 bit reference Pels   -1 0 1 2 3 4 5 6
340
0
          src1 = _mm_shuffle_epi8(src0,shflmask1);                                // -1 0 1 2  0 1 2 3
341
0
          src2 = _mm_shuffle_epi8(src0,shflmask2);
342
343
          // 1 2 3 4  2 3 4 5
344
0
          src0 = _mm_madd_epi16( coeff,src1 );
345
0
          src1 = _mm_madd_epi16( coeff,src2 );
346
347
0
          __m128i sum1  = _mm_hadd_epi32( src0, src1 );
348
0
          sum1 = _mm_add_epi32( sum1, offset );
349
0
          sum1 = _mm_srai_epi32( sum1, 6 );
350
0
          src0 = _mm_packs_epi32( sum, sum1 );
351
352
0
          refMainIndex+=4;
353
354
0
          if (useCubicFilter)
355
0
            src0 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, src0 ) );
356
357
0
          _mm_storeu_si128( ( __m128i * )(pDst + x), src0);
358
359
0
        }
360
0
        deltaPos += intraPredAngle;
361
0
      }
362
0
    }
363
791
  }
364
119
  else if( W == 4 )
365
119
  {
366
367
119
    __m128i shflmask1= _mm_set_epi8( 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2,   0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 );
368
119
    __m128i shflmask2= _mm_set_epi8( 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6,   0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4 );
369
119
    __m128i vbdmin,vbdmax;
370
371
119
    __m128i offset = _mm_set1_epi32( 32 );
372
373
119
    if (useCubicFilter)
374
119
    {
375
119
      vbdmin = _mm_set1_epi16( clpRng.min() );
376
119
      vbdmax = _mm_set1_epi16( clpRng.max() );
377
119
    }
378
379
2.37k
    for (int y = 0; y<height; y++ )
380
2.26k
    {
381
2.26k
      int deltaInt   = deltaPos >> 5;
382
2.26k
      int deltaFract = deltaPos & (32 - 1);
383
2.26k
      int refMainIndex   = deltaInt + 1;
384
2.26k
      pDst=&pDstBuf[y*dstStride];
385
2.26k
      __m128i coeff = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] );   //load 4 16 bit filter coeffs
386
2.26k
      coeff = _mm_shuffle_epi32(coeff,0x44);
387
2.26k
      {
388
2.26k
        __m128i src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] );   //load 8 16 bit reference Pels   -1 0 1 2 3 4 5 6
389
2.26k
        __m128i src1 = _mm_shuffle_epi8(src0,shflmask1);                  // -1 0 1 2  0 1 2 3
390
2.26k
        __m128i src2 = _mm_shuffle_epi8(src0,shflmask2);                  // 1 2 3 4  2 3 4 5
391
2.26k
        src0 = _mm_madd_epi16( coeff,src1 );
392
2.26k
        src1 = _mm_madd_epi16( coeff,src2 );
393
2.26k
        __m128i sum  = _mm_hadd_epi32( src0, src1 );
394
2.26k
        sum = _mm_add_epi32( sum, offset );
395
2.26k
        sum = _mm_srai_epi32( sum, 6 );
396
397
2.26k
        src0 = _mm_packs_epi32( sum, sum );
398
399
2.26k
        refMainIndex+=4;
400
401
2.26k
        if (useCubicFilter)
402
2.26k
          src0 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, src0 ) );
403
404
2.26k
        _mm_storeu_si64( ( __m128i * )(pDst ), src0);
405
406
2.26k
      }
407
2.26k
      deltaPos += intraPredAngle;
408
2.26k
    }
409
119
  }
410
0
  else
411
0
  {
412
0
    THROW_FATAL( "Unsupported size in IntraPredAngleCore_SIMD" );
413
0
  }
414
#if USE_AVX2
415
910
  _mm256_zeroupper();
416
910
#endif
417
910
}
Unexecuted instantiation: void vvdec::IntraPredAngleCore_SIMD<(vvdec::x86_simd::X86_VEXT)1, 4>(short*, long, short*, int, int, int, int, short const*, bool, vvdec::ClpRngTemplate<short> const&)
Unexecuted instantiation: void vvdec::IntraPredAngleCore_SIMD<(vvdec::x86_simd::X86_VEXT)1, 8>(short*, long, short*, int, int, int, int, short const*, bool, vvdec::ClpRngTemplate<short> const&)
void vvdec::IntraPredAngleCore_SIMD<(vvdec::x86_simd::X86_VEXT)4, 4>(short*, long, short*, int, int, int, int, short const*, bool, vvdec::ClpRngTemplate<short> const&)
Line
Count
Source
184
119
{
185
119
  int16_t* pDst;
186
187
119
  if( W == 8 )
188
0
  {
189
0
    if( vext >= AVX2 )
190
0
    {
191
0
#ifdef USE_AVX2
192
0
      __m256i shflmask1= _mm256_set_epi8(0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6,   0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4,
193
0
          0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2,   0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 );
194
0
      __m256i offset = _mm256_set1_epi32( 32 );
195
196
0
      if (( width & 15 ) == 0 )
197
0
      {
198
0
        __m256i vbdmin,vbdmax;
199
200
0
        if (useCubicFilter)
201
0
        {
202
0
          vbdmin = _mm256_set1_epi16( clpRng.min() );
203
0
          vbdmax = _mm256_set1_epi16( clpRng.max() );
204
0
        }
205
206
0
        for (int y = 0; y<height; y++ )
207
0
        {
208
0
          int deltaInt   = deltaPos >> 5;
209
0
          int deltaFract = deltaPos & (32 - 1);
210
0
          int refMainIndex   = deltaInt + 1;
211
0
          pDst=&pDstBuf[y*dstStride];
212
0
          __m128i tmp = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] );   //load 4 16 bit filter coeffs
213
0
          tmp = _mm_shuffle_epi32(tmp,0x44);
214
0
          __m256i coeff = _mm256_broadcastsi128_si256(tmp);
215
0
          for( int x = 0; x < width; x+=16)
216
0
          {
217
0
            __m256i src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( const __m128i* ) &refMain[refMainIndex     - 1] ) );
218
0
            __m256i src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( const __m128i* ) &refMain[refMainIndex + 4 - 1] ) );
219
0
            src1 = _mm256_shuffle_epi8(src1,shflmask1);                 // -1 0 1 2  0 1 2 3 1 2 3 4  2 3 4 5
220
0
            src2 = _mm256_shuffle_epi8(src2,shflmask1);                 // 3 4 5 6  4 5 6 7  5 6 7 8 6 7 8 9
221
222
0
            src1 = _mm256_madd_epi16 (src1, coeff);
223
0
            src2 = _mm256_madd_epi16 (src2, coeff);
224
225
0
            __m256i  sum  = _mm256_hadd_epi32( src1, src2 );
226
0
            sum = _mm256_permute4x64_epi64(sum,0xD8);
227
228
0
            sum = _mm256_add_epi32( sum, offset );
229
0
            sum = _mm256_srai_epi32( sum, 6 );
230
231
0
            refMainIndex+=8;
232
            
233
0
            src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) &refMain[refMainIndex     - 1] ) );
234
0
            src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) &refMain[refMainIndex + 4 - 1] ) );
235
236
0
            src1 = _mm256_shuffle_epi8(src1,shflmask1);                 // -1 0 1 2  0 1 2 3 1 2 3 4  2 3 4 5
237
0
            src2 = _mm256_shuffle_epi8(src2,shflmask1);                 // 3 4 5 6  4 5 6 7  5 6 7 8 6 7 8 9
238
0
            src1 = _mm256_madd_epi16 (src1, coeff);
239
0
            src2 = _mm256_madd_epi16 (src2, coeff);
240
241
0
            __m256i  sum1  = _mm256_hadd_epi32( src1, src2 );
242
0
            sum1 = _mm256_permute4x64_epi64(sum1,0xD8);
243
244
0
            sum1 = _mm256_add_epi32( sum1, offset );
245
0
            sum1 = _mm256_srai_epi32( sum1, 6 );
246
0
            __m256i
247
0
            src0 = _mm256_packs_epi32( sum, sum1 );
248
249
0
            src0 = _mm256_permute4x64_epi64(src0,0xD8);
250
251
0
            refMainIndex+=8;
252
253
0
            if (useCubicFilter)
254
0
              src0 = _mm256_min_epi16( vbdmax, _mm256_max_epi16( vbdmin, src0 ) );
255
256
0
            _mm256_storeu_si256( ( __m256i * )(pDst + x), src0);
257
0
          }
258
0
          deltaPos += intraPredAngle;
259
0
        }
260
0
      }
261
0
      else // width =8
262
0
      {
263
        //        printf("AVX2 Block %d \n",width);
264
0
        __m128i vbdmin,vbdmax;
265
266
0
        if (useCubicFilter)
267
0
        {
268
0
          vbdmin = _mm_set1_epi16( clpRng.min() );
269
0
          vbdmax = _mm_set1_epi16( clpRng.max() );
270
0
        }
271
272
0
        for (int y = 0; y<height; y++ )
273
0
        {
274
0
          int deltaInt   = deltaPos >> 5;
275
0
          int deltaFract = deltaPos & (32 - 1);
276
0
          int refMainIndex   = deltaInt + 1;
277
0
          pDst=&pDstBuf[y*dstStride];
278
0
          __m128i tmp = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] );   //load 4 16 bit filter coeffs
279
0
          tmp = _mm_shuffle_epi32(tmp,0x44);
280
0
          __m256i coeff = _mm256_broadcastsi128_si256(tmp);
281
0
          __m256i src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) & refMain[refMainIndex - 1] ) );
282
0
          __m256i src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) & refMain[refMainIndex + 4 - 1] ) );
283
0
          src1 = _mm256_shuffle_epi8(src1,shflmask1);                 // -1 0 1 2  0 1 2 3 1 2 3 4  2 3 4 5
284
0
          src2 = _mm256_shuffle_epi8(src2,shflmask1);                 // 3 4 5 6  4 5 6 7  5 6 7 8 6 7 8 9
285
286
0
          src1 = _mm256_madd_epi16 (src1, coeff);
287
0
          src2 = _mm256_madd_epi16 (src2, coeff);
288
289
0
          __m256i  sum  = _mm256_hadd_epi32( src1, src2 );
290
0
          sum = _mm256_permute4x64_epi64(sum,0xD8);
291
292
0
          sum = _mm256_add_epi32( sum, offset );
293
0
          sum = _mm256_srai_epi32( sum, 6 );
294
0
          __m128i dest128 = _mm256_cvtepi32_epi16x( sum );
295
296
0
          if (useCubicFilter)
297
0
            dest128 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, dest128 ) );
298
299
0
          _mm_storeu_si128( ( __m128i * )(pDst), dest128);
300
0
          deltaPos += intraPredAngle;
301
0
        }
302
0
      }
303
0
#endif
304
0
    }
305
0
    else
306
0
    {
307
0
      __m128i shflmask1= _mm_set_epi8( 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2,   0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 );
308
0
      __m128i shflmask2= _mm_set_epi8( 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6,   0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4 );
309
0
      __m128i vbdmin,vbdmax;
310
311
0
      __m128i offset = _mm_set1_epi32( 32 );
312
313
0
      if (useCubicFilter)
314
0
      {
315
0
        vbdmin = _mm_set1_epi16( clpRng.min() );
316
0
        vbdmax = _mm_set1_epi16( clpRng.max() );
317
0
      }
318
319
0
      for (int y = 0; y<height; y++ )
320
0
      {
321
0
        int deltaInt   = deltaPos >> 5;
322
0
        int deltaFract = deltaPos & (32 - 1);
323
0
        int refMainIndex   = deltaInt + 1;
324
0
        pDst=&pDstBuf[y*dstStride];
325
0
        __m128i coeff = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] );   //load 4 16 bit filter coeffs
326
0
        coeff = _mm_shuffle_epi32(coeff,0x44);
327
0
        for( int x = 0; x < width; x+=8)
328
0
        {
329
0
          __m128i src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] );   //load 8 16 bit reference Pels   -1 0 1 2 3 4 5 6
330
0
          __m128i src1 = _mm_shuffle_epi8(src0,shflmask1);                  // -1 0 1 2  0 1 2 3
331
0
          __m128i src2 = _mm_shuffle_epi8(src0,shflmask2);                  // 1 2 3 4  2 3 4 5
332
0
          src0 = _mm_madd_epi16( coeff,src1 );
333
0
          src1 = _mm_madd_epi16( coeff,src2 );
334
0
          __m128i sum  = _mm_hadd_epi32( src0, src1 );
335
0
          sum = _mm_add_epi32( sum, offset );
336
0
          sum = _mm_srai_epi32( sum, 6 );
337
338
0
          refMainIndex+=4;
339
0
          src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] );   //load 8 16 bit reference Pels   -1 0 1 2 3 4 5 6
340
0
          src1 = _mm_shuffle_epi8(src0,shflmask1);                                // -1 0 1 2  0 1 2 3
341
0
          src2 = _mm_shuffle_epi8(src0,shflmask2);
342
343
          // 1 2 3 4  2 3 4 5
344
0
          src0 = _mm_madd_epi16( coeff,src1 );
345
0
          src1 = _mm_madd_epi16( coeff,src2 );
346
347
0
          __m128i sum1  = _mm_hadd_epi32( src0, src1 );
348
0
          sum1 = _mm_add_epi32( sum1, offset );
349
0
          sum1 = _mm_srai_epi32( sum1, 6 );
350
0
          src0 = _mm_packs_epi32( sum, sum1 );
351
352
0
          refMainIndex+=4;
353
354
0
          if (useCubicFilter)
355
0
            src0 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, src0 ) );
356
357
0
          _mm_storeu_si128( ( __m128i * )(pDst + x), src0);
358
359
0
        }
360
0
        deltaPos += intraPredAngle;
361
0
      }
362
0
    }
363
0
  }
364
119
  else if( W == 4 )
365
119
  {
366
367
119
    __m128i shflmask1= _mm_set_epi8( 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2,   0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 );
368
119
    __m128i shflmask2= _mm_set_epi8( 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6,   0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4 );
369
119
    __m128i vbdmin,vbdmax;
370
371
119
    __m128i offset = _mm_set1_epi32( 32 );
372
373
119
    if (useCubicFilter)
374
119
    {
375
119
      vbdmin = _mm_set1_epi16( clpRng.min() );
376
119
      vbdmax = _mm_set1_epi16( clpRng.max() );
377
119
    }
378
379
2.37k
    for (int y = 0; y<height; y++ )
380
2.26k
    {
381
2.26k
      int deltaInt   = deltaPos >> 5;
382
2.26k
      int deltaFract = deltaPos & (32 - 1);
383
2.26k
      int refMainIndex   = deltaInt + 1;
384
2.26k
      pDst=&pDstBuf[y*dstStride];
385
2.26k
      __m128i coeff = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] );   //load 4 16 bit filter coeffs
386
2.26k
      coeff = _mm_shuffle_epi32(coeff,0x44);
387
2.26k
      {
388
2.26k
        __m128i src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] );   //load 8 16 bit reference Pels   -1 0 1 2 3 4 5 6
389
2.26k
        __m128i src1 = _mm_shuffle_epi8(src0,shflmask1);                  // -1 0 1 2  0 1 2 3
390
2.26k
        __m128i src2 = _mm_shuffle_epi8(src0,shflmask2);                  // 1 2 3 4  2 3 4 5
391
2.26k
        src0 = _mm_madd_epi16( coeff,src1 );
392
2.26k
        src1 = _mm_madd_epi16( coeff,src2 );
393
2.26k
        __m128i sum  = _mm_hadd_epi32( src0, src1 );
394
2.26k
        sum = _mm_add_epi32( sum, offset );
395
2.26k
        sum = _mm_srai_epi32( sum, 6 );
396
397
2.26k
        src0 = _mm_packs_epi32( sum, sum );
398
399
2.26k
        refMainIndex+=4;
400
401
2.26k
        if (useCubicFilter)
402
2.26k
          src0 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, src0 ) );
403
404
2.26k
        _mm_storeu_si64( ( __m128i * )(pDst ), src0);
405
406
2.26k
      }
407
2.26k
      deltaPos += intraPredAngle;
408
2.26k
    }
409
119
  }
410
0
  else
411
0
  {
412
0
    THROW_FATAL( "Unsupported size in IntraPredAngleCore_SIMD" );
413
0
  }
414
119
#if USE_AVX2
415
119
  _mm256_zeroupper();
416
119
#endif
417
119
}
void vvdec::IntraPredAngleCore_SIMD<(vvdec::x86_simd::X86_VEXT)4, 8>(short*, long, short*, int, int, int, int, short const*, bool, vvdec::ClpRngTemplate<short> const&)
Line
Count
Source
184
791
{
185
791
  int16_t* pDst;
186
187
791
  if( W == 8 )
188
791
  {
189
791
    if( vext >= AVX2 )
190
791
    {
191
791
#ifdef USE_AVX2
192
791
      __m256i shflmask1= _mm256_set_epi8(0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6,   0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4,
193
791
          0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2,   0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 );
194
791
      __m256i offset = _mm256_set1_epi32( 32 );
195
196
791
      if (( width & 15 ) == 0 )
197
656
      {
198
656
        __m256i vbdmin,vbdmax;
199
200
656
        if (useCubicFilter)
201
268
        {
202
268
          vbdmin = _mm256_set1_epi16( clpRng.min() );
203
268
          vbdmax = _mm256_set1_epi16( clpRng.max() );
204
268
        }
205
206
18.7k
        for (int y = 0; y<height; y++ )
207
18.0k
        {
208
18.0k
          int deltaInt   = deltaPos >> 5;
209
18.0k
          int deltaFract = deltaPos & (32 - 1);
210
18.0k
          int refMainIndex   = deltaInt + 1;
211
18.0k
          pDst=&pDstBuf[y*dstStride];
212
18.0k
          __m128i tmp = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] );   //load 4 16 bit filter coeffs
213
18.0k
          tmp = _mm_shuffle_epi32(tmp,0x44);
214
18.0k
          __m256i coeff = _mm256_broadcastsi128_si256(tmp);
215
72.2k
          for( int x = 0; x < width; x+=16)
216
54.2k
          {
217
54.2k
            __m256i src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( const __m128i* ) &refMain[refMainIndex     - 1] ) );
218
54.2k
            __m256i src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( const __m128i* ) &refMain[refMainIndex + 4 - 1] ) );
219
54.2k
            src1 = _mm256_shuffle_epi8(src1,shflmask1);                 // -1 0 1 2  0 1 2 3 1 2 3 4  2 3 4 5
220
54.2k
            src2 = _mm256_shuffle_epi8(src2,shflmask1);                 // 3 4 5 6  4 5 6 7  5 6 7 8 6 7 8 9
221
222
54.2k
            src1 = _mm256_madd_epi16 (src1, coeff);
223
54.2k
            src2 = _mm256_madd_epi16 (src2, coeff);
224
225
54.2k
            __m256i  sum  = _mm256_hadd_epi32( src1, src2 );
226
54.2k
            sum = _mm256_permute4x64_epi64(sum,0xD8);
227
228
54.2k
            sum = _mm256_add_epi32( sum, offset );
229
54.2k
            sum = _mm256_srai_epi32( sum, 6 );
230
231
54.2k
            refMainIndex+=8;
232
            
233
54.2k
            src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) &refMain[refMainIndex     - 1] ) );
234
54.2k
            src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) &refMain[refMainIndex + 4 - 1] ) );
235
236
54.2k
            src1 = _mm256_shuffle_epi8(src1,shflmask1);                 // -1 0 1 2  0 1 2 3 1 2 3 4  2 3 4 5
237
54.2k
            src2 = _mm256_shuffle_epi8(src2,shflmask1);                 // 3 4 5 6  4 5 6 7  5 6 7 8 6 7 8 9
238
54.2k
            src1 = _mm256_madd_epi16 (src1, coeff);
239
54.2k
            src2 = _mm256_madd_epi16 (src2, coeff);
240
241
54.2k
            __m256i  sum1  = _mm256_hadd_epi32( src1, src2 );
242
54.2k
            sum1 = _mm256_permute4x64_epi64(sum1,0xD8);
243
244
54.2k
            sum1 = _mm256_add_epi32( sum1, offset );
245
54.2k
            sum1 = _mm256_srai_epi32( sum1, 6 );
246
54.2k
            __m256i
247
54.2k
            src0 = _mm256_packs_epi32( sum, sum1 );
248
249
54.2k
            src0 = _mm256_permute4x64_epi64(src0,0xD8);
250
251
54.2k
            refMainIndex+=8;
252
253
54.2k
            if (useCubicFilter)
254
13.3k
              src0 = _mm256_min_epi16( vbdmax, _mm256_max_epi16( vbdmin, src0 ) );
255
256
54.2k
            _mm256_storeu_si256( ( __m256i * )(pDst + x), src0);
257
54.2k
          }
258
18.0k
          deltaPos += intraPredAngle;
259
18.0k
        }
260
656
      }
261
135
      else // width =8
262
135
      {
263
        //        printf("AVX2 Block %d \n",width);
264
135
        __m128i vbdmin,vbdmax;
265
266
135
        if (useCubicFilter)
267
122
        {
268
122
          vbdmin = _mm_set1_epi16( clpRng.min() );
269
122
          vbdmax = _mm_set1_epi16( clpRng.max() );
270
122
        }
271
272
2.31k
        for (int y = 0; y<height; y++ )
273
2.17k
        {
274
2.17k
          int deltaInt   = deltaPos >> 5;
275
2.17k
          int deltaFract = deltaPos & (32 - 1);
276
2.17k
          int refMainIndex   = deltaInt + 1;
277
2.17k
          pDst=&pDstBuf[y*dstStride];
278
2.17k
          __m128i tmp = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] );   //load 4 16 bit filter coeffs
279
2.17k
          tmp = _mm_shuffle_epi32(tmp,0x44);
280
2.17k
          __m256i coeff = _mm256_broadcastsi128_si256(tmp);
281
2.17k
          __m256i src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) & refMain[refMainIndex - 1] ) );
282
2.17k
          __m256i src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) & refMain[refMainIndex + 4 - 1] ) );
283
2.17k
          src1 = _mm256_shuffle_epi8(src1,shflmask1);                 // -1 0 1 2  0 1 2 3 1 2 3 4  2 3 4 5
284
2.17k
          src2 = _mm256_shuffle_epi8(src2,shflmask1);                 // 3 4 5 6  4 5 6 7  5 6 7 8 6 7 8 9
285
286
2.17k
          src1 = _mm256_madd_epi16 (src1, coeff);
287
2.17k
          src2 = _mm256_madd_epi16 (src2, coeff);
288
289
2.17k
          __m256i  sum  = _mm256_hadd_epi32( src1, src2 );
290
2.17k
          sum = _mm256_permute4x64_epi64(sum,0xD8);
291
292
2.17k
          sum = _mm256_add_epi32( sum, offset );
293
2.17k
          sum = _mm256_srai_epi32( sum, 6 );
294
2.17k
          __m128i dest128 = _mm256_cvtepi32_epi16x( sum );
295
296
2.17k
          if (useCubicFilter)
297
1.83k
            dest128 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, dest128 ) );
298
299
2.17k
          _mm_storeu_si128( ( __m128i * )(pDst), dest128);
300
2.17k
          deltaPos += intraPredAngle;
301
2.17k
        }
302
135
      }
303
791
#endif
304
791
    }
305
0
    else
306
0
    {
307
0
      __m128i shflmask1= _mm_set_epi8( 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2,   0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 );
308
0
      __m128i shflmask2= _mm_set_epi8( 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6,   0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4 );
309
0
      __m128i vbdmin,vbdmax;
310
311
0
      __m128i offset = _mm_set1_epi32( 32 );
312
313
0
      if (useCubicFilter)
314
0
      {
315
0
        vbdmin = _mm_set1_epi16( clpRng.min() );
316
0
        vbdmax = _mm_set1_epi16( clpRng.max() );
317
0
      }
318
319
0
      for (int y = 0; y<height; y++ )
320
0
      {
321
0
        int deltaInt   = deltaPos >> 5;
322
0
        int deltaFract = deltaPos & (32 - 1);
323
0
        int refMainIndex   = deltaInt + 1;
324
0
        pDst=&pDstBuf[y*dstStride];
325
0
        __m128i coeff = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] );   //load 4 16 bit filter coeffs
326
0
        coeff = _mm_shuffle_epi32(coeff,0x44);
327
0
        for( int x = 0; x < width; x+=8)
328
0
        {
329
0
          __m128i src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] );   //load 8 16 bit reference Pels   -1 0 1 2 3 4 5 6
330
0
          __m128i src1 = _mm_shuffle_epi8(src0,shflmask1);                  // -1 0 1 2  0 1 2 3
331
0
          __m128i src2 = _mm_shuffle_epi8(src0,shflmask2);                  // 1 2 3 4  2 3 4 5
332
0
          src0 = _mm_madd_epi16( coeff,src1 );
333
0
          src1 = _mm_madd_epi16( coeff,src2 );
334
0
          __m128i sum  = _mm_hadd_epi32( src0, src1 );
335
0
          sum = _mm_add_epi32( sum, offset );
336
0
          sum = _mm_srai_epi32( sum, 6 );
337
338
0
          refMainIndex+=4;
339
0
          src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] );   //load 8 16 bit reference Pels   -1 0 1 2 3 4 5 6
340
0
          src1 = _mm_shuffle_epi8(src0,shflmask1);                                // -1 0 1 2  0 1 2 3
341
0
          src2 = _mm_shuffle_epi8(src0,shflmask2);
342
343
          // 1 2 3 4  2 3 4 5
344
0
          src0 = _mm_madd_epi16( coeff,src1 );
345
0
          src1 = _mm_madd_epi16( coeff,src2 );
346
347
0
          __m128i sum1  = _mm_hadd_epi32( src0, src1 );
348
0
          sum1 = _mm_add_epi32( sum1, offset );
349
0
          sum1 = _mm_srai_epi32( sum1, 6 );
350
0
          src0 = _mm_packs_epi32( sum, sum1 );
351
352
0
          refMainIndex+=4;
353
354
0
          if (useCubicFilter)
355
0
            src0 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, src0 ) );
356
357
0
          _mm_storeu_si128( ( __m128i * )(pDst + x), src0);
358
359
0
        }
360
0
        deltaPos += intraPredAngle;
361
0
      }
362
0
    }
363
791
  }
364
0
  else if( W == 4 )
365
0
  {
366
367
0
    __m128i shflmask1= _mm_set_epi8( 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2,   0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 );
368
0
    __m128i shflmask2= _mm_set_epi8( 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6,   0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4 );
369
0
    __m128i vbdmin,vbdmax;
370
371
0
    __m128i offset = _mm_set1_epi32( 32 );
372
373
0
    if (useCubicFilter)
374
0
    {
375
0
      vbdmin = _mm_set1_epi16( clpRng.min() );
376
0
      vbdmax = _mm_set1_epi16( clpRng.max() );
377
0
    }
378
379
0
    for (int y = 0; y<height; y++ )
380
0
    {
381
0
      int deltaInt   = deltaPos >> 5;
382
0
      int deltaFract = deltaPos & (32 - 1);
383
0
      int refMainIndex   = deltaInt + 1;
384
0
      pDst=&pDstBuf[y*dstStride];
385
0
      __m128i coeff = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] );   //load 4 16 bit filter coeffs
386
0
      coeff = _mm_shuffle_epi32(coeff,0x44);
387
0
      {
388
0
        __m128i src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] );   //load 8 16 bit reference Pels   -1 0 1 2 3 4 5 6
389
0
        __m128i src1 = _mm_shuffle_epi8(src0,shflmask1);                  // -1 0 1 2  0 1 2 3
390
0
        __m128i src2 = _mm_shuffle_epi8(src0,shflmask2);                  // 1 2 3 4  2 3 4 5
391
0
        src0 = _mm_madd_epi16( coeff,src1 );
392
0
        src1 = _mm_madd_epi16( coeff,src2 );
393
0
        __m128i sum  = _mm_hadd_epi32( src0, src1 );
394
0
        sum = _mm_add_epi32( sum, offset );
395
0
        sum = _mm_srai_epi32( sum, 6 );
396
397
0
        src0 = _mm_packs_epi32( sum, sum );
398
399
0
        refMainIndex+=4;
400
401
0
        if (useCubicFilter)
402
0
          src0 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, src0 ) );
403
404
0
        _mm_storeu_si64( ( __m128i * )(pDst ), src0);
405
406
0
      }
407
0
      deltaPos += intraPredAngle;
408
0
    }
409
0
  }
410
0
  else
411
0
  {
412
0
    THROW_FATAL( "Unsupported size in IntraPredAngleCore_SIMD" );
413
0
  }
414
791
#if USE_AVX2
415
791
  _mm256_zeroupper();
416
791
#endif
417
791
}
418
419
template< X86_VEXT vext, int W >
420
void  IntraPredSampleFilter_SIMD(Pel *ptrSrc,const ptrdiff_t srcStride,PelBuf &piPred,const uint32_t uiDirMode,const ClpRng& clpRng)
421
3.97k
{
422
3.97k
  const int       iWidth    = piPred.width;
423
3.97k
  const int       iHeight   = piPred.height;
424
3.97k
  PelBuf          dstBuf    = piPred;
425
3.97k
  Pel*            pDst      = dstBuf.buf;
426
3.97k
  const ptrdiff_t dstStride = dstBuf.stride;
427
428
3.97k
  const int scale = ((getLog2(iWidth) - 2 + getLog2(iHeight) - 2 + 2) >> 2);
429
3.97k
  CHECK(scale < 0 || scale > 31, "PDPC: scale < 0 || scale > 2");
430
431
#if USE_AVX2
432
3.97k
  if( W > 8 )
433
2.71k
  {
434
2.71k
    __m256i tmplo,tmphi;
435
2.71k
    __m256i w64 = _mm256_set_epi16(64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64);
436
2.71k
    __m256i w32 = _mm256_set_epi32(32,32,32,32,32,32,32,32);
437
2.71k
    __m256i vbdmin   = _mm256_set1_epi32( clpRng.min() );
438
2.71k
    __m256i vbdmax   = _mm256_set1_epi32( clpRng.max() );
439
2.71k
    __m256i wl16;
440
2.71k
    __m256i wl16start;
441
442
2.71k
    if (scale==0)
443
0
    {
444
0
      wl16start = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,0,0,0,2,8,32);
445
0
    }
446
2.71k
    else if (scale==1)
447
1.70k
    {
448
1.70k
      wl16start = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,1,2,4,8,16,32);
449
1.70k
    }
450
1.00k
    else if (scale==2)
451
1.00k
    {
452
1.00k
      wl16start = _mm256_set_epi16(0,0,0,0,1,1,2,2,4,4,8,8,16,16,32,32);
453
1.00k
    }
454
0
    else
455
0
    {
456
0
      THROW_FATAL( "Wrong scale (" << scale << ")" );
457
0
    }
458
459
460
2.71k
    if (uiDirMode == PLANAR_IDX || uiDirMode == DC_IDX )
461
2.71k
    {
462
67.1k
      for (int y = 0; y < iHeight; y++)
463
64.4k
      {
464
64.4k
        int wT = 32 >> std::min(31, ((y << 1) >> scale));
465
466
64.4k
        __m256i wt16 = _mm256_set_epi16(wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT);
467
64.4k
        __m256i x16left = _mm256_broadcastw_epi16(_mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)*srcStride))));
468
469
64.4k
        if (wT)
470
21.8k
        {
471
65.8k
          for (int x = 0; x < iWidth; x+=16)
472
43.9k
          {
473
43.9k
            if (x==0)
474
21.8k
            {
475
21.8k
              wl16=wl16start;
476
477
21.8k
              __m256i x16top = _mm256_loadu_si256((__m256i *) (ptrSrc+x+1)); // load top
478
21.8k
              __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride+x)); // load dst
479
480
21.8k
              tmplo = _mm256_mullo_epi16(x16left,wl16);  //wL * left
481
21.8k
              tmphi = _mm256_mulhi_epi16(x16left,wl16);  //wL * left
482
21.8k
              __m256i leftlo = _mm256_unpacklo_epi16(tmplo,tmphi);
483
21.8k
              __m256i lefthi = _mm256_unpackhi_epi16(tmplo,tmphi);
484
485
21.8k
              tmplo = _mm256_mullo_epi16(x16top,wt16);    // wT*top
486
21.8k
              tmphi = _mm256_mulhi_epi16(x16top,wt16);    // wT*top
487
21.8k
              __m256i toplo = _mm256_unpacklo_epi16(tmplo,tmphi);
488
21.8k
              __m256i tophi = _mm256_unpackhi_epi16(tmplo,tmphi);
489
490
21.8k
              __m256i wX = _mm256_sub_epi16(w64,wl16);
491
21.8k
              wX = _mm256_sub_epi16(wX,wt16);            // 64-wL-wT
492
21.8k
              tmplo = _mm256_mullo_epi16(x16dst,wX);    // 64-wL-wT*dst
493
21.8k
              tmphi = _mm256_mulhi_epi16(x16dst,wX);    // 64-wL-wT*dst
494
21.8k
              __m256i dstlo = _mm256_unpacklo_epi16(tmplo,tmphi);
495
21.8k
              __m256i dsthi = _mm256_unpackhi_epi16(tmplo,tmphi);
496
497
21.8k
              dstlo = _mm256_add_epi32(dstlo,toplo);
498
21.8k
              dsthi = _mm256_add_epi32(dsthi,tophi);
499
21.8k
              dstlo = _mm256_add_epi32(dstlo,leftlo);
500
21.8k
              dsthi = _mm256_add_epi32(dsthi,lefthi);
501
21.8k
              dstlo = _mm256_add_epi32(dstlo,w32);
502
21.8k
              dsthi = _mm256_add_epi32(dsthi,w32);
503
504
21.8k
              dstlo =  _mm256_srai_epi32(dstlo,6);
505
21.8k
              dsthi =  _mm256_srai_epi32(dsthi,6);
506
507
21.8k
              dstlo =  _mm256_max_epi32(vbdmin,dstlo);
508
21.8k
              dsthi =  _mm256_max_epi32(vbdmin,dsthi);
509
21.8k
              dstlo =  _mm256_min_epi32(vbdmax,dstlo);
510
21.8k
              dsthi =  _mm256_min_epi32(vbdmax,dsthi);
511
512
21.8k
              dstlo =  _mm256_packs_epi32(dstlo,dsthi);
513
21.8k
              dstlo =  _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) );
514
515
21.8k
              _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride+x), dstlo );
516
21.8k
            }
517
22.1k
            else
518
22.1k
            {
519
520
22.1k
              __m256i x16top = _mm256_loadu_si256((__m256i *) (ptrSrc+x+1)); // load top
521
22.1k
              __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride+x)); // load dst
522
523
524
22.1k
              tmplo = _mm256_mullo_epi16(x16top,wt16);    // wT*top
525
22.1k
              tmphi = _mm256_mulhi_epi16(x16top,wt16);    // wT*top
526
22.1k
              __m256i toplo = _mm256_unpacklo_epi16(tmplo,tmphi);
527
22.1k
              __m256i tophi = _mm256_unpackhi_epi16(tmplo,tmphi);
528
529
22.1k
              __m256i wX = _mm256_sub_epi16(w64,wt16);
530
22.1k
              tmplo = _mm256_mullo_epi16(x16dst,wX);    // 64-wL-wT*dst
531
22.1k
              tmphi = _mm256_mulhi_epi16(x16dst,wX);    // 64-wL-wT*dst
532
22.1k
              __m256i dstlo = _mm256_unpacklo_epi16(tmplo,tmphi);
533
22.1k
              __m256i dsthi = _mm256_unpackhi_epi16(tmplo,tmphi);
534
535
22.1k
              dstlo = _mm256_add_epi32(dstlo,toplo);
536
22.1k
              dsthi = _mm256_add_epi32(dsthi,tophi);
537
22.1k
              dstlo = _mm256_add_epi32(dstlo,w32);
538
22.1k
              dsthi = _mm256_add_epi32(dsthi,w32);
539
540
22.1k
              dstlo =  _mm256_srai_epi32(dstlo,6);
541
22.1k
              dsthi =  _mm256_srai_epi32(dsthi,6);
542
543
22.1k
              dstlo =  _mm256_max_epi32(vbdmin,dstlo);
544
22.1k
              dsthi =  _mm256_max_epi32(vbdmin,dsthi);
545
22.1k
              dstlo =  _mm256_min_epi32(vbdmax,dstlo);
546
22.1k
              dsthi =  _mm256_min_epi32(vbdmax,dsthi);
547
548
22.1k
              dstlo =  _mm256_packs_epi32(dstlo,dsthi);
549
22.1k
              dstlo =  _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) );
550
551
22.1k
              _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride+x), dstlo );
552
22.1k
            }
553
554
43.9k
          }  // for
555
21.8k
        }
556
42.5k
        else
557
42.5k
        { // wT =0
558
559
42.5k
          wl16=wl16start;
560
42.5k
          __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride)); // load dst
561
562
42.5k
          tmplo = _mm256_mullo_epi16(x16left,wl16);  //wL * left
563
42.5k
          tmphi = _mm256_mulhi_epi16(x16left,wl16);  //wL * left
564
42.5k
          __m256i leftlo = _mm256_unpacklo_epi16(tmplo,tmphi);
565
42.5k
          __m256i lefthi = _mm256_unpackhi_epi16(tmplo,tmphi);
566
567
42.5k
          __m256i wX = _mm256_sub_epi16(w64,wl16);
568
42.5k
          tmplo = _mm256_mullo_epi16(x16dst,wX);    // 64-wL-wT*dst
569
42.5k
          tmphi = _mm256_mulhi_epi16(x16dst,wX);    // 64-wL-wT*dst
570
42.5k
          __m256i dstlo = _mm256_unpacklo_epi16(tmplo,tmphi);
571
42.5k
          __m256i dsthi = _mm256_unpackhi_epi16(tmplo,tmphi);
572
573
42.5k
          dstlo = _mm256_add_epi32(dstlo,leftlo);
574
42.5k
          dsthi = _mm256_add_epi32(dsthi,lefthi);
575
42.5k
          dstlo = _mm256_add_epi32(dstlo,w32);
576
42.5k
          dsthi = _mm256_add_epi32(dsthi,w32);
577
578
42.5k
          dstlo =  _mm256_srai_epi32(dstlo,6);
579
42.5k
          dsthi =  _mm256_srai_epi32(dsthi,6);
580
581
42.5k
          dstlo =  _mm256_max_epi32(vbdmin,dstlo);
582
42.5k
          dsthi =  _mm256_max_epi32(vbdmin,dsthi);
583
42.5k
          dstlo =  _mm256_min_epi32(vbdmax,dstlo);
584
42.5k
          dsthi =  _mm256_min_epi32(vbdmax,dsthi);
585
586
42.5k
          dstlo =  _mm256_packs_epi32(dstlo,dsthi);
587
42.5k
          dstlo =  _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) );
588
589
42.5k
          _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride), dstlo );
590
42.5k
        }
591
64.4k
      }
592
2.71k
    }
593
2.71k
  }
594
1.25k
  else
595
1.25k
#endif
596
1.25k
  {
597
1.25k
    __m128i tmplo8,tmphi8;
598
1.25k
    __m128i w64_8 = _mm_set_epi16(64,64,64,64,64,64,64,64);
599
1.25k
    __m128i w32_8 = _mm_set_epi32(32,32,32,32);
600
1.25k
    __m128i vbdmin8   = _mm_set1_epi32( clpRng.min() );
601
1.25k
    __m128i vbdmax8   = _mm_set1_epi32( clpRng.max() );
602
1.25k
    __m128i wl8start,wl8start2;
603
1.25k
    CHECK(scale < 0 || scale > 2, "PDPC: scale < 0 || scale > 2");
604
605
1.25k
    if (scale==0)
606
459
    {
607
459
      wl8start = _mm_set_epi16(0,0,0,0,0,2,8,32);
608
459
      wl8start2 = _mm_set_epi16(0,0,0,0,0,0,0,0);
609
459
    }
610
800
    else if (scale==1)
611
800
    {
612
800
      wl8start = _mm_set_epi16(0,0,1,2,4,8,16,32);
613
800
      wl8start2 = _mm_set_epi16(0,0,0,0,0,0,0,0);
614
800
    }
615
0
    else if (scale==2)
616
0
    {
617
0
      wl8start = _mm_set_epi16(4,4,8,8,16,16,32,32);
618
0
      wl8start2 = _mm_set_epi16(0,0,0,0,1,1,2,2);
619
0
    }
620
1.25k
    if (uiDirMode == PLANAR_IDX || uiDirMode == DC_IDX )
621
1.25k
    {
622
1.25k
      __m128i wl8 = wl8start;
623
15.8k
      for (int y = 0; y < iHeight; y++)
624
14.5k
      {
625
14.5k
        int wT = 32 >> std::min(31, ((y << 1) >> scale));
626
627
14.5k
        __m128i wt8 = _mm_set_epi16(wT,wT,wT,wT,wT,wT,wT,wT);
628
 //       __m128i x8left = _mm_broadcastw_epi16(_mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)*srcStride))));
629
630
14.5k
        __m128i x8left = _mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)*srcStride)));
631
14.5k
        x8left =_mm_shufflelo_epi16(x8left,0);
632
14.5k
        x8left =_mm_shuffle_epi32(x8left,0);
633
634
635
14.5k
        if (wT)
636
6.17k
        {
637
12.3k
          for (int x = 0; x < iWidth; x+=8)
638
6.17k
          {
639
6.17k
            __m128i x8top = _mm_loadu_si128( (__m128i*) ( ptrSrc + x + 1 ) );   // load top
640
6.17k
            __m128i x8dst = _mm_setzero_si128();
641
6.17k
            if( iWidth >= 8 )
642
4.21k
              x8dst = _mm_loadu_si128( (const __m128i*) ( pDst + y * dstStride + x ) );   // load dst
643
1.96k
            else if( iWidth == 4 )
644
1.96k
              x8dst = _mm_loadu_si64( (const __m128i*) ( pDst + y * dstStride + x ) );    // load dst
645
0
            else if( iWidth == 2 )
646
0
              x8dst = _mm_loadu_si32( (const __m128i*) ( pDst + y * dstStride + x ) );    // load dst
647
0
            else
648
0
            {
649
0
              CHECKD( true, "wrong iWidth in IntraPredSampleFilter_SIMD, only implemented for >=8, ==4, ==2" );
650
0
            }
651
652
6.17k
            if (x>8)
653
0
            {
654
0
              tmplo8 = _mm_mullo_epi16(x8top,wt8);    // wT*top
655
0
              tmphi8 = _mm_mulhi_epi16(x8top,wt8);    // wT*top
656
0
              __m128i toplo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
657
0
              __m128i tophi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
658
659
660
0
              __m128i wX = _mm_sub_epi16(w64_8,wt8);
661
0
              tmplo8 = _mm_mullo_epi16(x8dst,wX);    // 64-wL-wT*dst
662
0
              tmphi8 = _mm_mulhi_epi16(x8dst,wX);    // 64-wL-wT*dst
663
0
              __m128i dstlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
664
0
              __m128i dsthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
665
666
0
              dstlo8 = _mm_add_epi32(dstlo8,toplo8);
667
0
              dsthi8 = _mm_add_epi32(dsthi8,tophi8);
668
0
              dstlo8 = _mm_add_epi32(dstlo8,w32_8);
669
0
              dsthi8 = _mm_add_epi32(dsthi8,w32_8);
670
671
0
              dstlo8 =  _mm_srai_epi32(dstlo8,6);
672
0
              dsthi8 =  _mm_srai_epi32(dsthi8,6);
673
674
0
              dstlo8 =  _mm_max_epi32(vbdmin8,dstlo8);
675
0
              dsthi8 =  _mm_max_epi32(vbdmin8,dsthi8);
676
0
              dstlo8 =  _mm_min_epi32(vbdmax8,dstlo8);
677
0
              dsthi8 =  _mm_min_epi32(vbdmax8,dsthi8);
678
679
0
              x8dst = _mm_packs_epi32(dstlo8,dsthi8);
680
0
            }
681
6.17k
            else // x<=8
682
6.17k
            {
683
6.17k
              if (x==0)
684
6.17k
                wl8=wl8start;
685
0
              else if (x==8)
686
0
                wl8=wl8start2;
687
688
6.17k
              tmplo8 = _mm_mullo_epi16(x8left,wl8);  //wL * left
689
6.17k
              tmphi8 = _mm_mulhi_epi16(x8left,wl8);  //wL * left
690
6.17k
              __m128i leftlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
691
6.17k
              __m128i lefthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
692
693
6.17k
              tmplo8 = _mm_mullo_epi16(x8top,wt8);    // wT*top
694
6.17k
              tmphi8 = _mm_mulhi_epi16(x8top,wt8);    // wT*top
695
6.17k
              __m128i toplo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
696
6.17k
              __m128i tophi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
697
698
6.17k
              __m128i wX = _mm_sub_epi16(w64_8,wl8);
699
6.17k
              wX = _mm_sub_epi16(wX,wt8);            // 64-wL-wT
700
6.17k
              tmplo8 = _mm_mullo_epi16(x8dst,wX);    // 64-wL-wT*dst
701
6.17k
              tmphi8 = _mm_mulhi_epi16(x8dst,wX);    // 64-wL-wT*dst
702
6.17k
              __m128i dstlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
703
6.17k
              __m128i dsthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
704
705
6.17k
              dstlo8 = _mm_add_epi32(dstlo8,toplo8);
706
6.17k
              dsthi8 = _mm_add_epi32(dsthi8,tophi8);
707
6.17k
              dstlo8 = _mm_add_epi32(dstlo8,leftlo8);
708
6.17k
              dsthi8 = _mm_add_epi32(dsthi8,lefthi8);
709
6.17k
              dstlo8 = _mm_add_epi32(dstlo8,w32_8);
710
6.17k
              dsthi8 = _mm_add_epi32(dsthi8,w32_8);
711
712
6.17k
              dstlo8 =  _mm_srai_epi32(dstlo8,6);
713
6.17k
              dsthi8 =  _mm_srai_epi32(dsthi8,6);
714
715
6.17k
              dstlo8 =  _mm_max_epi32(vbdmin8,dstlo8);
716
6.17k
              dsthi8 =  _mm_max_epi32(vbdmin8,dsthi8);
717
6.17k
              dstlo8 =  _mm_min_epi32(vbdmax8,dstlo8);
718
6.17k
              dsthi8 =  _mm_min_epi32(vbdmax8,dsthi8);
719
720
6.17k
              x8dst = _mm_packs_epi32(dstlo8,dsthi8);
721
6.17k
            }
722
723
6.17k
            if( iWidth >= 8 )
724
4.21k
              _mm_storeu_si128( (__m128i*) ( pDst + y * dstStride + x ), x8dst );
725
1.96k
            else if( iWidth == 4 )
726
1.96k
              _mm_storeu_si64( (__m128i*) ( pDst + y * dstStride + x ), x8dst );
727
0
            else if( iWidth == 2 )
728
0
              _mm_storeu_si32( (__m128i*) ( pDst + y * dstStride + x ), x8dst );
729
6.17k
          }
730
6.17k
        }
731
8.39k
        else //wT =0
732
8.39k
        {
733
16.7k
          for( int x = 0; x < std::min( iWidth, 16 ); x += 8 )
734
8.39k
          {
735
8.39k
            if( x == 0 )
736
8.39k
              wl8 = wl8start;
737
0
            else if( x == 8 )
738
0
              wl8 = wl8start2;
739
740
8.39k
            __m128i x8dst;
741
8.39k
            if( iWidth >= 8 )
742
5.22k
              x8dst = _mm_loadu_si128( (const __m128i*)( pDst + y * dstStride + x ) );   // load dst
743
3.17k
            else if( iWidth == 4 )
744
3.17k
              x8dst = _mm_loadu_si64( (const __m128i*)( pDst + y * dstStride + x ) );    // load dst
745
0
            else if( iWidth == 2 )
746
0
              x8dst = _mm_loadu_si32( (const __m128i*)( pDst + y * dstStride + x ) );    // load dst
747
0
            else
748
8.39k
              CHECK( true, "wrong iWidth in IntraPredSampleFilter_SIMD, only implemented for >=8, ==4, ==2" );
749
750
751
8.39k
            tmplo8          = _mm_mullo_epi16( x8left, wl8 );   // wL * left
752
8.39k
            tmphi8          = _mm_mulhi_epi16( x8left, wl8 );   // wL * left
753
8.39k
            __m128i leftlo8 = _mm_unpacklo_epi16( tmplo8, tmphi8 );
754
8.39k
            __m128i lefthi8 = _mm_unpackhi_epi16( tmplo8, tmphi8 );
755
756
8.39k
            __m128i wX     = _mm_sub_epi16( w64_8, wl8 );
757
8.39k
            tmplo8         = _mm_mullo_epi16( x8dst, wX );   // 64-wL-wT*dst
758
8.39k
            tmphi8         = _mm_mulhi_epi16( x8dst, wX );   // 64-wL-wT*dst
759
8.39k
            __m128i dstlo8 = _mm_unpacklo_epi16( tmplo8, tmphi8 );
760
8.39k
            __m128i dsthi8 = _mm_unpackhi_epi16( tmplo8, tmphi8 );
761
762
8.39k
            dstlo8 = _mm_add_epi32( dstlo8, leftlo8 );
763
8.39k
            dsthi8 = _mm_add_epi32( dsthi8, lefthi8 );
764
8.39k
            dstlo8 = _mm_add_epi32( dstlo8, w32_8 );
765
8.39k
            dsthi8 = _mm_add_epi32( dsthi8, w32_8 );
766
767
8.39k
            dstlo8 = _mm_srai_epi32( dstlo8, 6 );
768
8.39k
            dsthi8 = _mm_srai_epi32( dsthi8, 6 );
769
770
8.39k
            dstlo8 = _mm_max_epi32( vbdmin8, dstlo8 );
771
8.39k
            dsthi8 = _mm_max_epi32( vbdmin8, dsthi8 );
772
8.39k
            dstlo8 = _mm_min_epi32( vbdmax8, dstlo8 );
773
8.39k
            dsthi8 = _mm_min_epi32( vbdmax8, dsthi8 );
774
775
8.39k
            dstlo8 = _mm_packs_epi32( dstlo8, dsthi8 );
776
777
8.39k
            if( iWidth >= 8 )
778
5.22k
              _mm_storeu_si128( (__m128i*)( pDst + y * dstStride + x ), dstlo8 );
779
3.17k
            else if( iWidth == 4 )
780
3.17k
              _mm_storeu_si64( (__m128i*)( pDst + y * dstStride + x ), ( dstlo8 ) );
781
0
            else if( iWidth == 2 )
782
0
              _mm_storeu_si32( (__m128i*)( pDst + y * dstStride + x ), dstlo8 );
783
8.39k
          }
784
8.39k
        }
785
14.5k
      }
786
1.25k
    }
787
1.25k
  }
788
789
790
#if USE_AVX2
791
3.97k
  _mm256_zeroupper();
792
3.97k
#endif
793
3.97k
}
Unexecuted instantiation: void vvdec::IntraPredSampleFilter_SIMD<(vvdec::x86_simd::X86_VEXT)1, 8>(short*, long, vvdec::AreaBuf<short>&, unsigned int, vvdec::ClpRngTemplate<short> const&)
Unexecuted instantiation: void vvdec::IntraPredSampleFilter_SIMD<(vvdec::x86_simd::X86_VEXT)1, 16>(short*, long, vvdec::AreaBuf<short>&, unsigned int, vvdec::ClpRngTemplate<short> const&)
void vvdec::IntraPredSampleFilter_SIMD<(vvdec::x86_simd::X86_VEXT)4, 8>(short*, long, vvdec::AreaBuf<short>&, unsigned int, vvdec::ClpRngTemplate<short> const&)
Line
Count
Source
421
1.25k
{
422
1.25k
  const int       iWidth    = piPred.width;
423
1.25k
  const int       iHeight   = piPred.height;
424
1.25k
  PelBuf          dstBuf    = piPred;
425
1.25k
  Pel*            pDst      = dstBuf.buf;
426
1.25k
  const ptrdiff_t dstStride = dstBuf.stride;
427
428
1.25k
  const int scale = ((getLog2(iWidth) - 2 + getLog2(iHeight) - 2 + 2) >> 2);
429
1.25k
  CHECK(scale < 0 || scale > 31, "PDPC: scale < 0 || scale > 2");
430
431
1.25k
#if USE_AVX2
432
1.25k
  if( W > 8 )
433
0
  {
434
0
    __m256i tmplo,tmphi;
435
0
    __m256i w64 = _mm256_set_epi16(64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64);
436
0
    __m256i w32 = _mm256_set_epi32(32,32,32,32,32,32,32,32);
437
0
    __m256i vbdmin   = _mm256_set1_epi32( clpRng.min() );
438
0
    __m256i vbdmax   = _mm256_set1_epi32( clpRng.max() );
439
0
    __m256i wl16;
440
0
    __m256i wl16start;
441
442
0
    if (scale==0)
443
0
    {
444
0
      wl16start = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,0,0,0,2,8,32);
445
0
    }
446
0
    else if (scale==1)
447
0
    {
448
0
      wl16start = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,1,2,4,8,16,32);
449
0
    }
450
0
    else if (scale==2)
451
0
    {
452
0
      wl16start = _mm256_set_epi16(0,0,0,0,1,1,2,2,4,4,8,8,16,16,32,32);
453
0
    }
454
0
    else
455
0
    {
456
0
      THROW_FATAL( "Wrong scale (" << scale << ")" );
457
0
    }
458
459
460
0
    if (uiDirMode == PLANAR_IDX || uiDirMode == DC_IDX )
461
0
    {
462
0
      for (int y = 0; y < iHeight; y++)
463
0
      {
464
0
        int wT = 32 >> std::min(31, ((y << 1) >> scale));
465
466
0
        __m256i wt16 = _mm256_set_epi16(wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT);
467
0
        __m256i x16left = _mm256_broadcastw_epi16(_mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)*srcStride))));
468
469
0
        if (wT)
470
0
        {
471
0
          for (int x = 0; x < iWidth; x+=16)
472
0
          {
473
0
            if (x==0)
474
0
            {
475
0
              wl16=wl16start;
476
477
0
              __m256i x16top = _mm256_loadu_si256((__m256i *) (ptrSrc+x+1)); // load top
478
0
              __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride+x)); // load dst
479
480
0
              tmplo = _mm256_mullo_epi16(x16left,wl16);  //wL * left
481
0
              tmphi = _mm256_mulhi_epi16(x16left,wl16);  //wL * left
482
0
              __m256i leftlo = _mm256_unpacklo_epi16(tmplo,tmphi);
483
0
              __m256i lefthi = _mm256_unpackhi_epi16(tmplo,tmphi);
484
485
0
              tmplo = _mm256_mullo_epi16(x16top,wt16);    // wT*top
486
0
              tmphi = _mm256_mulhi_epi16(x16top,wt16);    // wT*top
487
0
              __m256i toplo = _mm256_unpacklo_epi16(tmplo,tmphi);
488
0
              __m256i tophi = _mm256_unpackhi_epi16(tmplo,tmphi);
489
490
0
              __m256i wX = _mm256_sub_epi16(w64,wl16);
491
0
              wX = _mm256_sub_epi16(wX,wt16);            // 64-wL-wT
492
0
              tmplo = _mm256_mullo_epi16(x16dst,wX);    // 64-wL-wT*dst
493
0
              tmphi = _mm256_mulhi_epi16(x16dst,wX);    // 64-wL-wT*dst
494
0
              __m256i dstlo = _mm256_unpacklo_epi16(tmplo,tmphi);
495
0
              __m256i dsthi = _mm256_unpackhi_epi16(tmplo,tmphi);
496
497
0
              dstlo = _mm256_add_epi32(dstlo,toplo);
498
0
              dsthi = _mm256_add_epi32(dsthi,tophi);
499
0
              dstlo = _mm256_add_epi32(dstlo,leftlo);
500
0
              dsthi = _mm256_add_epi32(dsthi,lefthi);
501
0
              dstlo = _mm256_add_epi32(dstlo,w32);
502
0
              dsthi = _mm256_add_epi32(dsthi,w32);
503
504
0
              dstlo =  _mm256_srai_epi32(dstlo,6);
505
0
              dsthi =  _mm256_srai_epi32(dsthi,6);
506
507
0
              dstlo =  _mm256_max_epi32(vbdmin,dstlo);
508
0
              dsthi =  _mm256_max_epi32(vbdmin,dsthi);
509
0
              dstlo =  _mm256_min_epi32(vbdmax,dstlo);
510
0
              dsthi =  _mm256_min_epi32(vbdmax,dsthi);
511
512
0
              dstlo =  _mm256_packs_epi32(dstlo,dsthi);
513
0
              dstlo =  _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) );
514
515
0
              _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride+x), dstlo );
516
0
            }
517
0
            else
518
0
            {
519
520
0
              __m256i x16top = _mm256_loadu_si256((__m256i *) (ptrSrc+x+1)); // load top
521
0
              __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride+x)); // load dst
522
523
524
0
              tmplo = _mm256_mullo_epi16(x16top,wt16);    // wT*top
525
0
              tmphi = _mm256_mulhi_epi16(x16top,wt16);    // wT*top
526
0
              __m256i toplo = _mm256_unpacklo_epi16(tmplo,tmphi);
527
0
              __m256i tophi = _mm256_unpackhi_epi16(tmplo,tmphi);
528
529
0
              __m256i wX = _mm256_sub_epi16(w64,wt16);
530
0
              tmplo = _mm256_mullo_epi16(x16dst,wX);    // 64-wL-wT*dst
531
0
              tmphi = _mm256_mulhi_epi16(x16dst,wX);    // 64-wL-wT*dst
532
0
              __m256i dstlo = _mm256_unpacklo_epi16(tmplo,tmphi);
533
0
              __m256i dsthi = _mm256_unpackhi_epi16(tmplo,tmphi);
534
535
0
              dstlo = _mm256_add_epi32(dstlo,toplo);
536
0
              dsthi = _mm256_add_epi32(dsthi,tophi);
537
0
              dstlo = _mm256_add_epi32(dstlo,w32);
538
0
              dsthi = _mm256_add_epi32(dsthi,w32);
539
540
0
              dstlo =  _mm256_srai_epi32(dstlo,6);
541
0
              dsthi =  _mm256_srai_epi32(dsthi,6);
542
543
0
              dstlo =  _mm256_max_epi32(vbdmin,dstlo);
544
0
              dsthi =  _mm256_max_epi32(vbdmin,dsthi);
545
0
              dstlo =  _mm256_min_epi32(vbdmax,dstlo);
546
0
              dsthi =  _mm256_min_epi32(vbdmax,dsthi);
547
548
0
              dstlo =  _mm256_packs_epi32(dstlo,dsthi);
549
0
              dstlo =  _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) );
550
551
0
              _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride+x), dstlo );
552
0
            }
553
554
0
          }  // for
555
0
        }
556
0
        else
557
0
        { // wT =0
558
559
0
          wl16=wl16start;
560
0
          __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride)); // load dst
561
562
0
          tmplo = _mm256_mullo_epi16(x16left,wl16);  //wL * left
563
0
          tmphi = _mm256_mulhi_epi16(x16left,wl16);  //wL * left
564
0
          __m256i leftlo = _mm256_unpacklo_epi16(tmplo,tmphi);
565
0
          __m256i lefthi = _mm256_unpackhi_epi16(tmplo,tmphi);
566
567
0
          __m256i wX = _mm256_sub_epi16(w64,wl16);
568
0
          tmplo = _mm256_mullo_epi16(x16dst,wX);    // 64-wL-wT*dst
569
0
          tmphi = _mm256_mulhi_epi16(x16dst,wX);    // 64-wL-wT*dst
570
0
          __m256i dstlo = _mm256_unpacklo_epi16(tmplo,tmphi);
571
0
          __m256i dsthi = _mm256_unpackhi_epi16(tmplo,tmphi);
572
573
0
          dstlo = _mm256_add_epi32(dstlo,leftlo);
574
0
          dsthi = _mm256_add_epi32(dsthi,lefthi);
575
0
          dstlo = _mm256_add_epi32(dstlo,w32);
576
0
          dsthi = _mm256_add_epi32(dsthi,w32);
577
578
0
          dstlo =  _mm256_srai_epi32(dstlo,6);
579
0
          dsthi =  _mm256_srai_epi32(dsthi,6);
580
581
0
          dstlo =  _mm256_max_epi32(vbdmin,dstlo);
582
0
          dsthi =  _mm256_max_epi32(vbdmin,dsthi);
583
0
          dstlo =  _mm256_min_epi32(vbdmax,dstlo);
584
0
          dsthi =  _mm256_min_epi32(vbdmax,dsthi);
585
586
0
          dstlo =  _mm256_packs_epi32(dstlo,dsthi);
587
0
          dstlo =  _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) );
588
589
0
          _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride), dstlo );
590
0
        }
591
0
      }
592
0
    }
593
0
  }
594
1.25k
  else
595
1.25k
#endif
596
1.25k
  {
597
1.25k
    __m128i tmplo8,tmphi8;
598
1.25k
    __m128i w64_8 = _mm_set_epi16(64,64,64,64,64,64,64,64);
599
1.25k
    __m128i w32_8 = _mm_set_epi32(32,32,32,32);
600
1.25k
    __m128i vbdmin8   = _mm_set1_epi32( clpRng.min() );
601
1.25k
    __m128i vbdmax8   = _mm_set1_epi32( clpRng.max() );
602
1.25k
    __m128i wl8start,wl8start2;
603
1.25k
    CHECK(scale < 0 || scale > 2, "PDPC: scale < 0 || scale > 2");
604
605
1.25k
    if (scale==0)
606
459
    {
607
459
      wl8start = _mm_set_epi16(0,0,0,0,0,2,8,32);
608
459
      wl8start2 = _mm_set_epi16(0,0,0,0,0,0,0,0);
609
459
    }
610
800
    else if (scale==1)
611
800
    {
612
800
      wl8start = _mm_set_epi16(0,0,1,2,4,8,16,32);
613
800
      wl8start2 = _mm_set_epi16(0,0,0,0,0,0,0,0);
614
800
    }
615
0
    else if (scale==2)
616
0
    {
617
0
      wl8start = _mm_set_epi16(4,4,8,8,16,16,32,32);
618
0
      wl8start2 = _mm_set_epi16(0,0,0,0,1,1,2,2);
619
0
    }
620
1.25k
    if (uiDirMode == PLANAR_IDX || uiDirMode == DC_IDX )
621
1.25k
    {
622
1.25k
      __m128i wl8 = wl8start;
623
15.8k
      for (int y = 0; y < iHeight; y++)
624
14.5k
      {
625
14.5k
        int wT = 32 >> std::min(31, ((y << 1) >> scale));
626
627
14.5k
        __m128i wt8 = _mm_set_epi16(wT,wT,wT,wT,wT,wT,wT,wT);
628
 //       __m128i x8left = _mm_broadcastw_epi16(_mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)*srcStride))));
629
630
14.5k
        __m128i x8left = _mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)*srcStride)));
631
14.5k
        x8left =_mm_shufflelo_epi16(x8left,0);
632
14.5k
        x8left =_mm_shuffle_epi32(x8left,0);
633
634
635
14.5k
        if (wT)
636
6.17k
        {
637
12.3k
          for (int x = 0; x < iWidth; x+=8)
638
6.17k
          {
639
6.17k
            __m128i x8top = _mm_loadu_si128( (__m128i*) ( ptrSrc + x + 1 ) );   // load top
640
6.17k
            __m128i x8dst = _mm_setzero_si128();
641
6.17k
            if( iWidth >= 8 )
642
4.21k
              x8dst = _mm_loadu_si128( (const __m128i*) ( pDst + y * dstStride + x ) );   // load dst
643
1.96k
            else if( iWidth == 4 )
644
1.96k
              x8dst = _mm_loadu_si64( (const __m128i*) ( pDst + y * dstStride + x ) );    // load dst
645
0
            else if( iWidth == 2 )
646
0
              x8dst = _mm_loadu_si32( (const __m128i*) ( pDst + y * dstStride + x ) );    // load dst
647
0
            else
648
0
            {
649
0
              CHECKD( true, "wrong iWidth in IntraPredSampleFilter_SIMD, only implemented for >=8, ==4, ==2" );
650
0
            }
651
652
6.17k
            if (x>8)
653
0
            {
654
0
              tmplo8 = _mm_mullo_epi16(x8top,wt8);    // wT*top
655
0
              tmphi8 = _mm_mulhi_epi16(x8top,wt8);    // wT*top
656
0
              __m128i toplo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
657
0
              __m128i tophi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
658
659
660
0
              __m128i wX = _mm_sub_epi16(w64_8,wt8);
661
0
              tmplo8 = _mm_mullo_epi16(x8dst,wX);    // 64-wL-wT*dst
662
0
              tmphi8 = _mm_mulhi_epi16(x8dst,wX);    // 64-wL-wT*dst
663
0
              __m128i dstlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
664
0
              __m128i dsthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
665
666
0
              dstlo8 = _mm_add_epi32(dstlo8,toplo8);
667
0
              dsthi8 = _mm_add_epi32(dsthi8,tophi8);
668
0
              dstlo8 = _mm_add_epi32(dstlo8,w32_8);
669
0
              dsthi8 = _mm_add_epi32(dsthi8,w32_8);
670
671
0
              dstlo8 =  _mm_srai_epi32(dstlo8,6);
672
0
              dsthi8 =  _mm_srai_epi32(dsthi8,6);
673
674
0
              dstlo8 =  _mm_max_epi32(vbdmin8,dstlo8);
675
0
              dsthi8 =  _mm_max_epi32(vbdmin8,dsthi8);
676
0
              dstlo8 =  _mm_min_epi32(vbdmax8,dstlo8);
677
0
              dsthi8 =  _mm_min_epi32(vbdmax8,dsthi8);
678
679
0
              x8dst = _mm_packs_epi32(dstlo8,dsthi8);
680
0
            }
681
6.17k
            else // x<=8
682
6.17k
            {
683
6.17k
              if (x==0)
684
6.17k
                wl8=wl8start;
685
0
              else if (x==8)
686
0
                wl8=wl8start2;
687
688
6.17k
              tmplo8 = _mm_mullo_epi16(x8left,wl8);  //wL * left
689
6.17k
              tmphi8 = _mm_mulhi_epi16(x8left,wl8);  //wL * left
690
6.17k
              __m128i leftlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
691
6.17k
              __m128i lefthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
692
693
6.17k
              tmplo8 = _mm_mullo_epi16(x8top,wt8);    // wT*top
694
6.17k
              tmphi8 = _mm_mulhi_epi16(x8top,wt8);    // wT*top
695
6.17k
              __m128i toplo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
696
6.17k
              __m128i tophi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
697
698
6.17k
              __m128i wX = _mm_sub_epi16(w64_8,wl8);
699
6.17k
              wX = _mm_sub_epi16(wX,wt8);            // 64-wL-wT
700
6.17k
              tmplo8 = _mm_mullo_epi16(x8dst,wX);    // 64-wL-wT*dst
701
6.17k
              tmphi8 = _mm_mulhi_epi16(x8dst,wX);    // 64-wL-wT*dst
702
6.17k
              __m128i dstlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
703
6.17k
              __m128i dsthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
704
705
6.17k
              dstlo8 = _mm_add_epi32(dstlo8,toplo8);
706
6.17k
              dsthi8 = _mm_add_epi32(dsthi8,tophi8);
707
6.17k
              dstlo8 = _mm_add_epi32(dstlo8,leftlo8);
708
6.17k
              dsthi8 = _mm_add_epi32(dsthi8,lefthi8);
709
6.17k
              dstlo8 = _mm_add_epi32(dstlo8,w32_8);
710
6.17k
              dsthi8 = _mm_add_epi32(dsthi8,w32_8);
711
712
6.17k
              dstlo8 =  _mm_srai_epi32(dstlo8,6);
713
6.17k
              dsthi8 =  _mm_srai_epi32(dsthi8,6);
714
715
6.17k
              dstlo8 =  _mm_max_epi32(vbdmin8,dstlo8);
716
6.17k
              dsthi8 =  _mm_max_epi32(vbdmin8,dsthi8);
717
6.17k
              dstlo8 =  _mm_min_epi32(vbdmax8,dstlo8);
718
6.17k
              dsthi8 =  _mm_min_epi32(vbdmax8,dsthi8);
719
720
6.17k
              x8dst = _mm_packs_epi32(dstlo8,dsthi8);
721
6.17k
            }
722
723
6.17k
            if( iWidth >= 8 )
724
4.21k
              _mm_storeu_si128( (__m128i*) ( pDst + y * dstStride + x ), x8dst );
725
1.96k
            else if( iWidth == 4 )
726
1.96k
              _mm_storeu_si64( (__m128i*) ( pDst + y * dstStride + x ), x8dst );
727
0
            else if( iWidth == 2 )
728
0
              _mm_storeu_si32( (__m128i*) ( pDst + y * dstStride + x ), x8dst );
729
6.17k
          }
730
6.17k
        }
731
8.39k
        else //wT =0
732
8.39k
        {
733
16.7k
          for( int x = 0; x < std::min( iWidth, 16 ); x += 8 )
734
8.39k
          {
735
8.39k
            if( x == 0 )
736
8.39k
              wl8 = wl8start;
737
0
            else if( x == 8 )
738
0
              wl8 = wl8start2;
739
740
8.39k
            __m128i x8dst;
741
8.39k
            if( iWidth >= 8 )
742
5.22k
              x8dst = _mm_loadu_si128( (const __m128i*)( pDst + y * dstStride + x ) );   // load dst
743
3.17k
            else if( iWidth == 4 )
744
3.17k
              x8dst = _mm_loadu_si64( (const __m128i*)( pDst + y * dstStride + x ) );    // load dst
745
0
            else if( iWidth == 2 )
746
0
              x8dst = _mm_loadu_si32( (const __m128i*)( pDst + y * dstStride + x ) );    // load dst
747
0
            else
748
8.39k
              CHECK( true, "wrong iWidth in IntraPredSampleFilter_SIMD, only implemented for >=8, ==4, ==2" );
749
750
751
8.39k
            tmplo8          = _mm_mullo_epi16( x8left, wl8 );   // wL * left
752
8.39k
            tmphi8          = _mm_mulhi_epi16( x8left, wl8 );   // wL * left
753
8.39k
            __m128i leftlo8 = _mm_unpacklo_epi16( tmplo8, tmphi8 );
754
8.39k
            __m128i lefthi8 = _mm_unpackhi_epi16( tmplo8, tmphi8 );
755
756
8.39k
            __m128i wX     = _mm_sub_epi16( w64_8, wl8 );
757
8.39k
            tmplo8         = _mm_mullo_epi16( x8dst, wX );   // 64-wL-wT*dst
758
8.39k
            tmphi8         = _mm_mulhi_epi16( x8dst, wX );   // 64-wL-wT*dst
759
8.39k
            __m128i dstlo8 = _mm_unpacklo_epi16( tmplo8, tmphi8 );
760
8.39k
            __m128i dsthi8 = _mm_unpackhi_epi16( tmplo8, tmphi8 );
761
762
8.39k
            dstlo8 = _mm_add_epi32( dstlo8, leftlo8 );
763
8.39k
            dsthi8 = _mm_add_epi32( dsthi8, lefthi8 );
764
8.39k
            dstlo8 = _mm_add_epi32( dstlo8, w32_8 );
765
8.39k
            dsthi8 = _mm_add_epi32( dsthi8, w32_8 );
766
767
8.39k
            dstlo8 = _mm_srai_epi32( dstlo8, 6 );
768
8.39k
            dsthi8 = _mm_srai_epi32( dsthi8, 6 );
769
770
8.39k
            dstlo8 = _mm_max_epi32( vbdmin8, dstlo8 );
771
8.39k
            dsthi8 = _mm_max_epi32( vbdmin8, dsthi8 );
772
8.39k
            dstlo8 = _mm_min_epi32( vbdmax8, dstlo8 );
773
8.39k
            dsthi8 = _mm_min_epi32( vbdmax8, dsthi8 );
774
775
8.39k
            dstlo8 = _mm_packs_epi32( dstlo8, dsthi8 );
776
777
8.39k
            if( iWidth >= 8 )
778
5.22k
              _mm_storeu_si128( (__m128i*)( pDst + y * dstStride + x ), dstlo8 );
779
3.17k
            else if( iWidth == 4 )
780
3.17k
              _mm_storeu_si64( (__m128i*)( pDst + y * dstStride + x ), ( dstlo8 ) );
781
0
            else if( iWidth == 2 )
782
0
              _mm_storeu_si32( (__m128i*)( pDst + y * dstStride + x ), dstlo8 );
783
8.39k
          }
784
8.39k
        }
785
14.5k
      }
786
1.25k
    }
787
1.25k
  }
788
789
790
1.25k
#if USE_AVX2
791
1.25k
  _mm256_zeroupper();
792
1.25k
#endif
793
1.25k
}
void vvdec::IntraPredSampleFilter_SIMD<(vvdec::x86_simd::X86_VEXT)4, 16>(short*, long, vvdec::AreaBuf<short>&, unsigned int, vvdec::ClpRngTemplate<short> const&)
Line
Count
Source
421
2.71k
{
422
2.71k
  const int       iWidth    = piPred.width;
423
2.71k
  const int       iHeight   = piPred.height;
424
2.71k
  PelBuf          dstBuf    = piPred;
425
2.71k
  Pel*            pDst      = dstBuf.buf;
426
2.71k
  const ptrdiff_t dstStride = dstBuf.stride;
427
428
2.71k
  const int scale = ((getLog2(iWidth) - 2 + getLog2(iHeight) - 2 + 2) >> 2);
429
2.71k
  CHECK(scale < 0 || scale > 31, "PDPC: scale < 0 || scale > 2");
430
431
2.71k
#if USE_AVX2
432
2.71k
  if( W > 8 )
433
2.71k
  {
434
2.71k
    __m256i tmplo,tmphi;
435
2.71k
    __m256i w64 = _mm256_set_epi16(64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64);
436
2.71k
    __m256i w32 = _mm256_set_epi32(32,32,32,32,32,32,32,32);
437
2.71k
    __m256i vbdmin   = _mm256_set1_epi32( clpRng.min() );
438
2.71k
    __m256i vbdmax   = _mm256_set1_epi32( clpRng.max() );
439
2.71k
    __m256i wl16;
440
2.71k
    __m256i wl16start;
441
442
2.71k
    if (scale==0)
443
0
    {
444
0
      wl16start = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,0,0,0,2,8,32);
445
0
    }
446
2.71k
    else if (scale==1)
447
1.70k
    {
448
1.70k
      wl16start = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,1,2,4,8,16,32);
449
1.70k
    }
450
1.00k
    else if (scale==2)
451
1.00k
    {
452
1.00k
      wl16start = _mm256_set_epi16(0,0,0,0,1,1,2,2,4,4,8,8,16,16,32,32);
453
1.00k
    }
454
0
    else
455
0
    {
456
0
      THROW_FATAL( "Wrong scale (" << scale << ")" );
457
0
    }
458
459
460
2.71k
    if (uiDirMode == PLANAR_IDX || uiDirMode == DC_IDX )
461
2.71k
    {
462
67.1k
      for (int y = 0; y < iHeight; y++)
463
64.4k
      {
464
64.4k
        int wT = 32 >> std::min(31, ((y << 1) >> scale));
465
466
64.4k
        __m256i wt16 = _mm256_set_epi16(wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT);
467
64.4k
        __m256i x16left = _mm256_broadcastw_epi16(_mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)*srcStride))));
468
469
64.4k
        if (wT)
470
21.8k
        {
471
65.8k
          for (int x = 0; x < iWidth; x+=16)
472
43.9k
          {
473
43.9k
            if (x==0)
474
21.8k
            {
475
21.8k
              wl16=wl16start;
476
477
21.8k
              __m256i x16top = _mm256_loadu_si256((__m256i *) (ptrSrc+x+1)); // load top
478
21.8k
              __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride+x)); // load dst
479
480
21.8k
              tmplo = _mm256_mullo_epi16(x16left,wl16);  //wL * left
481
21.8k
              tmphi = _mm256_mulhi_epi16(x16left,wl16);  //wL * left
482
21.8k
              __m256i leftlo = _mm256_unpacklo_epi16(tmplo,tmphi);
483
21.8k
              __m256i lefthi = _mm256_unpackhi_epi16(tmplo,tmphi);
484
485
21.8k
              tmplo = _mm256_mullo_epi16(x16top,wt16);    // wT*top
486
21.8k
              tmphi = _mm256_mulhi_epi16(x16top,wt16);    // wT*top
487
21.8k
              __m256i toplo = _mm256_unpacklo_epi16(tmplo,tmphi);
488
21.8k
              __m256i tophi = _mm256_unpackhi_epi16(tmplo,tmphi);
489
490
21.8k
              __m256i wX = _mm256_sub_epi16(w64,wl16);
491
21.8k
              wX = _mm256_sub_epi16(wX,wt16);            // 64-wL-wT
492
21.8k
              tmplo = _mm256_mullo_epi16(x16dst,wX);    // 64-wL-wT*dst
493
21.8k
              tmphi = _mm256_mulhi_epi16(x16dst,wX);    // 64-wL-wT*dst
494
21.8k
              __m256i dstlo = _mm256_unpacklo_epi16(tmplo,tmphi);
495
21.8k
              __m256i dsthi = _mm256_unpackhi_epi16(tmplo,tmphi);
496
497
21.8k
              dstlo = _mm256_add_epi32(dstlo,toplo);
498
21.8k
              dsthi = _mm256_add_epi32(dsthi,tophi);
499
21.8k
              dstlo = _mm256_add_epi32(dstlo,leftlo);
500
21.8k
              dsthi = _mm256_add_epi32(dsthi,lefthi);
501
21.8k
              dstlo = _mm256_add_epi32(dstlo,w32);
502
21.8k
              dsthi = _mm256_add_epi32(dsthi,w32);
503
504
21.8k
              dstlo =  _mm256_srai_epi32(dstlo,6);
505
21.8k
              dsthi =  _mm256_srai_epi32(dsthi,6);
506
507
21.8k
              dstlo =  _mm256_max_epi32(vbdmin,dstlo);
508
21.8k
              dsthi =  _mm256_max_epi32(vbdmin,dsthi);
509
21.8k
              dstlo =  _mm256_min_epi32(vbdmax,dstlo);
510
21.8k
              dsthi =  _mm256_min_epi32(vbdmax,dsthi);
511
512
21.8k
              dstlo =  _mm256_packs_epi32(dstlo,dsthi);
513
21.8k
              dstlo =  _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) );
514
515
21.8k
              _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride+x), dstlo );
516
21.8k
            }
517
22.1k
            else
518
22.1k
            {
519
520
22.1k
              __m256i x16top = _mm256_loadu_si256((__m256i *) (ptrSrc+x+1)); // load top
521
22.1k
              __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride+x)); // load dst
522
523
524
22.1k
              tmplo = _mm256_mullo_epi16(x16top,wt16);    // wT*top
525
22.1k
              tmphi = _mm256_mulhi_epi16(x16top,wt16);    // wT*top
526
22.1k
              __m256i toplo = _mm256_unpacklo_epi16(tmplo,tmphi);
527
22.1k
              __m256i tophi = _mm256_unpackhi_epi16(tmplo,tmphi);
528
529
22.1k
              __m256i wX = _mm256_sub_epi16(w64,wt16);
530
22.1k
              tmplo = _mm256_mullo_epi16(x16dst,wX);    // 64-wL-wT*dst
531
22.1k
              tmphi = _mm256_mulhi_epi16(x16dst,wX);    // 64-wL-wT*dst
532
22.1k
              __m256i dstlo = _mm256_unpacklo_epi16(tmplo,tmphi);
533
22.1k
              __m256i dsthi = _mm256_unpackhi_epi16(tmplo,tmphi);
534
535
22.1k
              dstlo = _mm256_add_epi32(dstlo,toplo);
536
22.1k
              dsthi = _mm256_add_epi32(dsthi,tophi);
537
22.1k
              dstlo = _mm256_add_epi32(dstlo,w32);
538
22.1k
              dsthi = _mm256_add_epi32(dsthi,w32);
539
540
22.1k
              dstlo =  _mm256_srai_epi32(dstlo,6);
541
22.1k
              dsthi =  _mm256_srai_epi32(dsthi,6);
542
543
22.1k
              dstlo =  _mm256_max_epi32(vbdmin,dstlo);
544
22.1k
              dsthi =  _mm256_max_epi32(vbdmin,dsthi);
545
22.1k
              dstlo =  _mm256_min_epi32(vbdmax,dstlo);
546
22.1k
              dsthi =  _mm256_min_epi32(vbdmax,dsthi);
547
548
22.1k
              dstlo =  _mm256_packs_epi32(dstlo,dsthi);
549
22.1k
              dstlo =  _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) );
550
551
22.1k
              _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride+x), dstlo );
552
22.1k
            }
553
554
43.9k
          }  // for
555
21.8k
        }
556
42.5k
        else
557
42.5k
        { // wT =0
558
559
42.5k
          wl16=wl16start;
560
42.5k
          __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride)); // load dst
561
562
42.5k
          tmplo = _mm256_mullo_epi16(x16left,wl16);  //wL * left
563
42.5k
          tmphi = _mm256_mulhi_epi16(x16left,wl16);  //wL * left
564
42.5k
          __m256i leftlo = _mm256_unpacklo_epi16(tmplo,tmphi);
565
42.5k
          __m256i lefthi = _mm256_unpackhi_epi16(tmplo,tmphi);
566
567
42.5k
          __m256i wX = _mm256_sub_epi16(w64,wl16);
568
42.5k
          tmplo = _mm256_mullo_epi16(x16dst,wX);    // 64-wL-wT*dst
569
42.5k
          tmphi = _mm256_mulhi_epi16(x16dst,wX);    // 64-wL-wT*dst
570
42.5k
          __m256i dstlo = _mm256_unpacklo_epi16(tmplo,tmphi);
571
42.5k
          __m256i dsthi = _mm256_unpackhi_epi16(tmplo,tmphi);
572
573
42.5k
          dstlo = _mm256_add_epi32(dstlo,leftlo);
574
42.5k
          dsthi = _mm256_add_epi32(dsthi,lefthi);
575
42.5k
          dstlo = _mm256_add_epi32(dstlo,w32);
576
42.5k
          dsthi = _mm256_add_epi32(dsthi,w32);
577
578
42.5k
          dstlo =  _mm256_srai_epi32(dstlo,6);
579
42.5k
          dsthi =  _mm256_srai_epi32(dsthi,6);
580
581
42.5k
          dstlo =  _mm256_max_epi32(vbdmin,dstlo);
582
42.5k
          dsthi =  _mm256_max_epi32(vbdmin,dsthi);
583
42.5k
          dstlo =  _mm256_min_epi32(vbdmax,dstlo);
584
42.5k
          dsthi =  _mm256_min_epi32(vbdmax,dsthi);
585
586
42.5k
          dstlo =  _mm256_packs_epi32(dstlo,dsthi);
587
42.5k
          dstlo =  _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) );
588
589
42.5k
          _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride), dstlo );
590
42.5k
        }
591
64.4k
      }
592
2.71k
    }
593
2.71k
  }
594
0
  else
595
0
#endif
596
0
  {
597
0
    __m128i tmplo8,tmphi8;
598
0
    __m128i w64_8 = _mm_set_epi16(64,64,64,64,64,64,64,64);
599
0
    __m128i w32_8 = _mm_set_epi32(32,32,32,32);
600
0
    __m128i vbdmin8   = _mm_set1_epi32( clpRng.min() );
601
0
    __m128i vbdmax8   = _mm_set1_epi32( clpRng.max() );
602
0
    __m128i wl8start,wl8start2;
603
0
    CHECK(scale < 0 || scale > 2, "PDPC: scale < 0 || scale > 2");
604
605
0
    if (scale==0)
606
0
    {
607
0
      wl8start = _mm_set_epi16(0,0,0,0,0,2,8,32);
608
0
      wl8start2 = _mm_set_epi16(0,0,0,0,0,0,0,0);
609
0
    }
610
0
    else if (scale==1)
611
0
    {
612
0
      wl8start = _mm_set_epi16(0,0,1,2,4,8,16,32);
613
0
      wl8start2 = _mm_set_epi16(0,0,0,0,0,0,0,0);
614
0
    }
615
0
    else if (scale==2)
616
0
    {
617
0
      wl8start = _mm_set_epi16(4,4,8,8,16,16,32,32);
618
0
      wl8start2 = _mm_set_epi16(0,0,0,0,1,1,2,2);
619
0
    }
620
0
    if (uiDirMode == PLANAR_IDX || uiDirMode == DC_IDX )
621
0
    {
622
0
      __m128i wl8 = wl8start;
623
0
      for (int y = 0; y < iHeight; y++)
624
0
      {
625
0
        int wT = 32 >> std::min(31, ((y << 1) >> scale));
626
627
0
        __m128i wt8 = _mm_set_epi16(wT,wT,wT,wT,wT,wT,wT,wT);
628
 //       __m128i x8left = _mm_broadcastw_epi16(_mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)*srcStride))));
629
630
0
        __m128i x8left = _mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)*srcStride)));
631
0
        x8left =_mm_shufflelo_epi16(x8left,0);
632
0
        x8left =_mm_shuffle_epi32(x8left,0);
633
634
635
0
        if (wT)
636
0
        {
637
0
          for (int x = 0; x < iWidth; x+=8)
638
0
          {
639
0
            __m128i x8top = _mm_loadu_si128( (__m128i*) ( ptrSrc + x + 1 ) );   // load top
640
0
            __m128i x8dst = _mm_setzero_si128();
641
0
            if( iWidth >= 8 )
642
0
              x8dst = _mm_loadu_si128( (const __m128i*) ( pDst + y * dstStride + x ) );   // load dst
643
0
            else if( iWidth == 4 )
644
0
              x8dst = _mm_loadu_si64( (const __m128i*) ( pDst + y * dstStride + x ) );    // load dst
645
0
            else if( iWidth == 2 )
646
0
              x8dst = _mm_loadu_si32( (const __m128i*) ( pDst + y * dstStride + x ) );    // load dst
647
0
            else
648
0
            {
649
0
              CHECKD( true, "wrong iWidth in IntraPredSampleFilter_SIMD, only implemented for >=8, ==4, ==2" );
650
0
            }
651
652
0
            if (x>8)
653
0
            {
654
0
              tmplo8 = _mm_mullo_epi16(x8top,wt8);    // wT*top
655
0
              tmphi8 = _mm_mulhi_epi16(x8top,wt8);    // wT*top
656
0
              __m128i toplo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
657
0
              __m128i tophi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
658
659
660
0
              __m128i wX = _mm_sub_epi16(w64_8,wt8);
661
0
              tmplo8 = _mm_mullo_epi16(x8dst,wX);    // 64-wL-wT*dst
662
0
              tmphi8 = _mm_mulhi_epi16(x8dst,wX);    // 64-wL-wT*dst
663
0
              __m128i dstlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
664
0
              __m128i dsthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
665
666
0
              dstlo8 = _mm_add_epi32(dstlo8,toplo8);
667
0
              dsthi8 = _mm_add_epi32(dsthi8,tophi8);
668
0
              dstlo8 = _mm_add_epi32(dstlo8,w32_8);
669
0
              dsthi8 = _mm_add_epi32(dsthi8,w32_8);
670
671
0
              dstlo8 =  _mm_srai_epi32(dstlo8,6);
672
0
              dsthi8 =  _mm_srai_epi32(dsthi8,6);
673
674
0
              dstlo8 =  _mm_max_epi32(vbdmin8,dstlo8);
675
0
              dsthi8 =  _mm_max_epi32(vbdmin8,dsthi8);
676
0
              dstlo8 =  _mm_min_epi32(vbdmax8,dstlo8);
677
0
              dsthi8 =  _mm_min_epi32(vbdmax8,dsthi8);
678
679
0
              x8dst = _mm_packs_epi32(dstlo8,dsthi8);
680
0
            }
681
0
            else // x<=8
682
0
            {
683
0
              if (x==0)
684
0
                wl8=wl8start;
685
0
              else if (x==8)
686
0
                wl8=wl8start2;
687
688
0
              tmplo8 = _mm_mullo_epi16(x8left,wl8);  //wL * left
689
0
              tmphi8 = _mm_mulhi_epi16(x8left,wl8);  //wL * left
690
0
              __m128i leftlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
691
0
              __m128i lefthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
692
693
0
              tmplo8 = _mm_mullo_epi16(x8top,wt8);    // wT*top
694
0
              tmphi8 = _mm_mulhi_epi16(x8top,wt8);    // wT*top
695
0
              __m128i toplo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
696
0
              __m128i tophi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
697
698
0
              __m128i wX = _mm_sub_epi16(w64_8,wl8);
699
0
              wX = _mm_sub_epi16(wX,wt8);            // 64-wL-wT
700
0
              tmplo8 = _mm_mullo_epi16(x8dst,wX);    // 64-wL-wT*dst
701
0
              tmphi8 = _mm_mulhi_epi16(x8dst,wX);    // 64-wL-wT*dst
702
0
              __m128i dstlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
703
0
              __m128i dsthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
704
705
0
              dstlo8 = _mm_add_epi32(dstlo8,toplo8);
706
0
              dsthi8 = _mm_add_epi32(dsthi8,tophi8);
707
0
              dstlo8 = _mm_add_epi32(dstlo8,leftlo8);
708
0
              dsthi8 = _mm_add_epi32(dsthi8,lefthi8);
709
0
              dstlo8 = _mm_add_epi32(dstlo8,w32_8);
710
0
              dsthi8 = _mm_add_epi32(dsthi8,w32_8);
711
712
0
              dstlo8 =  _mm_srai_epi32(dstlo8,6);
713
0
              dsthi8 =  _mm_srai_epi32(dsthi8,6);
714
715
0
              dstlo8 =  _mm_max_epi32(vbdmin8,dstlo8);
716
0
              dsthi8 =  _mm_max_epi32(vbdmin8,dsthi8);
717
0
              dstlo8 =  _mm_min_epi32(vbdmax8,dstlo8);
718
0
              dsthi8 =  _mm_min_epi32(vbdmax8,dsthi8);
719
720
0
              x8dst = _mm_packs_epi32(dstlo8,dsthi8);
721
0
            }
722
723
0
            if( iWidth >= 8 )
724
0
              _mm_storeu_si128( (__m128i*) ( pDst + y * dstStride + x ), x8dst );
725
0
            else if( iWidth == 4 )
726
0
              _mm_storeu_si64( (__m128i*) ( pDst + y * dstStride + x ), x8dst );
727
0
            else if( iWidth == 2 )
728
0
              _mm_storeu_si32( (__m128i*) ( pDst + y * dstStride + x ), x8dst );
729
0
          }
730
0
        }
731
0
        else //wT =0
732
0
        {
733
0
          for( int x = 0; x < std::min( iWidth, 16 ); x += 8 )
734
0
          {
735
0
            if( x == 0 )
736
0
              wl8 = wl8start;
737
0
            else if( x == 8 )
738
0
              wl8 = wl8start2;
739
740
0
            __m128i x8dst;
741
0
            if( iWidth >= 8 )
742
0
              x8dst = _mm_loadu_si128( (const __m128i*)( pDst + y * dstStride + x ) );   // load dst
743
0
            else if( iWidth == 4 )
744
0
              x8dst = _mm_loadu_si64( (const __m128i*)( pDst + y * dstStride + x ) );    // load dst
745
0
            else if( iWidth == 2 )
746
0
              x8dst = _mm_loadu_si32( (const __m128i*)( pDst + y * dstStride + x ) );    // load dst
747
0
            else
748
0
              CHECK( true, "wrong iWidth in IntraPredSampleFilter_SIMD, only implemented for >=8, ==4, ==2" );
749
750
751
0
            tmplo8          = _mm_mullo_epi16( x8left, wl8 );   // wL * left
752
0
            tmphi8          = _mm_mulhi_epi16( x8left, wl8 );   // wL * left
753
0
            __m128i leftlo8 = _mm_unpacklo_epi16( tmplo8, tmphi8 );
754
0
            __m128i lefthi8 = _mm_unpackhi_epi16( tmplo8, tmphi8 );
755
756
0
            __m128i wX     = _mm_sub_epi16( w64_8, wl8 );
757
0
            tmplo8         = _mm_mullo_epi16( x8dst, wX );   // 64-wL-wT*dst
758
0
            tmphi8         = _mm_mulhi_epi16( x8dst, wX );   // 64-wL-wT*dst
759
0
            __m128i dstlo8 = _mm_unpacklo_epi16( tmplo8, tmphi8 );
760
0
            __m128i dsthi8 = _mm_unpackhi_epi16( tmplo8, tmphi8 );
761
762
0
            dstlo8 = _mm_add_epi32( dstlo8, leftlo8 );
763
0
            dsthi8 = _mm_add_epi32( dsthi8, lefthi8 );
764
0
            dstlo8 = _mm_add_epi32( dstlo8, w32_8 );
765
0
            dsthi8 = _mm_add_epi32( dsthi8, w32_8 );
766
767
0
            dstlo8 = _mm_srai_epi32( dstlo8, 6 );
768
0
            dsthi8 = _mm_srai_epi32( dsthi8, 6 );
769
770
0
            dstlo8 = _mm_max_epi32( vbdmin8, dstlo8 );
771
0
            dsthi8 = _mm_max_epi32( vbdmin8, dsthi8 );
772
0
            dstlo8 = _mm_min_epi32( vbdmax8, dstlo8 );
773
0
            dsthi8 = _mm_min_epi32( vbdmax8, dsthi8 );
774
775
0
            dstlo8 = _mm_packs_epi32( dstlo8, dsthi8 );
776
777
0
            if( iWidth >= 8 )
778
0
              _mm_storeu_si128( (__m128i*)( pDst + y * dstStride + x ), dstlo8 );
779
0
            else if( iWidth == 4 )
780
0
              _mm_storeu_si64( (__m128i*)( pDst + y * dstStride + x ), ( dstlo8 ) );
781
0
            else if( iWidth == 2 )
782
0
              _mm_storeu_si32( (__m128i*)( pDst + y * dstStride + x ), dstlo8 );
783
0
          }
784
0
        }
785
0
      }
786
0
    }
787
0
  }
788
789
790
2.71k
#if USE_AVX2
791
2.71k
  _mm256_zeroupper();
792
2.71k
#endif
793
2.71k
}
794
795
/** Function for deriving planar intra prediction. This function derives the prediction samples for planar mode (intra coding).
796
 */
797
template< X86_VEXT vext>
798
void xPredIntraPlanar_SIMD( const CPelBuf &pSrc, PelBuf &pDst, const SPS& sps )
799
2.41k
{
800
801
2.41k
  const uint32_t width      = pDst.width;
802
2.41k
  const uint32_t height     = pDst.height;
803
2.41k
  const uint32_t log2W      = getLog2( width );
804
2.41k
  const uint32_t log2H      = getLog2( height );
805
2.41k
  const uint32_t finalShift = 1 + log2W + log2H;
806
2.41k
  const uint32_t offset     = 1 << (log2W + log2H);
807
2.41k
  const ptrdiff_t stride    = pDst.stride;
808
2.41k
  Pel*            pred      = pDst.buf;
809
810
2.41k
  const Pel *ptrSrc =pSrc.buf;
811
812
2.41k
  int leftColumn,rightColumn;
813
2.41k
  Pel tmp;
814
2.41k
  int topRight = pSrc.at( width + 1, 0 );
815
816
2.41k
  tmp=pSrc.at( 0, height+1 );
817
2.41k
  const __m128i bottomLeft16 = _mm_set_epi16(tmp,tmp,tmp,tmp,tmp,tmp,tmp,tmp);
818
2.41k
  const __m128i zero = _mm_setzero_si128();
819
2.41k
  const __m128i eight = _mm_set_epi16(8,8,8,8,8,8,8,8);
820
2.41k
  const __m128i offset32 = _mm_set_epi32(offset,offset,offset,offset);
821
2.41k
  const __m128i vLog2W = _mm_cvtsi32_si128(log2W);
822
2.41k
  const __m128i vLog2H = _mm_cvtsi32_si128(log2H);
823
2.41k
  const __m128i vFinalShift = _mm_cvtsi32_si128(finalShift);
824
825
49.2k
  for( int y = 0; y < height; y++)
826
46.8k
  {
827
46.8k
    leftColumn=pSrc.at(  0, y + 1 );
828
46.8k
    rightColumn = topRight - leftColumn;
829
46.8k
    leftColumn  = leftColumn << log2W;
830
46.8k
    const __m128i leftColumn32 = _mm_set_epi32(leftColumn,leftColumn,leftColumn,leftColumn);
831
46.8k
    const __m128i rightcolumn16 = _mm_set_epi16(rightColumn,rightColumn,rightColumn,rightColumn,rightColumn,rightColumn,rightColumn,rightColumn);
832
46.8k
    const __m128i y16 = _mm_set_epi16(y+1,y+1,y+1,y+1,y+1,y+1,y+1,y+1);
833
46.8k
          __m128i x16 = _mm_set_epi16(8,7,6,5,4,3,2,1);
834
835
226k
    for( int x = 0; x < width; x+=8 )
836
179k
    {
837
      //topRow[x] = pSrc.at( x + 1, 0 );
838
179k
      __m128i topRow16 = _mm_loadu_si128 ((__m128i const *) (ptrSrc+(x+1)));
839
      //bottomRow[x] = bottomLeft - topRow[x];
840
179k
      __m128i bottomRow16L = _mm_sub_epi16(bottomLeft16,topRow16);
841
      // (y+1)*bottomRow[x]
842
179k
      __m128i  tmpH = _mm_mulhi_epi16(bottomRow16L,y16);
843
179k
      __m128i tmpL = _mm_mullo_epi16(bottomRow16L,y16);
844
179k
      bottomRow16L = _mm_unpacklo_epi16(tmpL,tmpH);
845
179k
      __m128i bottomRow16H = _mm_unpackhi_epi16(tmpL,tmpH);
846
847
      // (topRow[x] topRow16H<< log2H)
848
179k
      __m128i topRow32L = _mm_unpacklo_epi16(topRow16,zero);
849
179k
      __m128i topRow32H = _mm_unpackhi_epi16(topRow16,zero);
850
179k
      topRow32L = _mm_sll_epi32(topRow32L,vLog2H);
851
179k
      topRow32H = _mm_sll_epi32(topRow32H,vLog2H);
852
      // vertPred    = (topRow[x] << log2H) + (y+1)*bottomRow[x];
853
179k
      topRow32L = _mm_add_epi32(topRow32L,bottomRow16L);
854
179k
      topRow32H = _mm_add_epi32(topRow32H,bottomRow16H);
855
      // horPred = leftColumn + (x+1)*rightColumn;
856
179k
      tmpL = _mm_mullo_epi16(rightcolumn16,x16);
857
179k
      tmpH = _mm_mulhi_epi16(rightcolumn16,x16);
858
179k
      __m128i horpred32L = _mm_unpacklo_epi16(tmpL,tmpH);
859
179k
      __m128i horpred32H = _mm_unpackhi_epi16(tmpL,tmpH);
860
179k
      horpred32L = _mm_add_epi32(leftColumn32,horpred32L);
861
179k
      horpred32H = _mm_add_epi32(leftColumn32,horpred32H);
862
      // pred[x]      = ( ( horPred << log2H ) + ( vertPred << log2W ) + offset ) >> finalShift;
863
179k
      horpred32L = _mm_sll_epi32(horpred32L,vLog2H);
864
179k
      horpred32H = _mm_sll_epi32(horpred32H,vLog2H);
865
179k
      topRow32L = _mm_sll_epi32(topRow32L,vLog2W);
866
179k
      topRow32H = _mm_sll_epi32(topRow32H,vLog2W);
867
179k
      horpred32L = _mm_add_epi32(horpred32L,topRow32L);
868
179k
      horpred32H = _mm_add_epi32(horpred32H,topRow32H);
869
179k
      horpred32L = _mm_add_epi32(horpred32L,offset32);
870
179k
      horpred32H = _mm_add_epi32(horpred32H,offset32);
871
179k
      horpred32L = _mm_srl_epi32(horpred32L,vFinalShift);
872
179k
      horpred32H = _mm_srl_epi32(horpred32H,vFinalShift);
873
874
179k
      tmpL = _mm_packs_epi32(horpred32L,horpred32H);
875
179k
      if (width>=8)
876
176k
        _mm_storeu_si128(( __m128i * )(pred+y*stride+x), (tmpL) );
877
3.06k
      else if (width==4)
878
3.06k
        _mm_storeu_si64(( __m128i * )(pred+y*stride+x), (tmpL) );
879
0
      else if (width==2)
880
0
        _mm_storeu_si32(( __m128i * )(pred+y*stride+x),(tmpL) );
881
0
      else
882
0
        pred[y*stride+x]=(Pel)_mm_extract_epi16 (tmpL,0);
883
884
179k
      x16 = _mm_add_epi16(x16,eight);
885
179k
    }
886
46.8k
  }
887
2.41k
}
Unexecuted instantiation: void vvdec::xPredIntraPlanar_SIMD<(vvdec::x86_simd::X86_VEXT)1>(vvdec::AreaBuf<short const> const&, vvdec::AreaBuf<short>&, vvdec::SPS const&)
void vvdec::xPredIntraPlanar_SIMD<(vvdec::x86_simd::X86_VEXT)4>(vvdec::AreaBuf<short const> const&, vvdec::AreaBuf<short>&, vvdec::SPS const&)
Line
Count
Source
799
2.41k
{
800
801
2.41k
  const uint32_t width      = pDst.width;
802
2.41k
  const uint32_t height     = pDst.height;
803
2.41k
  const uint32_t log2W      = getLog2( width );
804
2.41k
  const uint32_t log2H      = getLog2( height );
805
2.41k
  const uint32_t finalShift = 1 + log2W + log2H;
806
2.41k
  const uint32_t offset     = 1 << (log2W + log2H);
807
2.41k
  const ptrdiff_t stride    = pDst.stride;
808
2.41k
  Pel*            pred      = pDst.buf;
809
810
2.41k
  const Pel *ptrSrc =pSrc.buf;
811
812
2.41k
  int leftColumn,rightColumn;
813
2.41k
  Pel tmp;
814
2.41k
  int topRight = pSrc.at( width + 1, 0 );
815
816
2.41k
  tmp=pSrc.at( 0, height+1 );
817
2.41k
  const __m128i bottomLeft16 = _mm_set_epi16(tmp,tmp,tmp,tmp,tmp,tmp,tmp,tmp);
818
2.41k
  const __m128i zero = _mm_setzero_si128();
819
2.41k
  const __m128i eight = _mm_set_epi16(8,8,8,8,8,8,8,8);
820
2.41k
  const __m128i offset32 = _mm_set_epi32(offset,offset,offset,offset);
821
2.41k
  const __m128i vLog2W = _mm_cvtsi32_si128(log2W);
822
2.41k
  const __m128i vLog2H = _mm_cvtsi32_si128(log2H);
823
2.41k
  const __m128i vFinalShift = _mm_cvtsi32_si128(finalShift);
824
825
49.2k
  for( int y = 0; y < height; y++)
826
46.8k
  {
827
46.8k
    leftColumn=pSrc.at(  0, y + 1 );
828
46.8k
    rightColumn = topRight - leftColumn;
829
46.8k
    leftColumn  = leftColumn << log2W;
830
46.8k
    const __m128i leftColumn32 = _mm_set_epi32(leftColumn,leftColumn,leftColumn,leftColumn);
831
46.8k
    const __m128i rightcolumn16 = _mm_set_epi16(rightColumn,rightColumn,rightColumn,rightColumn,rightColumn,rightColumn,rightColumn,rightColumn);
832
46.8k
    const __m128i y16 = _mm_set_epi16(y+1,y+1,y+1,y+1,y+1,y+1,y+1,y+1);
833
46.8k
          __m128i x16 = _mm_set_epi16(8,7,6,5,4,3,2,1);
834
835
226k
    for( int x = 0; x < width; x+=8 )
836
179k
    {
837
      //topRow[x] = pSrc.at( x + 1, 0 );
838
179k
      __m128i topRow16 = _mm_loadu_si128 ((__m128i const *) (ptrSrc+(x+1)));
839
      //bottomRow[x] = bottomLeft - topRow[x];
840
179k
      __m128i bottomRow16L = _mm_sub_epi16(bottomLeft16,topRow16);
841
      // (y+1)*bottomRow[x]
842
179k
      __m128i  tmpH = _mm_mulhi_epi16(bottomRow16L,y16);
843
179k
      __m128i tmpL = _mm_mullo_epi16(bottomRow16L,y16);
844
179k
      bottomRow16L = _mm_unpacklo_epi16(tmpL,tmpH);
845
179k
      __m128i bottomRow16H = _mm_unpackhi_epi16(tmpL,tmpH);
846
847
      // (topRow[x] topRow16H<< log2H)
848
179k
      __m128i topRow32L = _mm_unpacklo_epi16(topRow16,zero);
849
179k
      __m128i topRow32H = _mm_unpackhi_epi16(topRow16,zero);
850
179k
      topRow32L = _mm_sll_epi32(topRow32L,vLog2H);
851
179k
      topRow32H = _mm_sll_epi32(topRow32H,vLog2H);
852
      // vertPred    = (topRow[x] << log2H) + (y+1)*bottomRow[x];
853
179k
      topRow32L = _mm_add_epi32(topRow32L,bottomRow16L);
854
179k
      topRow32H = _mm_add_epi32(topRow32H,bottomRow16H);
855
      // horPred = leftColumn + (x+1)*rightColumn;
856
179k
      tmpL = _mm_mullo_epi16(rightcolumn16,x16);
857
179k
      tmpH = _mm_mulhi_epi16(rightcolumn16,x16);
858
179k
      __m128i horpred32L = _mm_unpacklo_epi16(tmpL,tmpH);
859
179k
      __m128i horpred32H = _mm_unpackhi_epi16(tmpL,tmpH);
860
179k
      horpred32L = _mm_add_epi32(leftColumn32,horpred32L);
861
179k
      horpred32H = _mm_add_epi32(leftColumn32,horpred32H);
862
      // pred[x]      = ( ( horPred << log2H ) + ( vertPred << log2W ) + offset ) >> finalShift;
863
179k
      horpred32L = _mm_sll_epi32(horpred32L,vLog2H);
864
179k
      horpred32H = _mm_sll_epi32(horpred32H,vLog2H);
865
179k
      topRow32L = _mm_sll_epi32(topRow32L,vLog2W);
866
179k
      topRow32H = _mm_sll_epi32(topRow32H,vLog2W);
867
179k
      horpred32L = _mm_add_epi32(horpred32L,topRow32L);
868
179k
      horpred32H = _mm_add_epi32(horpred32H,topRow32H);
869
179k
      horpred32L = _mm_add_epi32(horpred32L,offset32);
870
179k
      horpred32H = _mm_add_epi32(horpred32H,offset32);
871
179k
      horpred32L = _mm_srl_epi32(horpred32L,vFinalShift);
872
179k
      horpred32H = _mm_srl_epi32(horpred32H,vFinalShift);
873
874
179k
      tmpL = _mm_packs_epi32(horpred32L,horpred32H);
875
179k
      if (width>=8)
876
176k
        _mm_storeu_si128(( __m128i * )(pred+y*stride+x), (tmpL) );
877
3.06k
      else if (width==4)
878
3.06k
        _mm_storeu_si64(( __m128i * )(pred+y*stride+x), (tmpL) );
879
0
      else if (width==2)
880
0
        _mm_storeu_si32(( __m128i * )(pred+y*stride+x),(tmpL) );
881
0
      else
882
0
        pred[y*stride+x]=(Pel)_mm_extract_epi16 (tmpL,0);
883
884
179k
      x16 = _mm_add_epi16(x16,eight);
885
179k
    }
886
46.8k
  }
887
2.41k
}
888
889
template< X86_VEXT vext>
890
void GetLumaRecPixel420SIMD (const int width,const int height, const Pel* pRecSrc0,const ptrdiff_t iRecStride,Pel* pDst0,const ptrdiff_t iDstStride)
891
2.86k
{
892
#ifdef USE_AVX2
893
2.86k
  if( ( width & 15 ) == 0 )    // width>=16
894
//  if( 0 )    // width>=16
895
1.78k
  {
896
1.78k
    __m256i vzero = _mm256_set1_epi8(0);
897
1.78k
    __m256i vfour = _mm256_set1_epi32(4);
898
27.3k
    for( int y = 0; y < height; y++ )
899
25.5k
    {
900
64.9k
      for( int x = 0; x < width; x += 16 )
901
39.3k
      {
902
39.3k
        int x2=x<<1;
903
39.3k
        __m256i vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2-1]);      // -1 0 1 2 3 4 5 6
904
39.3k
        __m256i vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2]);          // 0 1 2 3 4 5 6 7
905
906
        __m256i vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55);      // -1 1 3 5  32 Bit
907
        __m256i vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55);      // 0 2 4 6  32 Bit
908
        __m256i vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA);      // 1 3 5 7 32 Bit
909
        vsrc10 = _mm256_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
910
        vsrc0 =  _mm256_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
911
912
        vsrc0 =  _mm256_add_epi32(vsrc0,vsrc10);
913
        __m256i vdst0 = _mm256_add_epi32(vsrc0,vsrc01);   // dst 0 1 2 3 32 Bit, untere Zeile fehlt noch
914
915
        vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 +15]);      // 7 8 9 10 11 12 13 14
916
        vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 + 16 ]);          // 8 9 10 11 12 13 14 15
917
918
        x2+= (int)iRecStride;
919
920
        vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55);
921
        vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55);
922
        vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA);
923
        vsrc10 = _mm256_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
924
        vsrc0 =  _mm256_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
925
926
        vsrc0 =  _mm256_add_epi32(vsrc0,vsrc10);
927
        __m256i vdst1 = _mm256_add_epi32(vsrc0,vsrc01);   // dst 4 5 6 7 32 Bit, untere Zeile fehlt noch
928
929
        // jetzt die nächste Zeile dazu
930
        vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2-1]);      // -1 0 1 2 3 4 5 6
931
        vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2]);          // 0 1 2 3 4 5 6 7
932
933
        vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55);      // -1 1 3 5  32 Bit
934
        vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55);      // 0 2 4 6  32 Bit
935
        vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA);      // 1 3 5 7 32 Bit
936
        vsrc10 = _mm256_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
937
        vsrc0 =  _mm256_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
938
939
        vsrc0 =  _mm256_add_epi32(vsrc0,vsrc10);
940
        __m256i vdst01 = _mm256_add_epi32(vsrc0,vsrc01);   // dst 0 1 2 3 32 Bit, untere Zeile
941
942
        vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 + 15]);      // 7 8 9 10 11 12 13 14
943
        vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 + 16 ]);          // 8 9 10 11 12 13 14 15
944
945
        vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55);
946
        vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55);
947
        vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA);
948
        vsrc10 = _mm256_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
949
        vsrc0 =  _mm256_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
950
951
        vsrc0 =  _mm256_add_epi32(vsrc0,vsrc10);
952
        __m256i vdst11 = _mm256_add_epi32(vsrc0,vsrc01);   // dst 4 5 6 7 32 Bit, untere Zeile
953
954
        vdst0 = _mm256_add_epi32(vdst0,vdst01);
955
        vdst1 = _mm256_add_epi32(vdst1,vdst11);
956
        vdst0 =  _mm256_add_epi32(vdst0,vfour);
957
        vdst1 =  _mm256_add_epi32(vdst1,vfour);
958
        vdst0 = _mm256_srli_epi32(vdst0,3);
959
        vdst1 = _mm256_srli_epi32(vdst1,3);
960
        vdst0 = _mm256_packus_epi32 (vdst0,vdst1);   // 16 bit
961
        vdst0 = _mm256_permute4x64_epi64(vdst0,0xd8);
962
963
39.3k
        _mm256_storeu_si256((__m256i*)&pDst0[x], vdst0);
964
        //        _mm_storeu_si128((__m128i*)&pDstTmp[x], vdst0);
965
39.3k
      }
966
25.5k
      pDst0 += iDstStride;
967
25.5k
      pRecSrc0 += (iRecStride<<1);
968
25.5k
    }
969
1.78k
  }
970
1.08k
  else
971
1.08k
#endif
972
1.08k
    if( ( width & 7 ) == 0 )    // width>=8
973
768
    {
974
768
      __m128i vzero = _mm_set1_epi8(0);
975
768
      __m128i vfour = _mm_set1_epi32(4);
976
977
978
7.18k
      for( int y = 0; y < height; y++ )
979
6.41k
      {
980
981
12.8k
        for( int x = 0; x < width; x += 8 )
982
6.41k
        {
983
6.41k
          int x2=x<<1;
984
6.41k
          __m128i vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2-1]);      // -1 0 1 2 3 4 5 6
985
6.41k
          __m128i vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2]);          // 0 1 2 3 4 5 6 7
986
987
6.41k
          __m128i vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55);      // -1 1 3 5  32 Bit
988
6.41k
          __m128i vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55);      // 0 2 4 6  32 Bit
989
6.41k
          __m128i vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA);      // 1 3 5 7 32 Bit
990
6.41k
          vsrc10 = _mm_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
991
6.41k
          vsrc0 =  _mm_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
992
993
6.41k
          vsrc0 =  _mm_add_epi32(vsrc0,vsrc10);
994
6.41k
          __m128i vdst0 = _mm_add_epi32(vsrc0,vsrc01);   // dst 0 1 2 3 32 Bit, untere Zeile fehlt noch
995
996
6.41k
          vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 +7]);      // 7 8 9 10 11 12 13 14
997
6.41k
          vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 + 8 ]);          // 8 9 10 11 12 13 14 15
998
999
6.41k
          x2+=(int)iRecStride;
1000
1001
6.41k
          vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55);
1002
6.41k
          vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55);
1003
6.41k
          vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA);
1004
6.41k
          vsrc10 = _mm_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
1005
6.41k
          vsrc0 =  _mm_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
1006
1007
6.41k
          vsrc0 =  _mm_add_epi32(vsrc0,vsrc10);
1008
6.41k
          __m128i vdst1 = _mm_add_epi32(vsrc0,vsrc01);   // dst 4 5 6 7 32 Bit, untere Zeile fehlt noch
1009
1010
          // jetzt die nächste Zeile dazu
1011
6.41k
          vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2-1]);      // -1 0 1 2 3 4 5 6
1012
6.41k
          vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2]);          // 0 1 2 3 4 5 6 7
1013
1014
6.41k
          vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55);      // -1 1 3 5  32 Bit
1015
6.41k
          vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55);      // 0 2 4 6  32 Bit
1016
6.41k
          vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA);      // 1 3 5 7 32 Bit
1017
6.41k
          vsrc10 = _mm_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
1018
6.41k
          vsrc0 =  _mm_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
1019
1020
6.41k
          vsrc0 =  _mm_add_epi32(vsrc0,vsrc10);
1021
6.41k
          __m128i vdst01 = _mm_add_epi32(vsrc0,vsrc01);   // dst 0 1 2 3 32 Bit, untere Zeile
1022
1023
6.41k
          vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 + 7]);      // 7 8 9 10 11 12 13 14
1024
6.41k
          vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 + 8 ]);          // 8 9 10 11 12 13 14 15
1025
1026
6.41k
          vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55);
1027
6.41k
          vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55);
1028
6.41k
          vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA);
1029
6.41k
          vsrc10 = _mm_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
1030
6.41k
          vsrc0 =  _mm_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
1031
1032
6.41k
          vsrc0 =  _mm_add_epi32(vsrc0,vsrc10);
1033
6.41k
          __m128i vdst11 = _mm_add_epi32(vsrc0,vsrc01);   // dst 4 5 6 7 32 Bit, untere Zeile
1034
1035
6.41k
          vdst0 = _mm_add_epi32(vdst0,vdst01);
1036
6.41k
          vdst1 = _mm_add_epi32(vdst1,vdst11);
1037
6.41k
          vdst0 =  _mm_add_epi32(vdst0,vfour);
1038
6.41k
          vdst1 =  _mm_add_epi32(vdst1,vfour);
1039
6.41k
          vdst0 = _mm_srli_epi32(vdst0,3);
1040
6.41k
          vdst1 = _mm_srli_epi32(vdst1,3);
1041
6.41k
          vdst0 = _mm_packus_epi32 (vdst0,vdst1);   // 16 bit
1042
1043
6.41k
          _mm_storeu_si128((__m128i*)&pDst0[x], vdst0);
1044
          //        _mm_storeu_si128((__m128i*)&pDstTmp[x], vdst0);
1045
6.41k
        }
1046
6.41k
        pDst0 += iDstStride;
1047
6.41k
        pRecSrc0 += (iRecStride<<1);
1048
6.41k
      }
1049
768
    }
1050
312
    else     // width<=4
1051
312
    {
1052
312
      __m128i vzero = _mm_set1_epi8(0);
1053
312
      __m128i vfour = _mm_set1_epi32(4);
1054
1055
2.60k
      for( int y = 0; y < height; y++ )
1056
2.28k
      {
1057
2.28k
        __m128i vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[-1]);      // -1 0 1 2 3 4 5 6
1058
2.28k
        __m128i vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[0]);          // 0 1 2 3 4 5 6 7
1059
1060
2.28k
        __m128i vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55);      // -1 1 3 5  32 Bit
1061
2.28k
        __m128i vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55);      // 0 2 4 6  32 Bit
1062
2.28k
        __m128i vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA);      // 1 3 5 7 32 Bit
1063
2.28k
        vsrc10 = _mm_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
1064
2.28k
        vsrc0 =  _mm_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
1065
1066
2.28k
        vsrc0 =  _mm_add_epi32(vsrc0,vsrc10);
1067
2.28k
        __m128i vdst0 = _mm_add_epi32(vsrc0,vsrc01);   // dst 0 1 2 3 32 Bit, untere Zeile fehlt noch
1068
1069
        // jetzt die nächste Zeile dazu
1070
2.28k
        vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[iRecStride-1]);      // -1 0 1 2 3 4 5 6
1071
2.28k
        vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[iRecStride]);          // 0 1 2 3 4 5 6_mm_storeu_si32 7
1072
1073
2.28k
        vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55);      // -1 1 3 5  32 Bit
1074
2.28k
        vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55);      // 0 2 4 6  32 Bit
1075
2.28k
        vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA);      // 1 3 5 7 32 Bit
1076
2.28k
        vsrc10 = _mm_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
1077
2.28k
        vsrc0 =  _mm_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
1078
1079
2.28k
        vsrc0 =  _mm_add_epi32(vsrc0,vsrc10);
1080
2.28k
        __m128i vdst01 = _mm_add_epi32(vsrc0,vsrc01);   // dst 0 1 2 3 32 Bit, untere Zeile
1081
1082
1083
2.28k
        vdst0 = _mm_add_epi32(vdst0,vdst01);
1084
2.28k
        vdst0 =  _mm_add_epi32(vdst0,vfour);
1085
2.28k
        vdst0 = _mm_srli_epi32(vdst0,3);
1086
2.28k
        vdst0 = _mm_packus_epi32 (vdst0,vdst0);   // 16 bit
1087
1088
2.28k
        if (width==4)
1089
2.28k
          _mm_storeu_si64(( __m128i * )&pDst0[0], (vdst0) );
1090
0
        else if (width==2)
1091
0
          _mm_storeu_si32(( __m128i * )&pDst0[0], (vdst0) );
1092
0
        else
1093
0
        {
1094
0
          int tmp = _mm_cvtsi128_si32(vdst0);
1095
0
          pDst0[0] = (Pel) tmp;
1096
0
        }
1097
1098
2.28k
        pDst0 += iDstStride;
1099
2.28k
        pRecSrc0 += (iRecStride<<1);
1100
2.28k
      }
1101
312
    }
1102
2.86k
}
Unexecuted instantiation: void vvdec::GetLumaRecPixel420SIMD<(vvdec::x86_simd::X86_VEXT)1>(int, int, short const*, long, short*, long)
void vvdec::GetLumaRecPixel420SIMD<(vvdec::x86_simd::X86_VEXT)4>(int, int, short const*, long, short*, long)
Line
Count
Source
891
2.86k
{
892
2.86k
#ifdef USE_AVX2
893
2.86k
  if( ( width & 15 ) == 0 )    // width>=16
894
//  if( 0 )    // width>=16
895
1.78k
  {
896
1.78k
    __m256i vzero = _mm256_set1_epi8(0);
897
1.78k
    __m256i vfour = _mm256_set1_epi32(4);
898
27.3k
    for( int y = 0; y < height; y++ )
899
25.5k
    {
900
64.9k
      for( int x = 0; x < width; x += 16 )
901
39.3k
      {
902
39.3k
        int x2=x<<1;
903
39.3k
        __m256i vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2-1]);      // -1 0 1 2 3 4 5 6
904
39.3k
        __m256i vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2]);          // 0 1 2 3 4 5 6 7
905
906
39.3k
        __m256i vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55);      // -1 1 3 5  32 Bit
907
39.3k
        __m256i vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55);      // 0 2 4 6  32 Bit
908
39.3k
        __m256i vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA);      // 1 3 5 7 32 Bit
909
39.3k
        vsrc10 = _mm256_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
910
39.3k
        vsrc0 =  _mm256_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
911
912
39.3k
        vsrc0 =  _mm256_add_epi32(vsrc0,vsrc10);
913
39.3k
        __m256i vdst0 = _mm256_add_epi32(vsrc0,vsrc01);   // dst 0 1 2 3 32 Bit, untere Zeile fehlt noch
914
915
39.3k
        vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 +15]);      // 7 8 9 10 11 12 13 14
916
39.3k
        vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 + 16 ]);          // 8 9 10 11 12 13 14 15
917
918
39.3k
        x2+= (int)iRecStride;
919
920
39.3k
        vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55);
921
39.3k
        vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55);
922
39.3k
        vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA);
923
39.3k
        vsrc10 = _mm256_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
924
39.3k
        vsrc0 =  _mm256_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
925
926
39.3k
        vsrc0 =  _mm256_add_epi32(vsrc0,vsrc10);
927
39.3k
        __m256i vdst1 = _mm256_add_epi32(vsrc0,vsrc01);   // dst 4 5 6 7 32 Bit, untere Zeile fehlt noch
928
929
        // jetzt die nächste Zeile dazu
930
39.3k
        vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2-1]);      // -1 0 1 2 3 4 5 6
931
39.3k
        vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2]);          // 0 1 2 3 4 5 6 7
932
933
39.3k
        vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55);      // -1 1 3 5  32 Bit
934
39.3k
        vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55);      // 0 2 4 6  32 Bit
935
39.3k
        vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA);      // 1 3 5 7 32 Bit
936
39.3k
        vsrc10 = _mm256_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
937
39.3k
        vsrc0 =  _mm256_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
938
939
39.3k
        vsrc0 =  _mm256_add_epi32(vsrc0,vsrc10);
940
39.3k
        __m256i vdst01 = _mm256_add_epi32(vsrc0,vsrc01);   // dst 0 1 2 3 32 Bit, untere Zeile
941
942
39.3k
        vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 + 15]);      // 7 8 9 10 11 12 13 14
943
39.3k
        vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 + 16 ]);          // 8 9 10 11 12 13 14 15
944
945
39.3k
        vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55);
946
39.3k
        vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55);
947
39.3k
        vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA);
948
39.3k
        vsrc10 = _mm256_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
949
39.3k
        vsrc0 =  _mm256_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
950
951
39.3k
        vsrc0 =  _mm256_add_epi32(vsrc0,vsrc10);
952
39.3k
        __m256i vdst11 = _mm256_add_epi32(vsrc0,vsrc01);   // dst 4 5 6 7 32 Bit, untere Zeile
953
954
39.3k
        vdst0 = _mm256_add_epi32(vdst0,vdst01);
955
39.3k
        vdst1 = _mm256_add_epi32(vdst1,vdst11);
956
39.3k
        vdst0 =  _mm256_add_epi32(vdst0,vfour);
957
39.3k
        vdst1 =  _mm256_add_epi32(vdst1,vfour);
958
39.3k
        vdst0 = _mm256_srli_epi32(vdst0,3);
959
39.3k
        vdst1 = _mm256_srli_epi32(vdst1,3);
960
39.3k
        vdst0 = _mm256_packus_epi32 (vdst0,vdst1);   // 16 bit
961
39.3k
        vdst0 = _mm256_permute4x64_epi64(vdst0,0xd8);
962
963
39.3k
        _mm256_storeu_si256((__m256i*)&pDst0[x], vdst0);
964
        //        _mm_storeu_si128((__m128i*)&pDstTmp[x], vdst0);
965
39.3k
      }
966
25.5k
      pDst0 += iDstStride;
967
25.5k
      pRecSrc0 += (iRecStride<<1);
968
25.5k
    }
969
1.78k
  }
970
1.08k
  else
971
1.08k
#endif
972
1.08k
    if( ( width & 7 ) == 0 )    // width>=8
973
768
    {
974
768
      __m128i vzero = _mm_set1_epi8(0);
975
768
      __m128i vfour = _mm_set1_epi32(4);
976
977
978
7.18k
      for( int y = 0; y < height; y++ )
979
6.41k
      {
980
981
12.8k
        for( int x = 0; x < width; x += 8 )
982
6.41k
        {
983
6.41k
          int x2=x<<1;
984
6.41k
          __m128i vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2-1]);      // -1 0 1 2 3 4 5 6
985
6.41k
          __m128i vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2]);          // 0 1 2 3 4 5 6 7
986
987
6.41k
          __m128i vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55);      // -1 1 3 5  32 Bit
988
6.41k
          __m128i vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55);      // 0 2 4 6  32 Bit
989
6.41k
          __m128i vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA);      // 1 3 5 7 32 Bit
990
6.41k
          vsrc10 = _mm_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
991
6.41k
          vsrc0 =  _mm_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
992
993
6.41k
          vsrc0 =  _mm_add_epi32(vsrc0,vsrc10);
994
6.41k
          __m128i vdst0 = _mm_add_epi32(vsrc0,vsrc01);   // dst 0 1 2 3 32 Bit, untere Zeile fehlt noch
995
996
6.41k
          vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 +7]);      // 7 8 9 10 11 12 13 14
997
6.41k
          vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 + 8 ]);          // 8 9 10 11 12 13 14 15
998
999
6.41k
          x2+=(int)iRecStride;
1000
1001
6.41k
          vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55);
1002
6.41k
          vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55);
1003
6.41k
          vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA);
1004
6.41k
          vsrc10 = _mm_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
1005
6.41k
          vsrc0 =  _mm_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
1006
1007
6.41k
          vsrc0 =  _mm_add_epi32(vsrc0,vsrc10);
1008
6.41k
          __m128i vdst1 = _mm_add_epi32(vsrc0,vsrc01);   // dst 4 5 6 7 32 Bit, untere Zeile fehlt noch
1009
1010
          // jetzt die nächste Zeile dazu
1011
6.41k
          vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2-1]);      // -1 0 1 2 3 4 5 6
1012
6.41k
          vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2]);          // 0 1 2 3 4 5 6 7
1013
1014
6.41k
          vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55);      // -1 1 3 5  32 Bit
1015
6.41k
          vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55);      // 0 2 4 6  32 Bit
1016
6.41k
          vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA);      // 1 3 5 7 32 Bit
1017
6.41k
          vsrc10 = _mm_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
1018
6.41k
          vsrc0 =  _mm_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
1019
1020
6.41k
          vsrc0 =  _mm_add_epi32(vsrc0,vsrc10);
1021
6.41k
          __m128i vdst01 = _mm_add_epi32(vsrc0,vsrc01);   // dst 0 1 2 3 32 Bit, untere Zeile
1022
1023
6.41k
          vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 + 7]);      // 7 8 9 10 11 12 13 14
1024
6.41k
          vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 + 8 ]);          // 8 9 10 11 12 13 14 15
1025
1026
6.41k
          vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55);
1027
6.41k
          vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55);
1028
6.41k
          vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA);
1029
6.41k
          vsrc10 = _mm_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
1030
6.41k
          vsrc0 =  _mm_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
1031
1032
6.41k
          vsrc0 =  _mm_add_epi32(vsrc0,vsrc10);
1033
6.41k
          __m128i vdst11 = _mm_add_epi32(vsrc0,vsrc01);   // dst 4 5 6 7 32 Bit, untere Zeile
1034
1035
6.41k
          vdst0 = _mm_add_epi32(vdst0,vdst01);
1036
6.41k
          vdst1 = _mm_add_epi32(vdst1,vdst11);
1037
6.41k
          vdst0 =  _mm_add_epi32(vdst0,vfour);
1038
6.41k
          vdst1 =  _mm_add_epi32(vdst1,vfour);
1039
6.41k
          vdst0 = _mm_srli_epi32(vdst0,3);
1040
6.41k
          vdst1 = _mm_srli_epi32(vdst1,3);
1041
6.41k
          vdst0 = _mm_packus_epi32 (vdst0,vdst1);   // 16 bit
1042
1043
6.41k
          _mm_storeu_si128((__m128i*)&pDst0[x], vdst0);
1044
          //        _mm_storeu_si128((__m128i*)&pDstTmp[x], vdst0);
1045
6.41k
        }
1046
6.41k
        pDst0 += iDstStride;
1047
6.41k
        pRecSrc0 += (iRecStride<<1);
1048
6.41k
      }
1049
768
    }
1050
312
    else     // width<=4
1051
312
    {
1052
312
      __m128i vzero = _mm_set1_epi8(0);
1053
312
      __m128i vfour = _mm_set1_epi32(4);
1054
1055
2.60k
      for( int y = 0; y < height; y++ )
1056
2.28k
      {
1057
2.28k
        __m128i vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[-1]);      // -1 0 1 2 3 4 5 6
1058
2.28k
        __m128i vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[0]);          // 0 1 2 3 4 5 6 7
1059
1060
2.28k
        __m128i vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55);      // -1 1 3 5  32 Bit
1061
2.28k
        __m128i vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55);      // 0 2 4 6  32 Bit
1062
2.28k
        __m128i vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA);      // 1 3 5 7 32 Bit
1063
2.28k
        vsrc10 = _mm_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
1064
2.28k
        vsrc0 =  _mm_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
1065
1066
2.28k
        vsrc0 =  _mm_add_epi32(vsrc0,vsrc10);
1067
2.28k
        __m128i vdst0 = _mm_add_epi32(vsrc0,vsrc01);   // dst 0 1 2 3 32 Bit, untere Zeile fehlt noch
1068
1069
        // jetzt die nächste Zeile dazu
1070
2.28k
        vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[iRecStride-1]);      // -1 0 1 2 3 4 5 6
1071
2.28k
        vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[iRecStride]);          // 0 1 2 3 4 5 6_mm_storeu_si32 7
1072
1073
2.28k
        vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55);      // -1 1 3 5  32 Bit
1074
2.28k
        vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55);      // 0 2 4 6  32 Bit
1075
2.28k
        vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA);      // 1 3 5 7 32 Bit
1076
2.28k
        vsrc10 = _mm_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
1077
2.28k
        vsrc0 =  _mm_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
1078
1079
2.28k
        vsrc0 =  _mm_add_epi32(vsrc0,vsrc10);
1080
2.28k
        __m128i vdst01 = _mm_add_epi32(vsrc0,vsrc01);   // dst 0 1 2 3 32 Bit, untere Zeile
1081
1082
1083
2.28k
        vdst0 = _mm_add_epi32(vdst0,vdst01);
1084
2.28k
        vdst0 =  _mm_add_epi32(vdst0,vfour);
1085
2.28k
        vdst0 = _mm_srli_epi32(vdst0,3);
1086
2.28k
        vdst0 = _mm_packus_epi32 (vdst0,vdst0);   // 16 bit
1087
1088
2.28k
        if (width==4)
1089
2.28k
          _mm_storeu_si64(( __m128i * )&pDst0[0], (vdst0) );
1090
0
        else if (width==2)
1091
0
          _mm_storeu_si32(( __m128i * )&pDst0[0], (vdst0) );
1092
0
        else
1093
0
        {
1094
0
          int tmp = _mm_cvtsi128_si32(vdst0);
1095
0
          pDst0[0] = (Pel) tmp;
1096
0
        }
1097
1098
2.28k
        pDst0 += iDstStride;
1099
2.28k
        pRecSrc0 += (iRecStride<<1);
1100
2.28k
      }
1101
312
    }
1102
2.86k
}
1103
1104
1105
1106
template<X86_VEXT vext>
1107
void IntraPrediction::_initIntraPredictionX86()
1108
9.18k
{
1109
9.18k
  IntraPredAngleCore4 = IntraPredAngleCore_SIMD<vext, 4>;
1110
9.18k
  IntraPredAngleCore8 = IntraPredAngleCore_SIMD<vext, 8>;
1111
9.18k
  IntraPredAngleChroma4 = IntraPredAngleChroma_SIMD<vext, 4>;
1112
9.18k
  IntraPredAngleChroma8 = IntraPredAngleChroma_SIMD<vext, 8>;
1113
1114
9.18k
  IntraPredSampleFilter8 = IntraPredSampleFilter_SIMD<vext, 8>;
1115
9.18k
  IntraPredSampleFilter16 = IntraPredSampleFilter_SIMD<vext, 16>;
1116
1117
9.18k
  xPredIntraPlanar = xPredIntraPlanar_SIMD<vext>;
1118
1119
9.18k
  GetLumaRecPixel420 = GetLumaRecPixel420SIMD<vext>;
1120
1121
9.18k
}
Unexecuted instantiation: void vvdec::IntraPrediction::_initIntraPredictionX86<(vvdec::x86_simd::X86_VEXT)1>()
void vvdec::IntraPrediction::_initIntraPredictionX86<(vvdec::x86_simd::X86_VEXT)4>()
Line
Count
Source
1108
9.18k
{
1109
9.18k
  IntraPredAngleCore4 = IntraPredAngleCore_SIMD<vext, 4>;
1110
9.18k
  IntraPredAngleCore8 = IntraPredAngleCore_SIMD<vext, 8>;
1111
9.18k
  IntraPredAngleChroma4 = IntraPredAngleChroma_SIMD<vext, 4>;
1112
9.18k
  IntraPredAngleChroma8 = IntraPredAngleChroma_SIMD<vext, 8>;
1113
1114
9.18k
  IntraPredSampleFilter8 = IntraPredSampleFilter_SIMD<vext, 8>;
1115
9.18k
  IntraPredSampleFilter16 = IntraPredSampleFilter_SIMD<vext, 16>;
1116
1117
9.18k
  xPredIntraPlanar = xPredIntraPlanar_SIMD<vext>;
1118
1119
9.18k
  GetLumaRecPixel420 = GetLumaRecPixel420SIMD<vext>;
1120
1121
9.18k
}
1122
template void IntraPrediction::_initIntraPredictionX86<SIMDX86>();
1123
1124
#endif // TARGET_SIMD_X86
1125
#endif
1126
1127
}