Coverage Report

Created: 2026-06-16 07:20

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/vvdec/source/Lib/CommonLib/x86/IntraPredX86.h
Line
Count
Source
1
/* -----------------------------------------------------------------------------
2
The copyright in this software is being made available under the Clear BSD
3
License, included below. No patent rights, trademark rights and/or 
4
other Intellectual Property Rights other than the copyrights concerning 
5
the Software are granted under this license.
6
7
The Clear BSD License
8
9
Copyright (c) 2018-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVdeC Authors.
10
All rights reserved.
11
12
Redistribution and use in source and binary forms, with or without modification,
13
are permitted (subject to the limitations in the disclaimer below) provided that
14
the following conditions are met:
15
16
     * Redistributions of source code must retain the above copyright notice,
17
     this list of conditions and the following disclaimer.
18
19
     * Redistributions in binary form must reproduce the above copyright
20
     notice, this list of conditions and the following disclaimer in the
21
     documentation and/or other materials provided with the distribution.
22
23
     * Neither the name of the copyright holder nor the names of its
24
     contributors may be used to endorse or promote products derived from this
25
     software without specific prior written permission.
26
27
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
28
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
29
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
32
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
36
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
40
41
------------------------------------------------------------------------------------------- */
42
43
/** \file     IntraPredX86.h
44
    \brief    SIMD for IntraPrediction
45
*/
46
47
#include "CommonLib/CommonDef.h"
48
#include "CommonDefX86.h"
49
#include "CommonLib/IntraPrediction.h"
50
51
namespace vvdec
52
{
53
54
#if ENABLE_SIMD_OPT_INTRAPRED
55
#ifdef TARGET_SIMD_X86
56
57
//#define USE_AVX2
58
template< X86_VEXT vext, int W >
59
void IntraPredAngleChroma_SIMD(int16_t* pDst,const ptrdiff_t dstStride,int16_t* pBorder,int width,int height,int deltaPos,int intraPredAngle)
60
2.72k
{
61
2.72k
  int deltaInt;
62
2.72k
  int deltaFract;
63
2.72k
  int refMainIndex;
64
65
2.72k
  __m128i voffset = _mm_set1_epi16(16);
66
2.72k
  if( W == 8 )
67
2.29k
  {
68
2.29k
    if( vext >= AVX2 )
69
2.29k
    {
70
#ifdef USE_AVX2
71
2.29k
      if (( width & 15 ) == 0 )
72
1.70k
      {
73
1.70k
       int deltaInt;
74
1.70k
        int deltaFract;
75
1.70k
        int refMainIndex;
76
77
        __m256i voffset = _mm256_set1_epi16(16);
78
28.8k
        for (int k=0; k<height; k++) {
79
80
27.1k
          deltaInt   = deltaPos >> 5;
81
27.1k
          deltaFract = deltaPos & (32 - 1);
82
83
27.1k
          __m256i vfract = _mm256_set1_epi16(deltaFract);
84
27.1k
          __m256i v32minfract = _mm256_set1_epi16(32-deltaFract);
85
          // Do linear filtering
86
68.5k
          for (int l=0; l<width; l+=16) {
87
41.3k
            refMainIndex   = l+ deltaInt+1;
88
41.3k
            __m256i vpred0 = _mm256_lddqu_si256((__m256i*)&pBorder[refMainIndex]);
89
41.3k
            __m256i vpred1 = _mm256_lddqu_si256((__m256i*)&pBorder[refMainIndex+1]);
90
41.3k
            vpred0 = _mm256_mullo_epi16(v32minfract, vpred0);
91
41.3k
            vpred1 = _mm256_mullo_epi16(vfract, vpred1);
92
41.3k
            __m256i vpred = _mm256_srli_epi16(_mm256_add_epi16(_mm256_add_epi16(vpred0, vpred1), voffset), 5);
93
41.3k
            _mm256_storeu_si256((__m256i*)&pDst[l], vpred);
94
41.3k
          }
95
27.1k
          pDst+=dstStride;
96
27.1k
          deltaPos += intraPredAngle;
97
27.1k
        }
98
1.70k
      }
99
592
      else // width==8
100
592
      {
101
7.03k
        for (int k=0; k<height; k++)
102
6.44k
        {
103
6.44k
          deltaInt   = deltaPos >> 5;
104
6.44k
          deltaFract = deltaPos & (32 - 1);
105
106
6.44k
          __m128i vfract = _mm_set1_epi16(deltaFract);
107
6.44k
          __m128i v32minfract = _mm_set1_epi16(32-deltaFract);
108
          // Do linear filtering
109
12.8k
          for (int l=0; l<width; l+=8) {
110
6.44k
            refMainIndex        = l+ deltaInt+1;
111
6.44k
            __m128i vpred0 = _mm_lddqu_si128((__m128i*)&pBorder[refMainIndex]);
112
6.44k
            __m128i vpred1 = _mm_lddqu_si128((__m128i*)&pBorder[refMainIndex+1]);
113
6.44k
            vpred0 = _mm_mullo_epi16(v32minfract, vpred0);
114
6.44k
            vpred1 = _mm_mullo_epi16(vfract, vpred1);
115
6.44k
            __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5);
116
6.44k
            _mm_storeu_si128((__m128i*)&pDst[l], vpred);
117
6.44k
          }
118
6.44k
          deltaPos += intraPredAngle;
119
120
6.44k
          pDst+=dstStride;
121
6.44k
        }
122
123
592
      }
124
#endif //AVX2
125
2.29k
    }  
126
0
    else
127
0
    {
128
0
      for (int k=0; k<height; k++) {
129
0
        deltaInt   = deltaPos >> 5;
130
0
        deltaFract = deltaPos & (32 - 1);
131
132
0
        __m128i vfract = _mm_set1_epi16(deltaFract);
133
0
        __m128i v32minfract = _mm_set1_epi16(32-deltaFract);
134
        // Do linear filtering
135
0
        for (int l=0; l<width; l+=8) {
136
0
          refMainIndex        = l+ deltaInt+1;
137
0
          __m128i vpred0 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex]);
138
0
          __m128i vpred1 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex+1]);
139
0
          vpred0 = _mm_mullo_epi16(v32minfract, vpred0);
140
0
          vpred1 = _mm_mullo_epi16(vfract, vpred1);
141
0
          __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5);
142
0
          _mm_storeu_si128((__m128i*)&pDst[l], vpred);
143
0
        }
144
0
        deltaPos += intraPredAngle;
145
146
0
        pDst+=dstStride;
147
0
      }
148
0
    }
149
150
2.29k
  }
151
424
  else if( W == 4 )
152
424
  {
153
3.64k
    for (int k=0; k<height; k++) {
154
3.21k
      deltaInt   = deltaPos >> 5;
155
3.21k
      deltaFract = deltaPos & (32 - 1);
156
157
3.21k
      __m128i vfract = _mm_set1_epi16(deltaFract);
158
3.21k
      __m128i v32minfract = _mm_set1_epi16(32-deltaFract);
159
      // Do linear filtering
160
3.21k
      refMainIndex        = deltaInt+1;
161
3.21k
      __m128i vpred0 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex]);
162
3.21k
      __m128i vpred1 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex+1]);
163
3.21k
      vpred0 = _mm_mullo_epi16(v32minfract, vpred0);
164
3.21k
      vpred1 = _mm_mullo_epi16(vfract, vpred1);
165
3.21k
      __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5);
166
3.21k
      _mm_storeu_si64( ( __m128i * )(pDst ), vpred);
167
3.21k
      deltaPos += intraPredAngle;
168
3.21k
      pDst+=dstStride;
169
3.21k
    }
170
424
  }
171
0
  else
172
0
  {
173
0
    THROW_FATAL( "Unsupported size in IntraPredAngleCore_SIMD" );
174
0
  }
175
#if USE_AVX2
176
177
2.72k
  _mm256_zeroupper();
178
2.72k
#endif
179
2.72k
}
Unexecuted instantiation: void vvdec::IntraPredAngleChroma_SIMD<(vvdec::x86_simd::X86_VEXT)1, 4>(short*, long, short*, int, int, int, int)
Unexecuted instantiation: void vvdec::IntraPredAngleChroma_SIMD<(vvdec::x86_simd::X86_VEXT)1, 8>(short*, long, short*, int, int, int, int)
void vvdec::IntraPredAngleChroma_SIMD<(vvdec::x86_simd::X86_VEXT)4, 4>(short*, long, short*, int, int, int, int)
Line
Count
Source
60
424
{
61
424
  int deltaInt;
62
424
  int deltaFract;
63
424
  int refMainIndex;
64
65
424
  __m128i voffset = _mm_set1_epi16(16);
66
424
  if( W == 8 )
67
0
  {
68
0
    if( vext >= AVX2 )
69
0
    {
70
0
#ifdef USE_AVX2
71
0
      if (( width & 15 ) == 0 )
72
0
      {
73
0
       int deltaInt;
74
0
        int deltaFract;
75
0
        int refMainIndex;
76
77
0
        __m256i voffset = _mm256_set1_epi16(16);
78
0
        for (int k=0; k<height; k++) {
79
80
0
          deltaInt   = deltaPos >> 5;
81
0
          deltaFract = deltaPos & (32 - 1);
82
83
0
          __m256i vfract = _mm256_set1_epi16(deltaFract);
84
0
          __m256i v32minfract = _mm256_set1_epi16(32-deltaFract);
85
          // Do linear filtering
86
0
          for (int l=0; l<width; l+=16) {
87
0
            refMainIndex   = l+ deltaInt+1;
88
0
            __m256i vpred0 = _mm256_lddqu_si256((__m256i*)&pBorder[refMainIndex]);
89
0
            __m256i vpred1 = _mm256_lddqu_si256((__m256i*)&pBorder[refMainIndex+1]);
90
0
            vpred0 = _mm256_mullo_epi16(v32minfract, vpred0);
91
0
            vpred1 = _mm256_mullo_epi16(vfract, vpred1);
92
0
            __m256i vpred = _mm256_srli_epi16(_mm256_add_epi16(_mm256_add_epi16(vpred0, vpred1), voffset), 5);
93
0
            _mm256_storeu_si256((__m256i*)&pDst[l], vpred);
94
0
          }
95
0
          pDst+=dstStride;
96
0
          deltaPos += intraPredAngle;
97
0
        }
98
0
      }
99
0
      else // width==8
100
0
      {
101
0
        for (int k=0; k<height; k++)
102
0
        {
103
0
          deltaInt   = deltaPos >> 5;
104
0
          deltaFract = deltaPos & (32 - 1);
105
106
0
          __m128i vfract = _mm_set1_epi16(deltaFract);
107
0
          __m128i v32minfract = _mm_set1_epi16(32-deltaFract);
108
          // Do linear filtering
109
0
          for (int l=0; l<width; l+=8) {
110
0
            refMainIndex        = l+ deltaInt+1;
111
0
            __m128i vpred0 = _mm_lddqu_si128((__m128i*)&pBorder[refMainIndex]);
112
0
            __m128i vpred1 = _mm_lddqu_si128((__m128i*)&pBorder[refMainIndex+1]);
113
0
            vpred0 = _mm_mullo_epi16(v32minfract, vpred0);
114
0
            vpred1 = _mm_mullo_epi16(vfract, vpred1);
115
0
            __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5);
116
0
            _mm_storeu_si128((__m128i*)&pDst[l], vpred);
117
0
          }
118
0
          deltaPos += intraPredAngle;
119
120
0
          pDst+=dstStride;
121
0
        }
122
123
0
      }
124
0
#endif //AVX2
125
0
    }  
126
0
    else
127
0
    {
128
0
      for (int k=0; k<height; k++) {
129
0
        deltaInt   = deltaPos >> 5;
130
0
        deltaFract = deltaPos & (32 - 1);
131
132
0
        __m128i vfract = _mm_set1_epi16(deltaFract);
133
0
        __m128i v32minfract = _mm_set1_epi16(32-deltaFract);
134
        // Do linear filtering
135
0
        for (int l=0; l<width; l+=8) {
136
0
          refMainIndex        = l+ deltaInt+1;
137
0
          __m128i vpred0 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex]);
138
0
          __m128i vpred1 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex+1]);
139
0
          vpred0 = _mm_mullo_epi16(v32minfract, vpred0);
140
0
          vpred1 = _mm_mullo_epi16(vfract, vpred1);
141
0
          __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5);
142
0
          _mm_storeu_si128((__m128i*)&pDst[l], vpred);
143
0
        }
144
0
        deltaPos += intraPredAngle;
145
146
0
        pDst+=dstStride;
147
0
      }
148
0
    }
149
150
0
  }
151
424
  else if( W == 4 )
152
424
  {
153
3.64k
    for (int k=0; k<height; k++) {
154
3.21k
      deltaInt   = deltaPos >> 5;
155
3.21k
      deltaFract = deltaPos & (32 - 1);
156
157
3.21k
      __m128i vfract = _mm_set1_epi16(deltaFract);
158
3.21k
      __m128i v32minfract = _mm_set1_epi16(32-deltaFract);
159
      // Do linear filtering
160
3.21k
      refMainIndex        = deltaInt+1;
161
3.21k
      __m128i vpred0 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex]);
162
3.21k
      __m128i vpred1 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex+1]);
163
3.21k
      vpred0 = _mm_mullo_epi16(v32minfract, vpred0);
164
3.21k
      vpred1 = _mm_mullo_epi16(vfract, vpred1);
165
3.21k
      __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5);
166
3.21k
      _mm_storeu_si64( ( __m128i * )(pDst ), vpred);
167
3.21k
      deltaPos += intraPredAngle;
168
3.21k
      pDst+=dstStride;
169
3.21k
    }
170
424
  }
171
0
  else
172
0
  {
173
0
    THROW_FATAL( "Unsupported size in IntraPredAngleCore_SIMD" );
174
0
  }
175
424
#if USE_AVX2
176
177
424
  _mm256_zeroupper();
178
424
#endif
179
424
}
void vvdec::IntraPredAngleChroma_SIMD<(vvdec::x86_simd::X86_VEXT)4, 8>(short*, long, short*, int, int, int, int)
Line
Count
Source
60
2.29k
{
61
2.29k
  int deltaInt;
62
2.29k
  int deltaFract;
63
2.29k
  int refMainIndex;
64
65
2.29k
  __m128i voffset = _mm_set1_epi16(16);
66
2.29k
  if( W == 8 )
67
2.29k
  {
68
2.29k
    if( vext >= AVX2 )
69
2.29k
    {
70
2.29k
#ifdef USE_AVX2
71
2.29k
      if (( width & 15 ) == 0 )
72
1.70k
      {
73
1.70k
       int deltaInt;
74
1.70k
        int deltaFract;
75
1.70k
        int refMainIndex;
76
77
1.70k
        __m256i voffset = _mm256_set1_epi16(16);
78
28.8k
        for (int k=0; k<height; k++) {
79
80
27.1k
          deltaInt   = deltaPos >> 5;
81
27.1k
          deltaFract = deltaPos & (32 - 1);
82
83
27.1k
          __m256i vfract = _mm256_set1_epi16(deltaFract);
84
27.1k
          __m256i v32minfract = _mm256_set1_epi16(32-deltaFract);
85
          // Do linear filtering
86
68.5k
          for (int l=0; l<width; l+=16) {
87
41.3k
            refMainIndex   = l+ deltaInt+1;
88
41.3k
            __m256i vpred0 = _mm256_lddqu_si256((__m256i*)&pBorder[refMainIndex]);
89
41.3k
            __m256i vpred1 = _mm256_lddqu_si256((__m256i*)&pBorder[refMainIndex+1]);
90
41.3k
            vpred0 = _mm256_mullo_epi16(v32minfract, vpred0);
91
41.3k
            vpred1 = _mm256_mullo_epi16(vfract, vpred1);
92
41.3k
            __m256i vpred = _mm256_srli_epi16(_mm256_add_epi16(_mm256_add_epi16(vpred0, vpred1), voffset), 5);
93
41.3k
            _mm256_storeu_si256((__m256i*)&pDst[l], vpred);
94
41.3k
          }
95
27.1k
          pDst+=dstStride;
96
27.1k
          deltaPos += intraPredAngle;
97
27.1k
        }
98
1.70k
      }
99
592
      else // width==8
100
592
      {
101
7.03k
        for (int k=0; k<height; k++)
102
6.44k
        {
103
6.44k
          deltaInt   = deltaPos >> 5;
104
6.44k
          deltaFract = deltaPos & (32 - 1);
105
106
6.44k
          __m128i vfract = _mm_set1_epi16(deltaFract);
107
6.44k
          __m128i v32minfract = _mm_set1_epi16(32-deltaFract);
108
          // Do linear filtering
109
12.8k
          for (int l=0; l<width; l+=8) {
110
6.44k
            refMainIndex        = l+ deltaInt+1;
111
6.44k
            __m128i vpred0 = _mm_lddqu_si128((__m128i*)&pBorder[refMainIndex]);
112
6.44k
            __m128i vpred1 = _mm_lddqu_si128((__m128i*)&pBorder[refMainIndex+1]);
113
6.44k
            vpred0 = _mm_mullo_epi16(v32minfract, vpred0);
114
6.44k
            vpred1 = _mm_mullo_epi16(vfract, vpred1);
115
6.44k
            __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5);
116
6.44k
            _mm_storeu_si128((__m128i*)&pDst[l], vpred);
117
6.44k
          }
118
6.44k
          deltaPos += intraPredAngle;
119
120
6.44k
          pDst+=dstStride;
121
6.44k
        }
122
123
592
      }
124
2.29k
#endif //AVX2
125
2.29k
    }  
126
0
    else
127
0
    {
128
0
      for (int k=0; k<height; k++) {
129
0
        deltaInt   = deltaPos >> 5;
130
0
        deltaFract = deltaPos & (32 - 1);
131
132
0
        __m128i vfract = _mm_set1_epi16(deltaFract);
133
0
        __m128i v32minfract = _mm_set1_epi16(32-deltaFract);
134
        // Do linear filtering
135
0
        for (int l=0; l<width; l+=8) {
136
0
          refMainIndex        = l+ deltaInt+1;
137
0
          __m128i vpred0 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex]);
138
0
          __m128i vpred1 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex+1]);
139
0
          vpred0 = _mm_mullo_epi16(v32minfract, vpred0);
140
0
          vpred1 = _mm_mullo_epi16(vfract, vpred1);
141
0
          __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5);
142
0
          _mm_storeu_si128((__m128i*)&pDst[l], vpred);
143
0
        }
144
0
        deltaPos += intraPredAngle;
145
146
0
        pDst+=dstStride;
147
0
      }
148
0
    }
149
150
2.29k
  }
151
0
  else if( W == 4 )
152
0
  {
153
0
    for (int k=0; k<height; k++) {
154
0
      deltaInt   = deltaPos >> 5;
155
0
      deltaFract = deltaPos & (32 - 1);
156
157
0
      __m128i vfract = _mm_set1_epi16(deltaFract);
158
0
      __m128i v32minfract = _mm_set1_epi16(32-deltaFract);
159
      // Do linear filtering
160
0
      refMainIndex        = deltaInt+1;
161
0
      __m128i vpred0 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex]);
162
0
      __m128i vpred1 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex+1]);
163
0
      vpred0 = _mm_mullo_epi16(v32minfract, vpred0);
164
0
      vpred1 = _mm_mullo_epi16(vfract, vpred1);
165
0
      __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5);
166
0
      _mm_storeu_si64( ( __m128i * )(pDst ), vpred);
167
0
      deltaPos += intraPredAngle;
168
0
      pDst+=dstStride;
169
0
    }
170
0
  }
171
0
  else
172
0
  {
173
0
    THROW_FATAL( "Unsupported size in IntraPredAngleCore_SIMD" );
174
0
  }
175
2.29k
#if USE_AVX2
176
177
2.29k
  _mm256_zeroupper();
178
2.29k
#endif
179
2.29k
}
180
181
182
template< X86_VEXT vext, int W >
183
void IntraPredAngleCore_SIMD(int16_t* pDstBuf,const ptrdiff_t dstStride,int16_t* refMain,int width,int height,int deltaPos,int intraPredAngle,const TFilterCoeff *ff,const bool useCubicFilter,const ClpRng& clpRng)
184
6.58k
{
185
6.58k
  int16_t* pDst;
186
187
6.58k
  if( W == 8 )
188
5.71k
  {
189
5.71k
    if( vext >= AVX2 )
190
5.71k
    {
191
#ifdef USE_AVX2
192
      __m256i shflmask1= _mm256_set_epi8(0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6,   0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4,
193
          0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2,   0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 );
194
      __m256i offset = _mm256_set1_epi32( 32 );
195
196
5.71k
      if (( width & 15 ) == 0 )
197
4.07k
      {
198
4.07k
        __m256i vbdmin,vbdmax;
199
200
4.07k
        if (useCubicFilter)
201
1.86k
        {
202
1.86k
          vbdmin = _mm256_set1_epi16( clpRng.min() );
203
1.86k
          vbdmax = _mm256_set1_epi16( clpRng.max() );
204
1.86k
        }
205
206
87.9k
        for (int y = 0; y<height; y++ )
207
83.8k
        {
208
83.8k
          int deltaInt   = deltaPos >> 5;
209
83.8k
          int deltaFract = deltaPos & (32 - 1);
210
83.8k
          int refMainIndex   = deltaInt + 1;
211
83.8k
          pDst=&pDstBuf[y*dstStride];
212
83.8k
          __m128i tmp = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] );   //load 4 16 bit filter coeffs
213
83.8k
          tmp = _mm_shuffle_epi32(tmp,0x44);
214
83.8k
          __m256i coeff = _mm256_broadcastsi128_si256(tmp);
215
280k
          for( int x = 0; x < width; x+=16)
216
196k
          {
217
196k
            __m256i src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( const __m128i* ) &refMain[refMainIndex     - 1] ) );
218
196k
            __m256i src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( const __m128i* ) &refMain[refMainIndex + 4 - 1] ) );
219
196k
            src1 = _mm256_shuffle_epi8(src1,shflmask1);                 // -1 0 1 2  0 1 2 3 1 2 3 4  2 3 4 5
220
196k
            src2 = _mm256_shuffle_epi8(src2,shflmask1);                 // 3 4 5 6  4 5 6 7  5 6 7 8 6 7 8 9
221
222
196k
            src1 = _mm256_madd_epi16 (src1, coeff);
223
196k
            src2 = _mm256_madd_epi16 (src2, coeff);
224
225
196k
            __m256i  sum  = _mm256_hadd_epi32( src1, src2 );
226
196k
            sum = _mm256_permute4x64_epi64(sum,0xD8);
227
228
196k
            sum = _mm256_add_epi32( sum, offset );
229
196k
            sum = _mm256_srai_epi32( sum, 6 );
230
231
196k
            refMainIndex+=8;
232
            
233
196k
            src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) &refMain[refMainIndex     - 1] ) );
234
196k
            src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) &refMain[refMainIndex + 4 - 1] ) );
235
236
196k
            src1 = _mm256_shuffle_epi8(src1,shflmask1);                 // -1 0 1 2  0 1 2 3 1 2 3 4  2 3 4 5
237
196k
            src2 = _mm256_shuffle_epi8(src2,shflmask1);                 // 3 4 5 6  4 5 6 7  5 6 7 8 6 7 8 9
238
196k
            src1 = _mm256_madd_epi16 (src1, coeff);
239
196k
            src2 = _mm256_madd_epi16 (src2, coeff);
240
241
196k
            __m256i  sum1  = _mm256_hadd_epi32( src1, src2 );
242
196k
            sum1 = _mm256_permute4x64_epi64(sum1,0xD8);
243
244
196k
            sum1 = _mm256_add_epi32( sum1, offset );
245
196k
            sum1 = _mm256_srai_epi32( sum1, 6 );
246
196k
            __m256i
247
196k
            src0 = _mm256_packs_epi32( sum, sum1 );
248
249
196k
            src0 = _mm256_permute4x64_epi64(src0,0xD8);
250
251
196k
            refMainIndex+=8;
252
253
196k
            if (useCubicFilter)
254
44.4k
              src0 = _mm256_min_epi16( vbdmax, _mm256_max_epi16( vbdmin, src0 ) );
255
256
196k
            _mm256_storeu_si256( ( __m256i * )(pDst + x), src0);
257
196k
          }
258
83.8k
          deltaPos += intraPredAngle;
259
83.8k
        }
260
4.07k
      }
261
1.64k
      else // width =8
262
1.64k
      {
263
        //        printf("AVX2 Block %d \n",width);
264
1.64k
        __m128i vbdmin,vbdmax;
265
266
1.64k
        if (useCubicFilter)
267
1.45k
        {
268
1.45k
          vbdmin = _mm_set1_epi16( clpRng.min() );
269
1.45k
          vbdmax = _mm_set1_epi16( clpRng.max() );
270
1.45k
        }
271
272
26.1k
        for (int y = 0; y<height; y++ )
273
24.4k
        {
274
24.4k
          int deltaInt   = deltaPos >> 5;
275
24.4k
          int deltaFract = deltaPos & (32 - 1);
276
24.4k
          int refMainIndex   = deltaInt + 1;
277
24.4k
          pDst=&pDstBuf[y*dstStride];
278
24.4k
          __m128i tmp = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] );   //load 4 16 bit filter coeffs
279
24.4k
          tmp = _mm_shuffle_epi32(tmp,0x44);
280
24.4k
          __m256i coeff = _mm256_broadcastsi128_si256(tmp);
281
24.4k
          __m256i src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) & refMain[refMainIndex - 1] ) );
282
24.4k
          __m256i src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) & refMain[refMainIndex + 4 - 1] ) );
283
24.4k
          src1 = _mm256_shuffle_epi8(src1,shflmask1);                 // -1 0 1 2  0 1 2 3 1 2 3 4  2 3 4 5
284
24.4k
          src2 = _mm256_shuffle_epi8(src2,shflmask1);                 // 3 4 5 6  4 5 6 7  5 6 7 8 6 7 8 9
285
286
24.4k
          src1 = _mm256_madd_epi16 (src1, coeff);
287
24.4k
          src2 = _mm256_madd_epi16 (src2, coeff);
288
289
24.4k
          __m256i  sum  = _mm256_hadd_epi32( src1, src2 );
290
24.4k
          sum = _mm256_permute4x64_epi64(sum,0xD8);
291
292
24.4k
          sum = _mm256_add_epi32( sum, offset );
293
24.4k
          sum = _mm256_srai_epi32( sum, 6 );
294
24.4k
          __m128i dest128 = _mm256_cvtepi32_epi16x( sum );
295
296
24.4k
          if (useCubicFilter)
297
18.8k
            dest128 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, dest128 ) );
298
299
24.4k
          _mm_storeu_si128( ( __m128i * )(pDst), dest128);
300
24.4k
          deltaPos += intraPredAngle;
301
24.4k
        }
302
1.64k
      }
303
#endif
304
5.71k
    }
305
0
    else
306
0
    {
307
0
      __m128i shflmask1= _mm_set_epi8( 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2,   0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 );
308
0
      __m128i shflmask2= _mm_set_epi8( 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6,   0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4 );
309
0
      __m128i vbdmin,vbdmax;
310
311
0
      __m128i offset = _mm_set1_epi32( 32 );
312
313
0
      if (useCubicFilter)
314
0
      {
315
0
        vbdmin = _mm_set1_epi16( clpRng.min() );
316
0
        vbdmax = _mm_set1_epi16( clpRng.max() );
317
0
      }
318
319
0
      for (int y = 0; y<height; y++ )
320
0
      {
321
0
        int deltaInt   = deltaPos >> 5;
322
0
        int deltaFract = deltaPos & (32 - 1);
323
0
        int refMainIndex   = deltaInt + 1;
324
0
        pDst=&pDstBuf[y*dstStride];
325
0
        __m128i coeff = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] );   //load 4 16 bit filter coeffs
326
0
        coeff = _mm_shuffle_epi32(coeff,0x44);
327
0
        for( int x = 0; x < width; x+=8)
328
0
        {
329
0
          __m128i src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] );   //load 8 16 bit reference Pels   -1 0 1 2 3 4 5 6
330
0
          __m128i src1 = _mm_shuffle_epi8(src0,shflmask1);                  // -1 0 1 2  0 1 2 3
331
0
          __m128i src2 = _mm_shuffle_epi8(src0,shflmask2);                  // 1 2 3 4  2 3 4 5
332
0
          src0 = _mm_madd_epi16( coeff,src1 );
333
0
          src1 = _mm_madd_epi16( coeff,src2 );
334
0
          __m128i sum  = _mm_hadd_epi32( src0, src1 );
335
0
          sum = _mm_add_epi32( sum, offset );
336
0
          sum = _mm_srai_epi32( sum, 6 );
337
338
0
          refMainIndex+=4;
339
0
          src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] );   //load 8 16 bit reference Pels   -1 0 1 2 3 4 5 6
340
0
          src1 = _mm_shuffle_epi8(src0,shflmask1);                                // -1 0 1 2  0 1 2 3
341
0
          src2 = _mm_shuffle_epi8(src0,shflmask2);
342
343
          // 1 2 3 4  2 3 4 5
344
0
          src0 = _mm_madd_epi16( coeff,src1 );
345
0
          src1 = _mm_madd_epi16( coeff,src2 );
346
347
0
          __m128i sum1  = _mm_hadd_epi32( src0, src1 );
348
0
          sum1 = _mm_add_epi32( sum1, offset );
349
0
          sum1 = _mm_srai_epi32( sum1, 6 );
350
0
          src0 = _mm_packs_epi32( sum, sum1 );
351
352
0
          refMainIndex+=4;
353
354
0
          if (useCubicFilter)
355
0
            src0 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, src0 ) );
356
357
0
          _mm_storeu_si128( ( __m128i * )(pDst + x), src0);
358
359
0
        }
360
0
        deltaPos += intraPredAngle;
361
0
      }
362
0
    }
363
5.71k
  }
364
870
  else if( W == 4 )
365
870
  {
366
367
870
    __m128i shflmask1= _mm_set_epi8( 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2,   0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 );
368
870
    __m128i shflmask2= _mm_set_epi8( 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6,   0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4 );
369
870
    __m128i vbdmin,vbdmax;
370
371
870
    __m128i offset = _mm_set1_epi32( 32 );
372
373
870
    if (useCubicFilter)
374
865
    {
375
865
      vbdmin = _mm_set1_epi16( clpRng.min() );
376
865
      vbdmax = _mm_set1_epi16( clpRng.max() );
377
865
    }
378
379
13.5k
    for (int y = 0; y<height; y++ )
380
12.7k
    {
381
12.7k
      int deltaInt   = deltaPos >> 5;
382
12.7k
      int deltaFract = deltaPos & (32 - 1);
383
12.7k
      int refMainIndex   = deltaInt + 1;
384
12.7k
      pDst=&pDstBuf[y*dstStride];
385
12.7k
      __m128i coeff = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] );   //load 4 16 bit filter coeffs
386
12.7k
      coeff = _mm_shuffle_epi32(coeff,0x44);
387
12.7k
      {
388
12.7k
        __m128i src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] );   //load 8 16 bit reference Pels   -1 0 1 2 3 4 5 6
389
12.7k
        __m128i src1 = _mm_shuffle_epi8(src0,shflmask1);                  // -1 0 1 2  0 1 2 3
390
12.7k
        __m128i src2 = _mm_shuffle_epi8(src0,shflmask2);                  // 1 2 3 4  2 3 4 5
391
12.7k
        src0 = _mm_madd_epi16( coeff,src1 );
392
12.7k
        src1 = _mm_madd_epi16( coeff,src2 );
393
12.7k
        __m128i sum  = _mm_hadd_epi32( src0, src1 );
394
12.7k
        sum = _mm_add_epi32( sum, offset );
395
12.7k
        sum = _mm_srai_epi32( sum, 6 );
396
397
12.7k
        src0 = _mm_packs_epi32( sum, sum );
398
399
12.7k
        refMainIndex+=4;
400
401
12.7k
        if (useCubicFilter)
402
12.5k
          src0 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, src0 ) );
403
404
12.7k
        _mm_storeu_si64( ( __m128i * )(pDst ), src0);
405
406
12.7k
      }
407
12.7k
      deltaPos += intraPredAngle;
408
12.7k
    }
409
870
  }
410
0
  else
411
0
  {
412
0
    THROW_FATAL( "Unsupported size in IntraPredAngleCore_SIMD" );
413
0
  }
414
#if USE_AVX2
415
6.58k
  _mm256_zeroupper();
416
6.58k
#endif
417
6.58k
}
Unexecuted instantiation: void vvdec::IntraPredAngleCore_SIMD<(vvdec::x86_simd::X86_VEXT)1, 4>(short*, long, short*, int, int, int, int, short const*, bool, vvdec::ClpRngTemplate<short> const&)
Unexecuted instantiation: void vvdec::IntraPredAngleCore_SIMD<(vvdec::x86_simd::X86_VEXT)1, 8>(short*, long, short*, int, int, int, int, short const*, bool, vvdec::ClpRngTemplate<short> const&)
void vvdec::IntraPredAngleCore_SIMD<(vvdec::x86_simd::X86_VEXT)4, 4>(short*, long, short*, int, int, int, int, short const*, bool, vvdec::ClpRngTemplate<short> const&)
Line
Count
Source
184
870
{
185
870
  int16_t* pDst;
186
187
870
  if( W == 8 )
188
0
  {
189
0
    if( vext >= AVX2 )
190
0
    {
191
0
#ifdef USE_AVX2
192
0
      __m256i shflmask1= _mm256_set_epi8(0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6,   0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4,
193
0
          0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2,   0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 );
194
0
      __m256i offset = _mm256_set1_epi32( 32 );
195
196
0
      if (( width & 15 ) == 0 )
197
0
      {
198
0
        __m256i vbdmin,vbdmax;
199
200
0
        if (useCubicFilter)
201
0
        {
202
0
          vbdmin = _mm256_set1_epi16( clpRng.min() );
203
0
          vbdmax = _mm256_set1_epi16( clpRng.max() );
204
0
        }
205
206
0
        for (int y = 0; y<height; y++ )
207
0
        {
208
0
          int deltaInt   = deltaPos >> 5;
209
0
          int deltaFract = deltaPos & (32 - 1);
210
0
          int refMainIndex   = deltaInt + 1;
211
0
          pDst=&pDstBuf[y*dstStride];
212
0
          __m128i tmp = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] );   //load 4 16 bit filter coeffs
213
0
          tmp = _mm_shuffle_epi32(tmp,0x44);
214
0
          __m256i coeff = _mm256_broadcastsi128_si256(tmp);
215
0
          for( int x = 0; x < width; x+=16)
216
0
          {
217
0
            __m256i src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( const __m128i* ) &refMain[refMainIndex     - 1] ) );
218
0
            __m256i src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( const __m128i* ) &refMain[refMainIndex + 4 - 1] ) );
219
0
            src1 = _mm256_shuffle_epi8(src1,shflmask1);                 // -1 0 1 2  0 1 2 3 1 2 3 4  2 3 4 5
220
0
            src2 = _mm256_shuffle_epi8(src2,shflmask1);                 // 3 4 5 6  4 5 6 7  5 6 7 8 6 7 8 9
221
222
0
            src1 = _mm256_madd_epi16 (src1, coeff);
223
0
            src2 = _mm256_madd_epi16 (src2, coeff);
224
225
0
            __m256i  sum  = _mm256_hadd_epi32( src1, src2 );
226
0
            sum = _mm256_permute4x64_epi64(sum,0xD8);
227
228
0
            sum = _mm256_add_epi32( sum, offset );
229
0
            sum = _mm256_srai_epi32( sum, 6 );
230
231
0
            refMainIndex+=8;
232
            
233
0
            src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) &refMain[refMainIndex     - 1] ) );
234
0
            src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) &refMain[refMainIndex + 4 - 1] ) );
235
236
0
            src1 = _mm256_shuffle_epi8(src1,shflmask1);                 // -1 0 1 2  0 1 2 3 1 2 3 4  2 3 4 5
237
0
            src2 = _mm256_shuffle_epi8(src2,shflmask1);                 // 3 4 5 6  4 5 6 7  5 6 7 8 6 7 8 9
238
0
            src1 = _mm256_madd_epi16 (src1, coeff);
239
0
            src2 = _mm256_madd_epi16 (src2, coeff);
240
241
0
            __m256i  sum1  = _mm256_hadd_epi32( src1, src2 );
242
0
            sum1 = _mm256_permute4x64_epi64(sum1,0xD8);
243
244
0
            sum1 = _mm256_add_epi32( sum1, offset );
245
0
            sum1 = _mm256_srai_epi32( sum1, 6 );
246
0
            __m256i
247
0
            src0 = _mm256_packs_epi32( sum, sum1 );
248
249
0
            src0 = _mm256_permute4x64_epi64(src0,0xD8);
250
251
0
            refMainIndex+=8;
252
253
0
            if (useCubicFilter)
254
0
              src0 = _mm256_min_epi16( vbdmax, _mm256_max_epi16( vbdmin, src0 ) );
255
256
0
            _mm256_storeu_si256( ( __m256i * )(pDst + x), src0);
257
0
          }
258
0
          deltaPos += intraPredAngle;
259
0
        }
260
0
      }
261
0
      else // width =8
262
0
      {
263
        //        printf("AVX2 Block %d \n",width);
264
0
        __m128i vbdmin,vbdmax;
265
266
0
        if (useCubicFilter)
267
0
        {
268
0
          vbdmin = _mm_set1_epi16( clpRng.min() );
269
0
          vbdmax = _mm_set1_epi16( clpRng.max() );
270
0
        }
271
272
0
        for (int y = 0; y<height; y++ )
273
0
        {
274
0
          int deltaInt   = deltaPos >> 5;
275
0
          int deltaFract = deltaPos & (32 - 1);
276
0
          int refMainIndex   = deltaInt + 1;
277
0
          pDst=&pDstBuf[y*dstStride];
278
0
          __m128i tmp = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] );   //load 4 16 bit filter coeffs
279
0
          tmp = _mm_shuffle_epi32(tmp,0x44);
280
0
          __m256i coeff = _mm256_broadcastsi128_si256(tmp);
281
0
          __m256i src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) & refMain[refMainIndex - 1] ) );
282
0
          __m256i src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) & refMain[refMainIndex + 4 - 1] ) );
283
0
          src1 = _mm256_shuffle_epi8(src1,shflmask1);                 // -1 0 1 2  0 1 2 3 1 2 3 4  2 3 4 5
284
0
          src2 = _mm256_shuffle_epi8(src2,shflmask1);                 // 3 4 5 6  4 5 6 7  5 6 7 8 6 7 8 9
285
286
0
          src1 = _mm256_madd_epi16 (src1, coeff);
287
0
          src2 = _mm256_madd_epi16 (src2, coeff);
288
289
0
          __m256i  sum  = _mm256_hadd_epi32( src1, src2 );
290
0
          sum = _mm256_permute4x64_epi64(sum,0xD8);
291
292
0
          sum = _mm256_add_epi32( sum, offset );
293
0
          sum = _mm256_srai_epi32( sum, 6 );
294
0
          __m128i dest128 = _mm256_cvtepi32_epi16x( sum );
295
296
0
          if (useCubicFilter)
297
0
            dest128 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, dest128 ) );
298
299
0
          _mm_storeu_si128( ( __m128i * )(pDst), dest128);
300
0
          deltaPos += intraPredAngle;
301
0
        }
302
0
      }
303
0
#endif
304
0
    }
305
0
    else
306
0
    {
307
0
      __m128i shflmask1= _mm_set_epi8( 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2,   0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 );
308
0
      __m128i shflmask2= _mm_set_epi8( 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6,   0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4 );
309
0
      __m128i vbdmin,vbdmax;
310
311
0
      __m128i offset = _mm_set1_epi32( 32 );
312
313
0
      if (useCubicFilter)
314
0
      {
315
0
        vbdmin = _mm_set1_epi16( clpRng.min() );
316
0
        vbdmax = _mm_set1_epi16( clpRng.max() );
317
0
      }
318
319
0
      for (int y = 0; y<height; y++ )
320
0
      {
321
0
        int deltaInt   = deltaPos >> 5;
322
0
        int deltaFract = deltaPos & (32 - 1);
323
0
        int refMainIndex   = deltaInt + 1;
324
0
        pDst=&pDstBuf[y*dstStride];
325
0
        __m128i coeff = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] );   //load 4 16 bit filter coeffs
326
0
        coeff = _mm_shuffle_epi32(coeff,0x44);
327
0
        for( int x = 0; x < width; x+=8)
328
0
        {
329
0
          __m128i src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] );   //load 8 16 bit reference Pels   -1 0 1 2 3 4 5 6
330
0
          __m128i src1 = _mm_shuffle_epi8(src0,shflmask1);                  // -1 0 1 2  0 1 2 3
331
0
          __m128i src2 = _mm_shuffle_epi8(src0,shflmask2);                  // 1 2 3 4  2 3 4 5
332
0
          src0 = _mm_madd_epi16( coeff,src1 );
333
0
          src1 = _mm_madd_epi16( coeff,src2 );
334
0
          __m128i sum  = _mm_hadd_epi32( src0, src1 );
335
0
          sum = _mm_add_epi32( sum, offset );
336
0
          sum = _mm_srai_epi32( sum, 6 );
337
338
0
          refMainIndex+=4;
339
0
          src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] );   //load 8 16 bit reference Pels   -1 0 1 2 3 4 5 6
340
0
          src1 = _mm_shuffle_epi8(src0,shflmask1);                                // -1 0 1 2  0 1 2 3
341
0
          src2 = _mm_shuffle_epi8(src0,shflmask2);
342
343
          // 1 2 3 4  2 3 4 5
344
0
          src0 = _mm_madd_epi16( coeff,src1 );
345
0
          src1 = _mm_madd_epi16( coeff,src2 );
346
347
0
          __m128i sum1  = _mm_hadd_epi32( src0, src1 );
348
0
          sum1 = _mm_add_epi32( sum1, offset );
349
0
          sum1 = _mm_srai_epi32( sum1, 6 );
350
0
          src0 = _mm_packs_epi32( sum, sum1 );
351
352
0
          refMainIndex+=4;
353
354
0
          if (useCubicFilter)
355
0
            src0 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, src0 ) );
356
357
0
          _mm_storeu_si128( ( __m128i * )(pDst + x), src0);
358
359
0
        }
360
0
        deltaPos += intraPredAngle;
361
0
      }
362
0
    }
363
0
  }
364
870
  else if( W == 4 )
365
870
  {
366
367
870
    __m128i shflmask1= _mm_set_epi8( 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2,   0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 );
368
870
    __m128i shflmask2= _mm_set_epi8( 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6,   0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4 );
369
870
    __m128i vbdmin,vbdmax;
370
371
870
    __m128i offset = _mm_set1_epi32( 32 );
372
373
870
    if (useCubicFilter)
374
865
    {
375
865
      vbdmin = _mm_set1_epi16( clpRng.min() );
376
865
      vbdmax = _mm_set1_epi16( clpRng.max() );
377
865
    }
378
379
13.5k
    for (int y = 0; y<height; y++ )
380
12.7k
    {
381
12.7k
      int deltaInt   = deltaPos >> 5;
382
12.7k
      int deltaFract = deltaPos & (32 - 1);
383
12.7k
      int refMainIndex   = deltaInt + 1;
384
12.7k
      pDst=&pDstBuf[y*dstStride];
385
12.7k
      __m128i coeff = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] );   //load 4 16 bit filter coeffs
386
12.7k
      coeff = _mm_shuffle_epi32(coeff,0x44);
387
12.7k
      {
388
12.7k
        __m128i src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] );   //load 8 16 bit reference Pels   -1 0 1 2 3 4 5 6
389
12.7k
        __m128i src1 = _mm_shuffle_epi8(src0,shflmask1);                  // -1 0 1 2  0 1 2 3
390
12.7k
        __m128i src2 = _mm_shuffle_epi8(src0,shflmask2);                  // 1 2 3 4  2 3 4 5
391
12.7k
        src0 = _mm_madd_epi16( coeff,src1 );
392
12.7k
        src1 = _mm_madd_epi16( coeff,src2 );
393
12.7k
        __m128i sum  = _mm_hadd_epi32( src0, src1 );
394
12.7k
        sum = _mm_add_epi32( sum, offset );
395
12.7k
        sum = _mm_srai_epi32( sum, 6 );
396
397
12.7k
        src0 = _mm_packs_epi32( sum, sum );
398
399
12.7k
        refMainIndex+=4;
400
401
12.7k
        if (useCubicFilter)
402
12.5k
          src0 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, src0 ) );
403
404
12.7k
        _mm_storeu_si64( ( __m128i * )(pDst ), src0);
405
406
12.7k
      }
407
12.7k
      deltaPos += intraPredAngle;
408
12.7k
    }
409
870
  }
410
0
  else
411
0
  {
412
0
    THROW_FATAL( "Unsupported size in IntraPredAngleCore_SIMD" );
413
0
  }
414
870
#if USE_AVX2
415
870
  _mm256_zeroupper();
416
870
#endif
417
870
}
void vvdec::IntraPredAngleCore_SIMD<(vvdec::x86_simd::X86_VEXT)4, 8>(short*, long, short*, int, int, int, int, short const*, bool, vvdec::ClpRngTemplate<short> const&)
Line
Count
Source
184
5.71k
{
185
5.71k
  int16_t* pDst;
186
187
5.71k
  if( W == 8 )
188
5.71k
  {
189
5.71k
    if( vext >= AVX2 )
190
5.71k
    {
191
5.71k
#ifdef USE_AVX2
192
5.71k
      __m256i shflmask1= _mm256_set_epi8(0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6,   0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4,
193
5.71k
          0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2,   0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 );
194
5.71k
      __m256i offset = _mm256_set1_epi32( 32 );
195
196
5.71k
      if (( width & 15 ) == 0 )
197
4.07k
      {
198
4.07k
        __m256i vbdmin,vbdmax;
199
200
4.07k
        if (useCubicFilter)
201
1.86k
        {
202
1.86k
          vbdmin = _mm256_set1_epi16( clpRng.min() );
203
1.86k
          vbdmax = _mm256_set1_epi16( clpRng.max() );
204
1.86k
        }
205
206
87.9k
        for (int y = 0; y<height; y++ )
207
83.8k
        {
208
83.8k
          int deltaInt   = deltaPos >> 5;
209
83.8k
          int deltaFract = deltaPos & (32 - 1);
210
83.8k
          int refMainIndex   = deltaInt + 1;
211
83.8k
          pDst=&pDstBuf[y*dstStride];
212
83.8k
          __m128i tmp = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] );   //load 4 16 bit filter coeffs
213
83.8k
          tmp = _mm_shuffle_epi32(tmp,0x44);
214
83.8k
          __m256i coeff = _mm256_broadcastsi128_si256(tmp);
215
280k
          for( int x = 0; x < width; x+=16)
216
196k
          {
217
196k
            __m256i src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( const __m128i* ) &refMain[refMainIndex     - 1] ) );
218
196k
            __m256i src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( const __m128i* ) &refMain[refMainIndex + 4 - 1] ) );
219
196k
            src1 = _mm256_shuffle_epi8(src1,shflmask1);                 // -1 0 1 2  0 1 2 3 1 2 3 4  2 3 4 5
220
196k
            src2 = _mm256_shuffle_epi8(src2,shflmask1);                 // 3 4 5 6  4 5 6 7  5 6 7 8 6 7 8 9
221
222
196k
            src1 = _mm256_madd_epi16 (src1, coeff);
223
196k
            src2 = _mm256_madd_epi16 (src2, coeff);
224
225
196k
            __m256i  sum  = _mm256_hadd_epi32( src1, src2 );
226
196k
            sum = _mm256_permute4x64_epi64(sum,0xD8);
227
228
196k
            sum = _mm256_add_epi32( sum, offset );
229
196k
            sum = _mm256_srai_epi32( sum, 6 );
230
231
196k
            refMainIndex+=8;
232
            
233
196k
            src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) &refMain[refMainIndex     - 1] ) );
234
196k
            src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) &refMain[refMainIndex + 4 - 1] ) );
235
236
196k
            src1 = _mm256_shuffle_epi8(src1,shflmask1);                 // -1 0 1 2  0 1 2 3 1 2 3 4  2 3 4 5
237
196k
            src2 = _mm256_shuffle_epi8(src2,shflmask1);                 // 3 4 5 6  4 5 6 7  5 6 7 8 6 7 8 9
238
196k
            src1 = _mm256_madd_epi16 (src1, coeff);
239
196k
            src2 = _mm256_madd_epi16 (src2, coeff);
240
241
196k
            __m256i  sum1  = _mm256_hadd_epi32( src1, src2 );
242
196k
            sum1 = _mm256_permute4x64_epi64(sum1,0xD8);
243
244
196k
            sum1 = _mm256_add_epi32( sum1, offset );
245
196k
            sum1 = _mm256_srai_epi32( sum1, 6 );
246
196k
            __m256i
247
196k
            src0 = _mm256_packs_epi32( sum, sum1 );
248
249
196k
            src0 = _mm256_permute4x64_epi64(src0,0xD8);
250
251
196k
            refMainIndex+=8;
252
253
196k
            if (useCubicFilter)
254
44.4k
              src0 = _mm256_min_epi16( vbdmax, _mm256_max_epi16( vbdmin, src0 ) );
255
256
196k
            _mm256_storeu_si256( ( __m256i * )(pDst + x), src0);
257
196k
          }
258
83.8k
          deltaPos += intraPredAngle;
259
83.8k
        }
260
4.07k
      }
261
1.64k
      else // width =8
262
1.64k
      {
263
        //        printf("AVX2 Block %d \n",width);
264
1.64k
        __m128i vbdmin,vbdmax;
265
266
1.64k
        if (useCubicFilter)
267
1.45k
        {
268
1.45k
          vbdmin = _mm_set1_epi16( clpRng.min() );
269
1.45k
          vbdmax = _mm_set1_epi16( clpRng.max() );
270
1.45k
        }
271
272
26.1k
        for (int y = 0; y<height; y++ )
273
24.4k
        {
274
24.4k
          int deltaInt   = deltaPos >> 5;
275
24.4k
          int deltaFract = deltaPos & (32 - 1);
276
24.4k
          int refMainIndex   = deltaInt + 1;
277
24.4k
          pDst=&pDstBuf[y*dstStride];
278
24.4k
          __m128i tmp = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] );   //load 4 16 bit filter coeffs
279
24.4k
          tmp = _mm_shuffle_epi32(tmp,0x44);
280
24.4k
          __m256i coeff = _mm256_broadcastsi128_si256(tmp);
281
24.4k
          __m256i src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) & refMain[refMainIndex - 1] ) );
282
24.4k
          __m256i src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) & refMain[refMainIndex + 4 - 1] ) );
283
24.4k
          src1 = _mm256_shuffle_epi8(src1,shflmask1);                 // -1 0 1 2  0 1 2 3 1 2 3 4  2 3 4 5
284
24.4k
          src2 = _mm256_shuffle_epi8(src2,shflmask1);                 // 3 4 5 6  4 5 6 7  5 6 7 8 6 7 8 9
285
286
24.4k
          src1 = _mm256_madd_epi16 (src1, coeff);
287
24.4k
          src2 = _mm256_madd_epi16 (src2, coeff);
288
289
24.4k
          __m256i  sum  = _mm256_hadd_epi32( src1, src2 );
290
24.4k
          sum = _mm256_permute4x64_epi64(sum,0xD8);
291
292
24.4k
          sum = _mm256_add_epi32( sum, offset );
293
24.4k
          sum = _mm256_srai_epi32( sum, 6 );
294
24.4k
          __m128i dest128 = _mm256_cvtepi32_epi16x( sum );
295
296
24.4k
          if (useCubicFilter)
297
18.8k
            dest128 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, dest128 ) );
298
299
24.4k
          _mm_storeu_si128( ( __m128i * )(pDst), dest128);
300
24.4k
          deltaPos += intraPredAngle;
301
24.4k
        }
302
1.64k
      }
303
5.71k
#endif
304
5.71k
    }
305
0
    else
306
0
    {
307
0
      __m128i shflmask1= _mm_set_epi8( 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2,   0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 );
308
0
      __m128i shflmask2= _mm_set_epi8( 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6,   0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4 );
309
0
      __m128i vbdmin,vbdmax;
310
311
0
      __m128i offset = _mm_set1_epi32( 32 );
312
313
0
      if (useCubicFilter)
314
0
      {
315
0
        vbdmin = _mm_set1_epi16( clpRng.min() );
316
0
        vbdmax = _mm_set1_epi16( clpRng.max() );
317
0
      }
318
319
0
      for (int y = 0; y<height; y++ )
320
0
      {
321
0
        int deltaInt   = deltaPos >> 5;
322
0
        int deltaFract = deltaPos & (32 - 1);
323
0
        int refMainIndex   = deltaInt + 1;
324
0
        pDst=&pDstBuf[y*dstStride];
325
0
        __m128i coeff = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] );   //load 4 16 bit filter coeffs
326
0
        coeff = _mm_shuffle_epi32(coeff,0x44);
327
0
        for( int x = 0; x < width; x+=8)
328
0
        {
329
0
          __m128i src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] );   //load 8 16 bit reference Pels   -1 0 1 2 3 4 5 6
330
0
          __m128i src1 = _mm_shuffle_epi8(src0,shflmask1);                  // -1 0 1 2  0 1 2 3
331
0
          __m128i src2 = _mm_shuffle_epi8(src0,shflmask2);                  // 1 2 3 4  2 3 4 5
332
0
          src0 = _mm_madd_epi16( coeff,src1 );
333
0
          src1 = _mm_madd_epi16( coeff,src2 );
334
0
          __m128i sum  = _mm_hadd_epi32( src0, src1 );
335
0
          sum = _mm_add_epi32( sum, offset );
336
0
          sum = _mm_srai_epi32( sum, 6 );
337
338
0
          refMainIndex+=4;
339
0
          src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] );   //load 8 16 bit reference Pels   -1 0 1 2 3 4 5 6
340
0
          src1 = _mm_shuffle_epi8(src0,shflmask1);                                // -1 0 1 2  0 1 2 3
341
0
          src2 = _mm_shuffle_epi8(src0,shflmask2);
342
343
          // 1 2 3 4  2 3 4 5
344
0
          src0 = _mm_madd_epi16( coeff,src1 );
345
0
          src1 = _mm_madd_epi16( coeff,src2 );
346
347
0
          __m128i sum1  = _mm_hadd_epi32( src0, src1 );
348
0
          sum1 = _mm_add_epi32( sum1, offset );
349
0
          sum1 = _mm_srai_epi32( sum1, 6 );
350
0
          src0 = _mm_packs_epi32( sum, sum1 );
351
352
0
          refMainIndex+=4;
353
354
0
          if (useCubicFilter)
355
0
            src0 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, src0 ) );
356
357
0
          _mm_storeu_si128( ( __m128i * )(pDst + x), src0);
358
359
0
        }
360
0
        deltaPos += intraPredAngle;
361
0
      }
362
0
    }
363
5.71k
  }
364
0
  else if( W == 4 )
365
0
  {
366
367
0
    __m128i shflmask1= _mm_set_epi8( 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2,   0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 );
368
0
    __m128i shflmask2= _mm_set_epi8( 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6,   0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4 );
369
0
    __m128i vbdmin,vbdmax;
370
371
0
    __m128i offset = _mm_set1_epi32( 32 );
372
373
0
    if (useCubicFilter)
374
0
    {
375
0
      vbdmin = _mm_set1_epi16( clpRng.min() );
376
0
      vbdmax = _mm_set1_epi16( clpRng.max() );
377
0
    }
378
379
0
    for (int y = 0; y<height; y++ )
380
0
    {
381
0
      int deltaInt   = deltaPos >> 5;
382
0
      int deltaFract = deltaPos & (32 - 1);
383
0
      int refMainIndex   = deltaInt + 1;
384
0
      pDst=&pDstBuf[y*dstStride];
385
0
      __m128i coeff = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] );   //load 4 16 bit filter coeffs
386
0
      coeff = _mm_shuffle_epi32(coeff,0x44);
387
0
      {
388
0
        __m128i src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] );   //load 8 16 bit reference Pels   -1 0 1 2 3 4 5 6
389
0
        __m128i src1 = _mm_shuffle_epi8(src0,shflmask1);                  // -1 0 1 2  0 1 2 3
390
0
        __m128i src2 = _mm_shuffle_epi8(src0,shflmask2);                  // 1 2 3 4  2 3 4 5
391
0
        src0 = _mm_madd_epi16( coeff,src1 );
392
0
        src1 = _mm_madd_epi16( coeff,src2 );
393
0
        __m128i sum  = _mm_hadd_epi32( src0, src1 );
394
0
        sum = _mm_add_epi32( sum, offset );
395
0
        sum = _mm_srai_epi32( sum, 6 );
396
397
0
        src0 = _mm_packs_epi32( sum, sum );
398
399
0
        refMainIndex+=4;
400
401
0
        if (useCubicFilter)
402
0
          src0 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, src0 ) );
403
404
0
        _mm_storeu_si64( ( __m128i * )(pDst ), src0);
405
406
0
      }
407
0
      deltaPos += intraPredAngle;
408
0
    }
409
0
  }
410
0
  else
411
0
  {
412
0
    THROW_FATAL( "Unsupported size in IntraPredAngleCore_SIMD" );
413
0
  }
414
5.71k
#if USE_AVX2
415
5.71k
  _mm256_zeroupper();
416
5.71k
#endif
417
5.71k
}
418
419
template< X86_VEXT vext, int W >
420
void  IntraPredSampleFilter_SIMD(Pel *ptrSrc,const ptrdiff_t srcStride,PelBuf &piPred,const uint32_t uiDirMode,const ClpRng& clpRng)
421
13.5k
{
422
13.5k
  const int       iWidth    = piPred.width;
423
13.5k
  const int       iHeight   = piPred.height;
424
13.5k
  PelBuf          dstBuf    = piPred;
425
13.5k
  Pel*            pDst      = dstBuf.buf;
426
13.5k
  const ptrdiff_t dstStride = dstBuf.stride;
427
428
13.5k
  const int scale = ((getLog2(iWidth) - 2 + getLog2(iHeight) - 2 + 2) >> 2);
429
13.5k
  CHECK(scale < 0 || scale > 31, "PDPC: scale < 0 || scale > 2");
430
431
#if USE_AVX2
432
13.5k
  if( W > 8 )
433
8.55k
  {
434
8.55k
    __m256i tmplo,tmphi;
435
8.55k
    __m256i w64 = _mm256_set_epi16(64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64);
436
8.55k
    __m256i w32 = _mm256_set_epi32(32,32,32,32,32,32,32,32);
437
8.55k
    __m256i vbdmin   = _mm256_set1_epi32( clpRng.min() );
438
8.55k
    __m256i vbdmax   = _mm256_set1_epi32( clpRng.max() );
439
8.55k
    __m256i wl16;
440
8.55k
    __m256i wl16start;
441
442
8.55k
    if (scale==0)
443
0
    {
444
0
      wl16start = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,0,0,0,2,8,32);
445
0
    }
446
8.55k
    else if (scale==1)
447
5.65k
    {
448
5.65k
      wl16start = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,1,2,4,8,16,32);
449
5.65k
    }
450
2.89k
    else if (scale==2)
451
2.89k
    {
452
2.89k
      wl16start = _mm256_set_epi16(0,0,0,0,1,1,2,2,4,4,8,8,16,16,32,32);
453
2.89k
    }
454
0
    else
455
0
    {
456
0
      THROW_FATAL( "Wrong scale (" << scale << ")" );
457
0
    }
458
459
460
8.55k
    if (uiDirMode == PLANAR_IDX || uiDirMode == DC_IDX )
461
8.55k
    {
462
210k
      for (int y = 0; y < iHeight; y++)
463
201k
      {
464
201k
        int wT = 32 >> std::min(31, ((y << 1) >> scale));
465
466
201k
        __m256i wt16 = _mm256_set_epi16(wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT);
467
201k
        __m256i x16left = _mm256_broadcastw_epi16(_mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)*srcStride))));
468
469
201k
        if (wT)
470
67.1k
        {
471
195k
          for (int x = 0; x < iWidth; x+=16)
472
128k
          {
473
128k
            if (x==0)
474
67.1k
            {
475
67.1k
              wl16=wl16start;
476
477
67.1k
              __m256i x16top = _mm256_loadu_si256((__m256i *) (ptrSrc+x+1)); // load top
478
67.1k
              __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride+x)); // load dst
479
480
67.1k
              tmplo = _mm256_mullo_epi16(x16left,wl16);  //wL * left
481
67.1k
              tmphi = _mm256_mulhi_epi16(x16left,wl16);  //wL * left
482
67.1k
              __m256i leftlo = _mm256_unpacklo_epi16(tmplo,tmphi);
483
67.1k
              __m256i lefthi = _mm256_unpackhi_epi16(tmplo,tmphi);
484
485
67.1k
              tmplo = _mm256_mullo_epi16(x16top,wt16);    // wT*top
486
67.1k
              tmphi = _mm256_mulhi_epi16(x16top,wt16);    // wT*top
487
67.1k
              __m256i toplo = _mm256_unpacklo_epi16(tmplo,tmphi);
488
67.1k
              __m256i tophi = _mm256_unpackhi_epi16(tmplo,tmphi);
489
490
67.1k
              __m256i wX = _mm256_sub_epi16(w64,wl16);
491
67.1k
              wX = _mm256_sub_epi16(wX,wt16);            // 64-wL-wT
492
67.1k
              tmplo = _mm256_mullo_epi16(x16dst,wX);    // 64-wL-wT*dst
493
67.1k
              tmphi = _mm256_mulhi_epi16(x16dst,wX);    // 64-wL-wT*dst
494
67.1k
              __m256i dstlo = _mm256_unpacklo_epi16(tmplo,tmphi);
495
67.1k
              __m256i dsthi = _mm256_unpackhi_epi16(tmplo,tmphi);
496
497
67.1k
              dstlo = _mm256_add_epi32(dstlo,toplo);
498
67.1k
              dsthi = _mm256_add_epi32(dsthi,tophi);
499
67.1k
              dstlo = _mm256_add_epi32(dstlo,leftlo);
500
67.1k
              dsthi = _mm256_add_epi32(dsthi,lefthi);
501
67.1k
              dstlo = _mm256_add_epi32(dstlo,w32);
502
67.1k
              dsthi = _mm256_add_epi32(dsthi,w32);
503
504
67.1k
              dstlo =  _mm256_srai_epi32(dstlo,6);
505
67.1k
              dsthi =  _mm256_srai_epi32(dsthi,6);
506
507
67.1k
              dstlo =  _mm256_max_epi32(vbdmin,dstlo);
508
67.1k
              dsthi =  _mm256_max_epi32(vbdmin,dsthi);
509
67.1k
              dstlo =  _mm256_min_epi32(vbdmax,dstlo);
510
67.1k
              dsthi =  _mm256_min_epi32(vbdmax,dsthi);
511
512
67.1k
              dstlo =  _mm256_packs_epi32(dstlo,dsthi);
513
67.1k
              dstlo =  _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) );
514
515
67.1k
              _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride+x), dstlo );
516
67.1k
            }
517
61.3k
            else
518
61.3k
            {
519
520
61.3k
              __m256i x16top = _mm256_loadu_si256((__m256i *) (ptrSrc+x+1)); // load top
521
61.3k
              __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride+x)); // load dst
522
523
524
61.3k
              tmplo = _mm256_mullo_epi16(x16top,wt16);    // wT*top
525
61.3k
              tmphi = _mm256_mulhi_epi16(x16top,wt16);    // wT*top
526
61.3k
              __m256i toplo = _mm256_unpacklo_epi16(tmplo,tmphi);
527
61.3k
              __m256i tophi = _mm256_unpackhi_epi16(tmplo,tmphi);
528
529
61.3k
              __m256i wX = _mm256_sub_epi16(w64,wt16);
530
61.3k
              tmplo = _mm256_mullo_epi16(x16dst,wX);    // 64-wL-wT*dst
531
61.3k
              tmphi = _mm256_mulhi_epi16(x16dst,wX);    // 64-wL-wT*dst
532
61.3k
              __m256i dstlo = _mm256_unpacklo_epi16(tmplo,tmphi);
533
61.3k
              __m256i dsthi = _mm256_unpackhi_epi16(tmplo,tmphi);
534
535
61.3k
              dstlo = _mm256_add_epi32(dstlo,toplo);
536
61.3k
              dsthi = _mm256_add_epi32(dsthi,tophi);
537
61.3k
              dstlo = _mm256_add_epi32(dstlo,w32);
538
61.3k
              dsthi = _mm256_add_epi32(dsthi,w32);
539
540
61.3k
              dstlo =  _mm256_srai_epi32(dstlo,6);
541
61.3k
              dsthi =  _mm256_srai_epi32(dsthi,6);
542
543
61.3k
              dstlo =  _mm256_max_epi32(vbdmin,dstlo);
544
61.3k
              dsthi =  _mm256_max_epi32(vbdmin,dsthi);
545
61.3k
              dstlo =  _mm256_min_epi32(vbdmax,dstlo);
546
61.3k
              dsthi =  _mm256_min_epi32(vbdmax,dsthi);
547
548
61.3k
              dstlo =  _mm256_packs_epi32(dstlo,dsthi);
549
61.3k
              dstlo =  _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) );
550
551
61.3k
              _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride+x), dstlo );
552
61.3k
            }
553
554
128k
          }  // for
555
67.1k
        }
556
134k
        else
557
134k
        { // wT =0
558
559
134k
          wl16=wl16start;
560
134k
          __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride)); // load dst
561
562
134k
          tmplo = _mm256_mullo_epi16(x16left,wl16);  //wL * left
563
134k
          tmphi = _mm256_mulhi_epi16(x16left,wl16);  //wL * left
564
134k
          __m256i leftlo = _mm256_unpacklo_epi16(tmplo,tmphi);
565
134k
          __m256i lefthi = _mm256_unpackhi_epi16(tmplo,tmphi);
566
567
134k
          __m256i wX = _mm256_sub_epi16(w64,wl16);
568
134k
          tmplo = _mm256_mullo_epi16(x16dst,wX);    // 64-wL-wT*dst
569
134k
          tmphi = _mm256_mulhi_epi16(x16dst,wX);    // 64-wL-wT*dst
570
134k
          __m256i dstlo = _mm256_unpacklo_epi16(tmplo,tmphi);
571
134k
          __m256i dsthi = _mm256_unpackhi_epi16(tmplo,tmphi);
572
573
134k
          dstlo = _mm256_add_epi32(dstlo,leftlo);
574
134k
          dsthi = _mm256_add_epi32(dsthi,lefthi);
575
134k
          dstlo = _mm256_add_epi32(dstlo,w32);
576
134k
          dsthi = _mm256_add_epi32(dsthi,w32);
577
578
134k
          dstlo =  _mm256_srai_epi32(dstlo,6);
579
134k
          dsthi =  _mm256_srai_epi32(dsthi,6);
580
581
134k
          dstlo =  _mm256_max_epi32(vbdmin,dstlo);
582
134k
          dsthi =  _mm256_max_epi32(vbdmin,dsthi);
583
134k
          dstlo =  _mm256_min_epi32(vbdmax,dstlo);
584
134k
          dsthi =  _mm256_min_epi32(vbdmax,dsthi);
585
586
134k
          dstlo =  _mm256_packs_epi32(dstlo,dsthi);
587
134k
          dstlo =  _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) );
588
589
134k
          _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride), dstlo );
590
134k
        }
591
201k
      }
592
8.55k
    }
593
8.55k
  }
594
4.94k
  else
595
4.94k
#endif
596
4.94k
  {
597
4.94k
    __m128i tmplo8,tmphi8;
598
4.94k
    __m128i w64_8 = _mm_set_epi16(64,64,64,64,64,64,64,64);
599
4.94k
    __m128i w32_8 = _mm_set_epi32(32,32,32,32);
600
4.94k
    __m128i vbdmin8   = _mm_set1_epi32( clpRng.min() );
601
4.94k
    __m128i vbdmax8   = _mm_set1_epi32( clpRng.max() );
602
4.94k
    __m128i wl8start,wl8start2;
603
4.94k
    CHECK(scale < 0 || scale > 2, "PDPC: scale < 0 || scale > 2");
604
605
4.94k
    if (scale==0)
606
1.54k
    {
607
1.54k
      wl8start = _mm_set_epi16(0,0,0,0,0,2,8,32);
608
1.54k
      wl8start2 = _mm_set_epi16(0,0,0,0,0,0,0,0);
609
1.54k
    }
610
3.40k
    else if (scale==1)
611
3.40k
    {
612
3.40k
      wl8start = _mm_set_epi16(0,0,1,2,4,8,16,32);
613
3.40k
      wl8start2 = _mm_set_epi16(0,0,0,0,0,0,0,0);
614
3.40k
    }
615
0
    else if (scale==2)
616
0
    {
617
0
      wl8start = _mm_set_epi16(4,4,8,8,16,16,32,32);
618
0
      wl8start2 = _mm_set_epi16(0,0,0,0,1,1,2,2);
619
0
    }
620
4.94k
    if (uiDirMode == PLANAR_IDX || uiDirMode == DC_IDX )
621
4.94k
    {
622
4.94k
      __m128i wl8 = wl8start;
623
73.8k
      for (int y = 0; y < iHeight; y++)
624
68.8k
      {
625
68.8k
        int wT = 32 >> std::min(31, ((y << 1) >> scale));
626
627
68.8k
        __m128i wt8 = _mm_set_epi16(wT,wT,wT,wT,wT,wT,wT,wT);
628
 //       __m128i x8left = _mm_broadcastw_epi16(_mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)*srcStride))));
629
630
68.8k
        __m128i x8left = _mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)*srcStride)));
631
68.8k
        x8left =_mm_shufflelo_epi16(x8left,0);
632
68.8k
        x8left =_mm_shuffle_epi32(x8left,0);
633
634
635
68.8k
        if (wT)
636
25.0k
        {
637
50.1k
          for (int x = 0; x < iWidth; x+=8)
638
25.0k
          {
639
25.0k
            __m128i x8top = _mm_loadu_si128( (__m128i*) ( ptrSrc + x + 1 ) );   // load top
640
25.0k
            __m128i x8dst = _mm_setzero_si128();
641
25.0k
            if( iWidth >= 8 )
642
17.0k
              x8dst = _mm_loadu_si128( (const __m128i*) ( pDst + y * dstStride + x ) );   // load dst
643
8.04k
            else if( iWidth == 4 )
644
8.04k
              x8dst = _mm_loadu_si64( (const __m128i*) ( pDst + y * dstStride + x ) );    // load dst
645
0
            else if( iWidth == 2 )
646
0
              x8dst = _mm_loadu_si32( (const __m128i*) ( pDst + y * dstStride + x ) );    // load dst
647
0
            else
648
0
            {
649
0
              CHECKD( true, "wrong iWidth in IntraPredSampleFilter_SIMD, only implemented for >=8, ==4, ==2" );
650
0
            }
651
652
25.0k
            if (x>8)
653
0
            {
654
0
              tmplo8 = _mm_mullo_epi16(x8top,wt8);    // wT*top
655
0
              tmphi8 = _mm_mulhi_epi16(x8top,wt8);    // wT*top
656
0
              __m128i toplo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
657
0
              __m128i tophi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
658
659
660
0
              __m128i wX = _mm_sub_epi16(w64_8,wt8);
661
0
              tmplo8 = _mm_mullo_epi16(x8dst,wX);    // 64-wL-wT*dst
662
0
              tmphi8 = _mm_mulhi_epi16(x8dst,wX);    // 64-wL-wT*dst
663
0
              __m128i dstlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
664
0
              __m128i dsthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
665
666
0
              dstlo8 = _mm_add_epi32(dstlo8,toplo8);
667
0
              dsthi8 = _mm_add_epi32(dsthi8,tophi8);
668
0
              dstlo8 = _mm_add_epi32(dstlo8,w32_8);
669
0
              dsthi8 = _mm_add_epi32(dsthi8,w32_8);
670
671
0
              dstlo8 =  _mm_srai_epi32(dstlo8,6);
672
0
              dsthi8 =  _mm_srai_epi32(dsthi8,6);
673
674
0
              dstlo8 =  _mm_max_epi32(vbdmin8,dstlo8);
675
0
              dsthi8 =  _mm_max_epi32(vbdmin8,dsthi8);
676
0
              dstlo8 =  _mm_min_epi32(vbdmax8,dstlo8);
677
0
              dsthi8 =  _mm_min_epi32(vbdmax8,dsthi8);
678
679
0
              x8dst = _mm_packs_epi32(dstlo8,dsthi8);
680
0
            }
681
25.0k
            else // x<=8
682
25.0k
            {
683
25.0k
              if (x==0)
684
25.0k
                wl8=wl8start;
685
0
              else if (x==8)
686
0
                wl8=wl8start2;
687
688
25.0k
              tmplo8 = _mm_mullo_epi16(x8left,wl8);  //wL * left
689
25.0k
              tmphi8 = _mm_mulhi_epi16(x8left,wl8);  //wL * left
690
25.0k
              __m128i leftlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
691
25.0k
              __m128i lefthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
692
693
25.0k
              tmplo8 = _mm_mullo_epi16(x8top,wt8);    // wT*top
694
25.0k
              tmphi8 = _mm_mulhi_epi16(x8top,wt8);    // wT*top
695
25.0k
              __m128i toplo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
696
25.0k
              __m128i tophi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
697
698
25.0k
              __m128i wX = _mm_sub_epi16(w64_8,wl8);
699
25.0k
              wX = _mm_sub_epi16(wX,wt8);            // 64-wL-wT
700
25.0k
              tmplo8 = _mm_mullo_epi16(x8dst,wX);    // 64-wL-wT*dst
701
25.0k
              tmphi8 = _mm_mulhi_epi16(x8dst,wX);    // 64-wL-wT*dst
702
25.0k
              __m128i dstlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
703
25.0k
              __m128i dsthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
704
705
25.0k
              dstlo8 = _mm_add_epi32(dstlo8,toplo8);
706
25.0k
              dsthi8 = _mm_add_epi32(dsthi8,tophi8);
707
25.0k
              dstlo8 = _mm_add_epi32(dstlo8,leftlo8);
708
25.0k
              dsthi8 = _mm_add_epi32(dsthi8,lefthi8);
709
25.0k
              dstlo8 = _mm_add_epi32(dstlo8,w32_8);
710
25.0k
              dsthi8 = _mm_add_epi32(dsthi8,w32_8);
711
712
25.0k
              dstlo8 =  _mm_srai_epi32(dstlo8,6);
713
25.0k
              dsthi8 =  _mm_srai_epi32(dsthi8,6);
714
715
25.0k
              dstlo8 =  _mm_max_epi32(vbdmin8,dstlo8);
716
25.0k
              dsthi8 =  _mm_max_epi32(vbdmin8,dsthi8);
717
25.0k
              dstlo8 =  _mm_min_epi32(vbdmax8,dstlo8);
718
25.0k
              dsthi8 =  _mm_min_epi32(vbdmax8,dsthi8);
719
720
25.0k
              x8dst = _mm_packs_epi32(dstlo8,dsthi8);
721
25.0k
            }
722
723
25.0k
            if( iWidth >= 8 )
724
17.0k
              _mm_storeu_si128( (__m128i*) ( pDst + y * dstStride + x ), x8dst );
725
8.04k
            else if( iWidth == 4 )
726
8.04k
              _mm_storeu_si64( (__m128i*) ( pDst + y * dstStride + x ), x8dst );
727
0
            else if( iWidth == 2 )
728
0
              _mm_storeu_si32( (__m128i*) ( pDst + y * dstStride + x ), x8dst );
729
25.0k
          }
730
25.0k
        }
731
43.8k
        else //wT =0
732
43.8k
        {
733
87.6k
          for( int x = 0; x < std::min( iWidth, 16 ); x += 8 )
734
43.8k
          {
735
43.8k
            if( x == 0 )
736
43.8k
              wl8 = wl8start;
737
0
            else if( x == 8 )
738
0
              wl8 = wl8start2;
739
740
43.8k
            __m128i x8dst;
741
43.8k
            if( iWidth >= 8 )
742
26.8k
              x8dst = _mm_loadu_si128( (const __m128i*)( pDst + y * dstStride + x ) );   // load dst
743
16.9k
            else if( iWidth == 4 )
744
16.9k
              x8dst = _mm_loadu_si64( (const __m128i*)( pDst + y * dstStride + x ) );    // load dst
745
0
            else if( iWidth == 2 )
746
0
              x8dst = _mm_loadu_si32( (const __m128i*)( pDst + y * dstStride + x ) );    // load dst
747
0
            else
748
43.8k
              CHECK( true, "wrong iWidth in IntraPredSampleFilter_SIMD, only implemented for >=8, ==4, ==2" );
749
750
751
43.8k
            tmplo8          = _mm_mullo_epi16( x8left, wl8 );   // wL * left
752
43.8k
            tmphi8          = _mm_mulhi_epi16( x8left, wl8 );   // wL * left
753
43.8k
            __m128i leftlo8 = _mm_unpacklo_epi16( tmplo8, tmphi8 );
754
43.8k
            __m128i lefthi8 = _mm_unpackhi_epi16( tmplo8, tmphi8 );
755
756
43.8k
            __m128i wX     = _mm_sub_epi16( w64_8, wl8 );
757
43.8k
            tmplo8         = _mm_mullo_epi16( x8dst, wX );   // 64-wL-wT*dst
758
43.8k
            tmphi8         = _mm_mulhi_epi16( x8dst, wX );   // 64-wL-wT*dst
759
43.8k
            __m128i dstlo8 = _mm_unpacklo_epi16( tmplo8, tmphi8 );
760
43.8k
            __m128i dsthi8 = _mm_unpackhi_epi16( tmplo8, tmphi8 );
761
762
43.8k
            dstlo8 = _mm_add_epi32( dstlo8, leftlo8 );
763
43.8k
            dsthi8 = _mm_add_epi32( dsthi8, lefthi8 );
764
43.8k
            dstlo8 = _mm_add_epi32( dstlo8, w32_8 );
765
43.8k
            dsthi8 = _mm_add_epi32( dsthi8, w32_8 );
766
767
43.8k
            dstlo8 = _mm_srai_epi32( dstlo8, 6 );
768
43.8k
            dsthi8 = _mm_srai_epi32( dsthi8, 6 );
769
770
43.8k
            dstlo8 = _mm_max_epi32( vbdmin8, dstlo8 );
771
43.8k
            dsthi8 = _mm_max_epi32( vbdmin8, dsthi8 );
772
43.8k
            dstlo8 = _mm_min_epi32( vbdmax8, dstlo8 );
773
43.8k
            dsthi8 = _mm_min_epi32( vbdmax8, dsthi8 );
774
775
43.8k
            dstlo8 = _mm_packs_epi32( dstlo8, dsthi8 );
776
777
43.8k
            if( iWidth >= 8 )
778
26.8k
              _mm_storeu_si128( (__m128i*)( pDst + y * dstStride + x ), dstlo8 );
779
16.9k
            else if( iWidth == 4 )
780
16.9k
              _mm_storeu_si64( (__m128i*)( pDst + y * dstStride + x ), ( dstlo8 ) );
781
0
            else if( iWidth == 2 )
782
0
              _mm_storeu_si32( (__m128i*)( pDst + y * dstStride + x ), dstlo8 );
783
43.8k
          }
784
43.8k
        }
785
68.8k
      }
786
4.94k
    }
787
4.94k
  }
788
789
790
#if USE_AVX2
791
13.5k
  _mm256_zeroupper();
792
13.5k
#endif
793
13.5k
}
Unexecuted instantiation: void vvdec::IntraPredSampleFilter_SIMD<(vvdec::x86_simd::X86_VEXT)1, 8>(short*, long, vvdec::AreaBuf<short>&, unsigned int, vvdec::ClpRngTemplate<short> const&)
Unexecuted instantiation: void vvdec::IntraPredSampleFilter_SIMD<(vvdec::x86_simd::X86_VEXT)1, 16>(short*, long, vvdec::AreaBuf<short>&, unsigned int, vvdec::ClpRngTemplate<short> const&)
void vvdec::IntraPredSampleFilter_SIMD<(vvdec::x86_simd::X86_VEXT)4, 8>(short*, long, vvdec::AreaBuf<short>&, unsigned int, vvdec::ClpRngTemplate<short> const&)
Line
Count
Source
421
4.94k
{
422
4.94k
  const int       iWidth    = piPred.width;
423
4.94k
  const int       iHeight   = piPred.height;
424
4.94k
  PelBuf          dstBuf    = piPred;
425
4.94k
  Pel*            pDst      = dstBuf.buf;
426
4.94k
  const ptrdiff_t dstStride = dstBuf.stride;
427
428
4.94k
  const int scale = ((getLog2(iWidth) - 2 + getLog2(iHeight) - 2 + 2) >> 2);
429
4.94k
  CHECK(scale < 0 || scale > 31, "PDPC: scale < 0 || scale > 2");
430
431
4.94k
#if USE_AVX2
432
4.94k
  if( W > 8 )
433
0
  {
434
0
    __m256i tmplo,tmphi;
435
0
    __m256i w64 = _mm256_set_epi16(64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64);
436
0
    __m256i w32 = _mm256_set_epi32(32,32,32,32,32,32,32,32);
437
0
    __m256i vbdmin   = _mm256_set1_epi32( clpRng.min() );
438
0
    __m256i vbdmax   = _mm256_set1_epi32( clpRng.max() );
439
0
    __m256i wl16;
440
0
    __m256i wl16start;
441
442
0
    if (scale==0)
443
0
    {
444
0
      wl16start = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,0,0,0,2,8,32);
445
0
    }
446
0
    else if (scale==1)
447
0
    {
448
0
      wl16start = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,1,2,4,8,16,32);
449
0
    }
450
0
    else if (scale==2)
451
0
    {
452
0
      wl16start = _mm256_set_epi16(0,0,0,0,1,1,2,2,4,4,8,8,16,16,32,32);
453
0
    }
454
0
    else
455
0
    {
456
0
      THROW_FATAL( "Wrong scale (" << scale << ")" );
457
0
    }
458
459
460
0
    if (uiDirMode == PLANAR_IDX || uiDirMode == DC_IDX )
461
0
    {
462
0
      for (int y = 0; y < iHeight; y++)
463
0
      {
464
0
        int wT = 32 >> std::min(31, ((y << 1) >> scale));
465
466
0
        __m256i wt16 = _mm256_set_epi16(wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT);
467
0
        __m256i x16left = _mm256_broadcastw_epi16(_mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)*srcStride))));
468
469
0
        if (wT)
470
0
        {
471
0
          for (int x = 0; x < iWidth; x+=16)
472
0
          {
473
0
            if (x==0)
474
0
            {
475
0
              wl16=wl16start;
476
477
0
              __m256i x16top = _mm256_loadu_si256((__m256i *) (ptrSrc+x+1)); // load top
478
0
              __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride+x)); // load dst
479
480
0
              tmplo = _mm256_mullo_epi16(x16left,wl16);  //wL * left
481
0
              tmphi = _mm256_mulhi_epi16(x16left,wl16);  //wL * left
482
0
              __m256i leftlo = _mm256_unpacklo_epi16(tmplo,tmphi);
483
0
              __m256i lefthi = _mm256_unpackhi_epi16(tmplo,tmphi);
484
485
0
              tmplo = _mm256_mullo_epi16(x16top,wt16);    // wT*top
486
0
              tmphi = _mm256_mulhi_epi16(x16top,wt16);    // wT*top
487
0
              __m256i toplo = _mm256_unpacklo_epi16(tmplo,tmphi);
488
0
              __m256i tophi = _mm256_unpackhi_epi16(tmplo,tmphi);
489
490
0
              __m256i wX = _mm256_sub_epi16(w64,wl16);
491
0
              wX = _mm256_sub_epi16(wX,wt16);            // 64-wL-wT
492
0
              tmplo = _mm256_mullo_epi16(x16dst,wX);    // 64-wL-wT*dst
493
0
              tmphi = _mm256_mulhi_epi16(x16dst,wX);    // 64-wL-wT*dst
494
0
              __m256i dstlo = _mm256_unpacklo_epi16(tmplo,tmphi);
495
0
              __m256i dsthi = _mm256_unpackhi_epi16(tmplo,tmphi);
496
497
0
              dstlo = _mm256_add_epi32(dstlo,toplo);
498
0
              dsthi = _mm256_add_epi32(dsthi,tophi);
499
0
              dstlo = _mm256_add_epi32(dstlo,leftlo);
500
0
              dsthi = _mm256_add_epi32(dsthi,lefthi);
501
0
              dstlo = _mm256_add_epi32(dstlo,w32);
502
0
              dsthi = _mm256_add_epi32(dsthi,w32);
503
504
0
              dstlo =  _mm256_srai_epi32(dstlo,6);
505
0
              dsthi =  _mm256_srai_epi32(dsthi,6);
506
507
0
              dstlo =  _mm256_max_epi32(vbdmin,dstlo);
508
0
              dsthi =  _mm256_max_epi32(vbdmin,dsthi);
509
0
              dstlo =  _mm256_min_epi32(vbdmax,dstlo);
510
0
              dsthi =  _mm256_min_epi32(vbdmax,dsthi);
511
512
0
              dstlo =  _mm256_packs_epi32(dstlo,dsthi);
513
0
              dstlo =  _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) );
514
515
0
              _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride+x), dstlo );
516
0
            }
517
0
            else
518
0
            {
519
520
0
              __m256i x16top = _mm256_loadu_si256((__m256i *) (ptrSrc+x+1)); // load top
521
0
              __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride+x)); // load dst
522
523
524
0
              tmplo = _mm256_mullo_epi16(x16top,wt16);    // wT*top
525
0
              tmphi = _mm256_mulhi_epi16(x16top,wt16);    // wT*top
526
0
              __m256i toplo = _mm256_unpacklo_epi16(tmplo,tmphi);
527
0
              __m256i tophi = _mm256_unpackhi_epi16(tmplo,tmphi);
528
529
0
              __m256i wX = _mm256_sub_epi16(w64,wt16);
530
0
              tmplo = _mm256_mullo_epi16(x16dst,wX);    // 64-wL-wT*dst
531
0
              tmphi = _mm256_mulhi_epi16(x16dst,wX);    // 64-wL-wT*dst
532
0
              __m256i dstlo = _mm256_unpacklo_epi16(tmplo,tmphi);
533
0
              __m256i dsthi = _mm256_unpackhi_epi16(tmplo,tmphi);
534
535
0
              dstlo = _mm256_add_epi32(dstlo,toplo);
536
0
              dsthi = _mm256_add_epi32(dsthi,tophi);
537
0
              dstlo = _mm256_add_epi32(dstlo,w32);
538
0
              dsthi = _mm256_add_epi32(dsthi,w32);
539
540
0
              dstlo =  _mm256_srai_epi32(dstlo,6);
541
0
              dsthi =  _mm256_srai_epi32(dsthi,6);
542
543
0
              dstlo =  _mm256_max_epi32(vbdmin,dstlo);
544
0
              dsthi =  _mm256_max_epi32(vbdmin,dsthi);
545
0
              dstlo =  _mm256_min_epi32(vbdmax,dstlo);
546
0
              dsthi =  _mm256_min_epi32(vbdmax,dsthi);
547
548
0
              dstlo =  _mm256_packs_epi32(dstlo,dsthi);
549
0
              dstlo =  _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) );
550
551
0
              _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride+x), dstlo );
552
0
            }
553
554
0
          }  // for
555
0
        }
556
0
        else
557
0
        { // wT =0
558
559
0
          wl16=wl16start;
560
0
          __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride)); // load dst
561
562
0
          tmplo = _mm256_mullo_epi16(x16left,wl16);  //wL * left
563
0
          tmphi = _mm256_mulhi_epi16(x16left,wl16);  //wL * left
564
0
          __m256i leftlo = _mm256_unpacklo_epi16(tmplo,tmphi);
565
0
          __m256i lefthi = _mm256_unpackhi_epi16(tmplo,tmphi);
566
567
0
          __m256i wX = _mm256_sub_epi16(w64,wl16);
568
0
          tmplo = _mm256_mullo_epi16(x16dst,wX);    // 64-wL-wT*dst
569
0
          tmphi = _mm256_mulhi_epi16(x16dst,wX);    // 64-wL-wT*dst
570
0
          __m256i dstlo = _mm256_unpacklo_epi16(tmplo,tmphi);
571
0
          __m256i dsthi = _mm256_unpackhi_epi16(tmplo,tmphi);
572
573
0
          dstlo = _mm256_add_epi32(dstlo,leftlo);
574
0
          dsthi = _mm256_add_epi32(dsthi,lefthi);
575
0
          dstlo = _mm256_add_epi32(dstlo,w32);
576
0
          dsthi = _mm256_add_epi32(dsthi,w32);
577
578
0
          dstlo =  _mm256_srai_epi32(dstlo,6);
579
0
          dsthi =  _mm256_srai_epi32(dsthi,6);
580
581
0
          dstlo =  _mm256_max_epi32(vbdmin,dstlo);
582
0
          dsthi =  _mm256_max_epi32(vbdmin,dsthi);
583
0
          dstlo =  _mm256_min_epi32(vbdmax,dstlo);
584
0
          dsthi =  _mm256_min_epi32(vbdmax,dsthi);
585
586
0
          dstlo =  _mm256_packs_epi32(dstlo,dsthi);
587
0
          dstlo =  _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) );
588
589
0
          _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride), dstlo );
590
0
        }
591
0
      }
592
0
    }
593
0
  }
594
4.94k
  else
595
4.94k
#endif
596
4.94k
  {
597
4.94k
    __m128i tmplo8,tmphi8;
598
4.94k
    __m128i w64_8 = _mm_set_epi16(64,64,64,64,64,64,64,64);
599
4.94k
    __m128i w32_8 = _mm_set_epi32(32,32,32,32);
600
4.94k
    __m128i vbdmin8   = _mm_set1_epi32( clpRng.min() );
601
4.94k
    __m128i vbdmax8   = _mm_set1_epi32( clpRng.max() );
602
4.94k
    __m128i wl8start,wl8start2;
603
4.94k
    CHECK(scale < 0 || scale > 2, "PDPC: scale < 0 || scale > 2");
604
605
4.94k
    if (scale==0)
606
1.54k
    {
607
1.54k
      wl8start = _mm_set_epi16(0,0,0,0,0,2,8,32);
608
1.54k
      wl8start2 = _mm_set_epi16(0,0,0,0,0,0,0,0);
609
1.54k
    }
610
3.40k
    else if (scale==1)
611
3.40k
    {
612
3.40k
      wl8start = _mm_set_epi16(0,0,1,2,4,8,16,32);
613
3.40k
      wl8start2 = _mm_set_epi16(0,0,0,0,0,0,0,0);
614
3.40k
    }
615
0
    else if (scale==2)
616
0
    {
617
0
      wl8start = _mm_set_epi16(4,4,8,8,16,16,32,32);
618
0
      wl8start2 = _mm_set_epi16(0,0,0,0,1,1,2,2);
619
0
    }
620
4.94k
    if (uiDirMode == PLANAR_IDX || uiDirMode == DC_IDX )
621
4.94k
    {
622
4.94k
      __m128i wl8 = wl8start;
623
73.8k
      for (int y = 0; y < iHeight; y++)
624
68.8k
      {
625
68.8k
        int wT = 32 >> std::min(31, ((y << 1) >> scale));
626
627
68.8k
        __m128i wt8 = _mm_set_epi16(wT,wT,wT,wT,wT,wT,wT,wT);
628
 //       __m128i x8left = _mm_broadcastw_epi16(_mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)*srcStride))));
629
630
68.8k
        __m128i x8left = _mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)*srcStride)));
631
68.8k
        x8left =_mm_shufflelo_epi16(x8left,0);
632
68.8k
        x8left =_mm_shuffle_epi32(x8left,0);
633
634
635
68.8k
        if (wT)
636
25.0k
        {
637
50.1k
          for (int x = 0; x < iWidth; x+=8)
638
25.0k
          {
639
25.0k
            __m128i x8top = _mm_loadu_si128( (__m128i*) ( ptrSrc + x + 1 ) );   // load top
640
25.0k
            __m128i x8dst = _mm_setzero_si128();
641
25.0k
            if( iWidth >= 8 )
642
17.0k
              x8dst = _mm_loadu_si128( (const __m128i*) ( pDst + y * dstStride + x ) );   // load dst
643
8.04k
            else if( iWidth == 4 )
644
8.04k
              x8dst = _mm_loadu_si64( (const __m128i*) ( pDst + y * dstStride + x ) );    // load dst
645
0
            else if( iWidth == 2 )
646
0
              x8dst = _mm_loadu_si32( (const __m128i*) ( pDst + y * dstStride + x ) );    // load dst
647
0
            else
648
0
            {
649
0
              CHECKD( true, "wrong iWidth in IntraPredSampleFilter_SIMD, only implemented for >=8, ==4, ==2" );
650
0
            }
651
652
25.0k
            if (x>8)
653
0
            {
654
0
              tmplo8 = _mm_mullo_epi16(x8top,wt8);    // wT*top
655
0
              tmphi8 = _mm_mulhi_epi16(x8top,wt8);    // wT*top
656
0
              __m128i toplo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
657
0
              __m128i tophi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
658
659
660
0
              __m128i wX = _mm_sub_epi16(w64_8,wt8);
661
0
              tmplo8 = _mm_mullo_epi16(x8dst,wX);    // 64-wL-wT*dst
662
0
              tmphi8 = _mm_mulhi_epi16(x8dst,wX);    // 64-wL-wT*dst
663
0
              __m128i dstlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
664
0
              __m128i dsthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
665
666
0
              dstlo8 = _mm_add_epi32(dstlo8,toplo8);
667
0
              dsthi8 = _mm_add_epi32(dsthi8,tophi8);
668
0
              dstlo8 = _mm_add_epi32(dstlo8,w32_8);
669
0
              dsthi8 = _mm_add_epi32(dsthi8,w32_8);
670
671
0
              dstlo8 =  _mm_srai_epi32(dstlo8,6);
672
0
              dsthi8 =  _mm_srai_epi32(dsthi8,6);
673
674
0
              dstlo8 =  _mm_max_epi32(vbdmin8,dstlo8);
675
0
              dsthi8 =  _mm_max_epi32(vbdmin8,dsthi8);
676
0
              dstlo8 =  _mm_min_epi32(vbdmax8,dstlo8);
677
0
              dsthi8 =  _mm_min_epi32(vbdmax8,dsthi8);
678
679
0
              x8dst = _mm_packs_epi32(dstlo8,dsthi8);
680
0
            }
681
25.0k
            else // x<=8
682
25.0k
            {
683
25.0k
              if (x==0)
684
25.0k
                wl8=wl8start;
685
0
              else if (x==8)
686
0
                wl8=wl8start2;
687
688
25.0k
              tmplo8 = _mm_mullo_epi16(x8left,wl8);  //wL * left
689
25.0k
              tmphi8 = _mm_mulhi_epi16(x8left,wl8);  //wL * left
690
25.0k
              __m128i leftlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
691
25.0k
              __m128i lefthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
692
693
25.0k
              tmplo8 = _mm_mullo_epi16(x8top,wt8);    // wT*top
694
25.0k
              tmphi8 = _mm_mulhi_epi16(x8top,wt8);    // wT*top
695
25.0k
              __m128i toplo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
696
25.0k
              __m128i tophi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
697
698
25.0k
              __m128i wX = _mm_sub_epi16(w64_8,wl8);
699
25.0k
              wX = _mm_sub_epi16(wX,wt8);            // 64-wL-wT
700
25.0k
              tmplo8 = _mm_mullo_epi16(x8dst,wX);    // 64-wL-wT*dst
701
25.0k
              tmphi8 = _mm_mulhi_epi16(x8dst,wX);    // 64-wL-wT*dst
702
25.0k
              __m128i dstlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
703
25.0k
              __m128i dsthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
704
705
25.0k
              dstlo8 = _mm_add_epi32(dstlo8,toplo8);
706
25.0k
              dsthi8 = _mm_add_epi32(dsthi8,tophi8);
707
25.0k
              dstlo8 = _mm_add_epi32(dstlo8,leftlo8);
708
25.0k
              dsthi8 = _mm_add_epi32(dsthi8,lefthi8);
709
25.0k
              dstlo8 = _mm_add_epi32(dstlo8,w32_8);
710
25.0k
              dsthi8 = _mm_add_epi32(dsthi8,w32_8);
711
712
25.0k
              dstlo8 =  _mm_srai_epi32(dstlo8,6);
713
25.0k
              dsthi8 =  _mm_srai_epi32(dsthi8,6);
714
715
25.0k
              dstlo8 =  _mm_max_epi32(vbdmin8,dstlo8);
716
25.0k
              dsthi8 =  _mm_max_epi32(vbdmin8,dsthi8);
717
25.0k
              dstlo8 =  _mm_min_epi32(vbdmax8,dstlo8);
718
25.0k
              dsthi8 =  _mm_min_epi32(vbdmax8,dsthi8);
719
720
25.0k
              x8dst = _mm_packs_epi32(dstlo8,dsthi8);
721
25.0k
            }
722
723
25.0k
            if( iWidth >= 8 )
724
17.0k
              _mm_storeu_si128( (__m128i*) ( pDst + y * dstStride + x ), x8dst );
725
8.04k
            else if( iWidth == 4 )
726
8.04k
              _mm_storeu_si64( (__m128i*) ( pDst + y * dstStride + x ), x8dst );
727
0
            else if( iWidth == 2 )
728
0
              _mm_storeu_si32( (__m128i*) ( pDst + y * dstStride + x ), x8dst );
729
25.0k
          }
730
25.0k
        }
731
43.8k
        else //wT =0
732
43.8k
        {
733
87.6k
          for( int x = 0; x < std::min( iWidth, 16 ); x += 8 )
734
43.8k
          {
735
43.8k
            if( x == 0 )
736
43.8k
              wl8 = wl8start;
737
0
            else if( x == 8 )
738
0
              wl8 = wl8start2;
739
740
43.8k
            __m128i x8dst;
741
43.8k
            if( iWidth >= 8 )
742
26.8k
              x8dst = _mm_loadu_si128( (const __m128i*)( pDst + y * dstStride + x ) );   // load dst
743
16.9k
            else if( iWidth == 4 )
744
16.9k
              x8dst = _mm_loadu_si64( (const __m128i*)( pDst + y * dstStride + x ) );    // load dst
745
0
            else if( iWidth == 2 )
746
0
              x8dst = _mm_loadu_si32( (const __m128i*)( pDst + y * dstStride + x ) );    // load dst
747
0
            else
748
43.8k
              CHECK( true, "wrong iWidth in IntraPredSampleFilter_SIMD, only implemented for >=8, ==4, ==2" );
749
750
751
43.8k
            tmplo8          = _mm_mullo_epi16( x8left, wl8 );   // wL * left
752
43.8k
            tmphi8          = _mm_mulhi_epi16( x8left, wl8 );   // wL * left
753
43.8k
            __m128i leftlo8 = _mm_unpacklo_epi16( tmplo8, tmphi8 );
754
43.8k
            __m128i lefthi8 = _mm_unpackhi_epi16( tmplo8, tmphi8 );
755
756
43.8k
            __m128i wX     = _mm_sub_epi16( w64_8, wl8 );
757
43.8k
            tmplo8         = _mm_mullo_epi16( x8dst, wX );   // 64-wL-wT*dst
758
43.8k
            tmphi8         = _mm_mulhi_epi16( x8dst, wX );   // 64-wL-wT*dst
759
43.8k
            __m128i dstlo8 = _mm_unpacklo_epi16( tmplo8, tmphi8 );
760
43.8k
            __m128i dsthi8 = _mm_unpackhi_epi16( tmplo8, tmphi8 );
761
762
43.8k
            dstlo8 = _mm_add_epi32( dstlo8, leftlo8 );
763
43.8k
            dsthi8 = _mm_add_epi32( dsthi8, lefthi8 );
764
43.8k
            dstlo8 = _mm_add_epi32( dstlo8, w32_8 );
765
43.8k
            dsthi8 = _mm_add_epi32( dsthi8, w32_8 );
766
767
43.8k
            dstlo8 = _mm_srai_epi32( dstlo8, 6 );
768
43.8k
            dsthi8 = _mm_srai_epi32( dsthi8, 6 );
769
770
43.8k
            dstlo8 = _mm_max_epi32( vbdmin8, dstlo8 );
771
43.8k
            dsthi8 = _mm_max_epi32( vbdmin8, dsthi8 );
772
43.8k
            dstlo8 = _mm_min_epi32( vbdmax8, dstlo8 );
773
43.8k
            dsthi8 = _mm_min_epi32( vbdmax8, dsthi8 );
774
775
43.8k
            dstlo8 = _mm_packs_epi32( dstlo8, dsthi8 );
776
777
43.8k
            if( iWidth >= 8 )
778
26.8k
              _mm_storeu_si128( (__m128i*)( pDst + y * dstStride + x ), dstlo8 );
779
16.9k
            else if( iWidth == 4 )
780
16.9k
              _mm_storeu_si64( (__m128i*)( pDst + y * dstStride + x ), ( dstlo8 ) );
781
0
            else if( iWidth == 2 )
782
0
              _mm_storeu_si32( (__m128i*)( pDst + y * dstStride + x ), dstlo8 );
783
43.8k
          }
784
43.8k
        }
785
68.8k
      }
786
4.94k
    }
787
4.94k
  }
788
789
790
4.94k
#if USE_AVX2
791
4.94k
  _mm256_zeroupper();
792
4.94k
#endif
793
4.94k
}
void vvdec::IntraPredSampleFilter_SIMD<(vvdec::x86_simd::X86_VEXT)4, 16>(short*, long, vvdec::AreaBuf<short>&, unsigned int, vvdec::ClpRngTemplate<short> const&)
Line
Count
Source
421
8.55k
{
422
8.55k
  const int       iWidth    = piPred.width;
423
8.55k
  const int       iHeight   = piPred.height;
424
8.55k
  PelBuf          dstBuf    = piPred;
425
8.55k
  Pel*            pDst      = dstBuf.buf;
426
8.55k
  const ptrdiff_t dstStride = dstBuf.stride;
427
428
8.55k
  const int scale = ((getLog2(iWidth) - 2 + getLog2(iHeight) - 2 + 2) >> 2);
429
8.55k
  CHECK(scale < 0 || scale > 31, "PDPC: scale < 0 || scale > 2");
430
431
8.55k
#if USE_AVX2
432
8.55k
  if( W > 8 )
433
8.55k
  {
434
8.55k
    __m256i tmplo,tmphi;
435
8.55k
    __m256i w64 = _mm256_set_epi16(64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64);
436
8.55k
    __m256i w32 = _mm256_set_epi32(32,32,32,32,32,32,32,32);
437
8.55k
    __m256i vbdmin   = _mm256_set1_epi32( clpRng.min() );
438
8.55k
    __m256i vbdmax   = _mm256_set1_epi32( clpRng.max() );
439
8.55k
    __m256i wl16;
440
8.55k
    __m256i wl16start;
441
442
8.55k
    if (scale==0)
443
0
    {
444
0
      wl16start = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,0,0,0,2,8,32);
445
0
    }
446
8.55k
    else if (scale==1)
447
5.65k
    {
448
5.65k
      wl16start = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,1,2,4,8,16,32);
449
5.65k
    }
450
2.89k
    else if (scale==2)
451
2.89k
    {
452
2.89k
      wl16start = _mm256_set_epi16(0,0,0,0,1,1,2,2,4,4,8,8,16,16,32,32);
453
2.89k
    }
454
0
    else
455
0
    {
456
0
      THROW_FATAL( "Wrong scale (" << scale << ")" );
457
0
    }
458
459
460
8.55k
    if (uiDirMode == PLANAR_IDX || uiDirMode == DC_IDX )
461
8.55k
    {
462
210k
      for (int y = 0; y < iHeight; y++)
463
201k
      {
464
201k
        int wT = 32 >> std::min(31, ((y << 1) >> scale));
465
466
201k
        __m256i wt16 = _mm256_set_epi16(wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT);
467
201k
        __m256i x16left = _mm256_broadcastw_epi16(_mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)*srcStride))));
468
469
201k
        if (wT)
470
67.1k
        {
471
195k
          for (int x = 0; x < iWidth; x+=16)
472
128k
          {
473
128k
            if (x==0)
474
67.1k
            {
475
67.1k
              wl16=wl16start;
476
477
67.1k
              __m256i x16top = _mm256_loadu_si256((__m256i *) (ptrSrc+x+1)); // load top
478
67.1k
              __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride+x)); // load dst
479
480
67.1k
              tmplo = _mm256_mullo_epi16(x16left,wl16);  //wL * left
481
67.1k
              tmphi = _mm256_mulhi_epi16(x16left,wl16);  //wL * left
482
67.1k
              __m256i leftlo = _mm256_unpacklo_epi16(tmplo,tmphi);
483
67.1k
              __m256i lefthi = _mm256_unpackhi_epi16(tmplo,tmphi);
484
485
67.1k
              tmplo = _mm256_mullo_epi16(x16top,wt16);    // wT*top
486
67.1k
              tmphi = _mm256_mulhi_epi16(x16top,wt16);    // wT*top
487
67.1k
              __m256i toplo = _mm256_unpacklo_epi16(tmplo,tmphi);
488
67.1k
              __m256i tophi = _mm256_unpackhi_epi16(tmplo,tmphi);
489
490
67.1k
              __m256i wX = _mm256_sub_epi16(w64,wl16);
491
67.1k
              wX = _mm256_sub_epi16(wX,wt16);            // 64-wL-wT
492
67.1k
              tmplo = _mm256_mullo_epi16(x16dst,wX);    // 64-wL-wT*dst
493
67.1k
              tmphi = _mm256_mulhi_epi16(x16dst,wX);    // 64-wL-wT*dst
494
67.1k
              __m256i dstlo = _mm256_unpacklo_epi16(tmplo,tmphi);
495
67.1k
              __m256i dsthi = _mm256_unpackhi_epi16(tmplo,tmphi);
496
497
67.1k
              dstlo = _mm256_add_epi32(dstlo,toplo);
498
67.1k
              dsthi = _mm256_add_epi32(dsthi,tophi);
499
67.1k
              dstlo = _mm256_add_epi32(dstlo,leftlo);
500
67.1k
              dsthi = _mm256_add_epi32(dsthi,lefthi);
501
67.1k
              dstlo = _mm256_add_epi32(dstlo,w32);
502
67.1k
              dsthi = _mm256_add_epi32(dsthi,w32);
503
504
67.1k
              dstlo =  _mm256_srai_epi32(dstlo,6);
505
67.1k
              dsthi =  _mm256_srai_epi32(dsthi,6);
506
507
67.1k
              dstlo =  _mm256_max_epi32(vbdmin,dstlo);
508
67.1k
              dsthi =  _mm256_max_epi32(vbdmin,dsthi);
509
67.1k
              dstlo =  _mm256_min_epi32(vbdmax,dstlo);
510
67.1k
              dsthi =  _mm256_min_epi32(vbdmax,dsthi);
511
512
67.1k
              dstlo =  _mm256_packs_epi32(dstlo,dsthi);
513
67.1k
              dstlo =  _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) );
514
515
67.1k
              _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride+x), dstlo );
516
67.1k
            }
517
61.3k
            else
518
61.3k
            {
519
520
61.3k
              __m256i x16top = _mm256_loadu_si256((__m256i *) (ptrSrc+x+1)); // load top
521
61.3k
              __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride+x)); // load dst
522
523
524
61.3k
              tmplo = _mm256_mullo_epi16(x16top,wt16);    // wT*top
525
61.3k
              tmphi = _mm256_mulhi_epi16(x16top,wt16);    // wT*top
526
61.3k
              __m256i toplo = _mm256_unpacklo_epi16(tmplo,tmphi);
527
61.3k
              __m256i tophi = _mm256_unpackhi_epi16(tmplo,tmphi);
528
529
61.3k
              __m256i wX = _mm256_sub_epi16(w64,wt16);
530
61.3k
              tmplo = _mm256_mullo_epi16(x16dst,wX);    // 64-wL-wT*dst
531
61.3k
              tmphi = _mm256_mulhi_epi16(x16dst,wX);    // 64-wL-wT*dst
532
61.3k
              __m256i dstlo = _mm256_unpacklo_epi16(tmplo,tmphi);
533
61.3k
              __m256i dsthi = _mm256_unpackhi_epi16(tmplo,tmphi);
534
535
61.3k
              dstlo = _mm256_add_epi32(dstlo,toplo);
536
61.3k
              dsthi = _mm256_add_epi32(dsthi,tophi);
537
61.3k
              dstlo = _mm256_add_epi32(dstlo,w32);
538
61.3k
              dsthi = _mm256_add_epi32(dsthi,w32);
539
540
61.3k
              dstlo =  _mm256_srai_epi32(dstlo,6);
541
61.3k
              dsthi =  _mm256_srai_epi32(dsthi,6);
542
543
61.3k
              dstlo =  _mm256_max_epi32(vbdmin,dstlo);
544
61.3k
              dsthi =  _mm256_max_epi32(vbdmin,dsthi);
545
61.3k
              dstlo =  _mm256_min_epi32(vbdmax,dstlo);
546
61.3k
              dsthi =  _mm256_min_epi32(vbdmax,dsthi);
547
548
61.3k
              dstlo =  _mm256_packs_epi32(dstlo,dsthi);
549
61.3k
              dstlo =  _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) );
550
551
61.3k
              _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride+x), dstlo );
552
61.3k
            }
553
554
128k
          }  // for
555
67.1k
        }
556
134k
        else
557
134k
        { // wT =0
558
559
134k
          wl16=wl16start;
560
134k
          __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride)); // load dst
561
562
134k
          tmplo = _mm256_mullo_epi16(x16left,wl16);  //wL * left
563
134k
          tmphi = _mm256_mulhi_epi16(x16left,wl16);  //wL * left
564
134k
          __m256i leftlo = _mm256_unpacklo_epi16(tmplo,tmphi);
565
134k
          __m256i lefthi = _mm256_unpackhi_epi16(tmplo,tmphi);
566
567
134k
          __m256i wX = _mm256_sub_epi16(w64,wl16);
568
134k
          tmplo = _mm256_mullo_epi16(x16dst,wX);    // 64-wL-wT*dst
569
134k
          tmphi = _mm256_mulhi_epi16(x16dst,wX);    // 64-wL-wT*dst
570
134k
          __m256i dstlo = _mm256_unpacklo_epi16(tmplo,tmphi);
571
134k
          __m256i dsthi = _mm256_unpackhi_epi16(tmplo,tmphi);
572
573
134k
          dstlo = _mm256_add_epi32(dstlo,leftlo);
574
134k
          dsthi = _mm256_add_epi32(dsthi,lefthi);
575
134k
          dstlo = _mm256_add_epi32(dstlo,w32);
576
134k
          dsthi = _mm256_add_epi32(dsthi,w32);
577
578
134k
          dstlo =  _mm256_srai_epi32(dstlo,6);
579
134k
          dsthi =  _mm256_srai_epi32(dsthi,6);
580
581
134k
          dstlo =  _mm256_max_epi32(vbdmin,dstlo);
582
134k
          dsthi =  _mm256_max_epi32(vbdmin,dsthi);
583
134k
          dstlo =  _mm256_min_epi32(vbdmax,dstlo);
584
134k
          dsthi =  _mm256_min_epi32(vbdmax,dsthi);
585
586
134k
          dstlo =  _mm256_packs_epi32(dstlo,dsthi);
587
134k
          dstlo =  _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) );
588
589
134k
          _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride), dstlo );
590
134k
        }
591
201k
      }
592
8.55k
    }
593
8.55k
  }
594
0
  else
595
0
#endif
596
0
  {
597
0
    __m128i tmplo8,tmphi8;
598
0
    __m128i w64_8 = _mm_set_epi16(64,64,64,64,64,64,64,64);
599
0
    __m128i w32_8 = _mm_set_epi32(32,32,32,32);
600
0
    __m128i vbdmin8   = _mm_set1_epi32( clpRng.min() );
601
0
    __m128i vbdmax8   = _mm_set1_epi32( clpRng.max() );
602
0
    __m128i wl8start,wl8start2;
603
0
    CHECK(scale < 0 || scale > 2, "PDPC: scale < 0 || scale > 2");
604
605
0
    if (scale==0)
606
0
    {
607
0
      wl8start = _mm_set_epi16(0,0,0,0,0,2,8,32);
608
0
      wl8start2 = _mm_set_epi16(0,0,0,0,0,0,0,0);
609
0
    }
610
0
    else if (scale==1)
611
0
    {
612
0
      wl8start = _mm_set_epi16(0,0,1,2,4,8,16,32);
613
0
      wl8start2 = _mm_set_epi16(0,0,0,0,0,0,0,0);
614
0
    }
615
0
    else if (scale==2)
616
0
    {
617
0
      wl8start = _mm_set_epi16(4,4,8,8,16,16,32,32);
618
0
      wl8start2 = _mm_set_epi16(0,0,0,0,1,1,2,2);
619
0
    }
620
0
    if (uiDirMode == PLANAR_IDX || uiDirMode == DC_IDX )
621
0
    {
622
0
      __m128i wl8 = wl8start;
623
0
      for (int y = 0; y < iHeight; y++)
624
0
      {
625
0
        int wT = 32 >> std::min(31, ((y << 1) >> scale));
626
627
0
        __m128i wt8 = _mm_set_epi16(wT,wT,wT,wT,wT,wT,wT,wT);
628
 //       __m128i x8left = _mm_broadcastw_epi16(_mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)*srcStride))));
629
630
0
        __m128i x8left = _mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)*srcStride)));
631
0
        x8left =_mm_shufflelo_epi16(x8left,0);
632
0
        x8left =_mm_shuffle_epi32(x8left,0);
633
634
635
0
        if (wT)
636
0
        {
637
0
          for (int x = 0; x < iWidth; x+=8)
638
0
          {
639
0
            __m128i x8top = _mm_loadu_si128( (__m128i*) ( ptrSrc + x + 1 ) );   // load top
640
0
            __m128i x8dst = _mm_setzero_si128();
641
0
            if( iWidth >= 8 )
642
0
              x8dst = _mm_loadu_si128( (const __m128i*) ( pDst + y * dstStride + x ) );   // load dst
643
0
            else if( iWidth == 4 )
644
0
              x8dst = _mm_loadu_si64( (const __m128i*) ( pDst + y * dstStride + x ) );    // load dst
645
0
            else if( iWidth == 2 )
646
0
              x8dst = _mm_loadu_si32( (const __m128i*) ( pDst + y * dstStride + x ) );    // load dst
647
0
            else
648
0
            {
649
0
              CHECKD( true, "wrong iWidth in IntraPredSampleFilter_SIMD, only implemented for >=8, ==4, ==2" );
650
0
            }
651
652
0
            if (x>8)
653
0
            {
654
0
              tmplo8 = _mm_mullo_epi16(x8top,wt8);    // wT*top
655
0
              tmphi8 = _mm_mulhi_epi16(x8top,wt8);    // wT*top
656
0
              __m128i toplo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
657
0
              __m128i tophi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
658
659
660
0
              __m128i wX = _mm_sub_epi16(w64_8,wt8);
661
0
              tmplo8 = _mm_mullo_epi16(x8dst,wX);    // 64-wL-wT*dst
662
0
              tmphi8 = _mm_mulhi_epi16(x8dst,wX);    // 64-wL-wT*dst
663
0
              __m128i dstlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
664
0
              __m128i dsthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
665
666
0
              dstlo8 = _mm_add_epi32(dstlo8,toplo8);
667
0
              dsthi8 = _mm_add_epi32(dsthi8,tophi8);
668
0
              dstlo8 = _mm_add_epi32(dstlo8,w32_8);
669
0
              dsthi8 = _mm_add_epi32(dsthi8,w32_8);
670
671
0
              dstlo8 =  _mm_srai_epi32(dstlo8,6);
672
0
              dsthi8 =  _mm_srai_epi32(dsthi8,6);
673
674
0
              dstlo8 =  _mm_max_epi32(vbdmin8,dstlo8);
675
0
              dsthi8 =  _mm_max_epi32(vbdmin8,dsthi8);
676
0
              dstlo8 =  _mm_min_epi32(vbdmax8,dstlo8);
677
0
              dsthi8 =  _mm_min_epi32(vbdmax8,dsthi8);
678
679
0
              x8dst = _mm_packs_epi32(dstlo8,dsthi8);
680
0
            }
681
0
            else // x<=8
682
0
            {
683
0
              if (x==0)
684
0
                wl8=wl8start;
685
0
              else if (x==8)
686
0
                wl8=wl8start2;
687
688
0
              tmplo8 = _mm_mullo_epi16(x8left,wl8);  //wL * left
689
0
              tmphi8 = _mm_mulhi_epi16(x8left,wl8);  //wL * left
690
0
              __m128i leftlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
691
0
              __m128i lefthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
692
693
0
              tmplo8 = _mm_mullo_epi16(x8top,wt8);    // wT*top
694
0
              tmphi8 = _mm_mulhi_epi16(x8top,wt8);    // wT*top
695
0
              __m128i toplo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
696
0
              __m128i tophi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
697
698
0
              __m128i wX = _mm_sub_epi16(w64_8,wl8);
699
0
              wX = _mm_sub_epi16(wX,wt8);            // 64-wL-wT
700
0
              tmplo8 = _mm_mullo_epi16(x8dst,wX);    // 64-wL-wT*dst
701
0
              tmphi8 = _mm_mulhi_epi16(x8dst,wX);    // 64-wL-wT*dst
702
0
              __m128i dstlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
703
0
              __m128i dsthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
704
705
0
              dstlo8 = _mm_add_epi32(dstlo8,toplo8);
706
0
              dsthi8 = _mm_add_epi32(dsthi8,tophi8);
707
0
              dstlo8 = _mm_add_epi32(dstlo8,leftlo8);
708
0
              dsthi8 = _mm_add_epi32(dsthi8,lefthi8);
709
0
              dstlo8 = _mm_add_epi32(dstlo8,w32_8);
710
0
              dsthi8 = _mm_add_epi32(dsthi8,w32_8);
711
712
0
              dstlo8 =  _mm_srai_epi32(dstlo8,6);
713
0
              dsthi8 =  _mm_srai_epi32(dsthi8,6);
714
715
0
              dstlo8 =  _mm_max_epi32(vbdmin8,dstlo8);
716
0
              dsthi8 =  _mm_max_epi32(vbdmin8,dsthi8);
717
0
              dstlo8 =  _mm_min_epi32(vbdmax8,dstlo8);
718
0
              dsthi8 =  _mm_min_epi32(vbdmax8,dsthi8);
719
720
0
              x8dst = _mm_packs_epi32(dstlo8,dsthi8);
721
0
            }
722
723
0
            if( iWidth >= 8 )
724
0
              _mm_storeu_si128( (__m128i*) ( pDst + y * dstStride + x ), x8dst );
725
0
            else if( iWidth == 4 )
726
0
              _mm_storeu_si64( (__m128i*) ( pDst + y * dstStride + x ), x8dst );
727
0
            else if( iWidth == 2 )
728
0
              _mm_storeu_si32( (__m128i*) ( pDst + y * dstStride + x ), x8dst );
729
0
          }
730
0
        }
731
0
        else //wT =0
732
0
        {
733
0
          for( int x = 0; x < std::min( iWidth, 16 ); x += 8 )
734
0
          {
735
0
            if( x == 0 )
736
0
              wl8 = wl8start;
737
0
            else if( x == 8 )
738
0
              wl8 = wl8start2;
739
740
0
            __m128i x8dst;
741
0
            if( iWidth >= 8 )
742
0
              x8dst = _mm_loadu_si128( (const __m128i*)( pDst + y * dstStride + x ) );   // load dst
743
0
            else if( iWidth == 4 )
744
0
              x8dst = _mm_loadu_si64( (const __m128i*)( pDst + y * dstStride + x ) );    // load dst
745
0
            else if( iWidth == 2 )
746
0
              x8dst = _mm_loadu_si32( (const __m128i*)( pDst + y * dstStride + x ) );    // load dst
747
0
            else
748
0
              CHECK( true, "wrong iWidth in IntraPredSampleFilter_SIMD, only implemented for >=8, ==4, ==2" );
749
750
751
0
            tmplo8          = _mm_mullo_epi16( x8left, wl8 );   // wL * left
752
0
            tmphi8          = _mm_mulhi_epi16( x8left, wl8 );   // wL * left
753
0
            __m128i leftlo8 = _mm_unpacklo_epi16( tmplo8, tmphi8 );
754
0
            __m128i lefthi8 = _mm_unpackhi_epi16( tmplo8, tmphi8 );
755
756
0
            __m128i wX     = _mm_sub_epi16( w64_8, wl8 );
757
0
            tmplo8         = _mm_mullo_epi16( x8dst, wX );   // 64-wL-wT*dst
758
0
            tmphi8         = _mm_mulhi_epi16( x8dst, wX );   // 64-wL-wT*dst
759
0
            __m128i dstlo8 = _mm_unpacklo_epi16( tmplo8, tmphi8 );
760
0
            __m128i dsthi8 = _mm_unpackhi_epi16( tmplo8, tmphi8 );
761
762
0
            dstlo8 = _mm_add_epi32( dstlo8, leftlo8 );
763
0
            dsthi8 = _mm_add_epi32( dsthi8, lefthi8 );
764
0
            dstlo8 = _mm_add_epi32( dstlo8, w32_8 );
765
0
            dsthi8 = _mm_add_epi32( dsthi8, w32_8 );
766
767
0
            dstlo8 = _mm_srai_epi32( dstlo8, 6 );
768
0
            dsthi8 = _mm_srai_epi32( dsthi8, 6 );
769
770
0
            dstlo8 = _mm_max_epi32( vbdmin8, dstlo8 );
771
0
            dsthi8 = _mm_max_epi32( vbdmin8, dsthi8 );
772
0
            dstlo8 = _mm_min_epi32( vbdmax8, dstlo8 );
773
0
            dsthi8 = _mm_min_epi32( vbdmax8, dsthi8 );
774
775
0
            dstlo8 = _mm_packs_epi32( dstlo8, dsthi8 );
776
777
0
            if( iWidth >= 8 )
778
0
              _mm_storeu_si128( (__m128i*)( pDst + y * dstStride + x ), dstlo8 );
779
0
            else if( iWidth == 4 )
780
0
              _mm_storeu_si64( (__m128i*)( pDst + y * dstStride + x ), ( dstlo8 ) );
781
0
            else if( iWidth == 2 )
782
0
              _mm_storeu_si32( (__m128i*)( pDst + y * dstStride + x ), dstlo8 );
783
0
          }
784
0
        }
785
0
      }
786
0
    }
787
0
  }
788
789
790
8.55k
#if USE_AVX2
791
8.55k
  _mm256_zeroupper();
792
8.55k
#endif
793
8.55k
}
794
795
/** Function for deriving planar intra prediction. This function derives the prediction samples for planar mode (intra coding).
796
 */
797
template< X86_VEXT vext>
798
void xPredIntraPlanar_SIMD( const CPelBuf &pSrc, PelBuf &pDst, const SPS& sps )
799
8.92k
{
800
801
8.92k
  const uint32_t width      = pDst.width;
802
8.92k
  const uint32_t height     = pDst.height;
803
8.92k
  const uint32_t log2W      = getLog2( width );
804
8.92k
  const uint32_t log2H      = getLog2( height );
805
8.92k
  const uint32_t finalShift = 1 + log2W + log2H;
806
8.92k
  const uint32_t offset     = 1 << (log2W + log2H);
807
8.92k
  const ptrdiff_t stride    = pDst.stride;
808
8.92k
  Pel*            pred      = pDst.buf;
809
810
8.92k
  const Pel *ptrSrc =pSrc.buf;
811
812
8.92k
  Pel tmp;
813
8.92k
  int topRight = pSrc.at( width + 1, 0 );
814
815
8.92k
  tmp=pSrc.at( 0, height+1 );
816
8.92k
  const __m128i bottomLeft16 = _mm_set_epi16(tmp,tmp,tmp,tmp,tmp,tmp,tmp,tmp);
817
8.92k
  const __m128i zero = _mm_setzero_si128();
818
8.92k
  const __m128i eight = _mm_set_epi16(8,8,8,8,8,8,8,8);
819
8.92k
  const __m128i offset32 = _mm_set_epi32(offset,offset,offset,offset);
820
8.92k
  const __m128i vLog2W = _mm_cvtsi32_si128(log2W);
821
8.92k
  const __m128i vLog2H = _mm_cvtsi32_si128(log2H);
822
8.92k
  const __m128i vFinalShift = _mm_cvtsi32_si128(finalShift);
823
824
174k
  for( int y = 0; y < height; y++)
825
165k
  {
826
165k
    int leftColumn  = pSrc.at( 0, y + 1 );
827
165k
    int rightColumn = topRight - leftColumn;
828
165k
    leftColumn = leftColumn * (1 << log2W);
829
165k
    const __m128i leftColumn32 = _mm_set_epi32(leftColumn,leftColumn,leftColumn,leftColumn);
830
165k
    const __m128i rightcolumn16 = _mm_set_epi16(rightColumn,rightColumn,rightColumn,rightColumn,rightColumn,rightColumn,rightColumn,rightColumn);
831
165k
    const __m128i y16 = _mm_set_epi16(y+1,y+1,y+1,y+1,y+1,y+1,y+1,y+1);
832
165k
          __m128i x16 = _mm_set_epi16(8,7,6,5,4,3,2,1);
833
834
706k
    for( int x = 0; x < width; x+=8 )
835
541k
    {
836
      //topRow[x] = pSrc.at( x + 1, 0 );
837
541k
      __m128i topRow16 = _mm_loadu_si128 ((__m128i const *) (ptrSrc+(x+1)));
838
      //bottomRow[x] = bottomLeft - topRow[x];
839
541k
      __m128i bottomRow16L = _mm_sub_epi16(bottomLeft16,topRow16);
840
      // (y+1)*bottomRow[x]
841
541k
      __m128i  tmpH = _mm_mulhi_epi16(bottomRow16L,y16);
842
541k
      __m128i tmpL = _mm_mullo_epi16(bottomRow16L,y16);
843
541k
      bottomRow16L = _mm_unpacklo_epi16(tmpL,tmpH);
844
541k
      __m128i bottomRow16H = _mm_unpackhi_epi16(tmpL,tmpH);
845
846
      // (topRow[x] topRow16H<< log2H)
847
541k
      __m128i topRow32L = _mm_unpacklo_epi16(topRow16,zero);
848
541k
      __m128i topRow32H = _mm_unpackhi_epi16(topRow16,zero);
849
541k
      topRow32L = _mm_sll_epi32(topRow32L,vLog2H);
850
541k
      topRow32H = _mm_sll_epi32(topRow32H,vLog2H);
851
      // vertPred    = (topRow[x] << log2H) + (y+1)*bottomRow[x];
852
541k
      topRow32L = _mm_add_epi32(topRow32L,bottomRow16L);
853
541k
      topRow32H = _mm_add_epi32(topRow32H,bottomRow16H);
854
      // horPred = leftColumn + (x+1)*rightColumn;
855
541k
      tmpL = _mm_mullo_epi16(rightcolumn16,x16);
856
541k
      tmpH = _mm_mulhi_epi16(rightcolumn16,x16);
857
541k
      __m128i horpred32L = _mm_unpacklo_epi16(tmpL,tmpH);
858
541k
      __m128i horpred32H = _mm_unpackhi_epi16(tmpL,tmpH);
859
541k
      horpred32L = _mm_add_epi32(leftColumn32,horpred32L);
860
541k
      horpred32H = _mm_add_epi32(leftColumn32,horpred32H);
861
      // pred[x]      = ( ( horPred << log2H ) + ( vertPred << log2W ) + offset ) >> finalShift;
862
541k
      horpred32L = _mm_sll_epi32(horpred32L,vLog2H);
863
541k
      horpred32H = _mm_sll_epi32(horpred32H,vLog2H);
864
541k
      topRow32L = _mm_sll_epi32(topRow32L,vLog2W);
865
541k
      topRow32H = _mm_sll_epi32(topRow32H,vLog2W);
866
541k
      horpred32L = _mm_add_epi32(horpred32L,topRow32L);
867
541k
      horpred32H = _mm_add_epi32(horpred32H,topRow32H);
868
541k
      horpred32L = _mm_add_epi32(horpred32L,offset32);
869
541k
      horpred32H = _mm_add_epi32(horpred32H,offset32);
870
541k
      horpred32L = _mm_srl_epi32(horpred32L,vFinalShift);
871
541k
      horpred32H = _mm_srl_epi32(horpred32H,vFinalShift);
872
873
541k
      tmpL = _mm_packs_epi32(horpred32L,horpred32H);
874
541k
      if (width>=8)
875
523k
        _mm_storeu_si128(( __m128i * )(pred+y*stride+x), (tmpL) );
876
17.9k
      else if (width==4)
877
17.9k
        _mm_storeu_si64(( __m128i * )(pred+y*stride+x), (tmpL) );
878
0
      else if (width==2)
879
0
        _mm_storeu_si32(( __m128i * )(pred+y*stride+x),(tmpL) );
880
0
      else
881
0
        pred[y*stride+x]=(Pel)_mm_extract_epi16 (tmpL,0);
882
883
541k
      x16 = _mm_add_epi16(x16,eight);
884
541k
    }
885
165k
  }
886
8.92k
}
Unexecuted instantiation: void vvdec::xPredIntraPlanar_SIMD<(vvdec::x86_simd::X86_VEXT)1>(vvdec::AreaBuf<short const> const&, vvdec::AreaBuf<short>&, vvdec::SPS const&)
void vvdec::xPredIntraPlanar_SIMD<(vvdec::x86_simd::X86_VEXT)4>(vvdec::AreaBuf<short const> const&, vvdec::AreaBuf<short>&, vvdec::SPS const&)
Line
Count
Source
799
8.92k
{
800
801
8.92k
  const uint32_t width      = pDst.width;
802
8.92k
  const uint32_t height     = pDst.height;
803
8.92k
  const uint32_t log2W      = getLog2( width );
804
8.92k
  const uint32_t log2H      = getLog2( height );
805
8.92k
  const uint32_t finalShift = 1 + log2W + log2H;
806
8.92k
  const uint32_t offset     = 1 << (log2W + log2H);
807
8.92k
  const ptrdiff_t stride    = pDst.stride;
808
8.92k
  Pel*            pred      = pDst.buf;
809
810
8.92k
  const Pel *ptrSrc =pSrc.buf;
811
812
8.92k
  Pel tmp;
813
8.92k
  int topRight = pSrc.at( width + 1, 0 );
814
815
8.92k
  tmp=pSrc.at( 0, height+1 );
816
8.92k
  const __m128i bottomLeft16 = _mm_set_epi16(tmp,tmp,tmp,tmp,tmp,tmp,tmp,tmp);
817
8.92k
  const __m128i zero = _mm_setzero_si128();
818
8.92k
  const __m128i eight = _mm_set_epi16(8,8,8,8,8,8,8,8);
819
8.92k
  const __m128i offset32 = _mm_set_epi32(offset,offset,offset,offset);
820
8.92k
  const __m128i vLog2W = _mm_cvtsi32_si128(log2W);
821
8.92k
  const __m128i vLog2H = _mm_cvtsi32_si128(log2H);
822
8.92k
  const __m128i vFinalShift = _mm_cvtsi32_si128(finalShift);
823
824
174k
  for( int y = 0; y < height; y++)
825
165k
  {
826
165k
    int leftColumn  = pSrc.at( 0, y + 1 );
827
165k
    int rightColumn = topRight - leftColumn;
828
165k
    leftColumn = leftColumn * (1 << log2W);
829
165k
    const __m128i leftColumn32 = _mm_set_epi32(leftColumn,leftColumn,leftColumn,leftColumn);
830
165k
    const __m128i rightcolumn16 = _mm_set_epi16(rightColumn,rightColumn,rightColumn,rightColumn,rightColumn,rightColumn,rightColumn,rightColumn);
831
165k
    const __m128i y16 = _mm_set_epi16(y+1,y+1,y+1,y+1,y+1,y+1,y+1,y+1);
832
165k
          __m128i x16 = _mm_set_epi16(8,7,6,5,4,3,2,1);
833
834
706k
    for( int x = 0; x < width; x+=8 )
835
541k
    {
836
      //topRow[x] = pSrc.at( x + 1, 0 );
837
541k
      __m128i topRow16 = _mm_loadu_si128 ((__m128i const *) (ptrSrc+(x+1)));
838
      //bottomRow[x] = bottomLeft - topRow[x];
839
541k
      __m128i bottomRow16L = _mm_sub_epi16(bottomLeft16,topRow16);
840
      // (y+1)*bottomRow[x]
841
541k
      __m128i  tmpH = _mm_mulhi_epi16(bottomRow16L,y16);
842
541k
      __m128i tmpL = _mm_mullo_epi16(bottomRow16L,y16);
843
541k
      bottomRow16L = _mm_unpacklo_epi16(tmpL,tmpH);
844
541k
      __m128i bottomRow16H = _mm_unpackhi_epi16(tmpL,tmpH);
845
846
      // (topRow[x] topRow16H<< log2H)
847
541k
      __m128i topRow32L = _mm_unpacklo_epi16(topRow16,zero);
848
541k
      __m128i topRow32H = _mm_unpackhi_epi16(topRow16,zero);
849
541k
      topRow32L = _mm_sll_epi32(topRow32L,vLog2H);
850
541k
      topRow32H = _mm_sll_epi32(topRow32H,vLog2H);
851
      // vertPred    = (topRow[x] << log2H) + (y+1)*bottomRow[x];
852
541k
      topRow32L = _mm_add_epi32(topRow32L,bottomRow16L);
853
541k
      topRow32H = _mm_add_epi32(topRow32H,bottomRow16H);
854
      // horPred = leftColumn + (x+1)*rightColumn;
855
541k
      tmpL = _mm_mullo_epi16(rightcolumn16,x16);
856
541k
      tmpH = _mm_mulhi_epi16(rightcolumn16,x16);
857
541k
      __m128i horpred32L = _mm_unpacklo_epi16(tmpL,tmpH);
858
541k
      __m128i horpred32H = _mm_unpackhi_epi16(tmpL,tmpH);
859
541k
      horpred32L = _mm_add_epi32(leftColumn32,horpred32L);
860
541k
      horpred32H = _mm_add_epi32(leftColumn32,horpred32H);
861
      // pred[x]      = ( ( horPred << log2H ) + ( vertPred << log2W ) + offset ) >> finalShift;
862
541k
      horpred32L = _mm_sll_epi32(horpred32L,vLog2H);
863
541k
      horpred32H = _mm_sll_epi32(horpred32H,vLog2H);
864
541k
      topRow32L = _mm_sll_epi32(topRow32L,vLog2W);
865
541k
      topRow32H = _mm_sll_epi32(topRow32H,vLog2W);
866
541k
      horpred32L = _mm_add_epi32(horpred32L,topRow32L);
867
541k
      horpred32H = _mm_add_epi32(horpred32H,topRow32H);
868
541k
      horpred32L = _mm_add_epi32(horpred32L,offset32);
869
541k
      horpred32H = _mm_add_epi32(horpred32H,offset32);
870
541k
      horpred32L = _mm_srl_epi32(horpred32L,vFinalShift);
871
541k
      horpred32H = _mm_srl_epi32(horpred32H,vFinalShift);
872
873
541k
      tmpL = _mm_packs_epi32(horpred32L,horpred32H);
874
541k
      if (width>=8)
875
523k
        _mm_storeu_si128(( __m128i * )(pred+y*stride+x), (tmpL) );
876
17.9k
      else if (width==4)
877
17.9k
        _mm_storeu_si64(( __m128i * )(pred+y*stride+x), (tmpL) );
878
0
      else if (width==2)
879
0
        _mm_storeu_si32(( __m128i * )(pred+y*stride+x),(tmpL) );
880
0
      else
881
0
        pred[y*stride+x]=(Pel)_mm_extract_epi16 (tmpL,0);
882
883
541k
      x16 = _mm_add_epi16(x16,eight);
884
541k
    }
885
165k
  }
886
8.92k
}
887
888
template< X86_VEXT vext>
889
void GetLumaRecPixel420SIMD (const int width,const int height, const Pel* pRecSrc0,const ptrdiff_t iRecStride,Pel* pDst0,const ptrdiff_t iDstStride)
890
10.7k
{
891
#ifdef USE_AVX2
892
10.7k
  if( ( width & 15 ) == 0 )    // width>=16
893
//  if( 0 )    // width>=16
894
5.35k
  {
895
5.35k
    __m256i vzero = _mm256_set1_epi8(0);
896
5.35k
    __m256i vfour = _mm256_set1_epi32(4);
897
94.2k
    for( int y = 0; y < height; y++ )
898
88.9k
    {
899
236k
      for( int x = 0; x < width; x += 16 )
900
147k
      {
901
147k
        int x2=x<<1;
902
147k
        __m256i vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2-1]);      // -1 0 1 2 3 4 5 6
903
147k
        __m256i vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2]);          // 0 1 2 3 4 5 6 7
904
905
        __m256i vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55);      // -1 1 3 5  32 Bit
906
        __m256i vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55);      // 0 2 4 6  32 Bit
907
        __m256i vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA);      // 1 3 5 7 32 Bit
908
        vsrc10 = _mm256_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
909
        vsrc0 =  _mm256_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
910
911
        vsrc0 =  _mm256_add_epi32(vsrc0,vsrc10);
912
        __m256i vdst0 = _mm256_add_epi32(vsrc0,vsrc01);   // dst 0 1 2 3 32 Bit, untere Zeile fehlt noch
913
914
        vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 +15]);      // 7 8 9 10 11 12 13 14
915
        vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 + 16 ]);          // 8 9 10 11 12 13 14 15
916
917
        x2+= (int)iRecStride;
918
919
        vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55);
920
        vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55);
921
        vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA);
922
        vsrc10 = _mm256_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
923
        vsrc0 =  _mm256_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
924
925
        vsrc0 =  _mm256_add_epi32(vsrc0,vsrc10);
926
        __m256i vdst1 = _mm256_add_epi32(vsrc0,vsrc01);   // dst 4 5 6 7 32 Bit, untere Zeile fehlt noch
927
928
        // jetzt die nächste Zeile dazu
929
        vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2-1]);      // -1 0 1 2 3 4 5 6
930
        vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2]);          // 0 1 2 3 4 5 6 7
931
932
        vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55);      // -1 1 3 5  32 Bit
933
        vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55);      // 0 2 4 6  32 Bit
934
        vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA);      // 1 3 5 7 32 Bit
935
        vsrc10 = _mm256_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
936
        vsrc0 =  _mm256_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
937
938
        vsrc0 =  _mm256_add_epi32(vsrc0,vsrc10);
939
        __m256i vdst01 = _mm256_add_epi32(vsrc0,vsrc01);   // dst 0 1 2 3 32 Bit, untere Zeile
940
941
        vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 + 15]);      // 7 8 9 10 11 12 13 14
942
        vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 + 16 ]);          // 8 9 10 11 12 13 14 15
943
944
        vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55);
945
        vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55);
946
        vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA);
947
        vsrc10 = _mm256_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
948
        vsrc0 =  _mm256_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
949
950
        vsrc0 =  _mm256_add_epi32(vsrc0,vsrc10);
951
        __m256i vdst11 = _mm256_add_epi32(vsrc0,vsrc01);   // dst 4 5 6 7 32 Bit, untere Zeile
952
953
        vdst0 = _mm256_add_epi32(vdst0,vdst01);
954
        vdst1 = _mm256_add_epi32(vdst1,vdst11);
955
        vdst0 =  _mm256_add_epi32(vdst0,vfour);
956
        vdst1 =  _mm256_add_epi32(vdst1,vfour);
957
        vdst0 = _mm256_srli_epi32(vdst0,3);
958
        vdst1 = _mm256_srli_epi32(vdst1,3);
959
        vdst0 = _mm256_packus_epi32 (vdst0,vdst1);   // 16 bit
960
        vdst0 = _mm256_permute4x64_epi64(vdst0,0xd8);
961
962
147k
        _mm256_storeu_si256((__m256i*)&pDst0[x], vdst0);
963
        //        _mm_storeu_si128((__m128i*)&pDstTmp[x], vdst0);
964
147k
      }
965
88.9k
      pDst0 += iDstStride;
966
88.9k
      pRecSrc0 += (iRecStride<<1);
967
88.9k
    }
968
5.35k
  }
969
5.36k
  else
970
5.36k
#endif
971
5.36k
    if( ( width & 7 ) == 0 )    // width>=8
972
3.43k
    {
973
3.43k
      __m128i vzero = _mm_set1_epi8(0);
974
3.43k
      __m128i vfour = _mm_set1_epi32(4);
975
976
977
31.4k
      for( int y = 0; y < height; y++ )
978
28.0k
      {
979
980
56.0k
        for( int x = 0; x < width; x += 8 )
981
28.0k
        {
982
28.0k
          int x2=x<<1;
983
28.0k
          __m128i vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2-1]);      // -1 0 1 2 3 4 5 6
984
28.0k
          __m128i vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2]);          // 0 1 2 3 4 5 6 7
985
986
28.0k
          __m128i vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55);      // -1 1 3 5  32 Bit
987
28.0k
          __m128i vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55);      // 0 2 4 6  32 Bit
988
28.0k
          __m128i vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA);      // 1 3 5 7 32 Bit
989
28.0k
          vsrc10 = _mm_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
990
28.0k
          vsrc0 =  _mm_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
991
992
28.0k
          vsrc0 =  _mm_add_epi32(vsrc0,vsrc10);
993
28.0k
          __m128i vdst0 = _mm_add_epi32(vsrc0,vsrc01);   // dst 0 1 2 3 32 Bit, untere Zeile fehlt noch
994
995
28.0k
          vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 +7]);      // 7 8 9 10 11 12 13 14
996
28.0k
          vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 + 8 ]);          // 8 9 10 11 12 13 14 15
997
998
28.0k
          x2+=(int)iRecStride;
999
1000
28.0k
          vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55);
1001
28.0k
          vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55);
1002
28.0k
          vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA);
1003
28.0k
          vsrc10 = _mm_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
1004
28.0k
          vsrc0 =  _mm_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
1005
1006
28.0k
          vsrc0 =  _mm_add_epi32(vsrc0,vsrc10);
1007
28.0k
          __m128i vdst1 = _mm_add_epi32(vsrc0,vsrc01);   // dst 4 5 6 7 32 Bit, untere Zeile fehlt noch
1008
1009
          // jetzt die nächste Zeile dazu
1010
28.0k
          vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2-1]);      // -1 0 1 2 3 4 5 6
1011
28.0k
          vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2]);          // 0 1 2 3 4 5 6 7
1012
1013
28.0k
          vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55);      // -1 1 3 5  32 Bit
1014
28.0k
          vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55);      // 0 2 4 6  32 Bit
1015
28.0k
          vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA);      // 1 3 5 7 32 Bit
1016
28.0k
          vsrc10 = _mm_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
1017
28.0k
          vsrc0 =  _mm_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
1018
1019
28.0k
          vsrc0 =  _mm_add_epi32(vsrc0,vsrc10);
1020
28.0k
          __m128i vdst01 = _mm_add_epi32(vsrc0,vsrc01);   // dst 0 1 2 3 32 Bit, untere Zeile
1021
1022
28.0k
          vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 + 7]);      // 7 8 9 10 11 12 13 14
1023
28.0k
          vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 + 8 ]);          // 8 9 10 11 12 13 14 15
1024
1025
28.0k
          vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55);
1026
28.0k
          vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55);
1027
28.0k
          vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA);
1028
28.0k
          vsrc10 = _mm_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
1029
28.0k
          vsrc0 =  _mm_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
1030
1031
28.0k
          vsrc0 =  _mm_add_epi32(vsrc0,vsrc10);
1032
28.0k
          __m128i vdst11 = _mm_add_epi32(vsrc0,vsrc01);   // dst 4 5 6 7 32 Bit, untere Zeile
1033
1034
28.0k
          vdst0 = _mm_add_epi32(vdst0,vdst01);
1035
28.0k
          vdst1 = _mm_add_epi32(vdst1,vdst11);
1036
28.0k
          vdst0 =  _mm_add_epi32(vdst0,vfour);
1037
28.0k
          vdst1 =  _mm_add_epi32(vdst1,vfour);
1038
28.0k
          vdst0 = _mm_srli_epi32(vdst0,3);
1039
28.0k
          vdst1 = _mm_srli_epi32(vdst1,3);
1040
28.0k
          vdst0 = _mm_packus_epi32 (vdst0,vdst1);   // 16 bit
1041
1042
28.0k
          _mm_storeu_si128((__m128i*)&pDst0[x], vdst0);
1043
          //        _mm_storeu_si128((__m128i*)&pDstTmp[x], vdst0);
1044
28.0k
        }
1045
28.0k
        pDst0 += iDstStride;
1046
28.0k
        pRecSrc0 += (iRecStride<<1);
1047
28.0k
      }
1048
3.43k
    }
1049
1.92k
    else     // width<=4
1050
1.92k
    {
1051
1.92k
      __m128i vzero = _mm_set1_epi8(0);
1052
1.92k
      __m128i vfour = _mm_set1_epi32(4);
1053
1054
17.8k
      for( int y = 0; y < height; y++ )
1055
15.9k
      {
1056
15.9k
        __m128i vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[-1]);      // -1 0 1 2 3 4 5 6
1057
15.9k
        __m128i vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[0]);          // 0 1 2 3 4 5 6 7
1058
1059
15.9k
        __m128i vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55);      // -1 1 3 5  32 Bit
1060
15.9k
        __m128i vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55);      // 0 2 4 6  32 Bit
1061
15.9k
        __m128i vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA);      // 1 3 5 7 32 Bit
1062
15.9k
        vsrc10 = _mm_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
1063
15.9k
        vsrc0 =  _mm_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
1064
1065
15.9k
        vsrc0 =  _mm_add_epi32(vsrc0,vsrc10);
1066
15.9k
        __m128i vdst0 = _mm_add_epi32(vsrc0,vsrc01);   // dst 0 1 2 3 32 Bit, untere Zeile fehlt noch
1067
1068
        // jetzt die nächste Zeile dazu
1069
15.9k
        vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[iRecStride-1]);      // -1 0 1 2 3 4 5 6
1070
15.9k
        vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[iRecStride]);          // 0 1 2 3 4 5 6_mm_storeu_si32 7
1071
1072
15.9k
        vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55);      // -1 1 3 5  32 Bit
1073
15.9k
        vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55);      // 0 2 4 6  32 Bit
1074
15.9k
        vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA);      // 1 3 5 7 32 Bit
1075
15.9k
        vsrc10 = _mm_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
1076
15.9k
        vsrc0 =  _mm_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
1077
1078
15.9k
        vsrc0 =  _mm_add_epi32(vsrc0,vsrc10);
1079
15.9k
        __m128i vdst01 = _mm_add_epi32(vsrc0,vsrc01);   // dst 0 1 2 3 32 Bit, untere Zeile
1080
1081
1082
15.9k
        vdst0 = _mm_add_epi32(vdst0,vdst01);
1083
15.9k
        vdst0 =  _mm_add_epi32(vdst0,vfour);
1084
15.9k
        vdst0 = _mm_srli_epi32(vdst0,3);
1085
15.9k
        vdst0 = _mm_packus_epi32 (vdst0,vdst0);   // 16 bit
1086
1087
15.9k
        if (width==4)
1088
15.9k
          _mm_storeu_si64(( __m128i * )&pDst0[0], (vdst0) );
1089
0
        else if (width==2)
1090
0
          _mm_storeu_si32(( __m128i * )&pDst0[0], (vdst0) );
1091
0
        else
1092
0
        {
1093
0
          int tmp = _mm_cvtsi128_si32(vdst0);
1094
0
          pDst0[0] = (Pel) tmp;
1095
0
        }
1096
1097
15.9k
        pDst0 += iDstStride;
1098
15.9k
        pRecSrc0 += (iRecStride<<1);
1099
15.9k
      }
1100
1.92k
    }
1101
10.7k
}
Unexecuted instantiation: void vvdec::GetLumaRecPixel420SIMD<(vvdec::x86_simd::X86_VEXT)1>(int, int, short const*, long, short*, long)
void vvdec::GetLumaRecPixel420SIMD<(vvdec::x86_simd::X86_VEXT)4>(int, int, short const*, long, short*, long)
Line
Count
Source
890
10.7k
{
891
10.7k
#ifdef USE_AVX2
892
10.7k
  if( ( width & 15 ) == 0 )    // width>=16
893
//  if( 0 )    // width>=16
894
5.35k
  {
895
5.35k
    __m256i vzero = _mm256_set1_epi8(0);
896
5.35k
    __m256i vfour = _mm256_set1_epi32(4);
897
94.2k
    for( int y = 0; y < height; y++ )
898
88.9k
    {
899
236k
      for( int x = 0; x < width; x += 16 )
900
147k
      {
901
147k
        int x2=x<<1;
902
147k
        __m256i vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2-1]);      // -1 0 1 2 3 4 5 6
903
147k
        __m256i vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2]);          // 0 1 2 3 4 5 6 7
904
905
147k
        __m256i vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55);      // -1 1 3 5  32 Bit
906
147k
        __m256i vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55);      // 0 2 4 6  32 Bit
907
147k
        __m256i vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA);      // 1 3 5 7 32 Bit
908
147k
        vsrc10 = _mm256_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
909
147k
        vsrc0 =  _mm256_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
910
911
147k
        vsrc0 =  _mm256_add_epi32(vsrc0,vsrc10);
912
147k
        __m256i vdst0 = _mm256_add_epi32(vsrc0,vsrc01);   // dst 0 1 2 3 32 Bit, untere Zeile fehlt noch
913
914
147k
        vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 +15]);      // 7 8 9 10 11 12 13 14
915
147k
        vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 + 16 ]);          // 8 9 10 11 12 13 14 15
916
917
147k
        x2+= (int)iRecStride;
918
919
147k
        vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55);
920
147k
        vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55);
921
147k
        vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA);
922
147k
        vsrc10 = _mm256_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
923
147k
        vsrc0 =  _mm256_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
924
925
147k
        vsrc0 =  _mm256_add_epi32(vsrc0,vsrc10);
926
147k
        __m256i vdst1 = _mm256_add_epi32(vsrc0,vsrc01);   // dst 4 5 6 7 32 Bit, untere Zeile fehlt noch
927
928
        // jetzt die nächste Zeile dazu
929
147k
        vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2-1]);      // -1 0 1 2 3 4 5 6
930
147k
        vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2]);          // 0 1 2 3 4 5 6 7
931
932
147k
        vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55);      // -1 1 3 5  32 Bit
933
147k
        vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55);      // 0 2 4 6  32 Bit
934
147k
        vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA);      // 1 3 5 7 32 Bit
935
147k
        vsrc10 = _mm256_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
936
147k
        vsrc0 =  _mm256_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
937
938
147k
        vsrc0 =  _mm256_add_epi32(vsrc0,vsrc10);
939
147k
        __m256i vdst01 = _mm256_add_epi32(vsrc0,vsrc01);   // dst 0 1 2 3 32 Bit, untere Zeile
940
941
147k
        vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 + 15]);      // 7 8 9 10 11 12 13 14
942
147k
        vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 + 16 ]);          // 8 9 10 11 12 13 14 15
943
944
147k
        vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55);
945
147k
        vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55);
946
147k
        vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA);
947
147k
        vsrc10 = _mm256_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
948
147k
        vsrc0 =  _mm256_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
949
950
147k
        vsrc0 =  _mm256_add_epi32(vsrc0,vsrc10);
951
147k
        __m256i vdst11 = _mm256_add_epi32(vsrc0,vsrc01);   // dst 4 5 6 7 32 Bit, untere Zeile
952
953
147k
        vdst0 = _mm256_add_epi32(vdst0,vdst01);
954
147k
        vdst1 = _mm256_add_epi32(vdst1,vdst11);
955
147k
        vdst0 =  _mm256_add_epi32(vdst0,vfour);
956
147k
        vdst1 =  _mm256_add_epi32(vdst1,vfour);
957
147k
        vdst0 = _mm256_srli_epi32(vdst0,3);
958
147k
        vdst1 = _mm256_srli_epi32(vdst1,3);
959
147k
        vdst0 = _mm256_packus_epi32 (vdst0,vdst1);   // 16 bit
960
147k
        vdst0 = _mm256_permute4x64_epi64(vdst0,0xd8);
961
962
147k
        _mm256_storeu_si256((__m256i*)&pDst0[x], vdst0);
963
        //        _mm_storeu_si128((__m128i*)&pDstTmp[x], vdst0);
964
147k
      }
965
88.9k
      pDst0 += iDstStride;
966
88.9k
      pRecSrc0 += (iRecStride<<1);
967
88.9k
    }
968
5.35k
  }
969
5.36k
  else
970
5.36k
#endif
971
5.36k
    if( ( width & 7 ) == 0 )    // width>=8
972
3.43k
    {
973
3.43k
      __m128i vzero = _mm_set1_epi8(0);
974
3.43k
      __m128i vfour = _mm_set1_epi32(4);
975
976
977
31.4k
      for( int y = 0; y < height; y++ )
978
28.0k
      {
979
980
56.0k
        for( int x = 0; x < width; x += 8 )
981
28.0k
        {
982
28.0k
          int x2=x<<1;
983
28.0k
          __m128i vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2-1]);      // -1 0 1 2 3 4 5 6
984
28.0k
          __m128i vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2]);          // 0 1 2 3 4 5 6 7
985
986
28.0k
          __m128i vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55);      // -1 1 3 5  32 Bit
987
28.0k
          __m128i vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55);      // 0 2 4 6  32 Bit
988
28.0k
          __m128i vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA);      // 1 3 5 7 32 Bit
989
28.0k
          vsrc10 = _mm_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
990
28.0k
          vsrc0 =  _mm_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
991
992
28.0k
          vsrc0 =  _mm_add_epi32(vsrc0,vsrc10);
993
28.0k
          __m128i vdst0 = _mm_add_epi32(vsrc0,vsrc01);   // dst 0 1 2 3 32 Bit, untere Zeile fehlt noch
994
995
28.0k
          vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 +7]);      // 7 8 9 10 11 12 13 14
996
28.0k
          vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 + 8 ]);          // 8 9 10 11 12 13 14 15
997
998
28.0k
          x2+=(int)iRecStride;
999
1000
28.0k
          vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55);
1001
28.0k
          vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55);
1002
28.0k
          vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA);
1003
28.0k
          vsrc10 = _mm_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
1004
28.0k
          vsrc0 =  _mm_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
1005
1006
28.0k
          vsrc0 =  _mm_add_epi32(vsrc0,vsrc10);
1007
28.0k
          __m128i vdst1 = _mm_add_epi32(vsrc0,vsrc01);   // dst 4 5 6 7 32 Bit, untere Zeile fehlt noch
1008
1009
          // jetzt die nächste Zeile dazu
1010
28.0k
          vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2-1]);      // -1 0 1 2 3 4 5 6
1011
28.0k
          vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2]);          // 0 1 2 3 4 5 6 7
1012
1013
28.0k
          vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55);      // -1 1 3 5  32 Bit
1014
28.0k
          vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55);      // 0 2 4 6  32 Bit
1015
28.0k
          vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA);      // 1 3 5 7 32 Bit
1016
28.0k
          vsrc10 = _mm_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
1017
28.0k
          vsrc0 =  _mm_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
1018
1019
28.0k
          vsrc0 =  _mm_add_epi32(vsrc0,vsrc10);
1020
28.0k
          __m128i vdst01 = _mm_add_epi32(vsrc0,vsrc01);   // dst 0 1 2 3 32 Bit, untere Zeile
1021
1022
28.0k
          vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 + 7]);      // 7 8 9 10 11 12 13 14
1023
28.0k
          vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 + 8 ]);          // 8 9 10 11 12 13 14 15
1024
1025
28.0k
          vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55);
1026
28.0k
          vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55);
1027
28.0k
          vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA);
1028
28.0k
          vsrc10 = _mm_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
1029
28.0k
          vsrc0 =  _mm_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
1030
1031
28.0k
          vsrc0 =  _mm_add_epi32(vsrc0,vsrc10);
1032
28.0k
          __m128i vdst11 = _mm_add_epi32(vsrc0,vsrc01);   // dst 4 5 6 7 32 Bit, untere Zeile
1033
1034
28.0k
          vdst0 = _mm_add_epi32(vdst0,vdst01);
1035
28.0k
          vdst1 = _mm_add_epi32(vdst1,vdst11);
1036
28.0k
          vdst0 =  _mm_add_epi32(vdst0,vfour);
1037
28.0k
          vdst1 =  _mm_add_epi32(vdst1,vfour);
1038
28.0k
          vdst0 = _mm_srli_epi32(vdst0,3);
1039
28.0k
          vdst1 = _mm_srli_epi32(vdst1,3);
1040
28.0k
          vdst0 = _mm_packus_epi32 (vdst0,vdst1);   // 16 bit
1041
1042
28.0k
          _mm_storeu_si128((__m128i*)&pDst0[x], vdst0);
1043
          //        _mm_storeu_si128((__m128i*)&pDstTmp[x], vdst0);
1044
28.0k
        }
1045
28.0k
        pDst0 += iDstStride;
1046
28.0k
        pRecSrc0 += (iRecStride<<1);
1047
28.0k
      }
1048
3.43k
    }
1049
1.92k
    else     // width<=4
1050
1.92k
    {
1051
1.92k
      __m128i vzero = _mm_set1_epi8(0);
1052
1.92k
      __m128i vfour = _mm_set1_epi32(4);
1053
1054
17.8k
      for( int y = 0; y < height; y++ )
1055
15.9k
      {
1056
15.9k
        __m128i vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[-1]);      // -1 0 1 2 3 4 5 6
1057
15.9k
        __m128i vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[0]);          // 0 1 2 3 4 5 6 7
1058
1059
15.9k
        __m128i vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55);      // -1 1 3 5  32 Bit
1060
15.9k
        __m128i vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55);      // 0 2 4 6  32 Bit
1061
15.9k
        __m128i vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA);      // 1 3 5 7 32 Bit
1062
15.9k
        vsrc10 = _mm_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
1063
15.9k
        vsrc0 =  _mm_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
1064
1065
15.9k
        vsrc0 =  _mm_add_epi32(vsrc0,vsrc10);
1066
15.9k
        __m128i vdst0 = _mm_add_epi32(vsrc0,vsrc01);   // dst 0 1 2 3 32 Bit, untere Zeile fehlt noch
1067
1068
        // jetzt die nächste Zeile dazu
1069
15.9k
        vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[iRecStride-1]);      // -1 0 1 2 3 4 5 6
1070
15.9k
        vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[iRecStride]);          // 0 1 2 3 4 5 6_mm_storeu_si32 7
1071
1072
15.9k
        vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55);      // -1 1 3 5  32 Bit
1073
15.9k
        vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55);      // 0 2 4 6  32 Bit
1074
15.9k
        vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA);      // 1 3 5 7 32 Bit
1075
15.9k
        vsrc10 = _mm_srli_epi32(vsrc10,16);      // 1 3 5 7 32 Bit
1076
15.9k
        vsrc0 =  _mm_slli_epi32 (vsrc0,1);      // 0  2 4 6 *2
1077
1078
15.9k
        vsrc0 =  _mm_add_epi32(vsrc0,vsrc10);
1079
15.9k
        __m128i vdst01 = _mm_add_epi32(vsrc0,vsrc01);   // dst 0 1 2 3 32 Bit, untere Zeile
1080
1081
1082
15.9k
        vdst0 = _mm_add_epi32(vdst0,vdst01);
1083
15.9k
        vdst0 =  _mm_add_epi32(vdst0,vfour);
1084
15.9k
        vdst0 = _mm_srli_epi32(vdst0,3);
1085
15.9k
        vdst0 = _mm_packus_epi32 (vdst0,vdst0);   // 16 bit
1086
1087
15.9k
        if (width==4)
1088
15.9k
          _mm_storeu_si64(( __m128i * )&pDst0[0], (vdst0) );
1089
0
        else if (width==2)
1090
0
          _mm_storeu_si32(( __m128i * )&pDst0[0], (vdst0) );
1091
0
        else
1092
0
        {
1093
0
          int tmp = _mm_cvtsi128_si32(vdst0);
1094
0
          pDst0[0] = (Pel) tmp;
1095
0
        }
1096
1097
15.9k
        pDst0 += iDstStride;
1098
15.9k
        pRecSrc0 += (iRecStride<<1);
1099
15.9k
      }
1100
1.92k
    }
1101
10.7k
}
1102
1103
1104
1105
template<X86_VEXT vext>
1106
void IntraPrediction::_initIntraPredictionX86()
1107
22.8k
{
1108
22.8k
  IntraPredAngleCore4 = IntraPredAngleCore_SIMD<vext, 4>;
1109
22.8k
  IntraPredAngleCore8 = IntraPredAngleCore_SIMD<vext, 8>;
1110
22.8k
  IntraPredAngleChroma4 = IntraPredAngleChroma_SIMD<vext, 4>;
1111
22.8k
  IntraPredAngleChroma8 = IntraPredAngleChroma_SIMD<vext, 8>;
1112
1113
22.8k
  IntraPredSampleFilter8 = IntraPredSampleFilter_SIMD<vext, 8>;
1114
22.8k
  IntraPredSampleFilter16 = IntraPredSampleFilter_SIMD<vext, 16>;
1115
1116
22.8k
  xPredIntraPlanar = xPredIntraPlanar_SIMD<vext>;
1117
1118
22.8k
  GetLumaRecPixel420 = GetLumaRecPixel420SIMD<vext>;
1119
1120
22.8k
}
Unexecuted instantiation: void vvdec::IntraPrediction::_initIntraPredictionX86<(vvdec::x86_simd::X86_VEXT)1>()
void vvdec::IntraPrediction::_initIntraPredictionX86<(vvdec::x86_simd::X86_VEXT)4>()
Line
Count
Source
1107
22.8k
{
1108
22.8k
  IntraPredAngleCore4 = IntraPredAngleCore_SIMD<vext, 4>;
1109
22.8k
  IntraPredAngleCore8 = IntraPredAngleCore_SIMD<vext, 8>;
1110
22.8k
  IntraPredAngleChroma4 = IntraPredAngleChroma_SIMD<vext, 4>;
1111
22.8k
  IntraPredAngleChroma8 = IntraPredAngleChroma_SIMD<vext, 8>;
1112
1113
22.8k
  IntraPredSampleFilter8 = IntraPredSampleFilter_SIMD<vext, 8>;
1114
22.8k
  IntraPredSampleFilter16 = IntraPredSampleFilter_SIMD<vext, 16>;
1115
1116
22.8k
  xPredIntraPlanar = xPredIntraPlanar_SIMD<vext>;
1117
1118
22.8k
  GetLumaRecPixel420 = GetLumaRecPixel420SIMD<vext>;
1119
1120
22.8k
}
1121
template void IntraPrediction::_initIntraPredictionX86<SIMDX86>();
1122
1123
#endif // TARGET_SIMD_X86
1124
#endif
1125
1126
}