/src/vvdec/source/Lib/CommonLib/x86/IntraPredX86.h
Line | Count | Source |
1 | | /* ----------------------------------------------------------------------------- |
2 | | The copyright in this software is being made available under the Clear BSD |
3 | | License, included below. No patent rights, trademark rights and/or |
4 | | other Intellectual Property Rights other than the copyrights concerning |
5 | | the Software are granted under this license. |
6 | | |
7 | | The Clear BSD License |
8 | | |
9 | | Copyright (c) 2018-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVdeC Authors. |
10 | | All rights reserved. |
11 | | |
12 | | Redistribution and use in source and binary forms, with or without modification, |
13 | | are permitted (subject to the limitations in the disclaimer below) provided that |
14 | | the following conditions are met: |
15 | | |
16 | | * Redistributions of source code must retain the above copyright notice, |
17 | | this list of conditions and the following disclaimer. |
18 | | |
19 | | * Redistributions in binary form must reproduce the above copyright |
20 | | notice, this list of conditions and the following disclaimer in the |
21 | | documentation and/or other materials provided with the distribution. |
22 | | |
23 | | * Neither the name of the copyright holder nor the names of its |
24 | | contributors may be used to endorse or promote products derived from this |
25 | | software without specific prior written permission. |
26 | | |
27 | | NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY |
28 | | THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND |
29 | | CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
30 | | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A |
31 | | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR |
32 | | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
33 | | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
34 | | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR |
35 | | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER |
36 | | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
37 | | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
38 | | POSSIBILITY OF SUCH DAMAGE. |
39 | | |
40 | | |
41 | | ------------------------------------------------------------------------------------------- */ |
42 | | |
43 | | /** \file IntraPredX86.h |
44 | | \brief SIMD for IntraPrediction |
45 | | */ |
46 | | |
47 | | #include "CommonLib/CommonDef.h" |
48 | | #include "CommonDefX86.h" |
49 | | #include "CommonLib/IntraPrediction.h" |
50 | | |
51 | | namespace vvdec |
52 | | { |
53 | | |
54 | | #if ENABLE_SIMD_OPT_INTRAPRED |
55 | | #ifdef TARGET_SIMD_X86 |
56 | | |
57 | | //#define USE_AVX2 |
58 | | template< X86_VEXT vext, int W > |
59 | | void IntraPredAngleChroma_SIMD(int16_t* pDst,const ptrdiff_t dstStride,int16_t* pBorder,int width,int height,int deltaPos,int intraPredAngle) |
60 | 798 | { |
61 | 798 | int deltaInt; |
62 | 798 | int deltaFract; |
63 | 798 | int refMainIndex; |
64 | | |
65 | 798 | __m128i voffset = _mm_set1_epi16(16); |
66 | 798 | if( W == 8 ) |
67 | 574 | { |
68 | 574 | if( vext >= AVX2 ) |
69 | 574 | { |
70 | | #ifdef USE_AVX2 |
71 | 574 | if (( width & 15 ) == 0 ) |
72 | 464 | { |
73 | 464 | int deltaInt; |
74 | 464 | int deltaFract; |
75 | 464 | int refMainIndex; |
76 | | |
77 | | __m256i voffset = _mm256_set1_epi16(16); |
78 | 7.13k | for (int k=0; k<height; k++) { |
79 | | |
80 | 6.66k | deltaInt = deltaPos >> 5; |
81 | 6.66k | deltaFract = deltaPos & (32 - 1); |
82 | | |
83 | 6.66k | __m256i vfract = _mm256_set1_epi16(deltaFract); |
84 | 6.66k | __m256i v32minfract = _mm256_set1_epi16(32-deltaFract); |
85 | | // Do linear filtering |
86 | 16.6k | for (int l=0; l<width; l+=16) { |
87 | 9.93k | refMainIndex = l+ deltaInt+1; |
88 | 9.93k | __m256i vpred0 = _mm256_lddqu_si256((__m256i*)&pBorder[refMainIndex]); |
89 | 9.93k | __m256i vpred1 = _mm256_lddqu_si256((__m256i*)&pBorder[refMainIndex+1]); |
90 | 9.93k | vpred0 = _mm256_mullo_epi16(v32minfract, vpred0); |
91 | 9.93k | vpred1 = _mm256_mullo_epi16(vfract, vpred1); |
92 | 9.93k | __m256i vpred = _mm256_srli_epi16(_mm256_add_epi16(_mm256_add_epi16(vpred0, vpred1), voffset), 5); |
93 | 9.93k | _mm256_storeu_si256((__m256i*)&pDst[l], vpred); |
94 | 9.93k | } |
95 | 6.66k | pDst+=dstStride; |
96 | 6.66k | deltaPos += intraPredAngle; |
97 | 6.66k | } |
98 | 464 | } |
99 | 110 | else // width==8 |
100 | 110 | { |
101 | 1.28k | for (int k=0; k<height; k++) |
102 | 1.17k | { |
103 | 1.17k | deltaInt = deltaPos >> 5; |
104 | 1.17k | deltaFract = deltaPos & (32 - 1); |
105 | | |
106 | 1.17k | __m128i vfract = _mm_set1_epi16(deltaFract); |
107 | 1.17k | __m128i v32minfract = _mm_set1_epi16(32-deltaFract); |
108 | | // Do linear filtering |
109 | 2.35k | for (int l=0; l<width; l+=8) { |
110 | 1.17k | refMainIndex = l+ deltaInt+1; |
111 | 1.17k | __m128i vpred0 = _mm_lddqu_si128((__m128i*)&pBorder[refMainIndex]); |
112 | 1.17k | __m128i vpred1 = _mm_lddqu_si128((__m128i*)&pBorder[refMainIndex+1]); |
113 | 1.17k | vpred0 = _mm_mullo_epi16(v32minfract, vpred0); |
114 | 1.17k | vpred1 = _mm_mullo_epi16(vfract, vpred1); |
115 | 1.17k | __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5); |
116 | 1.17k | _mm_storeu_si128((__m128i*)&pDst[l], vpred); |
117 | 1.17k | } |
118 | 1.17k | deltaPos += intraPredAngle; |
119 | | |
120 | 1.17k | pDst+=dstStride; |
121 | 1.17k | } |
122 | | |
123 | 110 | } |
124 | | #endif //AVX2 |
125 | 574 | } |
126 | 0 | else |
127 | 0 | { |
128 | 0 | for (int k=0; k<height; k++) { |
129 | 0 | deltaInt = deltaPos >> 5; |
130 | 0 | deltaFract = deltaPos & (32 - 1); |
131 | |
|
132 | 0 | __m128i vfract = _mm_set1_epi16(deltaFract); |
133 | 0 | __m128i v32minfract = _mm_set1_epi16(32-deltaFract); |
134 | | // Do linear filtering |
135 | 0 | for (int l=0; l<width; l+=8) { |
136 | 0 | refMainIndex = l+ deltaInt+1; |
137 | 0 | __m128i vpred0 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex]); |
138 | 0 | __m128i vpred1 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex+1]); |
139 | 0 | vpred0 = _mm_mullo_epi16(v32minfract, vpred0); |
140 | 0 | vpred1 = _mm_mullo_epi16(vfract, vpred1); |
141 | 0 | __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5); |
142 | 0 | _mm_storeu_si128((__m128i*)&pDst[l], vpred); |
143 | 0 | } |
144 | 0 | deltaPos += intraPredAngle; |
145 | |
|
146 | 0 | pDst+=dstStride; |
147 | 0 | } |
148 | 0 | } |
149 | | |
150 | 574 | } |
151 | 224 | else if( W == 4 ) |
152 | 224 | { |
153 | 1.64k | for (int k=0; k<height; k++) { |
154 | 1.42k | deltaInt = deltaPos >> 5; |
155 | 1.42k | deltaFract = deltaPos & (32 - 1); |
156 | | |
157 | 1.42k | __m128i vfract = _mm_set1_epi16(deltaFract); |
158 | 1.42k | __m128i v32minfract = _mm_set1_epi16(32-deltaFract); |
159 | | // Do linear filtering |
160 | 1.42k | refMainIndex = deltaInt+1; |
161 | 1.42k | __m128i vpred0 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex]); |
162 | 1.42k | __m128i vpred1 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex+1]); |
163 | 1.42k | vpred0 = _mm_mullo_epi16(v32minfract, vpred0); |
164 | 1.42k | vpred1 = _mm_mullo_epi16(vfract, vpred1); |
165 | 1.42k | __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5); |
166 | 1.42k | _mm_storeu_si64( ( __m128i * )(pDst ), vpred); |
167 | 1.42k | deltaPos += intraPredAngle; |
168 | 1.42k | pDst+=dstStride; |
169 | 1.42k | } |
170 | 224 | } |
171 | 0 | else |
172 | 0 | { |
173 | 0 | THROW_FATAL( "Unsupported size in IntraPredAngleCore_SIMD" ); |
174 | 0 | } |
175 | | #if USE_AVX2 |
176 | | |
177 | 798 | _mm256_zeroupper(); |
178 | 798 | #endif |
179 | 798 | } Unexecuted instantiation: void vvdec::IntraPredAngleChroma_SIMD<(vvdec::x86_simd::X86_VEXT)1, 4>(short*, long, short*, int, int, int, int) Unexecuted instantiation: void vvdec::IntraPredAngleChroma_SIMD<(vvdec::x86_simd::X86_VEXT)1, 8>(short*, long, short*, int, int, int, int) void vvdec::IntraPredAngleChroma_SIMD<(vvdec::x86_simd::X86_VEXT)4, 4>(short*, long, short*, int, int, int, int) Line | Count | Source | 60 | 224 | { | 61 | 224 | int deltaInt; | 62 | 224 | int deltaFract; | 63 | 224 | int refMainIndex; | 64 | | | 65 | 224 | __m128i voffset = _mm_set1_epi16(16); | 66 | 224 | if( W == 8 ) | 67 | 0 | { | 68 | 0 | if( vext >= AVX2 ) | 69 | 0 | { | 70 | 0 | #ifdef USE_AVX2 | 71 | 0 | if (( width & 15 ) == 0 ) | 72 | 0 | { | 73 | 0 | int deltaInt; | 74 | 0 | int deltaFract; | 75 | 0 | int refMainIndex; | 76 | |
| 77 | 0 | __m256i voffset = _mm256_set1_epi16(16); | 78 | 0 | for (int k=0; k<height; k++) { | 79 | |
| 80 | 0 | deltaInt = deltaPos >> 5; | 81 | 0 | deltaFract = deltaPos & (32 - 1); | 82 | |
| 83 | 0 | __m256i vfract = _mm256_set1_epi16(deltaFract); | 84 | 0 | __m256i v32minfract = _mm256_set1_epi16(32-deltaFract); | 85 | | // Do linear filtering | 86 | 0 | for (int l=0; l<width; l+=16) { | 87 | 0 | refMainIndex = l+ deltaInt+1; | 88 | 0 | __m256i vpred0 = _mm256_lddqu_si256((__m256i*)&pBorder[refMainIndex]); | 89 | 0 | __m256i vpred1 = _mm256_lddqu_si256((__m256i*)&pBorder[refMainIndex+1]); | 90 | 0 | vpred0 = _mm256_mullo_epi16(v32minfract, vpred0); | 91 | 0 | vpred1 = _mm256_mullo_epi16(vfract, vpred1); | 92 | 0 | __m256i vpred = _mm256_srli_epi16(_mm256_add_epi16(_mm256_add_epi16(vpred0, vpred1), voffset), 5); | 93 | 0 | _mm256_storeu_si256((__m256i*)&pDst[l], vpred); | 94 | 0 | } | 95 | 0 | pDst+=dstStride; | 96 | 0 | deltaPos += intraPredAngle; | 97 | 0 | } | 98 | 0 | } | 99 | 0 | else // width==8 | 100 | 0 | { | 101 | 0 | for (int k=0; k<height; k++) | 102 | 0 | { | 103 | 0 | deltaInt = deltaPos >> 5; | 104 | 0 | deltaFract = deltaPos & (32 - 1); | 105 | |
| 106 | 0 | __m128i vfract = _mm_set1_epi16(deltaFract); | 107 | 0 | __m128i v32minfract = _mm_set1_epi16(32-deltaFract); | 108 | | // Do linear filtering | 109 | 0 | for (int l=0; l<width; l+=8) { | 110 | 0 | refMainIndex = l+ deltaInt+1; | 111 | 0 | __m128i vpred0 = _mm_lddqu_si128((__m128i*)&pBorder[refMainIndex]); | 112 | 0 | __m128i vpred1 = _mm_lddqu_si128((__m128i*)&pBorder[refMainIndex+1]); | 113 | 0 | vpred0 = _mm_mullo_epi16(v32minfract, vpred0); | 114 | 0 | vpred1 = _mm_mullo_epi16(vfract, vpred1); | 115 | 0 | __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5); | 116 | 0 | _mm_storeu_si128((__m128i*)&pDst[l], vpred); | 117 | 0 | } | 118 | 0 | deltaPos += intraPredAngle; | 119 | |
| 120 | 0 | pDst+=dstStride; | 121 | 0 | } | 122 | |
| 123 | 0 | } | 124 | 0 | #endif //AVX2 | 125 | 0 | } | 126 | 0 | else | 127 | 0 | { | 128 | 0 | for (int k=0; k<height; k++) { | 129 | 0 | deltaInt = deltaPos >> 5; | 130 | 0 | deltaFract = deltaPos & (32 - 1); | 131 | |
| 132 | 0 | __m128i vfract = _mm_set1_epi16(deltaFract); | 133 | 0 | __m128i v32minfract = _mm_set1_epi16(32-deltaFract); | 134 | | // Do linear filtering | 135 | 0 | for (int l=0; l<width; l+=8) { | 136 | 0 | refMainIndex = l+ deltaInt+1; | 137 | 0 | __m128i vpred0 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex]); | 138 | 0 | __m128i vpred1 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex+1]); | 139 | 0 | vpred0 = _mm_mullo_epi16(v32minfract, vpred0); | 140 | 0 | vpred1 = _mm_mullo_epi16(vfract, vpred1); | 141 | 0 | __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5); | 142 | 0 | _mm_storeu_si128((__m128i*)&pDst[l], vpred); | 143 | 0 | } | 144 | 0 | deltaPos += intraPredAngle; | 145 | |
| 146 | 0 | pDst+=dstStride; | 147 | 0 | } | 148 | 0 | } | 149 | |
| 150 | 0 | } | 151 | 224 | else if( W == 4 ) | 152 | 224 | { | 153 | 1.64k | for (int k=0; k<height; k++) { | 154 | 1.42k | deltaInt = deltaPos >> 5; | 155 | 1.42k | deltaFract = deltaPos & (32 - 1); | 156 | | | 157 | 1.42k | __m128i vfract = _mm_set1_epi16(deltaFract); | 158 | 1.42k | __m128i v32minfract = _mm_set1_epi16(32-deltaFract); | 159 | | // Do linear filtering | 160 | 1.42k | refMainIndex = deltaInt+1; | 161 | 1.42k | __m128i vpred0 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex]); | 162 | 1.42k | __m128i vpred1 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex+1]); | 163 | 1.42k | vpred0 = _mm_mullo_epi16(v32minfract, vpred0); | 164 | 1.42k | vpred1 = _mm_mullo_epi16(vfract, vpred1); | 165 | 1.42k | __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5); | 166 | 1.42k | _mm_storeu_si64( ( __m128i * )(pDst ), vpred); | 167 | 1.42k | deltaPos += intraPredAngle; | 168 | 1.42k | pDst+=dstStride; | 169 | 1.42k | } | 170 | 224 | } | 171 | 0 | else | 172 | 0 | { | 173 | 0 | THROW_FATAL( "Unsupported size in IntraPredAngleCore_SIMD" ); | 174 | 0 | } | 175 | 224 | #if USE_AVX2 | 176 | | | 177 | 224 | _mm256_zeroupper(); | 178 | 224 | #endif | 179 | 224 | } |
void vvdec::IntraPredAngleChroma_SIMD<(vvdec::x86_simd::X86_VEXT)4, 8>(short*, long, short*, int, int, int, int) Line | Count | Source | 60 | 574 | { | 61 | 574 | int deltaInt; | 62 | 574 | int deltaFract; | 63 | 574 | int refMainIndex; | 64 | | | 65 | 574 | __m128i voffset = _mm_set1_epi16(16); | 66 | 574 | if( W == 8 ) | 67 | 574 | { | 68 | 574 | if( vext >= AVX2 ) | 69 | 574 | { | 70 | 574 | #ifdef USE_AVX2 | 71 | 574 | if (( width & 15 ) == 0 ) | 72 | 464 | { | 73 | 464 | int deltaInt; | 74 | 464 | int deltaFract; | 75 | 464 | int refMainIndex; | 76 | | | 77 | 464 | __m256i voffset = _mm256_set1_epi16(16); | 78 | 7.13k | for (int k=0; k<height; k++) { | 79 | | | 80 | 6.66k | deltaInt = deltaPos >> 5; | 81 | 6.66k | deltaFract = deltaPos & (32 - 1); | 82 | | | 83 | 6.66k | __m256i vfract = _mm256_set1_epi16(deltaFract); | 84 | 6.66k | __m256i v32minfract = _mm256_set1_epi16(32-deltaFract); | 85 | | // Do linear filtering | 86 | 16.6k | for (int l=0; l<width; l+=16) { | 87 | 9.93k | refMainIndex = l+ deltaInt+1; | 88 | 9.93k | __m256i vpred0 = _mm256_lddqu_si256((__m256i*)&pBorder[refMainIndex]); | 89 | 9.93k | __m256i vpred1 = _mm256_lddqu_si256((__m256i*)&pBorder[refMainIndex+1]); | 90 | 9.93k | vpred0 = _mm256_mullo_epi16(v32minfract, vpred0); | 91 | 9.93k | vpred1 = _mm256_mullo_epi16(vfract, vpred1); | 92 | 9.93k | __m256i vpred = _mm256_srli_epi16(_mm256_add_epi16(_mm256_add_epi16(vpred0, vpred1), voffset), 5); | 93 | 9.93k | _mm256_storeu_si256((__m256i*)&pDst[l], vpred); | 94 | 9.93k | } | 95 | 6.66k | pDst+=dstStride; | 96 | 6.66k | deltaPos += intraPredAngle; | 97 | 6.66k | } | 98 | 464 | } | 99 | 110 | else // width==8 | 100 | 110 | { | 101 | 1.28k | for (int k=0; k<height; k++) | 102 | 1.17k | { | 103 | 1.17k | deltaInt = deltaPos >> 5; | 104 | 1.17k | deltaFract = deltaPos & (32 - 1); | 105 | | | 106 | 1.17k | __m128i vfract = _mm_set1_epi16(deltaFract); | 107 | 1.17k | __m128i v32minfract = _mm_set1_epi16(32-deltaFract); | 108 | | // Do linear filtering | 109 | 2.35k | for (int l=0; l<width; l+=8) { | 110 | 1.17k | refMainIndex = l+ deltaInt+1; | 111 | 1.17k | __m128i vpred0 = _mm_lddqu_si128((__m128i*)&pBorder[refMainIndex]); | 112 | 1.17k | __m128i vpred1 = _mm_lddqu_si128((__m128i*)&pBorder[refMainIndex+1]); | 113 | 1.17k | vpred0 = _mm_mullo_epi16(v32minfract, vpred0); | 114 | 1.17k | vpred1 = _mm_mullo_epi16(vfract, vpred1); | 115 | 1.17k | __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5); | 116 | 1.17k | _mm_storeu_si128((__m128i*)&pDst[l], vpred); | 117 | 1.17k | } | 118 | 1.17k | deltaPos += intraPredAngle; | 119 | | | 120 | 1.17k | pDst+=dstStride; | 121 | 1.17k | } | 122 | | | 123 | 110 | } | 124 | 574 | #endif //AVX2 | 125 | 574 | } | 126 | 0 | else | 127 | 0 | { | 128 | 0 | for (int k=0; k<height; k++) { | 129 | 0 | deltaInt = deltaPos >> 5; | 130 | 0 | deltaFract = deltaPos & (32 - 1); | 131 | |
| 132 | 0 | __m128i vfract = _mm_set1_epi16(deltaFract); | 133 | 0 | __m128i v32minfract = _mm_set1_epi16(32-deltaFract); | 134 | | // Do linear filtering | 135 | 0 | for (int l=0; l<width; l+=8) { | 136 | 0 | refMainIndex = l+ deltaInt+1; | 137 | 0 | __m128i vpred0 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex]); | 138 | 0 | __m128i vpred1 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex+1]); | 139 | 0 | vpred0 = _mm_mullo_epi16(v32minfract, vpred0); | 140 | 0 | vpred1 = _mm_mullo_epi16(vfract, vpred1); | 141 | 0 | __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5); | 142 | 0 | _mm_storeu_si128((__m128i*)&pDst[l], vpred); | 143 | 0 | } | 144 | 0 | deltaPos += intraPredAngle; | 145 | |
| 146 | 0 | pDst+=dstStride; | 147 | 0 | } | 148 | 0 | } | 149 | | | 150 | 574 | } | 151 | 0 | else if( W == 4 ) | 152 | 0 | { | 153 | 0 | for (int k=0; k<height; k++) { | 154 | 0 | deltaInt = deltaPos >> 5; | 155 | 0 | deltaFract = deltaPos & (32 - 1); | 156 | |
| 157 | 0 | __m128i vfract = _mm_set1_epi16(deltaFract); | 158 | 0 | __m128i v32minfract = _mm_set1_epi16(32-deltaFract); | 159 | | // Do linear filtering | 160 | 0 | refMainIndex = deltaInt+1; | 161 | 0 | __m128i vpred0 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex]); | 162 | 0 | __m128i vpred1 = _mm_loadu_si128((__m128i*)&pBorder[refMainIndex+1]); | 163 | 0 | vpred0 = _mm_mullo_epi16(v32minfract, vpred0); | 164 | 0 | vpred1 = _mm_mullo_epi16(vfract, vpred1); | 165 | 0 | __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5); | 166 | 0 | _mm_storeu_si64( ( __m128i * )(pDst ), vpred); | 167 | 0 | deltaPos += intraPredAngle; | 168 | 0 | pDst+=dstStride; | 169 | 0 | } | 170 | 0 | } | 171 | 0 | else | 172 | 0 | { | 173 | 0 | THROW_FATAL( "Unsupported size in IntraPredAngleCore_SIMD" ); | 174 | 0 | } | 175 | 574 | #if USE_AVX2 | 176 | | | 177 | 574 | _mm256_zeroupper(); | 178 | 574 | #endif | 179 | 574 | } |
|
180 | | |
181 | | |
182 | | template< X86_VEXT vext, int W > |
183 | | void IntraPredAngleCore_SIMD(int16_t* pDstBuf,const ptrdiff_t dstStride,int16_t* refMain,int width,int height,int deltaPos,int intraPredAngle,const TFilterCoeff *ff,const bool useCubicFilter,const ClpRng& clpRng) |
184 | 910 | { |
185 | 910 | int16_t* pDst; |
186 | | |
187 | 910 | if( W == 8 ) |
188 | 791 | { |
189 | 791 | if( vext >= AVX2 ) |
190 | 791 | { |
191 | | #ifdef USE_AVX2 |
192 | | __m256i shflmask1= _mm256_set_epi8(0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, |
193 | | 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 ); |
194 | | __m256i offset = _mm256_set1_epi32( 32 ); |
195 | | |
196 | 791 | if (( width & 15 ) == 0 ) |
197 | 656 | { |
198 | 656 | __m256i vbdmin,vbdmax; |
199 | | |
200 | 656 | if (useCubicFilter) |
201 | 268 | { |
202 | 268 | vbdmin = _mm256_set1_epi16( clpRng.min() ); |
203 | 268 | vbdmax = _mm256_set1_epi16( clpRng.max() ); |
204 | 268 | } |
205 | | |
206 | 18.7k | for (int y = 0; y<height; y++ ) |
207 | 18.0k | { |
208 | 18.0k | int deltaInt = deltaPos >> 5; |
209 | 18.0k | int deltaFract = deltaPos & (32 - 1); |
210 | 18.0k | int refMainIndex = deltaInt + 1; |
211 | 18.0k | pDst=&pDstBuf[y*dstStride]; |
212 | 18.0k | __m128i tmp = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] ); //load 4 16 bit filter coeffs |
213 | 18.0k | tmp = _mm_shuffle_epi32(tmp,0x44); |
214 | 18.0k | __m256i coeff = _mm256_broadcastsi128_si256(tmp); |
215 | 72.2k | for( int x = 0; x < width; x+=16) |
216 | 54.2k | { |
217 | 54.2k | __m256i src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( const __m128i* ) &refMain[refMainIndex - 1] ) ); |
218 | 54.2k | __m256i src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( const __m128i* ) &refMain[refMainIndex + 4 - 1] ) ); |
219 | 54.2k | src1 = _mm256_shuffle_epi8(src1,shflmask1); // -1 0 1 2 0 1 2 3 1 2 3 4 2 3 4 5 |
220 | 54.2k | src2 = _mm256_shuffle_epi8(src2,shflmask1); // 3 4 5 6 4 5 6 7 5 6 7 8 6 7 8 9 |
221 | | |
222 | 54.2k | src1 = _mm256_madd_epi16 (src1, coeff); |
223 | 54.2k | src2 = _mm256_madd_epi16 (src2, coeff); |
224 | | |
225 | 54.2k | __m256i sum = _mm256_hadd_epi32( src1, src2 ); |
226 | 54.2k | sum = _mm256_permute4x64_epi64(sum,0xD8); |
227 | | |
228 | 54.2k | sum = _mm256_add_epi32( sum, offset ); |
229 | 54.2k | sum = _mm256_srai_epi32( sum, 6 ); |
230 | | |
231 | 54.2k | refMainIndex+=8; |
232 | | |
233 | 54.2k | src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) &refMain[refMainIndex - 1] ) ); |
234 | 54.2k | src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) &refMain[refMainIndex + 4 - 1] ) ); |
235 | | |
236 | 54.2k | src1 = _mm256_shuffle_epi8(src1,shflmask1); // -1 0 1 2 0 1 2 3 1 2 3 4 2 3 4 5 |
237 | 54.2k | src2 = _mm256_shuffle_epi8(src2,shflmask1); // 3 4 5 6 4 5 6 7 5 6 7 8 6 7 8 9 |
238 | 54.2k | src1 = _mm256_madd_epi16 (src1, coeff); |
239 | 54.2k | src2 = _mm256_madd_epi16 (src2, coeff); |
240 | | |
241 | 54.2k | __m256i sum1 = _mm256_hadd_epi32( src1, src2 ); |
242 | 54.2k | sum1 = _mm256_permute4x64_epi64(sum1,0xD8); |
243 | | |
244 | 54.2k | sum1 = _mm256_add_epi32( sum1, offset ); |
245 | 54.2k | sum1 = _mm256_srai_epi32( sum1, 6 ); |
246 | 54.2k | __m256i |
247 | 54.2k | src0 = _mm256_packs_epi32( sum, sum1 ); |
248 | | |
249 | 54.2k | src0 = _mm256_permute4x64_epi64(src0,0xD8); |
250 | | |
251 | 54.2k | refMainIndex+=8; |
252 | | |
253 | 54.2k | if (useCubicFilter) |
254 | 13.3k | src0 = _mm256_min_epi16( vbdmax, _mm256_max_epi16( vbdmin, src0 ) ); |
255 | | |
256 | 54.2k | _mm256_storeu_si256( ( __m256i * )(pDst + x), src0); |
257 | 54.2k | } |
258 | 18.0k | deltaPos += intraPredAngle; |
259 | 18.0k | } |
260 | 656 | } |
261 | 135 | else // width =8 |
262 | 135 | { |
263 | | // printf("AVX2 Block %d \n",width); |
264 | 135 | __m128i vbdmin,vbdmax; |
265 | | |
266 | 135 | if (useCubicFilter) |
267 | 122 | { |
268 | 122 | vbdmin = _mm_set1_epi16( clpRng.min() ); |
269 | 122 | vbdmax = _mm_set1_epi16( clpRng.max() ); |
270 | 122 | } |
271 | | |
272 | 2.31k | for (int y = 0; y<height; y++ ) |
273 | 2.17k | { |
274 | 2.17k | int deltaInt = deltaPos >> 5; |
275 | 2.17k | int deltaFract = deltaPos & (32 - 1); |
276 | 2.17k | int refMainIndex = deltaInt + 1; |
277 | 2.17k | pDst=&pDstBuf[y*dstStride]; |
278 | 2.17k | __m128i tmp = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] ); //load 4 16 bit filter coeffs |
279 | 2.17k | tmp = _mm_shuffle_epi32(tmp,0x44); |
280 | 2.17k | __m256i coeff = _mm256_broadcastsi128_si256(tmp); |
281 | 2.17k | __m256i src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) & refMain[refMainIndex - 1] ) ); |
282 | 2.17k | __m256i src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) & refMain[refMainIndex + 4 - 1] ) ); |
283 | 2.17k | src1 = _mm256_shuffle_epi8(src1,shflmask1); // -1 0 1 2 0 1 2 3 1 2 3 4 2 3 4 5 |
284 | 2.17k | src2 = _mm256_shuffle_epi8(src2,shflmask1); // 3 4 5 6 4 5 6 7 5 6 7 8 6 7 8 9 |
285 | | |
286 | 2.17k | src1 = _mm256_madd_epi16 (src1, coeff); |
287 | 2.17k | src2 = _mm256_madd_epi16 (src2, coeff); |
288 | | |
289 | 2.17k | __m256i sum = _mm256_hadd_epi32( src1, src2 ); |
290 | 2.17k | sum = _mm256_permute4x64_epi64(sum,0xD8); |
291 | | |
292 | 2.17k | sum = _mm256_add_epi32( sum, offset ); |
293 | 2.17k | sum = _mm256_srai_epi32( sum, 6 ); |
294 | 2.17k | __m128i dest128 = _mm256_cvtepi32_epi16x( sum ); |
295 | | |
296 | 2.17k | if (useCubicFilter) |
297 | 1.83k | dest128 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, dest128 ) ); |
298 | | |
299 | 2.17k | _mm_storeu_si128( ( __m128i * )(pDst), dest128); |
300 | 2.17k | deltaPos += intraPredAngle; |
301 | 2.17k | } |
302 | 135 | } |
303 | | #endif |
304 | 791 | } |
305 | 0 | else |
306 | 0 | { |
307 | 0 | __m128i shflmask1= _mm_set_epi8( 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 ); |
308 | 0 | __m128i shflmask2= _mm_set_epi8( 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4 ); |
309 | 0 | __m128i vbdmin,vbdmax; |
310 | |
|
311 | 0 | __m128i offset = _mm_set1_epi32( 32 ); |
312 | |
|
313 | 0 | if (useCubicFilter) |
314 | 0 | { |
315 | 0 | vbdmin = _mm_set1_epi16( clpRng.min() ); |
316 | 0 | vbdmax = _mm_set1_epi16( clpRng.max() ); |
317 | 0 | } |
318 | |
|
319 | 0 | for (int y = 0; y<height; y++ ) |
320 | 0 | { |
321 | 0 | int deltaInt = deltaPos >> 5; |
322 | 0 | int deltaFract = deltaPos & (32 - 1); |
323 | 0 | int refMainIndex = deltaInt + 1; |
324 | 0 | pDst=&pDstBuf[y*dstStride]; |
325 | 0 | __m128i coeff = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] ); //load 4 16 bit filter coeffs |
326 | 0 | coeff = _mm_shuffle_epi32(coeff,0x44); |
327 | 0 | for( int x = 0; x < width; x+=8) |
328 | 0 | { |
329 | 0 | __m128i src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] ); //load 8 16 bit reference Pels -1 0 1 2 3 4 5 6 |
330 | 0 | __m128i src1 = _mm_shuffle_epi8(src0,shflmask1); // -1 0 1 2 0 1 2 3 |
331 | 0 | __m128i src2 = _mm_shuffle_epi8(src0,shflmask2); // 1 2 3 4 2 3 4 5 |
332 | 0 | src0 = _mm_madd_epi16( coeff,src1 ); |
333 | 0 | src1 = _mm_madd_epi16( coeff,src2 ); |
334 | 0 | __m128i sum = _mm_hadd_epi32( src0, src1 ); |
335 | 0 | sum = _mm_add_epi32( sum, offset ); |
336 | 0 | sum = _mm_srai_epi32( sum, 6 ); |
337 | |
|
338 | 0 | refMainIndex+=4; |
339 | 0 | src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] ); //load 8 16 bit reference Pels -1 0 1 2 3 4 5 6 |
340 | 0 | src1 = _mm_shuffle_epi8(src0,shflmask1); // -1 0 1 2 0 1 2 3 |
341 | 0 | src2 = _mm_shuffle_epi8(src0,shflmask2); |
342 | | |
343 | | // 1 2 3 4 2 3 4 5 |
344 | 0 | src0 = _mm_madd_epi16( coeff,src1 ); |
345 | 0 | src1 = _mm_madd_epi16( coeff,src2 ); |
346 | |
|
347 | 0 | __m128i sum1 = _mm_hadd_epi32( src0, src1 ); |
348 | 0 | sum1 = _mm_add_epi32( sum1, offset ); |
349 | 0 | sum1 = _mm_srai_epi32( sum1, 6 ); |
350 | 0 | src0 = _mm_packs_epi32( sum, sum1 ); |
351 | |
|
352 | 0 | refMainIndex+=4; |
353 | |
|
354 | 0 | if (useCubicFilter) |
355 | 0 | src0 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, src0 ) ); |
356 | |
|
357 | 0 | _mm_storeu_si128( ( __m128i * )(pDst + x), src0); |
358 | |
|
359 | 0 | } |
360 | 0 | deltaPos += intraPredAngle; |
361 | 0 | } |
362 | 0 | } |
363 | 791 | } |
364 | 119 | else if( W == 4 ) |
365 | 119 | { |
366 | | |
367 | 119 | __m128i shflmask1= _mm_set_epi8( 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 ); |
368 | 119 | __m128i shflmask2= _mm_set_epi8( 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4 ); |
369 | 119 | __m128i vbdmin,vbdmax; |
370 | | |
371 | 119 | __m128i offset = _mm_set1_epi32( 32 ); |
372 | | |
373 | 119 | if (useCubicFilter) |
374 | 119 | { |
375 | 119 | vbdmin = _mm_set1_epi16( clpRng.min() ); |
376 | 119 | vbdmax = _mm_set1_epi16( clpRng.max() ); |
377 | 119 | } |
378 | | |
379 | 2.37k | for (int y = 0; y<height; y++ ) |
380 | 2.26k | { |
381 | 2.26k | int deltaInt = deltaPos >> 5; |
382 | 2.26k | int deltaFract = deltaPos & (32 - 1); |
383 | 2.26k | int refMainIndex = deltaInt + 1; |
384 | 2.26k | pDst=&pDstBuf[y*dstStride]; |
385 | 2.26k | __m128i coeff = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] ); //load 4 16 bit filter coeffs |
386 | 2.26k | coeff = _mm_shuffle_epi32(coeff,0x44); |
387 | 2.26k | { |
388 | 2.26k | __m128i src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] ); //load 8 16 bit reference Pels -1 0 1 2 3 4 5 6 |
389 | 2.26k | __m128i src1 = _mm_shuffle_epi8(src0,shflmask1); // -1 0 1 2 0 1 2 3 |
390 | 2.26k | __m128i src2 = _mm_shuffle_epi8(src0,shflmask2); // 1 2 3 4 2 3 4 5 |
391 | 2.26k | src0 = _mm_madd_epi16( coeff,src1 ); |
392 | 2.26k | src1 = _mm_madd_epi16( coeff,src2 ); |
393 | 2.26k | __m128i sum = _mm_hadd_epi32( src0, src1 ); |
394 | 2.26k | sum = _mm_add_epi32( sum, offset ); |
395 | 2.26k | sum = _mm_srai_epi32( sum, 6 ); |
396 | | |
397 | 2.26k | src0 = _mm_packs_epi32( sum, sum ); |
398 | | |
399 | 2.26k | refMainIndex+=4; |
400 | | |
401 | 2.26k | if (useCubicFilter) |
402 | 2.26k | src0 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, src0 ) ); |
403 | | |
404 | 2.26k | _mm_storeu_si64( ( __m128i * )(pDst ), src0); |
405 | | |
406 | 2.26k | } |
407 | 2.26k | deltaPos += intraPredAngle; |
408 | 2.26k | } |
409 | 119 | } |
410 | 0 | else |
411 | 0 | { |
412 | 0 | THROW_FATAL( "Unsupported size in IntraPredAngleCore_SIMD" ); |
413 | 0 | } |
414 | | #if USE_AVX2 |
415 | 910 | _mm256_zeroupper(); |
416 | 910 | #endif |
417 | 910 | } Unexecuted instantiation: void vvdec::IntraPredAngleCore_SIMD<(vvdec::x86_simd::X86_VEXT)1, 4>(short*, long, short*, int, int, int, int, short const*, bool, vvdec::ClpRngTemplate<short> const&) Unexecuted instantiation: void vvdec::IntraPredAngleCore_SIMD<(vvdec::x86_simd::X86_VEXT)1, 8>(short*, long, short*, int, int, int, int, short const*, bool, vvdec::ClpRngTemplate<short> const&) void vvdec::IntraPredAngleCore_SIMD<(vvdec::x86_simd::X86_VEXT)4, 4>(short*, long, short*, int, int, int, int, short const*, bool, vvdec::ClpRngTemplate<short> const&) Line | Count | Source | 184 | 119 | { | 185 | 119 | int16_t* pDst; | 186 | | | 187 | 119 | if( W == 8 ) | 188 | 0 | { | 189 | 0 | if( vext >= AVX2 ) | 190 | 0 | { | 191 | 0 | #ifdef USE_AVX2 | 192 | 0 | __m256i shflmask1= _mm256_set_epi8(0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, | 193 | 0 | 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 ); | 194 | 0 | __m256i offset = _mm256_set1_epi32( 32 ); | 195 | |
| 196 | 0 | if (( width & 15 ) == 0 ) | 197 | 0 | { | 198 | 0 | __m256i vbdmin,vbdmax; | 199 | |
| 200 | 0 | if (useCubicFilter) | 201 | 0 | { | 202 | 0 | vbdmin = _mm256_set1_epi16( clpRng.min() ); | 203 | 0 | vbdmax = _mm256_set1_epi16( clpRng.max() ); | 204 | 0 | } | 205 | |
| 206 | 0 | for (int y = 0; y<height; y++ ) | 207 | 0 | { | 208 | 0 | int deltaInt = deltaPos >> 5; | 209 | 0 | int deltaFract = deltaPos & (32 - 1); | 210 | 0 | int refMainIndex = deltaInt + 1; | 211 | 0 | pDst=&pDstBuf[y*dstStride]; | 212 | 0 | __m128i tmp = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] ); //load 4 16 bit filter coeffs | 213 | 0 | tmp = _mm_shuffle_epi32(tmp,0x44); | 214 | 0 | __m256i coeff = _mm256_broadcastsi128_si256(tmp); | 215 | 0 | for( int x = 0; x < width; x+=16) | 216 | 0 | { | 217 | 0 | __m256i src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( const __m128i* ) &refMain[refMainIndex - 1] ) ); | 218 | 0 | __m256i src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( const __m128i* ) &refMain[refMainIndex + 4 - 1] ) ); | 219 | 0 | src1 = _mm256_shuffle_epi8(src1,shflmask1); // -1 0 1 2 0 1 2 3 1 2 3 4 2 3 4 5 | 220 | 0 | src2 = _mm256_shuffle_epi8(src2,shflmask1); // 3 4 5 6 4 5 6 7 5 6 7 8 6 7 8 9 | 221 | |
| 222 | 0 | src1 = _mm256_madd_epi16 (src1, coeff); | 223 | 0 | src2 = _mm256_madd_epi16 (src2, coeff); | 224 | |
| 225 | 0 | __m256i sum = _mm256_hadd_epi32( src1, src2 ); | 226 | 0 | sum = _mm256_permute4x64_epi64(sum,0xD8); | 227 | |
| 228 | 0 | sum = _mm256_add_epi32( sum, offset ); | 229 | 0 | sum = _mm256_srai_epi32( sum, 6 ); | 230 | |
| 231 | 0 | refMainIndex+=8; | 232 | | | 233 | 0 | src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) &refMain[refMainIndex - 1] ) ); | 234 | 0 | src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) &refMain[refMainIndex + 4 - 1] ) ); | 235 | |
| 236 | 0 | src1 = _mm256_shuffle_epi8(src1,shflmask1); // -1 0 1 2 0 1 2 3 1 2 3 4 2 3 4 5 | 237 | 0 | src2 = _mm256_shuffle_epi8(src2,shflmask1); // 3 4 5 6 4 5 6 7 5 6 7 8 6 7 8 9 | 238 | 0 | src1 = _mm256_madd_epi16 (src1, coeff); | 239 | 0 | src2 = _mm256_madd_epi16 (src2, coeff); | 240 | |
| 241 | 0 | __m256i sum1 = _mm256_hadd_epi32( src1, src2 ); | 242 | 0 | sum1 = _mm256_permute4x64_epi64(sum1,0xD8); | 243 | |
| 244 | 0 | sum1 = _mm256_add_epi32( sum1, offset ); | 245 | 0 | sum1 = _mm256_srai_epi32( sum1, 6 ); | 246 | 0 | __m256i | 247 | 0 | src0 = _mm256_packs_epi32( sum, sum1 ); | 248 | |
| 249 | 0 | src0 = _mm256_permute4x64_epi64(src0,0xD8); | 250 | |
| 251 | 0 | refMainIndex+=8; | 252 | |
| 253 | 0 | if (useCubicFilter) | 254 | 0 | src0 = _mm256_min_epi16( vbdmax, _mm256_max_epi16( vbdmin, src0 ) ); | 255 | |
| 256 | 0 | _mm256_storeu_si256( ( __m256i * )(pDst + x), src0); | 257 | 0 | } | 258 | 0 | deltaPos += intraPredAngle; | 259 | 0 | } | 260 | 0 | } | 261 | 0 | else // width =8 | 262 | 0 | { | 263 | | // printf("AVX2 Block %d \n",width); | 264 | 0 | __m128i vbdmin,vbdmax; | 265 | |
| 266 | 0 | if (useCubicFilter) | 267 | 0 | { | 268 | 0 | vbdmin = _mm_set1_epi16( clpRng.min() ); | 269 | 0 | vbdmax = _mm_set1_epi16( clpRng.max() ); | 270 | 0 | } | 271 | |
| 272 | 0 | for (int y = 0; y<height; y++ ) | 273 | 0 | { | 274 | 0 | int deltaInt = deltaPos >> 5; | 275 | 0 | int deltaFract = deltaPos & (32 - 1); | 276 | 0 | int refMainIndex = deltaInt + 1; | 277 | 0 | pDst=&pDstBuf[y*dstStride]; | 278 | 0 | __m128i tmp = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] ); //load 4 16 bit filter coeffs | 279 | 0 | tmp = _mm_shuffle_epi32(tmp,0x44); | 280 | 0 | __m256i coeff = _mm256_broadcastsi128_si256(tmp); | 281 | 0 | __m256i src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) & refMain[refMainIndex - 1] ) ); | 282 | 0 | __m256i src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) & refMain[refMainIndex + 4 - 1] ) ); | 283 | 0 | src1 = _mm256_shuffle_epi8(src1,shflmask1); // -1 0 1 2 0 1 2 3 1 2 3 4 2 3 4 5 | 284 | 0 | src2 = _mm256_shuffle_epi8(src2,shflmask1); // 3 4 5 6 4 5 6 7 5 6 7 8 6 7 8 9 | 285 | |
| 286 | 0 | src1 = _mm256_madd_epi16 (src1, coeff); | 287 | 0 | src2 = _mm256_madd_epi16 (src2, coeff); | 288 | |
| 289 | 0 | __m256i sum = _mm256_hadd_epi32( src1, src2 ); | 290 | 0 | sum = _mm256_permute4x64_epi64(sum,0xD8); | 291 | |
| 292 | 0 | sum = _mm256_add_epi32( sum, offset ); | 293 | 0 | sum = _mm256_srai_epi32( sum, 6 ); | 294 | 0 | __m128i dest128 = _mm256_cvtepi32_epi16x( sum ); | 295 | |
| 296 | 0 | if (useCubicFilter) | 297 | 0 | dest128 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, dest128 ) ); | 298 | |
| 299 | 0 | _mm_storeu_si128( ( __m128i * )(pDst), dest128); | 300 | 0 | deltaPos += intraPredAngle; | 301 | 0 | } | 302 | 0 | } | 303 | 0 | #endif | 304 | 0 | } | 305 | 0 | else | 306 | 0 | { | 307 | 0 | __m128i shflmask1= _mm_set_epi8( 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 ); | 308 | 0 | __m128i shflmask2= _mm_set_epi8( 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4 ); | 309 | 0 | __m128i vbdmin,vbdmax; | 310 | |
| 311 | 0 | __m128i offset = _mm_set1_epi32( 32 ); | 312 | |
| 313 | 0 | if (useCubicFilter) | 314 | 0 | { | 315 | 0 | vbdmin = _mm_set1_epi16( clpRng.min() ); | 316 | 0 | vbdmax = _mm_set1_epi16( clpRng.max() ); | 317 | 0 | } | 318 | |
| 319 | 0 | for (int y = 0; y<height; y++ ) | 320 | 0 | { | 321 | 0 | int deltaInt = deltaPos >> 5; | 322 | 0 | int deltaFract = deltaPos & (32 - 1); | 323 | 0 | int refMainIndex = deltaInt + 1; | 324 | 0 | pDst=&pDstBuf[y*dstStride]; | 325 | 0 | __m128i coeff = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] ); //load 4 16 bit filter coeffs | 326 | 0 | coeff = _mm_shuffle_epi32(coeff,0x44); | 327 | 0 | for( int x = 0; x < width; x+=8) | 328 | 0 | { | 329 | 0 | __m128i src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] ); //load 8 16 bit reference Pels -1 0 1 2 3 4 5 6 | 330 | 0 | __m128i src1 = _mm_shuffle_epi8(src0,shflmask1); // -1 0 1 2 0 1 2 3 | 331 | 0 | __m128i src2 = _mm_shuffle_epi8(src0,shflmask2); // 1 2 3 4 2 3 4 5 | 332 | 0 | src0 = _mm_madd_epi16( coeff,src1 ); | 333 | 0 | src1 = _mm_madd_epi16( coeff,src2 ); | 334 | 0 | __m128i sum = _mm_hadd_epi32( src0, src1 ); | 335 | 0 | sum = _mm_add_epi32( sum, offset ); | 336 | 0 | sum = _mm_srai_epi32( sum, 6 ); | 337 | |
| 338 | 0 | refMainIndex+=4; | 339 | 0 | src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] ); //load 8 16 bit reference Pels -1 0 1 2 3 4 5 6 | 340 | 0 | src1 = _mm_shuffle_epi8(src0,shflmask1); // -1 0 1 2 0 1 2 3 | 341 | 0 | src2 = _mm_shuffle_epi8(src0,shflmask2); | 342 | | | 343 | | // 1 2 3 4 2 3 4 5 | 344 | 0 | src0 = _mm_madd_epi16( coeff,src1 ); | 345 | 0 | src1 = _mm_madd_epi16( coeff,src2 ); | 346 | |
| 347 | 0 | __m128i sum1 = _mm_hadd_epi32( src0, src1 ); | 348 | 0 | sum1 = _mm_add_epi32( sum1, offset ); | 349 | 0 | sum1 = _mm_srai_epi32( sum1, 6 ); | 350 | 0 | src0 = _mm_packs_epi32( sum, sum1 ); | 351 | |
| 352 | 0 | refMainIndex+=4; | 353 | |
| 354 | 0 | if (useCubicFilter) | 355 | 0 | src0 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, src0 ) ); | 356 | |
| 357 | 0 | _mm_storeu_si128( ( __m128i * )(pDst + x), src0); | 358 | |
| 359 | 0 | } | 360 | 0 | deltaPos += intraPredAngle; | 361 | 0 | } | 362 | 0 | } | 363 | 0 | } | 364 | 119 | else if( W == 4 ) | 365 | 119 | { | 366 | | | 367 | 119 | __m128i shflmask1= _mm_set_epi8( 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 ); | 368 | 119 | __m128i shflmask2= _mm_set_epi8( 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4 ); | 369 | 119 | __m128i vbdmin,vbdmax; | 370 | | | 371 | 119 | __m128i offset = _mm_set1_epi32( 32 ); | 372 | | | 373 | 119 | if (useCubicFilter) | 374 | 119 | { | 375 | 119 | vbdmin = _mm_set1_epi16( clpRng.min() ); | 376 | 119 | vbdmax = _mm_set1_epi16( clpRng.max() ); | 377 | 119 | } | 378 | | | 379 | 2.37k | for (int y = 0; y<height; y++ ) | 380 | 2.26k | { | 381 | 2.26k | int deltaInt = deltaPos >> 5; | 382 | 2.26k | int deltaFract = deltaPos & (32 - 1); | 383 | 2.26k | int refMainIndex = deltaInt + 1; | 384 | 2.26k | pDst=&pDstBuf[y*dstStride]; | 385 | 2.26k | __m128i coeff = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] ); //load 4 16 bit filter coeffs | 386 | 2.26k | coeff = _mm_shuffle_epi32(coeff,0x44); | 387 | 2.26k | { | 388 | 2.26k | __m128i src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] ); //load 8 16 bit reference Pels -1 0 1 2 3 4 5 6 | 389 | 2.26k | __m128i src1 = _mm_shuffle_epi8(src0,shflmask1); // -1 0 1 2 0 1 2 3 | 390 | 2.26k | __m128i src2 = _mm_shuffle_epi8(src0,shflmask2); // 1 2 3 4 2 3 4 5 | 391 | 2.26k | src0 = _mm_madd_epi16( coeff,src1 ); | 392 | 2.26k | src1 = _mm_madd_epi16( coeff,src2 ); | 393 | 2.26k | __m128i sum = _mm_hadd_epi32( src0, src1 ); | 394 | 2.26k | sum = _mm_add_epi32( sum, offset ); | 395 | 2.26k | sum = _mm_srai_epi32( sum, 6 ); | 396 | | | 397 | 2.26k | src0 = _mm_packs_epi32( sum, sum ); | 398 | | | 399 | 2.26k | refMainIndex+=4; | 400 | | | 401 | 2.26k | if (useCubicFilter) | 402 | 2.26k | src0 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, src0 ) ); | 403 | | | 404 | 2.26k | _mm_storeu_si64( ( __m128i * )(pDst ), src0); | 405 | | | 406 | 2.26k | } | 407 | 2.26k | deltaPos += intraPredAngle; | 408 | 2.26k | } | 409 | 119 | } | 410 | 0 | else | 411 | 0 | { | 412 | 0 | THROW_FATAL( "Unsupported size in IntraPredAngleCore_SIMD" ); | 413 | 0 | } | 414 | 119 | #if USE_AVX2 | 415 | 119 | _mm256_zeroupper(); | 416 | 119 | #endif | 417 | 119 | } |
void vvdec::IntraPredAngleCore_SIMD<(vvdec::x86_simd::X86_VEXT)4, 8>(short*, long, short*, int, int, int, int, short const*, bool, vvdec::ClpRngTemplate<short> const&) Line | Count | Source | 184 | 791 | { | 185 | 791 | int16_t* pDst; | 186 | | | 187 | 791 | if( W == 8 ) | 188 | 791 | { | 189 | 791 | if( vext >= AVX2 ) | 190 | 791 | { | 191 | 791 | #ifdef USE_AVX2 | 192 | 791 | __m256i shflmask1= _mm256_set_epi8(0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, | 193 | 791 | 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 ); | 194 | 791 | __m256i offset = _mm256_set1_epi32( 32 ); | 195 | | | 196 | 791 | if (( width & 15 ) == 0 ) | 197 | 656 | { | 198 | 656 | __m256i vbdmin,vbdmax; | 199 | | | 200 | 656 | if (useCubicFilter) | 201 | 268 | { | 202 | 268 | vbdmin = _mm256_set1_epi16( clpRng.min() ); | 203 | 268 | vbdmax = _mm256_set1_epi16( clpRng.max() ); | 204 | 268 | } | 205 | | | 206 | 18.7k | for (int y = 0; y<height; y++ ) | 207 | 18.0k | { | 208 | 18.0k | int deltaInt = deltaPos >> 5; | 209 | 18.0k | int deltaFract = deltaPos & (32 - 1); | 210 | 18.0k | int refMainIndex = deltaInt + 1; | 211 | 18.0k | pDst=&pDstBuf[y*dstStride]; | 212 | 18.0k | __m128i tmp = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] ); //load 4 16 bit filter coeffs | 213 | 18.0k | tmp = _mm_shuffle_epi32(tmp,0x44); | 214 | 18.0k | __m256i coeff = _mm256_broadcastsi128_si256(tmp); | 215 | 72.2k | for( int x = 0; x < width; x+=16) | 216 | 54.2k | { | 217 | 54.2k | __m256i src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( const __m128i* ) &refMain[refMainIndex - 1] ) ); | 218 | 54.2k | __m256i src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( const __m128i* ) &refMain[refMainIndex + 4 - 1] ) ); | 219 | 54.2k | src1 = _mm256_shuffle_epi8(src1,shflmask1); // -1 0 1 2 0 1 2 3 1 2 3 4 2 3 4 5 | 220 | 54.2k | src2 = _mm256_shuffle_epi8(src2,shflmask1); // 3 4 5 6 4 5 6 7 5 6 7 8 6 7 8 9 | 221 | | | 222 | 54.2k | src1 = _mm256_madd_epi16 (src1, coeff); | 223 | 54.2k | src2 = _mm256_madd_epi16 (src2, coeff); | 224 | | | 225 | 54.2k | __m256i sum = _mm256_hadd_epi32( src1, src2 ); | 226 | 54.2k | sum = _mm256_permute4x64_epi64(sum,0xD8); | 227 | | | 228 | 54.2k | sum = _mm256_add_epi32( sum, offset ); | 229 | 54.2k | sum = _mm256_srai_epi32( sum, 6 ); | 230 | | | 231 | 54.2k | refMainIndex+=8; | 232 | | | 233 | 54.2k | src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) &refMain[refMainIndex - 1] ) ); | 234 | 54.2k | src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) &refMain[refMainIndex + 4 - 1] ) ); | 235 | | | 236 | 54.2k | src1 = _mm256_shuffle_epi8(src1,shflmask1); // -1 0 1 2 0 1 2 3 1 2 3 4 2 3 4 5 | 237 | 54.2k | src2 = _mm256_shuffle_epi8(src2,shflmask1); // 3 4 5 6 4 5 6 7 5 6 7 8 6 7 8 9 | 238 | 54.2k | src1 = _mm256_madd_epi16 (src1, coeff); | 239 | 54.2k | src2 = _mm256_madd_epi16 (src2, coeff); | 240 | | | 241 | 54.2k | __m256i sum1 = _mm256_hadd_epi32( src1, src2 ); | 242 | 54.2k | sum1 = _mm256_permute4x64_epi64(sum1,0xD8); | 243 | | | 244 | 54.2k | sum1 = _mm256_add_epi32( sum1, offset ); | 245 | 54.2k | sum1 = _mm256_srai_epi32( sum1, 6 ); | 246 | 54.2k | __m256i | 247 | 54.2k | src0 = _mm256_packs_epi32( sum, sum1 ); | 248 | | | 249 | 54.2k | src0 = _mm256_permute4x64_epi64(src0,0xD8); | 250 | | | 251 | 54.2k | refMainIndex+=8; | 252 | | | 253 | 54.2k | if (useCubicFilter) | 254 | 13.3k | src0 = _mm256_min_epi16( vbdmax, _mm256_max_epi16( vbdmin, src0 ) ); | 255 | | | 256 | 54.2k | _mm256_storeu_si256( ( __m256i * )(pDst + x), src0); | 257 | 54.2k | } | 258 | 18.0k | deltaPos += intraPredAngle; | 259 | 18.0k | } | 260 | 656 | } | 261 | 135 | else // width =8 | 262 | 135 | { | 263 | | // printf("AVX2 Block %d \n",width); | 264 | 135 | __m128i vbdmin,vbdmax; | 265 | | | 266 | 135 | if (useCubicFilter) | 267 | 122 | { | 268 | 122 | vbdmin = _mm_set1_epi16( clpRng.min() ); | 269 | 122 | vbdmax = _mm_set1_epi16( clpRng.max() ); | 270 | 122 | } | 271 | | | 272 | 2.31k | for (int y = 0; y<height; y++ ) | 273 | 2.17k | { | 274 | 2.17k | int deltaInt = deltaPos >> 5; | 275 | 2.17k | int deltaFract = deltaPos & (32 - 1); | 276 | 2.17k | int refMainIndex = deltaInt + 1; | 277 | 2.17k | pDst=&pDstBuf[y*dstStride]; | 278 | 2.17k | __m128i tmp = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] ); //load 4 16 bit filter coeffs | 279 | 2.17k | tmp = _mm_shuffle_epi32(tmp,0x44); | 280 | 2.17k | __m256i coeff = _mm256_broadcastsi128_si256(tmp); | 281 | 2.17k | __m256i src1 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) & refMain[refMainIndex - 1] ) ); | 282 | 2.17k | __m256i src2 = _mm256_broadcastsi128_si256( _mm_loadu_si128( ( __m128i const* ) & refMain[refMainIndex + 4 - 1] ) ); | 283 | 2.17k | src1 = _mm256_shuffle_epi8(src1,shflmask1); // -1 0 1 2 0 1 2 3 1 2 3 4 2 3 4 5 | 284 | 2.17k | src2 = _mm256_shuffle_epi8(src2,shflmask1); // 3 4 5 6 4 5 6 7 5 6 7 8 6 7 8 9 | 285 | | | 286 | 2.17k | src1 = _mm256_madd_epi16 (src1, coeff); | 287 | 2.17k | src2 = _mm256_madd_epi16 (src2, coeff); | 288 | | | 289 | 2.17k | __m256i sum = _mm256_hadd_epi32( src1, src2 ); | 290 | 2.17k | sum = _mm256_permute4x64_epi64(sum,0xD8); | 291 | | | 292 | 2.17k | sum = _mm256_add_epi32( sum, offset ); | 293 | 2.17k | sum = _mm256_srai_epi32( sum, 6 ); | 294 | 2.17k | __m128i dest128 = _mm256_cvtepi32_epi16x( sum ); | 295 | | | 296 | 2.17k | if (useCubicFilter) | 297 | 1.83k | dest128 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, dest128 ) ); | 298 | | | 299 | 2.17k | _mm_storeu_si128( ( __m128i * )(pDst), dest128); | 300 | 2.17k | deltaPos += intraPredAngle; | 301 | 2.17k | } | 302 | 135 | } | 303 | 791 | #endif | 304 | 791 | } | 305 | 0 | else | 306 | 0 | { | 307 | 0 | __m128i shflmask1= _mm_set_epi8( 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 ); | 308 | 0 | __m128i shflmask2= _mm_set_epi8( 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4 ); | 309 | 0 | __m128i vbdmin,vbdmax; | 310 | |
| 311 | 0 | __m128i offset = _mm_set1_epi32( 32 ); | 312 | |
| 313 | 0 | if (useCubicFilter) | 314 | 0 | { | 315 | 0 | vbdmin = _mm_set1_epi16( clpRng.min() ); | 316 | 0 | vbdmax = _mm_set1_epi16( clpRng.max() ); | 317 | 0 | } | 318 | |
| 319 | 0 | for (int y = 0; y<height; y++ ) | 320 | 0 | { | 321 | 0 | int deltaInt = deltaPos >> 5; | 322 | 0 | int deltaFract = deltaPos & (32 - 1); | 323 | 0 | int refMainIndex = deltaInt + 1; | 324 | 0 | pDst=&pDstBuf[y*dstStride]; | 325 | 0 | __m128i coeff = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] ); //load 4 16 bit filter coeffs | 326 | 0 | coeff = _mm_shuffle_epi32(coeff,0x44); | 327 | 0 | for( int x = 0; x < width; x+=8) | 328 | 0 | { | 329 | 0 | __m128i src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] ); //load 8 16 bit reference Pels -1 0 1 2 3 4 5 6 | 330 | 0 | __m128i src1 = _mm_shuffle_epi8(src0,shflmask1); // -1 0 1 2 0 1 2 3 | 331 | 0 | __m128i src2 = _mm_shuffle_epi8(src0,shflmask2); // 1 2 3 4 2 3 4 5 | 332 | 0 | src0 = _mm_madd_epi16( coeff,src1 ); | 333 | 0 | src1 = _mm_madd_epi16( coeff,src2 ); | 334 | 0 | __m128i sum = _mm_hadd_epi32( src0, src1 ); | 335 | 0 | sum = _mm_add_epi32( sum, offset ); | 336 | 0 | sum = _mm_srai_epi32( sum, 6 ); | 337 | |
| 338 | 0 | refMainIndex+=4; | 339 | 0 | src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] ); //load 8 16 bit reference Pels -1 0 1 2 3 4 5 6 | 340 | 0 | src1 = _mm_shuffle_epi8(src0,shflmask1); // -1 0 1 2 0 1 2 3 | 341 | 0 | src2 = _mm_shuffle_epi8(src0,shflmask2); | 342 | | | 343 | | // 1 2 3 4 2 3 4 5 | 344 | 0 | src0 = _mm_madd_epi16( coeff,src1 ); | 345 | 0 | src1 = _mm_madd_epi16( coeff,src2 ); | 346 | |
| 347 | 0 | __m128i sum1 = _mm_hadd_epi32( src0, src1 ); | 348 | 0 | sum1 = _mm_add_epi32( sum1, offset ); | 349 | 0 | sum1 = _mm_srai_epi32( sum1, 6 ); | 350 | 0 | src0 = _mm_packs_epi32( sum, sum1 ); | 351 | |
| 352 | 0 | refMainIndex+=4; | 353 | |
| 354 | 0 | if (useCubicFilter) | 355 | 0 | src0 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, src0 ) ); | 356 | |
| 357 | 0 | _mm_storeu_si128( ( __m128i * )(pDst + x), src0); | 358 | |
| 359 | 0 | } | 360 | 0 | deltaPos += intraPredAngle; | 361 | 0 | } | 362 | 0 | } | 363 | 791 | } | 364 | 0 | else if( W == 4 ) | 365 | 0 | { | 366 | |
| 367 | 0 | __m128i shflmask1= _mm_set_epi8( 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 ); | 368 | 0 | __m128i shflmask2= _mm_set_epi8( 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4 ); | 369 | 0 | __m128i vbdmin,vbdmax; | 370 | |
| 371 | 0 | __m128i offset = _mm_set1_epi32( 32 ); | 372 | |
| 373 | 0 | if (useCubicFilter) | 374 | 0 | { | 375 | 0 | vbdmin = _mm_set1_epi16( clpRng.min() ); | 376 | 0 | vbdmax = _mm_set1_epi16( clpRng.max() ); | 377 | 0 | } | 378 | |
| 379 | 0 | for (int y = 0; y<height; y++ ) | 380 | 0 | { | 381 | 0 | int deltaInt = deltaPos >> 5; | 382 | 0 | int deltaFract = deltaPos & (32 - 1); | 383 | 0 | int refMainIndex = deltaInt + 1; | 384 | 0 | pDst=&pDstBuf[y*dstStride]; | 385 | 0 | __m128i coeff = _mm_loadu_si64( ( __m128i const * )&ff[deltaFract<<2] ); //load 4 16 bit filter coeffs | 386 | 0 | coeff = _mm_shuffle_epi32(coeff,0x44); | 387 | 0 | { | 388 | 0 | __m128i src0 = _mm_loadu_si128( ( __m128i const * )&refMain[refMainIndex - 1] ); //load 8 16 bit reference Pels -1 0 1 2 3 4 5 6 | 389 | 0 | __m128i src1 = _mm_shuffle_epi8(src0,shflmask1); // -1 0 1 2 0 1 2 3 | 390 | 0 | __m128i src2 = _mm_shuffle_epi8(src0,shflmask2); // 1 2 3 4 2 3 4 5 | 391 | 0 | src0 = _mm_madd_epi16( coeff,src1 ); | 392 | 0 | src1 = _mm_madd_epi16( coeff,src2 ); | 393 | 0 | __m128i sum = _mm_hadd_epi32( src0, src1 ); | 394 | 0 | sum = _mm_add_epi32( sum, offset ); | 395 | 0 | sum = _mm_srai_epi32( sum, 6 ); | 396 | |
| 397 | 0 | src0 = _mm_packs_epi32( sum, sum ); | 398 | |
| 399 | 0 | refMainIndex+=4; | 400 | |
| 401 | 0 | if (useCubicFilter) | 402 | 0 | src0 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, src0 ) ); | 403 | |
| 404 | 0 | _mm_storeu_si64( ( __m128i * )(pDst ), src0); | 405 | |
| 406 | 0 | } | 407 | 0 | deltaPos += intraPredAngle; | 408 | 0 | } | 409 | 0 | } | 410 | 0 | else | 411 | 0 | { | 412 | 0 | THROW_FATAL( "Unsupported size in IntraPredAngleCore_SIMD" ); | 413 | 0 | } | 414 | 791 | #if USE_AVX2 | 415 | 791 | _mm256_zeroupper(); | 416 | 791 | #endif | 417 | 791 | } |
|
418 | | |
419 | | template< X86_VEXT vext, int W > |
420 | | void IntraPredSampleFilter_SIMD(Pel *ptrSrc,const ptrdiff_t srcStride,PelBuf &piPred,const uint32_t uiDirMode,const ClpRng& clpRng) |
421 | 3.97k | { |
422 | 3.97k | const int iWidth = piPred.width; |
423 | 3.97k | const int iHeight = piPred.height; |
424 | 3.97k | PelBuf dstBuf = piPred; |
425 | 3.97k | Pel* pDst = dstBuf.buf; |
426 | 3.97k | const ptrdiff_t dstStride = dstBuf.stride; |
427 | | |
428 | 3.97k | const int scale = ((getLog2(iWidth) - 2 + getLog2(iHeight) - 2 + 2) >> 2); |
429 | 3.97k | CHECK(scale < 0 || scale > 31, "PDPC: scale < 0 || scale > 2"); |
430 | | |
431 | | #if USE_AVX2 |
432 | 3.97k | if( W > 8 ) |
433 | 2.71k | { |
434 | 2.71k | __m256i tmplo,tmphi; |
435 | 2.71k | __m256i w64 = _mm256_set_epi16(64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64); |
436 | 2.71k | __m256i w32 = _mm256_set_epi32(32,32,32,32,32,32,32,32); |
437 | 2.71k | __m256i vbdmin = _mm256_set1_epi32( clpRng.min() ); |
438 | 2.71k | __m256i vbdmax = _mm256_set1_epi32( clpRng.max() ); |
439 | 2.71k | __m256i wl16; |
440 | 2.71k | __m256i wl16start; |
441 | | |
442 | 2.71k | if (scale==0) |
443 | 0 | { |
444 | 0 | wl16start = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,0,0,0,2,8,32); |
445 | 0 | } |
446 | 2.71k | else if (scale==1) |
447 | 1.70k | { |
448 | 1.70k | wl16start = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,1,2,4,8,16,32); |
449 | 1.70k | } |
450 | 1.00k | else if (scale==2) |
451 | 1.00k | { |
452 | 1.00k | wl16start = _mm256_set_epi16(0,0,0,0,1,1,2,2,4,4,8,8,16,16,32,32); |
453 | 1.00k | } |
454 | 0 | else |
455 | 0 | { |
456 | 0 | THROW_FATAL( "Wrong scale (" << scale << ")" ); |
457 | 0 | } |
458 | | |
459 | | |
460 | 2.71k | if (uiDirMode == PLANAR_IDX || uiDirMode == DC_IDX ) |
461 | 2.71k | { |
462 | 67.1k | for (int y = 0; y < iHeight; y++) |
463 | 64.4k | { |
464 | 64.4k | int wT = 32 >> std::min(31, ((y << 1) >> scale)); |
465 | | |
466 | 64.4k | __m256i wt16 = _mm256_set_epi16(wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT); |
467 | 64.4k | __m256i x16left = _mm256_broadcastw_epi16(_mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)*srcStride)))); |
468 | | |
469 | 64.4k | if (wT) |
470 | 21.8k | { |
471 | 65.8k | for (int x = 0; x < iWidth; x+=16) |
472 | 43.9k | { |
473 | 43.9k | if (x==0) |
474 | 21.8k | { |
475 | 21.8k | wl16=wl16start; |
476 | | |
477 | 21.8k | __m256i x16top = _mm256_loadu_si256((__m256i *) (ptrSrc+x+1)); // load top |
478 | 21.8k | __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride+x)); // load dst |
479 | | |
480 | 21.8k | tmplo = _mm256_mullo_epi16(x16left,wl16); //wL * left |
481 | 21.8k | tmphi = _mm256_mulhi_epi16(x16left,wl16); //wL * left |
482 | 21.8k | __m256i leftlo = _mm256_unpacklo_epi16(tmplo,tmphi); |
483 | 21.8k | __m256i lefthi = _mm256_unpackhi_epi16(tmplo,tmphi); |
484 | | |
485 | 21.8k | tmplo = _mm256_mullo_epi16(x16top,wt16); // wT*top |
486 | 21.8k | tmphi = _mm256_mulhi_epi16(x16top,wt16); // wT*top |
487 | 21.8k | __m256i toplo = _mm256_unpacklo_epi16(tmplo,tmphi); |
488 | 21.8k | __m256i tophi = _mm256_unpackhi_epi16(tmplo,tmphi); |
489 | | |
490 | 21.8k | __m256i wX = _mm256_sub_epi16(w64,wl16); |
491 | 21.8k | wX = _mm256_sub_epi16(wX,wt16); // 64-wL-wT |
492 | 21.8k | tmplo = _mm256_mullo_epi16(x16dst,wX); // 64-wL-wT*dst |
493 | 21.8k | tmphi = _mm256_mulhi_epi16(x16dst,wX); // 64-wL-wT*dst |
494 | 21.8k | __m256i dstlo = _mm256_unpacklo_epi16(tmplo,tmphi); |
495 | 21.8k | __m256i dsthi = _mm256_unpackhi_epi16(tmplo,tmphi); |
496 | | |
497 | 21.8k | dstlo = _mm256_add_epi32(dstlo,toplo); |
498 | 21.8k | dsthi = _mm256_add_epi32(dsthi,tophi); |
499 | 21.8k | dstlo = _mm256_add_epi32(dstlo,leftlo); |
500 | 21.8k | dsthi = _mm256_add_epi32(dsthi,lefthi); |
501 | 21.8k | dstlo = _mm256_add_epi32(dstlo,w32); |
502 | 21.8k | dsthi = _mm256_add_epi32(dsthi,w32); |
503 | | |
504 | 21.8k | dstlo = _mm256_srai_epi32(dstlo,6); |
505 | 21.8k | dsthi = _mm256_srai_epi32(dsthi,6); |
506 | | |
507 | 21.8k | dstlo = _mm256_max_epi32(vbdmin,dstlo); |
508 | 21.8k | dsthi = _mm256_max_epi32(vbdmin,dsthi); |
509 | 21.8k | dstlo = _mm256_min_epi32(vbdmax,dstlo); |
510 | 21.8k | dsthi = _mm256_min_epi32(vbdmax,dsthi); |
511 | | |
512 | 21.8k | dstlo = _mm256_packs_epi32(dstlo,dsthi); |
513 | 21.8k | dstlo = _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) ); |
514 | | |
515 | 21.8k | _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride+x), dstlo ); |
516 | 21.8k | } |
517 | 22.1k | else |
518 | 22.1k | { |
519 | | |
520 | 22.1k | __m256i x16top = _mm256_loadu_si256((__m256i *) (ptrSrc+x+1)); // load top |
521 | 22.1k | __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride+x)); // load dst |
522 | | |
523 | | |
524 | 22.1k | tmplo = _mm256_mullo_epi16(x16top,wt16); // wT*top |
525 | 22.1k | tmphi = _mm256_mulhi_epi16(x16top,wt16); // wT*top |
526 | 22.1k | __m256i toplo = _mm256_unpacklo_epi16(tmplo,tmphi); |
527 | 22.1k | __m256i tophi = _mm256_unpackhi_epi16(tmplo,tmphi); |
528 | | |
529 | 22.1k | __m256i wX = _mm256_sub_epi16(w64,wt16); |
530 | 22.1k | tmplo = _mm256_mullo_epi16(x16dst,wX); // 64-wL-wT*dst |
531 | 22.1k | tmphi = _mm256_mulhi_epi16(x16dst,wX); // 64-wL-wT*dst |
532 | 22.1k | __m256i dstlo = _mm256_unpacklo_epi16(tmplo,tmphi); |
533 | 22.1k | __m256i dsthi = _mm256_unpackhi_epi16(tmplo,tmphi); |
534 | | |
535 | 22.1k | dstlo = _mm256_add_epi32(dstlo,toplo); |
536 | 22.1k | dsthi = _mm256_add_epi32(dsthi,tophi); |
537 | 22.1k | dstlo = _mm256_add_epi32(dstlo,w32); |
538 | 22.1k | dsthi = _mm256_add_epi32(dsthi,w32); |
539 | | |
540 | 22.1k | dstlo = _mm256_srai_epi32(dstlo,6); |
541 | 22.1k | dsthi = _mm256_srai_epi32(dsthi,6); |
542 | | |
543 | 22.1k | dstlo = _mm256_max_epi32(vbdmin,dstlo); |
544 | 22.1k | dsthi = _mm256_max_epi32(vbdmin,dsthi); |
545 | 22.1k | dstlo = _mm256_min_epi32(vbdmax,dstlo); |
546 | 22.1k | dsthi = _mm256_min_epi32(vbdmax,dsthi); |
547 | | |
548 | 22.1k | dstlo = _mm256_packs_epi32(dstlo,dsthi); |
549 | 22.1k | dstlo = _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) ); |
550 | | |
551 | 22.1k | _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride+x), dstlo ); |
552 | 22.1k | } |
553 | | |
554 | 43.9k | } // for |
555 | 21.8k | } |
556 | 42.5k | else |
557 | 42.5k | { // wT =0 |
558 | | |
559 | 42.5k | wl16=wl16start; |
560 | 42.5k | __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride)); // load dst |
561 | | |
562 | 42.5k | tmplo = _mm256_mullo_epi16(x16left,wl16); //wL * left |
563 | 42.5k | tmphi = _mm256_mulhi_epi16(x16left,wl16); //wL * left |
564 | 42.5k | __m256i leftlo = _mm256_unpacklo_epi16(tmplo,tmphi); |
565 | 42.5k | __m256i lefthi = _mm256_unpackhi_epi16(tmplo,tmphi); |
566 | | |
567 | 42.5k | __m256i wX = _mm256_sub_epi16(w64,wl16); |
568 | 42.5k | tmplo = _mm256_mullo_epi16(x16dst,wX); // 64-wL-wT*dst |
569 | 42.5k | tmphi = _mm256_mulhi_epi16(x16dst,wX); // 64-wL-wT*dst |
570 | 42.5k | __m256i dstlo = _mm256_unpacklo_epi16(tmplo,tmphi); |
571 | 42.5k | __m256i dsthi = _mm256_unpackhi_epi16(tmplo,tmphi); |
572 | | |
573 | 42.5k | dstlo = _mm256_add_epi32(dstlo,leftlo); |
574 | 42.5k | dsthi = _mm256_add_epi32(dsthi,lefthi); |
575 | 42.5k | dstlo = _mm256_add_epi32(dstlo,w32); |
576 | 42.5k | dsthi = _mm256_add_epi32(dsthi,w32); |
577 | | |
578 | 42.5k | dstlo = _mm256_srai_epi32(dstlo,6); |
579 | 42.5k | dsthi = _mm256_srai_epi32(dsthi,6); |
580 | | |
581 | 42.5k | dstlo = _mm256_max_epi32(vbdmin,dstlo); |
582 | 42.5k | dsthi = _mm256_max_epi32(vbdmin,dsthi); |
583 | 42.5k | dstlo = _mm256_min_epi32(vbdmax,dstlo); |
584 | 42.5k | dsthi = _mm256_min_epi32(vbdmax,dsthi); |
585 | | |
586 | 42.5k | dstlo = _mm256_packs_epi32(dstlo,dsthi); |
587 | 42.5k | dstlo = _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) ); |
588 | | |
589 | 42.5k | _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride), dstlo ); |
590 | 42.5k | } |
591 | 64.4k | } |
592 | 2.71k | } |
593 | 2.71k | } |
594 | 1.25k | else |
595 | 1.25k | #endif |
596 | 1.25k | { |
597 | 1.25k | __m128i tmplo8,tmphi8; |
598 | 1.25k | __m128i w64_8 = _mm_set_epi16(64,64,64,64,64,64,64,64); |
599 | 1.25k | __m128i w32_8 = _mm_set_epi32(32,32,32,32); |
600 | 1.25k | __m128i vbdmin8 = _mm_set1_epi32( clpRng.min() ); |
601 | 1.25k | __m128i vbdmax8 = _mm_set1_epi32( clpRng.max() ); |
602 | 1.25k | __m128i wl8start,wl8start2; |
603 | 1.25k | CHECK(scale < 0 || scale > 2, "PDPC: scale < 0 || scale > 2"); |
604 | | |
605 | 1.25k | if (scale==0) |
606 | 459 | { |
607 | 459 | wl8start = _mm_set_epi16(0,0,0,0,0,2,8,32); |
608 | 459 | wl8start2 = _mm_set_epi16(0,0,0,0,0,0,0,0); |
609 | 459 | } |
610 | 800 | else if (scale==1) |
611 | 800 | { |
612 | 800 | wl8start = _mm_set_epi16(0,0,1,2,4,8,16,32); |
613 | 800 | wl8start2 = _mm_set_epi16(0,0,0,0,0,0,0,0); |
614 | 800 | } |
615 | 0 | else if (scale==2) |
616 | 0 | { |
617 | 0 | wl8start = _mm_set_epi16(4,4,8,8,16,16,32,32); |
618 | 0 | wl8start2 = _mm_set_epi16(0,0,0,0,1,1,2,2); |
619 | 0 | } |
620 | 1.25k | if (uiDirMode == PLANAR_IDX || uiDirMode == DC_IDX ) |
621 | 1.25k | { |
622 | 1.25k | __m128i wl8 = wl8start; |
623 | 15.8k | for (int y = 0; y < iHeight; y++) |
624 | 14.5k | { |
625 | 14.5k | int wT = 32 >> std::min(31, ((y << 1) >> scale)); |
626 | | |
627 | 14.5k | __m128i wt8 = _mm_set_epi16(wT,wT,wT,wT,wT,wT,wT,wT); |
628 | | // __m128i x8left = _mm_broadcastw_epi16(_mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)*srcStride)))); |
629 | | |
630 | 14.5k | __m128i x8left = _mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)*srcStride))); |
631 | 14.5k | x8left =_mm_shufflelo_epi16(x8left,0); |
632 | 14.5k | x8left =_mm_shuffle_epi32(x8left,0); |
633 | | |
634 | | |
635 | 14.5k | if (wT) |
636 | 6.17k | { |
637 | 12.3k | for (int x = 0; x < iWidth; x+=8) |
638 | 6.17k | { |
639 | 6.17k | __m128i x8top = _mm_loadu_si128( (__m128i*) ( ptrSrc + x + 1 ) ); // load top |
640 | 6.17k | __m128i x8dst = _mm_setzero_si128(); |
641 | 6.17k | if( iWidth >= 8 ) |
642 | 4.21k | x8dst = _mm_loadu_si128( (const __m128i*) ( pDst + y * dstStride + x ) ); // load dst |
643 | 1.96k | else if( iWidth == 4 ) |
644 | 1.96k | x8dst = _mm_loadu_si64( (const __m128i*) ( pDst + y * dstStride + x ) ); // load dst |
645 | 0 | else if( iWidth == 2 ) |
646 | 0 | x8dst = _mm_loadu_si32( (const __m128i*) ( pDst + y * dstStride + x ) ); // load dst |
647 | 0 | else |
648 | 0 | { |
649 | 0 | CHECKD( true, "wrong iWidth in IntraPredSampleFilter_SIMD, only implemented for >=8, ==4, ==2" ); |
650 | 0 | } |
651 | | |
652 | 6.17k | if (x>8) |
653 | 0 | { |
654 | 0 | tmplo8 = _mm_mullo_epi16(x8top,wt8); // wT*top |
655 | 0 | tmphi8 = _mm_mulhi_epi16(x8top,wt8); // wT*top |
656 | 0 | __m128i toplo8 = _mm_unpacklo_epi16(tmplo8,tmphi8); |
657 | 0 | __m128i tophi8 = _mm_unpackhi_epi16(tmplo8,tmphi8); |
658 | | |
659 | |
|
660 | 0 | __m128i wX = _mm_sub_epi16(w64_8,wt8); |
661 | 0 | tmplo8 = _mm_mullo_epi16(x8dst,wX); // 64-wL-wT*dst |
662 | 0 | tmphi8 = _mm_mulhi_epi16(x8dst,wX); // 64-wL-wT*dst |
663 | 0 | __m128i dstlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8); |
664 | 0 | __m128i dsthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8); |
665 | |
|
666 | 0 | dstlo8 = _mm_add_epi32(dstlo8,toplo8); |
667 | 0 | dsthi8 = _mm_add_epi32(dsthi8,tophi8); |
668 | 0 | dstlo8 = _mm_add_epi32(dstlo8,w32_8); |
669 | 0 | dsthi8 = _mm_add_epi32(dsthi8,w32_8); |
670 | |
|
671 | 0 | dstlo8 = _mm_srai_epi32(dstlo8,6); |
672 | 0 | dsthi8 = _mm_srai_epi32(dsthi8,6); |
673 | |
|
674 | 0 | dstlo8 = _mm_max_epi32(vbdmin8,dstlo8); |
675 | 0 | dsthi8 = _mm_max_epi32(vbdmin8,dsthi8); |
676 | 0 | dstlo8 = _mm_min_epi32(vbdmax8,dstlo8); |
677 | 0 | dsthi8 = _mm_min_epi32(vbdmax8,dsthi8); |
678 | |
|
679 | 0 | x8dst = _mm_packs_epi32(dstlo8,dsthi8); |
680 | 0 | } |
681 | 6.17k | else // x<=8 |
682 | 6.17k | { |
683 | 6.17k | if (x==0) |
684 | 6.17k | wl8=wl8start; |
685 | 0 | else if (x==8) |
686 | 0 | wl8=wl8start2; |
687 | | |
688 | 6.17k | tmplo8 = _mm_mullo_epi16(x8left,wl8); //wL * left |
689 | 6.17k | tmphi8 = _mm_mulhi_epi16(x8left,wl8); //wL * left |
690 | 6.17k | __m128i leftlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8); |
691 | 6.17k | __m128i lefthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8); |
692 | | |
693 | 6.17k | tmplo8 = _mm_mullo_epi16(x8top,wt8); // wT*top |
694 | 6.17k | tmphi8 = _mm_mulhi_epi16(x8top,wt8); // wT*top |
695 | 6.17k | __m128i toplo8 = _mm_unpacklo_epi16(tmplo8,tmphi8); |
696 | 6.17k | __m128i tophi8 = _mm_unpackhi_epi16(tmplo8,tmphi8); |
697 | | |
698 | 6.17k | __m128i wX = _mm_sub_epi16(w64_8,wl8); |
699 | 6.17k | wX = _mm_sub_epi16(wX,wt8); // 64-wL-wT |
700 | 6.17k | tmplo8 = _mm_mullo_epi16(x8dst,wX); // 64-wL-wT*dst |
701 | 6.17k | tmphi8 = _mm_mulhi_epi16(x8dst,wX); // 64-wL-wT*dst |
702 | 6.17k | __m128i dstlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8); |
703 | 6.17k | __m128i dsthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8); |
704 | | |
705 | 6.17k | dstlo8 = _mm_add_epi32(dstlo8,toplo8); |
706 | 6.17k | dsthi8 = _mm_add_epi32(dsthi8,tophi8); |
707 | 6.17k | dstlo8 = _mm_add_epi32(dstlo8,leftlo8); |
708 | 6.17k | dsthi8 = _mm_add_epi32(dsthi8,lefthi8); |
709 | 6.17k | dstlo8 = _mm_add_epi32(dstlo8,w32_8); |
710 | 6.17k | dsthi8 = _mm_add_epi32(dsthi8,w32_8); |
711 | | |
712 | 6.17k | dstlo8 = _mm_srai_epi32(dstlo8,6); |
713 | 6.17k | dsthi8 = _mm_srai_epi32(dsthi8,6); |
714 | | |
715 | 6.17k | dstlo8 = _mm_max_epi32(vbdmin8,dstlo8); |
716 | 6.17k | dsthi8 = _mm_max_epi32(vbdmin8,dsthi8); |
717 | 6.17k | dstlo8 = _mm_min_epi32(vbdmax8,dstlo8); |
718 | 6.17k | dsthi8 = _mm_min_epi32(vbdmax8,dsthi8); |
719 | | |
720 | 6.17k | x8dst = _mm_packs_epi32(dstlo8,dsthi8); |
721 | 6.17k | } |
722 | | |
723 | 6.17k | if( iWidth >= 8 ) |
724 | 4.21k | _mm_storeu_si128( (__m128i*) ( pDst + y * dstStride + x ), x8dst ); |
725 | 1.96k | else if( iWidth == 4 ) |
726 | 1.96k | _mm_storeu_si64( (__m128i*) ( pDst + y * dstStride + x ), x8dst ); |
727 | 0 | else if( iWidth == 2 ) |
728 | 0 | _mm_storeu_si32( (__m128i*) ( pDst + y * dstStride + x ), x8dst ); |
729 | 6.17k | } |
730 | 6.17k | } |
731 | 8.39k | else //wT =0 |
732 | 8.39k | { |
733 | 16.7k | for( int x = 0; x < std::min( iWidth, 16 ); x += 8 ) |
734 | 8.39k | { |
735 | 8.39k | if( x == 0 ) |
736 | 8.39k | wl8 = wl8start; |
737 | 0 | else if( x == 8 ) |
738 | 0 | wl8 = wl8start2; |
739 | | |
740 | 8.39k | __m128i x8dst; |
741 | 8.39k | if( iWidth >= 8 ) |
742 | 5.22k | x8dst = _mm_loadu_si128( (const __m128i*)( pDst + y * dstStride + x ) ); // load dst |
743 | 3.17k | else if( iWidth == 4 ) |
744 | 3.17k | x8dst = _mm_loadu_si64( (const __m128i*)( pDst + y * dstStride + x ) ); // load dst |
745 | 0 | else if( iWidth == 2 ) |
746 | 0 | x8dst = _mm_loadu_si32( (const __m128i*)( pDst + y * dstStride + x ) ); // load dst |
747 | 0 | else |
748 | 8.39k | CHECK( true, "wrong iWidth in IntraPredSampleFilter_SIMD, only implemented for >=8, ==4, ==2" ); |
749 | | |
750 | | |
751 | 8.39k | tmplo8 = _mm_mullo_epi16( x8left, wl8 ); // wL * left |
752 | 8.39k | tmphi8 = _mm_mulhi_epi16( x8left, wl8 ); // wL * left |
753 | 8.39k | __m128i leftlo8 = _mm_unpacklo_epi16( tmplo8, tmphi8 ); |
754 | 8.39k | __m128i lefthi8 = _mm_unpackhi_epi16( tmplo8, tmphi8 ); |
755 | | |
756 | 8.39k | __m128i wX = _mm_sub_epi16( w64_8, wl8 ); |
757 | 8.39k | tmplo8 = _mm_mullo_epi16( x8dst, wX ); // 64-wL-wT*dst |
758 | 8.39k | tmphi8 = _mm_mulhi_epi16( x8dst, wX ); // 64-wL-wT*dst |
759 | 8.39k | __m128i dstlo8 = _mm_unpacklo_epi16( tmplo8, tmphi8 ); |
760 | 8.39k | __m128i dsthi8 = _mm_unpackhi_epi16( tmplo8, tmphi8 ); |
761 | | |
762 | 8.39k | dstlo8 = _mm_add_epi32( dstlo8, leftlo8 ); |
763 | 8.39k | dsthi8 = _mm_add_epi32( dsthi8, lefthi8 ); |
764 | 8.39k | dstlo8 = _mm_add_epi32( dstlo8, w32_8 ); |
765 | 8.39k | dsthi8 = _mm_add_epi32( dsthi8, w32_8 ); |
766 | | |
767 | 8.39k | dstlo8 = _mm_srai_epi32( dstlo8, 6 ); |
768 | 8.39k | dsthi8 = _mm_srai_epi32( dsthi8, 6 ); |
769 | | |
770 | 8.39k | dstlo8 = _mm_max_epi32( vbdmin8, dstlo8 ); |
771 | 8.39k | dsthi8 = _mm_max_epi32( vbdmin8, dsthi8 ); |
772 | 8.39k | dstlo8 = _mm_min_epi32( vbdmax8, dstlo8 ); |
773 | 8.39k | dsthi8 = _mm_min_epi32( vbdmax8, dsthi8 ); |
774 | | |
775 | 8.39k | dstlo8 = _mm_packs_epi32( dstlo8, dsthi8 ); |
776 | | |
777 | 8.39k | if( iWidth >= 8 ) |
778 | 5.22k | _mm_storeu_si128( (__m128i*)( pDst + y * dstStride + x ), dstlo8 ); |
779 | 3.17k | else if( iWidth == 4 ) |
780 | 3.17k | _mm_storeu_si64( (__m128i*)( pDst + y * dstStride + x ), ( dstlo8 ) ); |
781 | 0 | else if( iWidth == 2 ) |
782 | 0 | _mm_storeu_si32( (__m128i*)( pDst + y * dstStride + x ), dstlo8 ); |
783 | 8.39k | } |
784 | 8.39k | } |
785 | 14.5k | } |
786 | 1.25k | } |
787 | 1.25k | } |
788 | | |
789 | | |
790 | | #if USE_AVX2 |
791 | 3.97k | _mm256_zeroupper(); |
792 | 3.97k | #endif |
793 | 3.97k | } Unexecuted instantiation: void vvdec::IntraPredSampleFilter_SIMD<(vvdec::x86_simd::X86_VEXT)1, 8>(short*, long, vvdec::AreaBuf<short>&, unsigned int, vvdec::ClpRngTemplate<short> const&) Unexecuted instantiation: void vvdec::IntraPredSampleFilter_SIMD<(vvdec::x86_simd::X86_VEXT)1, 16>(short*, long, vvdec::AreaBuf<short>&, unsigned int, vvdec::ClpRngTemplate<short> const&) void vvdec::IntraPredSampleFilter_SIMD<(vvdec::x86_simd::X86_VEXT)4, 8>(short*, long, vvdec::AreaBuf<short>&, unsigned int, vvdec::ClpRngTemplate<short> const&) Line | Count | Source | 421 | 1.25k | { | 422 | 1.25k | const int iWidth = piPred.width; | 423 | 1.25k | const int iHeight = piPred.height; | 424 | 1.25k | PelBuf dstBuf = piPred; | 425 | 1.25k | Pel* pDst = dstBuf.buf; | 426 | 1.25k | const ptrdiff_t dstStride = dstBuf.stride; | 427 | | | 428 | 1.25k | const int scale = ((getLog2(iWidth) - 2 + getLog2(iHeight) - 2 + 2) >> 2); | 429 | 1.25k | CHECK(scale < 0 || scale > 31, "PDPC: scale < 0 || scale > 2"); | 430 | | | 431 | 1.25k | #if USE_AVX2 | 432 | 1.25k | if( W > 8 ) | 433 | 0 | { | 434 | 0 | __m256i tmplo,tmphi; | 435 | 0 | __m256i w64 = _mm256_set_epi16(64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64); | 436 | 0 | __m256i w32 = _mm256_set_epi32(32,32,32,32,32,32,32,32); | 437 | 0 | __m256i vbdmin = _mm256_set1_epi32( clpRng.min() ); | 438 | 0 | __m256i vbdmax = _mm256_set1_epi32( clpRng.max() ); | 439 | 0 | __m256i wl16; | 440 | 0 | __m256i wl16start; | 441 | |
| 442 | 0 | if (scale==0) | 443 | 0 | { | 444 | 0 | wl16start = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,0,0,0,2,8,32); | 445 | 0 | } | 446 | 0 | else if (scale==1) | 447 | 0 | { | 448 | 0 | wl16start = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,1,2,4,8,16,32); | 449 | 0 | } | 450 | 0 | else if (scale==2) | 451 | 0 | { | 452 | 0 | wl16start = _mm256_set_epi16(0,0,0,0,1,1,2,2,4,4,8,8,16,16,32,32); | 453 | 0 | } | 454 | 0 | else | 455 | 0 | { | 456 | 0 | THROW_FATAL( "Wrong scale (" << scale << ")" ); | 457 | 0 | } | 458 | | | 459 | | | 460 | 0 | if (uiDirMode == PLANAR_IDX || uiDirMode == DC_IDX ) | 461 | 0 | { | 462 | 0 | for (int y = 0; y < iHeight; y++) | 463 | 0 | { | 464 | 0 | int wT = 32 >> std::min(31, ((y << 1) >> scale)); | 465 | |
| 466 | 0 | __m256i wt16 = _mm256_set_epi16(wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT); | 467 | 0 | __m256i x16left = _mm256_broadcastw_epi16(_mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)*srcStride)))); | 468 | |
| 469 | 0 | if (wT) | 470 | 0 | { | 471 | 0 | for (int x = 0; x < iWidth; x+=16) | 472 | 0 | { | 473 | 0 | if (x==0) | 474 | 0 | { | 475 | 0 | wl16=wl16start; | 476 | |
| 477 | 0 | __m256i x16top = _mm256_loadu_si256((__m256i *) (ptrSrc+x+1)); // load top | 478 | 0 | __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride+x)); // load dst | 479 | |
| 480 | 0 | tmplo = _mm256_mullo_epi16(x16left,wl16); //wL * left | 481 | 0 | tmphi = _mm256_mulhi_epi16(x16left,wl16); //wL * left | 482 | 0 | __m256i leftlo = _mm256_unpacklo_epi16(tmplo,tmphi); | 483 | 0 | __m256i lefthi = _mm256_unpackhi_epi16(tmplo,tmphi); | 484 | |
| 485 | 0 | tmplo = _mm256_mullo_epi16(x16top,wt16); // wT*top | 486 | 0 | tmphi = _mm256_mulhi_epi16(x16top,wt16); // wT*top | 487 | 0 | __m256i toplo = _mm256_unpacklo_epi16(tmplo,tmphi); | 488 | 0 | __m256i tophi = _mm256_unpackhi_epi16(tmplo,tmphi); | 489 | |
| 490 | 0 | __m256i wX = _mm256_sub_epi16(w64,wl16); | 491 | 0 | wX = _mm256_sub_epi16(wX,wt16); // 64-wL-wT | 492 | 0 | tmplo = _mm256_mullo_epi16(x16dst,wX); // 64-wL-wT*dst | 493 | 0 | tmphi = _mm256_mulhi_epi16(x16dst,wX); // 64-wL-wT*dst | 494 | 0 | __m256i dstlo = _mm256_unpacklo_epi16(tmplo,tmphi); | 495 | 0 | __m256i dsthi = _mm256_unpackhi_epi16(tmplo,tmphi); | 496 | |
| 497 | 0 | dstlo = _mm256_add_epi32(dstlo,toplo); | 498 | 0 | dsthi = _mm256_add_epi32(dsthi,tophi); | 499 | 0 | dstlo = _mm256_add_epi32(dstlo,leftlo); | 500 | 0 | dsthi = _mm256_add_epi32(dsthi,lefthi); | 501 | 0 | dstlo = _mm256_add_epi32(dstlo,w32); | 502 | 0 | dsthi = _mm256_add_epi32(dsthi,w32); | 503 | |
| 504 | 0 | dstlo = _mm256_srai_epi32(dstlo,6); | 505 | 0 | dsthi = _mm256_srai_epi32(dsthi,6); | 506 | |
| 507 | 0 | dstlo = _mm256_max_epi32(vbdmin,dstlo); | 508 | 0 | dsthi = _mm256_max_epi32(vbdmin,dsthi); | 509 | 0 | dstlo = _mm256_min_epi32(vbdmax,dstlo); | 510 | 0 | dsthi = _mm256_min_epi32(vbdmax,dsthi); | 511 | |
| 512 | 0 | dstlo = _mm256_packs_epi32(dstlo,dsthi); | 513 | 0 | dstlo = _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) ); | 514 | |
| 515 | 0 | _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride+x), dstlo ); | 516 | 0 | } | 517 | 0 | else | 518 | 0 | { | 519 | |
| 520 | 0 | __m256i x16top = _mm256_loadu_si256((__m256i *) (ptrSrc+x+1)); // load top | 521 | 0 | __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride+x)); // load dst | 522 | | | 523 | |
| 524 | 0 | tmplo = _mm256_mullo_epi16(x16top,wt16); // wT*top | 525 | 0 | tmphi = _mm256_mulhi_epi16(x16top,wt16); // wT*top | 526 | 0 | __m256i toplo = _mm256_unpacklo_epi16(tmplo,tmphi); | 527 | 0 | __m256i tophi = _mm256_unpackhi_epi16(tmplo,tmphi); | 528 | |
| 529 | 0 | __m256i wX = _mm256_sub_epi16(w64,wt16); | 530 | 0 | tmplo = _mm256_mullo_epi16(x16dst,wX); // 64-wL-wT*dst | 531 | 0 | tmphi = _mm256_mulhi_epi16(x16dst,wX); // 64-wL-wT*dst | 532 | 0 | __m256i dstlo = _mm256_unpacklo_epi16(tmplo,tmphi); | 533 | 0 | __m256i dsthi = _mm256_unpackhi_epi16(tmplo,tmphi); | 534 | |
| 535 | 0 | dstlo = _mm256_add_epi32(dstlo,toplo); | 536 | 0 | dsthi = _mm256_add_epi32(dsthi,tophi); | 537 | 0 | dstlo = _mm256_add_epi32(dstlo,w32); | 538 | 0 | dsthi = _mm256_add_epi32(dsthi,w32); | 539 | |
| 540 | 0 | dstlo = _mm256_srai_epi32(dstlo,6); | 541 | 0 | dsthi = _mm256_srai_epi32(dsthi,6); | 542 | |
| 543 | 0 | dstlo = _mm256_max_epi32(vbdmin,dstlo); | 544 | 0 | dsthi = _mm256_max_epi32(vbdmin,dsthi); | 545 | 0 | dstlo = _mm256_min_epi32(vbdmax,dstlo); | 546 | 0 | dsthi = _mm256_min_epi32(vbdmax,dsthi); | 547 | |
| 548 | 0 | dstlo = _mm256_packs_epi32(dstlo,dsthi); | 549 | 0 | dstlo = _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) ); | 550 | |
| 551 | 0 | _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride+x), dstlo ); | 552 | 0 | } | 553 | |
| 554 | 0 | } // for | 555 | 0 | } | 556 | 0 | else | 557 | 0 | { // wT =0 | 558 | |
| 559 | 0 | wl16=wl16start; | 560 | 0 | __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride)); // load dst | 561 | |
| 562 | 0 | tmplo = _mm256_mullo_epi16(x16left,wl16); //wL * left | 563 | 0 | tmphi = _mm256_mulhi_epi16(x16left,wl16); //wL * left | 564 | 0 | __m256i leftlo = _mm256_unpacklo_epi16(tmplo,tmphi); | 565 | 0 | __m256i lefthi = _mm256_unpackhi_epi16(tmplo,tmphi); | 566 | |
| 567 | 0 | __m256i wX = _mm256_sub_epi16(w64,wl16); | 568 | 0 | tmplo = _mm256_mullo_epi16(x16dst,wX); // 64-wL-wT*dst | 569 | 0 | tmphi = _mm256_mulhi_epi16(x16dst,wX); // 64-wL-wT*dst | 570 | 0 | __m256i dstlo = _mm256_unpacklo_epi16(tmplo,tmphi); | 571 | 0 | __m256i dsthi = _mm256_unpackhi_epi16(tmplo,tmphi); | 572 | |
| 573 | 0 | dstlo = _mm256_add_epi32(dstlo,leftlo); | 574 | 0 | dsthi = _mm256_add_epi32(dsthi,lefthi); | 575 | 0 | dstlo = _mm256_add_epi32(dstlo,w32); | 576 | 0 | dsthi = _mm256_add_epi32(dsthi,w32); | 577 | |
| 578 | 0 | dstlo = _mm256_srai_epi32(dstlo,6); | 579 | 0 | dsthi = _mm256_srai_epi32(dsthi,6); | 580 | |
| 581 | 0 | dstlo = _mm256_max_epi32(vbdmin,dstlo); | 582 | 0 | dsthi = _mm256_max_epi32(vbdmin,dsthi); | 583 | 0 | dstlo = _mm256_min_epi32(vbdmax,dstlo); | 584 | 0 | dsthi = _mm256_min_epi32(vbdmax,dsthi); | 585 | |
| 586 | 0 | dstlo = _mm256_packs_epi32(dstlo,dsthi); | 587 | 0 | dstlo = _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) ); | 588 | |
| 589 | 0 | _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride), dstlo ); | 590 | 0 | } | 591 | 0 | } | 592 | 0 | } | 593 | 0 | } | 594 | 1.25k | else | 595 | 1.25k | #endif | 596 | 1.25k | { | 597 | 1.25k | __m128i tmplo8,tmphi8; | 598 | 1.25k | __m128i w64_8 = _mm_set_epi16(64,64,64,64,64,64,64,64); | 599 | 1.25k | __m128i w32_8 = _mm_set_epi32(32,32,32,32); | 600 | 1.25k | __m128i vbdmin8 = _mm_set1_epi32( clpRng.min() ); | 601 | 1.25k | __m128i vbdmax8 = _mm_set1_epi32( clpRng.max() ); | 602 | 1.25k | __m128i wl8start,wl8start2; | 603 | 1.25k | CHECK(scale < 0 || scale > 2, "PDPC: scale < 0 || scale > 2"); | 604 | | | 605 | 1.25k | if (scale==0) | 606 | 459 | { | 607 | 459 | wl8start = _mm_set_epi16(0,0,0,0,0,2,8,32); | 608 | 459 | wl8start2 = _mm_set_epi16(0,0,0,0,0,0,0,0); | 609 | 459 | } | 610 | 800 | else if (scale==1) | 611 | 800 | { | 612 | 800 | wl8start = _mm_set_epi16(0,0,1,2,4,8,16,32); | 613 | 800 | wl8start2 = _mm_set_epi16(0,0,0,0,0,0,0,0); | 614 | 800 | } | 615 | 0 | else if (scale==2) | 616 | 0 | { | 617 | 0 | wl8start = _mm_set_epi16(4,4,8,8,16,16,32,32); | 618 | 0 | wl8start2 = _mm_set_epi16(0,0,0,0,1,1,2,2); | 619 | 0 | } | 620 | 1.25k | if (uiDirMode == PLANAR_IDX || uiDirMode == DC_IDX ) | 621 | 1.25k | { | 622 | 1.25k | __m128i wl8 = wl8start; | 623 | 15.8k | for (int y = 0; y < iHeight; y++) | 624 | 14.5k | { | 625 | 14.5k | int wT = 32 >> std::min(31, ((y << 1) >> scale)); | 626 | | | 627 | 14.5k | __m128i wt8 = _mm_set_epi16(wT,wT,wT,wT,wT,wT,wT,wT); | 628 | | // __m128i x8left = _mm_broadcastw_epi16(_mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)*srcStride)))); | 629 | | | 630 | 14.5k | __m128i x8left = _mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)*srcStride))); | 631 | 14.5k | x8left =_mm_shufflelo_epi16(x8left,0); | 632 | 14.5k | x8left =_mm_shuffle_epi32(x8left,0); | 633 | | | 634 | | | 635 | 14.5k | if (wT) | 636 | 6.17k | { | 637 | 12.3k | for (int x = 0; x < iWidth; x+=8) | 638 | 6.17k | { | 639 | 6.17k | __m128i x8top = _mm_loadu_si128( (__m128i*) ( ptrSrc + x + 1 ) ); // load top | 640 | 6.17k | __m128i x8dst = _mm_setzero_si128(); | 641 | 6.17k | if( iWidth >= 8 ) | 642 | 4.21k | x8dst = _mm_loadu_si128( (const __m128i*) ( pDst + y * dstStride + x ) ); // load dst | 643 | 1.96k | else if( iWidth == 4 ) | 644 | 1.96k | x8dst = _mm_loadu_si64( (const __m128i*) ( pDst + y * dstStride + x ) ); // load dst | 645 | 0 | else if( iWidth == 2 ) | 646 | 0 | x8dst = _mm_loadu_si32( (const __m128i*) ( pDst + y * dstStride + x ) ); // load dst | 647 | 0 | else | 648 | 0 | { | 649 | 0 | CHECKD( true, "wrong iWidth in IntraPredSampleFilter_SIMD, only implemented for >=8, ==4, ==2" ); | 650 | 0 | } | 651 | | | 652 | 6.17k | if (x>8) | 653 | 0 | { | 654 | 0 | tmplo8 = _mm_mullo_epi16(x8top,wt8); // wT*top | 655 | 0 | tmphi8 = _mm_mulhi_epi16(x8top,wt8); // wT*top | 656 | 0 | __m128i toplo8 = _mm_unpacklo_epi16(tmplo8,tmphi8); | 657 | 0 | __m128i tophi8 = _mm_unpackhi_epi16(tmplo8,tmphi8); | 658 | | | 659 | |
| 660 | 0 | __m128i wX = _mm_sub_epi16(w64_8,wt8); | 661 | 0 | tmplo8 = _mm_mullo_epi16(x8dst,wX); // 64-wL-wT*dst | 662 | 0 | tmphi8 = _mm_mulhi_epi16(x8dst,wX); // 64-wL-wT*dst | 663 | 0 | __m128i dstlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8); | 664 | 0 | __m128i dsthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8); | 665 | |
| 666 | 0 | dstlo8 = _mm_add_epi32(dstlo8,toplo8); | 667 | 0 | dsthi8 = _mm_add_epi32(dsthi8,tophi8); | 668 | 0 | dstlo8 = _mm_add_epi32(dstlo8,w32_8); | 669 | 0 | dsthi8 = _mm_add_epi32(dsthi8,w32_8); | 670 | |
| 671 | 0 | dstlo8 = _mm_srai_epi32(dstlo8,6); | 672 | 0 | dsthi8 = _mm_srai_epi32(dsthi8,6); | 673 | |
| 674 | 0 | dstlo8 = _mm_max_epi32(vbdmin8,dstlo8); | 675 | 0 | dsthi8 = _mm_max_epi32(vbdmin8,dsthi8); | 676 | 0 | dstlo8 = _mm_min_epi32(vbdmax8,dstlo8); | 677 | 0 | dsthi8 = _mm_min_epi32(vbdmax8,dsthi8); | 678 | |
| 679 | 0 | x8dst = _mm_packs_epi32(dstlo8,dsthi8); | 680 | 0 | } | 681 | 6.17k | else // x<=8 | 682 | 6.17k | { | 683 | 6.17k | if (x==0) | 684 | 6.17k | wl8=wl8start; | 685 | 0 | else if (x==8) | 686 | 0 | wl8=wl8start2; | 687 | | | 688 | 6.17k | tmplo8 = _mm_mullo_epi16(x8left,wl8); //wL * left | 689 | 6.17k | tmphi8 = _mm_mulhi_epi16(x8left,wl8); //wL * left | 690 | 6.17k | __m128i leftlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8); | 691 | 6.17k | __m128i lefthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8); | 692 | | | 693 | 6.17k | tmplo8 = _mm_mullo_epi16(x8top,wt8); // wT*top | 694 | 6.17k | tmphi8 = _mm_mulhi_epi16(x8top,wt8); // wT*top | 695 | 6.17k | __m128i toplo8 = _mm_unpacklo_epi16(tmplo8,tmphi8); | 696 | 6.17k | __m128i tophi8 = _mm_unpackhi_epi16(tmplo8,tmphi8); | 697 | | | 698 | 6.17k | __m128i wX = _mm_sub_epi16(w64_8,wl8); | 699 | 6.17k | wX = _mm_sub_epi16(wX,wt8); // 64-wL-wT | 700 | 6.17k | tmplo8 = _mm_mullo_epi16(x8dst,wX); // 64-wL-wT*dst | 701 | 6.17k | tmphi8 = _mm_mulhi_epi16(x8dst,wX); // 64-wL-wT*dst | 702 | 6.17k | __m128i dstlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8); | 703 | 6.17k | __m128i dsthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8); | 704 | | | 705 | 6.17k | dstlo8 = _mm_add_epi32(dstlo8,toplo8); | 706 | 6.17k | dsthi8 = _mm_add_epi32(dsthi8,tophi8); | 707 | 6.17k | dstlo8 = _mm_add_epi32(dstlo8,leftlo8); | 708 | 6.17k | dsthi8 = _mm_add_epi32(dsthi8,lefthi8); | 709 | 6.17k | dstlo8 = _mm_add_epi32(dstlo8,w32_8); | 710 | 6.17k | dsthi8 = _mm_add_epi32(dsthi8,w32_8); | 711 | | | 712 | 6.17k | dstlo8 = _mm_srai_epi32(dstlo8,6); | 713 | 6.17k | dsthi8 = _mm_srai_epi32(dsthi8,6); | 714 | | | 715 | 6.17k | dstlo8 = _mm_max_epi32(vbdmin8,dstlo8); | 716 | 6.17k | dsthi8 = _mm_max_epi32(vbdmin8,dsthi8); | 717 | 6.17k | dstlo8 = _mm_min_epi32(vbdmax8,dstlo8); | 718 | 6.17k | dsthi8 = _mm_min_epi32(vbdmax8,dsthi8); | 719 | | | 720 | 6.17k | x8dst = _mm_packs_epi32(dstlo8,dsthi8); | 721 | 6.17k | } | 722 | | | 723 | 6.17k | if( iWidth >= 8 ) | 724 | 4.21k | _mm_storeu_si128( (__m128i*) ( pDst + y * dstStride + x ), x8dst ); | 725 | 1.96k | else if( iWidth == 4 ) | 726 | 1.96k | _mm_storeu_si64( (__m128i*) ( pDst + y * dstStride + x ), x8dst ); | 727 | 0 | else if( iWidth == 2 ) | 728 | 0 | _mm_storeu_si32( (__m128i*) ( pDst + y * dstStride + x ), x8dst ); | 729 | 6.17k | } | 730 | 6.17k | } | 731 | 8.39k | else //wT =0 | 732 | 8.39k | { | 733 | 16.7k | for( int x = 0; x < std::min( iWidth, 16 ); x += 8 ) | 734 | 8.39k | { | 735 | 8.39k | if( x == 0 ) | 736 | 8.39k | wl8 = wl8start; | 737 | 0 | else if( x == 8 ) | 738 | 0 | wl8 = wl8start2; | 739 | | | 740 | 8.39k | __m128i x8dst; | 741 | 8.39k | if( iWidth >= 8 ) | 742 | 5.22k | x8dst = _mm_loadu_si128( (const __m128i*)( pDst + y * dstStride + x ) ); // load dst | 743 | 3.17k | else if( iWidth == 4 ) | 744 | 3.17k | x8dst = _mm_loadu_si64( (const __m128i*)( pDst + y * dstStride + x ) ); // load dst | 745 | 0 | else if( iWidth == 2 ) | 746 | 0 | x8dst = _mm_loadu_si32( (const __m128i*)( pDst + y * dstStride + x ) ); // load dst | 747 | 0 | else | 748 | 8.39k | CHECK( true, "wrong iWidth in IntraPredSampleFilter_SIMD, only implemented for >=8, ==4, ==2" ); | 749 | | | 750 | | | 751 | 8.39k | tmplo8 = _mm_mullo_epi16( x8left, wl8 ); // wL * left | 752 | 8.39k | tmphi8 = _mm_mulhi_epi16( x8left, wl8 ); // wL * left | 753 | 8.39k | __m128i leftlo8 = _mm_unpacklo_epi16( tmplo8, tmphi8 ); | 754 | 8.39k | __m128i lefthi8 = _mm_unpackhi_epi16( tmplo8, tmphi8 ); | 755 | | | 756 | 8.39k | __m128i wX = _mm_sub_epi16( w64_8, wl8 ); | 757 | 8.39k | tmplo8 = _mm_mullo_epi16( x8dst, wX ); // 64-wL-wT*dst | 758 | 8.39k | tmphi8 = _mm_mulhi_epi16( x8dst, wX ); // 64-wL-wT*dst | 759 | 8.39k | __m128i dstlo8 = _mm_unpacklo_epi16( tmplo8, tmphi8 ); | 760 | 8.39k | __m128i dsthi8 = _mm_unpackhi_epi16( tmplo8, tmphi8 ); | 761 | | | 762 | 8.39k | dstlo8 = _mm_add_epi32( dstlo8, leftlo8 ); | 763 | 8.39k | dsthi8 = _mm_add_epi32( dsthi8, lefthi8 ); | 764 | 8.39k | dstlo8 = _mm_add_epi32( dstlo8, w32_8 ); | 765 | 8.39k | dsthi8 = _mm_add_epi32( dsthi8, w32_8 ); | 766 | | | 767 | 8.39k | dstlo8 = _mm_srai_epi32( dstlo8, 6 ); | 768 | 8.39k | dsthi8 = _mm_srai_epi32( dsthi8, 6 ); | 769 | | | 770 | 8.39k | dstlo8 = _mm_max_epi32( vbdmin8, dstlo8 ); | 771 | 8.39k | dsthi8 = _mm_max_epi32( vbdmin8, dsthi8 ); | 772 | 8.39k | dstlo8 = _mm_min_epi32( vbdmax8, dstlo8 ); | 773 | 8.39k | dsthi8 = _mm_min_epi32( vbdmax8, dsthi8 ); | 774 | | | 775 | 8.39k | dstlo8 = _mm_packs_epi32( dstlo8, dsthi8 ); | 776 | | | 777 | 8.39k | if( iWidth >= 8 ) | 778 | 5.22k | _mm_storeu_si128( (__m128i*)( pDst + y * dstStride + x ), dstlo8 ); | 779 | 3.17k | else if( iWidth == 4 ) | 780 | 3.17k | _mm_storeu_si64( (__m128i*)( pDst + y * dstStride + x ), ( dstlo8 ) ); | 781 | 0 | else if( iWidth == 2 ) | 782 | 0 | _mm_storeu_si32( (__m128i*)( pDst + y * dstStride + x ), dstlo8 ); | 783 | 8.39k | } | 784 | 8.39k | } | 785 | 14.5k | } | 786 | 1.25k | } | 787 | 1.25k | } | 788 | | | 789 | | | 790 | 1.25k | #if USE_AVX2 | 791 | 1.25k | _mm256_zeroupper(); | 792 | 1.25k | #endif | 793 | 1.25k | } |
void vvdec::IntraPredSampleFilter_SIMD<(vvdec::x86_simd::X86_VEXT)4, 16>(short*, long, vvdec::AreaBuf<short>&, unsigned int, vvdec::ClpRngTemplate<short> const&) Line | Count | Source | 421 | 2.71k | { | 422 | 2.71k | const int iWidth = piPred.width; | 423 | 2.71k | const int iHeight = piPred.height; | 424 | 2.71k | PelBuf dstBuf = piPred; | 425 | 2.71k | Pel* pDst = dstBuf.buf; | 426 | 2.71k | const ptrdiff_t dstStride = dstBuf.stride; | 427 | | | 428 | 2.71k | const int scale = ((getLog2(iWidth) - 2 + getLog2(iHeight) - 2 + 2) >> 2); | 429 | 2.71k | CHECK(scale < 0 || scale > 31, "PDPC: scale < 0 || scale > 2"); | 430 | | | 431 | 2.71k | #if USE_AVX2 | 432 | 2.71k | if( W > 8 ) | 433 | 2.71k | { | 434 | 2.71k | __m256i tmplo,tmphi; | 435 | 2.71k | __m256i w64 = _mm256_set_epi16(64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64); | 436 | 2.71k | __m256i w32 = _mm256_set_epi32(32,32,32,32,32,32,32,32); | 437 | 2.71k | __m256i vbdmin = _mm256_set1_epi32( clpRng.min() ); | 438 | 2.71k | __m256i vbdmax = _mm256_set1_epi32( clpRng.max() ); | 439 | 2.71k | __m256i wl16; | 440 | 2.71k | __m256i wl16start; | 441 | | | 442 | 2.71k | if (scale==0) | 443 | 0 | { | 444 | 0 | wl16start = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,0,0,0,2,8,32); | 445 | 0 | } | 446 | 2.71k | else if (scale==1) | 447 | 1.70k | { | 448 | 1.70k | wl16start = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,1,2,4,8,16,32); | 449 | 1.70k | } | 450 | 1.00k | else if (scale==2) | 451 | 1.00k | { | 452 | 1.00k | wl16start = _mm256_set_epi16(0,0,0,0,1,1,2,2,4,4,8,8,16,16,32,32); | 453 | 1.00k | } | 454 | 0 | else | 455 | 0 | { | 456 | 0 | THROW_FATAL( "Wrong scale (" << scale << ")" ); | 457 | 0 | } | 458 | | | 459 | | | 460 | 2.71k | if (uiDirMode == PLANAR_IDX || uiDirMode == DC_IDX ) | 461 | 2.71k | { | 462 | 67.1k | for (int y = 0; y < iHeight; y++) | 463 | 64.4k | { | 464 | 64.4k | int wT = 32 >> std::min(31, ((y << 1) >> scale)); | 465 | | | 466 | 64.4k | __m256i wt16 = _mm256_set_epi16(wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT); | 467 | 64.4k | __m256i x16left = _mm256_broadcastw_epi16(_mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)*srcStride)))); | 468 | | | 469 | 64.4k | if (wT) | 470 | 21.8k | { | 471 | 65.8k | for (int x = 0; x < iWidth; x+=16) | 472 | 43.9k | { | 473 | 43.9k | if (x==0) | 474 | 21.8k | { | 475 | 21.8k | wl16=wl16start; | 476 | | | 477 | 21.8k | __m256i x16top = _mm256_loadu_si256((__m256i *) (ptrSrc+x+1)); // load top | 478 | 21.8k | __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride+x)); // load dst | 479 | | | 480 | 21.8k | tmplo = _mm256_mullo_epi16(x16left,wl16); //wL * left | 481 | 21.8k | tmphi = _mm256_mulhi_epi16(x16left,wl16); //wL * left | 482 | 21.8k | __m256i leftlo = _mm256_unpacklo_epi16(tmplo,tmphi); | 483 | 21.8k | __m256i lefthi = _mm256_unpackhi_epi16(tmplo,tmphi); | 484 | | | 485 | 21.8k | tmplo = _mm256_mullo_epi16(x16top,wt16); // wT*top | 486 | 21.8k | tmphi = _mm256_mulhi_epi16(x16top,wt16); // wT*top | 487 | 21.8k | __m256i toplo = _mm256_unpacklo_epi16(tmplo,tmphi); | 488 | 21.8k | __m256i tophi = _mm256_unpackhi_epi16(tmplo,tmphi); | 489 | | | 490 | 21.8k | __m256i wX = _mm256_sub_epi16(w64,wl16); | 491 | 21.8k | wX = _mm256_sub_epi16(wX,wt16); // 64-wL-wT | 492 | 21.8k | tmplo = _mm256_mullo_epi16(x16dst,wX); // 64-wL-wT*dst | 493 | 21.8k | tmphi = _mm256_mulhi_epi16(x16dst,wX); // 64-wL-wT*dst | 494 | 21.8k | __m256i dstlo = _mm256_unpacklo_epi16(tmplo,tmphi); | 495 | 21.8k | __m256i dsthi = _mm256_unpackhi_epi16(tmplo,tmphi); | 496 | | | 497 | 21.8k | dstlo = _mm256_add_epi32(dstlo,toplo); | 498 | 21.8k | dsthi = _mm256_add_epi32(dsthi,tophi); | 499 | 21.8k | dstlo = _mm256_add_epi32(dstlo,leftlo); | 500 | 21.8k | dsthi = _mm256_add_epi32(dsthi,lefthi); | 501 | 21.8k | dstlo = _mm256_add_epi32(dstlo,w32); | 502 | 21.8k | dsthi = _mm256_add_epi32(dsthi,w32); | 503 | | | 504 | 21.8k | dstlo = _mm256_srai_epi32(dstlo,6); | 505 | 21.8k | dsthi = _mm256_srai_epi32(dsthi,6); | 506 | | | 507 | 21.8k | dstlo = _mm256_max_epi32(vbdmin,dstlo); | 508 | 21.8k | dsthi = _mm256_max_epi32(vbdmin,dsthi); | 509 | 21.8k | dstlo = _mm256_min_epi32(vbdmax,dstlo); | 510 | 21.8k | dsthi = _mm256_min_epi32(vbdmax,dsthi); | 511 | | | 512 | 21.8k | dstlo = _mm256_packs_epi32(dstlo,dsthi); | 513 | 21.8k | dstlo = _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) ); | 514 | | | 515 | 21.8k | _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride+x), dstlo ); | 516 | 21.8k | } | 517 | 22.1k | else | 518 | 22.1k | { | 519 | | | 520 | 22.1k | __m256i x16top = _mm256_loadu_si256((__m256i *) (ptrSrc+x+1)); // load top | 521 | 22.1k | __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride+x)); // load dst | 522 | | | 523 | | | 524 | 22.1k | tmplo = _mm256_mullo_epi16(x16top,wt16); // wT*top | 525 | 22.1k | tmphi = _mm256_mulhi_epi16(x16top,wt16); // wT*top | 526 | 22.1k | __m256i toplo = _mm256_unpacklo_epi16(tmplo,tmphi); | 527 | 22.1k | __m256i tophi = _mm256_unpackhi_epi16(tmplo,tmphi); | 528 | | | 529 | 22.1k | __m256i wX = _mm256_sub_epi16(w64,wt16); | 530 | 22.1k | tmplo = _mm256_mullo_epi16(x16dst,wX); // 64-wL-wT*dst | 531 | 22.1k | tmphi = _mm256_mulhi_epi16(x16dst,wX); // 64-wL-wT*dst | 532 | 22.1k | __m256i dstlo = _mm256_unpacklo_epi16(tmplo,tmphi); | 533 | 22.1k | __m256i dsthi = _mm256_unpackhi_epi16(tmplo,tmphi); | 534 | | | 535 | 22.1k | dstlo = _mm256_add_epi32(dstlo,toplo); | 536 | 22.1k | dsthi = _mm256_add_epi32(dsthi,tophi); | 537 | 22.1k | dstlo = _mm256_add_epi32(dstlo,w32); | 538 | 22.1k | dsthi = _mm256_add_epi32(dsthi,w32); | 539 | | | 540 | 22.1k | dstlo = _mm256_srai_epi32(dstlo,6); | 541 | 22.1k | dsthi = _mm256_srai_epi32(dsthi,6); | 542 | | | 543 | 22.1k | dstlo = _mm256_max_epi32(vbdmin,dstlo); | 544 | 22.1k | dsthi = _mm256_max_epi32(vbdmin,dsthi); | 545 | 22.1k | dstlo = _mm256_min_epi32(vbdmax,dstlo); | 546 | 22.1k | dsthi = _mm256_min_epi32(vbdmax,dsthi); | 547 | | | 548 | 22.1k | dstlo = _mm256_packs_epi32(dstlo,dsthi); | 549 | 22.1k | dstlo = _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) ); | 550 | | | 551 | 22.1k | _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride+x), dstlo ); | 552 | 22.1k | } | 553 | | | 554 | 43.9k | } // for | 555 | 21.8k | } | 556 | 42.5k | else | 557 | 42.5k | { // wT =0 | 558 | | | 559 | 42.5k | wl16=wl16start; | 560 | 42.5k | __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride)); // load dst | 561 | | | 562 | 42.5k | tmplo = _mm256_mullo_epi16(x16left,wl16); //wL * left | 563 | 42.5k | tmphi = _mm256_mulhi_epi16(x16left,wl16); //wL * left | 564 | 42.5k | __m256i leftlo = _mm256_unpacklo_epi16(tmplo,tmphi); | 565 | 42.5k | __m256i lefthi = _mm256_unpackhi_epi16(tmplo,tmphi); | 566 | | | 567 | 42.5k | __m256i wX = _mm256_sub_epi16(w64,wl16); | 568 | 42.5k | tmplo = _mm256_mullo_epi16(x16dst,wX); // 64-wL-wT*dst | 569 | 42.5k | tmphi = _mm256_mulhi_epi16(x16dst,wX); // 64-wL-wT*dst | 570 | 42.5k | __m256i dstlo = _mm256_unpacklo_epi16(tmplo,tmphi); | 571 | 42.5k | __m256i dsthi = _mm256_unpackhi_epi16(tmplo,tmphi); | 572 | | | 573 | 42.5k | dstlo = _mm256_add_epi32(dstlo,leftlo); | 574 | 42.5k | dsthi = _mm256_add_epi32(dsthi,lefthi); | 575 | 42.5k | dstlo = _mm256_add_epi32(dstlo,w32); | 576 | 42.5k | dsthi = _mm256_add_epi32(dsthi,w32); | 577 | | | 578 | 42.5k | dstlo = _mm256_srai_epi32(dstlo,6); | 579 | 42.5k | dsthi = _mm256_srai_epi32(dsthi,6); | 580 | | | 581 | 42.5k | dstlo = _mm256_max_epi32(vbdmin,dstlo); | 582 | 42.5k | dsthi = _mm256_max_epi32(vbdmin,dsthi); | 583 | 42.5k | dstlo = _mm256_min_epi32(vbdmax,dstlo); | 584 | 42.5k | dsthi = _mm256_min_epi32(vbdmax,dsthi); | 585 | | | 586 | 42.5k | dstlo = _mm256_packs_epi32(dstlo,dsthi); | 587 | 42.5k | dstlo = _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) ); | 588 | | | 589 | 42.5k | _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride), dstlo ); | 590 | 42.5k | } | 591 | 64.4k | } | 592 | 2.71k | } | 593 | 2.71k | } | 594 | 0 | else | 595 | 0 | #endif | 596 | 0 | { | 597 | 0 | __m128i tmplo8,tmphi8; | 598 | 0 | __m128i w64_8 = _mm_set_epi16(64,64,64,64,64,64,64,64); | 599 | 0 | __m128i w32_8 = _mm_set_epi32(32,32,32,32); | 600 | 0 | __m128i vbdmin8 = _mm_set1_epi32( clpRng.min() ); | 601 | 0 | __m128i vbdmax8 = _mm_set1_epi32( clpRng.max() ); | 602 | 0 | __m128i wl8start,wl8start2; | 603 | 0 | CHECK(scale < 0 || scale > 2, "PDPC: scale < 0 || scale > 2"); | 604 | |
| 605 | 0 | if (scale==0) | 606 | 0 | { | 607 | 0 | wl8start = _mm_set_epi16(0,0,0,0,0,2,8,32); | 608 | 0 | wl8start2 = _mm_set_epi16(0,0,0,0,0,0,0,0); | 609 | 0 | } | 610 | 0 | else if (scale==1) | 611 | 0 | { | 612 | 0 | wl8start = _mm_set_epi16(0,0,1,2,4,8,16,32); | 613 | 0 | wl8start2 = _mm_set_epi16(0,0,0,0,0,0,0,0); | 614 | 0 | } | 615 | 0 | else if (scale==2) | 616 | 0 | { | 617 | 0 | wl8start = _mm_set_epi16(4,4,8,8,16,16,32,32); | 618 | 0 | wl8start2 = _mm_set_epi16(0,0,0,0,1,1,2,2); | 619 | 0 | } | 620 | 0 | if (uiDirMode == PLANAR_IDX || uiDirMode == DC_IDX ) | 621 | 0 | { | 622 | 0 | __m128i wl8 = wl8start; | 623 | 0 | for (int y = 0; y < iHeight; y++) | 624 | 0 | { | 625 | 0 | int wT = 32 >> std::min(31, ((y << 1) >> scale)); | 626 | |
| 627 | 0 | __m128i wt8 = _mm_set_epi16(wT,wT,wT,wT,wT,wT,wT,wT); | 628 | | // __m128i x8left = _mm_broadcastw_epi16(_mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)*srcStride)))); | 629 | |
| 630 | 0 | __m128i x8left = _mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)*srcStride))); | 631 | 0 | x8left =_mm_shufflelo_epi16(x8left,0); | 632 | 0 | x8left =_mm_shuffle_epi32(x8left,0); | 633 | | | 634 | |
| 635 | 0 | if (wT) | 636 | 0 | { | 637 | 0 | for (int x = 0; x < iWidth; x+=8) | 638 | 0 | { | 639 | 0 | __m128i x8top = _mm_loadu_si128( (__m128i*) ( ptrSrc + x + 1 ) ); // load top | 640 | 0 | __m128i x8dst = _mm_setzero_si128(); | 641 | 0 | if( iWidth >= 8 ) | 642 | 0 | x8dst = _mm_loadu_si128( (const __m128i*) ( pDst + y * dstStride + x ) ); // load dst | 643 | 0 | else if( iWidth == 4 ) | 644 | 0 | x8dst = _mm_loadu_si64( (const __m128i*) ( pDst + y * dstStride + x ) ); // load dst | 645 | 0 | else if( iWidth == 2 ) | 646 | 0 | x8dst = _mm_loadu_si32( (const __m128i*) ( pDst + y * dstStride + x ) ); // load dst | 647 | 0 | else | 648 | 0 | { | 649 | 0 | CHECKD( true, "wrong iWidth in IntraPredSampleFilter_SIMD, only implemented for >=8, ==4, ==2" ); | 650 | 0 | } | 651 | | | 652 | 0 | if (x>8) | 653 | 0 | { | 654 | 0 | tmplo8 = _mm_mullo_epi16(x8top,wt8); // wT*top | 655 | 0 | tmphi8 = _mm_mulhi_epi16(x8top,wt8); // wT*top | 656 | 0 | __m128i toplo8 = _mm_unpacklo_epi16(tmplo8,tmphi8); | 657 | 0 | __m128i tophi8 = _mm_unpackhi_epi16(tmplo8,tmphi8); | 658 | | | 659 | |
| 660 | 0 | __m128i wX = _mm_sub_epi16(w64_8,wt8); | 661 | 0 | tmplo8 = _mm_mullo_epi16(x8dst,wX); // 64-wL-wT*dst | 662 | 0 | tmphi8 = _mm_mulhi_epi16(x8dst,wX); // 64-wL-wT*dst | 663 | 0 | __m128i dstlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8); | 664 | 0 | __m128i dsthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8); | 665 | |
| 666 | 0 | dstlo8 = _mm_add_epi32(dstlo8,toplo8); | 667 | 0 | dsthi8 = _mm_add_epi32(dsthi8,tophi8); | 668 | 0 | dstlo8 = _mm_add_epi32(dstlo8,w32_8); | 669 | 0 | dsthi8 = _mm_add_epi32(dsthi8,w32_8); | 670 | |
| 671 | 0 | dstlo8 = _mm_srai_epi32(dstlo8,6); | 672 | 0 | dsthi8 = _mm_srai_epi32(dsthi8,6); | 673 | |
| 674 | 0 | dstlo8 = _mm_max_epi32(vbdmin8,dstlo8); | 675 | 0 | dsthi8 = _mm_max_epi32(vbdmin8,dsthi8); | 676 | 0 | dstlo8 = _mm_min_epi32(vbdmax8,dstlo8); | 677 | 0 | dsthi8 = _mm_min_epi32(vbdmax8,dsthi8); | 678 | |
| 679 | 0 | x8dst = _mm_packs_epi32(dstlo8,dsthi8); | 680 | 0 | } | 681 | 0 | else // x<=8 | 682 | 0 | { | 683 | 0 | if (x==0) | 684 | 0 | wl8=wl8start; | 685 | 0 | else if (x==8) | 686 | 0 | wl8=wl8start2; | 687 | |
| 688 | 0 | tmplo8 = _mm_mullo_epi16(x8left,wl8); //wL * left | 689 | 0 | tmphi8 = _mm_mulhi_epi16(x8left,wl8); //wL * left | 690 | 0 | __m128i leftlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8); | 691 | 0 | __m128i lefthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8); | 692 | |
| 693 | 0 | tmplo8 = _mm_mullo_epi16(x8top,wt8); // wT*top | 694 | 0 | tmphi8 = _mm_mulhi_epi16(x8top,wt8); // wT*top | 695 | 0 | __m128i toplo8 = _mm_unpacklo_epi16(tmplo8,tmphi8); | 696 | 0 | __m128i tophi8 = _mm_unpackhi_epi16(tmplo8,tmphi8); | 697 | |
| 698 | 0 | __m128i wX = _mm_sub_epi16(w64_8,wl8); | 699 | 0 | wX = _mm_sub_epi16(wX,wt8); // 64-wL-wT | 700 | 0 | tmplo8 = _mm_mullo_epi16(x8dst,wX); // 64-wL-wT*dst | 701 | 0 | tmphi8 = _mm_mulhi_epi16(x8dst,wX); // 64-wL-wT*dst | 702 | 0 | __m128i dstlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8); | 703 | 0 | __m128i dsthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8); | 704 | |
| 705 | 0 | dstlo8 = _mm_add_epi32(dstlo8,toplo8); | 706 | 0 | dsthi8 = _mm_add_epi32(dsthi8,tophi8); | 707 | 0 | dstlo8 = _mm_add_epi32(dstlo8,leftlo8); | 708 | 0 | dsthi8 = _mm_add_epi32(dsthi8,lefthi8); | 709 | 0 | dstlo8 = _mm_add_epi32(dstlo8,w32_8); | 710 | 0 | dsthi8 = _mm_add_epi32(dsthi8,w32_8); | 711 | |
| 712 | 0 | dstlo8 = _mm_srai_epi32(dstlo8,6); | 713 | 0 | dsthi8 = _mm_srai_epi32(dsthi8,6); | 714 | |
| 715 | 0 | dstlo8 = _mm_max_epi32(vbdmin8,dstlo8); | 716 | 0 | dsthi8 = _mm_max_epi32(vbdmin8,dsthi8); | 717 | 0 | dstlo8 = _mm_min_epi32(vbdmax8,dstlo8); | 718 | 0 | dsthi8 = _mm_min_epi32(vbdmax8,dsthi8); | 719 | |
| 720 | 0 | x8dst = _mm_packs_epi32(dstlo8,dsthi8); | 721 | 0 | } | 722 | |
| 723 | 0 | if( iWidth >= 8 ) | 724 | 0 | _mm_storeu_si128( (__m128i*) ( pDst + y * dstStride + x ), x8dst ); | 725 | 0 | else if( iWidth == 4 ) | 726 | 0 | _mm_storeu_si64( (__m128i*) ( pDst + y * dstStride + x ), x8dst ); | 727 | 0 | else if( iWidth == 2 ) | 728 | 0 | _mm_storeu_si32( (__m128i*) ( pDst + y * dstStride + x ), x8dst ); | 729 | 0 | } | 730 | 0 | } | 731 | 0 | else //wT =0 | 732 | 0 | { | 733 | 0 | for( int x = 0; x < std::min( iWidth, 16 ); x += 8 ) | 734 | 0 | { | 735 | 0 | if( x == 0 ) | 736 | 0 | wl8 = wl8start; | 737 | 0 | else if( x == 8 ) | 738 | 0 | wl8 = wl8start2; | 739 | |
| 740 | 0 | __m128i x8dst; | 741 | 0 | if( iWidth >= 8 ) | 742 | 0 | x8dst = _mm_loadu_si128( (const __m128i*)( pDst + y * dstStride + x ) ); // load dst | 743 | 0 | else if( iWidth == 4 ) | 744 | 0 | x8dst = _mm_loadu_si64( (const __m128i*)( pDst + y * dstStride + x ) ); // load dst | 745 | 0 | else if( iWidth == 2 ) | 746 | 0 | x8dst = _mm_loadu_si32( (const __m128i*)( pDst + y * dstStride + x ) ); // load dst | 747 | 0 | else | 748 | 0 | CHECK( true, "wrong iWidth in IntraPredSampleFilter_SIMD, only implemented for >=8, ==4, ==2" ); | 749 | | | 750 | |
| 751 | 0 | tmplo8 = _mm_mullo_epi16( x8left, wl8 ); // wL * left | 752 | 0 | tmphi8 = _mm_mulhi_epi16( x8left, wl8 ); // wL * left | 753 | 0 | __m128i leftlo8 = _mm_unpacklo_epi16( tmplo8, tmphi8 ); | 754 | 0 | __m128i lefthi8 = _mm_unpackhi_epi16( tmplo8, tmphi8 ); | 755 | |
| 756 | 0 | __m128i wX = _mm_sub_epi16( w64_8, wl8 ); | 757 | 0 | tmplo8 = _mm_mullo_epi16( x8dst, wX ); // 64-wL-wT*dst | 758 | 0 | tmphi8 = _mm_mulhi_epi16( x8dst, wX ); // 64-wL-wT*dst | 759 | 0 | __m128i dstlo8 = _mm_unpacklo_epi16( tmplo8, tmphi8 ); | 760 | 0 | __m128i dsthi8 = _mm_unpackhi_epi16( tmplo8, tmphi8 ); | 761 | |
| 762 | 0 | dstlo8 = _mm_add_epi32( dstlo8, leftlo8 ); | 763 | 0 | dsthi8 = _mm_add_epi32( dsthi8, lefthi8 ); | 764 | 0 | dstlo8 = _mm_add_epi32( dstlo8, w32_8 ); | 765 | 0 | dsthi8 = _mm_add_epi32( dsthi8, w32_8 ); | 766 | |
| 767 | 0 | dstlo8 = _mm_srai_epi32( dstlo8, 6 ); | 768 | 0 | dsthi8 = _mm_srai_epi32( dsthi8, 6 ); | 769 | |
| 770 | 0 | dstlo8 = _mm_max_epi32( vbdmin8, dstlo8 ); | 771 | 0 | dsthi8 = _mm_max_epi32( vbdmin8, dsthi8 ); | 772 | 0 | dstlo8 = _mm_min_epi32( vbdmax8, dstlo8 ); | 773 | 0 | dsthi8 = _mm_min_epi32( vbdmax8, dsthi8 ); | 774 | |
| 775 | 0 | dstlo8 = _mm_packs_epi32( dstlo8, dsthi8 ); | 776 | |
| 777 | 0 | if( iWidth >= 8 ) | 778 | 0 | _mm_storeu_si128( (__m128i*)( pDst + y * dstStride + x ), dstlo8 ); | 779 | 0 | else if( iWidth == 4 ) | 780 | 0 | _mm_storeu_si64( (__m128i*)( pDst + y * dstStride + x ), ( dstlo8 ) ); | 781 | 0 | else if( iWidth == 2 ) | 782 | 0 | _mm_storeu_si32( (__m128i*)( pDst + y * dstStride + x ), dstlo8 ); | 783 | 0 | } | 784 | 0 | } | 785 | 0 | } | 786 | 0 | } | 787 | 0 | } | 788 | | | 789 | | | 790 | 2.71k | #if USE_AVX2 | 791 | 2.71k | _mm256_zeroupper(); | 792 | 2.71k | #endif | 793 | 2.71k | } |
|
794 | | |
795 | | /** Function for deriving planar intra prediction. This function derives the prediction samples for planar mode (intra coding). |
796 | | */ |
797 | | template< X86_VEXT vext> |
798 | | void xPredIntraPlanar_SIMD( const CPelBuf &pSrc, PelBuf &pDst, const SPS& sps ) |
799 | 2.41k | { |
800 | | |
801 | 2.41k | const uint32_t width = pDst.width; |
802 | 2.41k | const uint32_t height = pDst.height; |
803 | 2.41k | const uint32_t log2W = getLog2( width ); |
804 | 2.41k | const uint32_t log2H = getLog2( height ); |
805 | 2.41k | const uint32_t finalShift = 1 + log2W + log2H; |
806 | 2.41k | const uint32_t offset = 1 << (log2W + log2H); |
807 | 2.41k | const ptrdiff_t stride = pDst.stride; |
808 | 2.41k | Pel* pred = pDst.buf; |
809 | | |
810 | 2.41k | const Pel *ptrSrc =pSrc.buf; |
811 | | |
812 | 2.41k | int leftColumn,rightColumn; |
813 | 2.41k | Pel tmp; |
814 | 2.41k | int topRight = pSrc.at( width + 1, 0 ); |
815 | | |
816 | 2.41k | tmp=pSrc.at( 0, height+1 ); |
817 | 2.41k | const __m128i bottomLeft16 = _mm_set_epi16(tmp,tmp,tmp,tmp,tmp,tmp,tmp,tmp); |
818 | 2.41k | const __m128i zero = _mm_setzero_si128(); |
819 | 2.41k | const __m128i eight = _mm_set_epi16(8,8,8,8,8,8,8,8); |
820 | 2.41k | const __m128i offset32 = _mm_set_epi32(offset,offset,offset,offset); |
821 | 2.41k | const __m128i vLog2W = _mm_cvtsi32_si128(log2W); |
822 | 2.41k | const __m128i vLog2H = _mm_cvtsi32_si128(log2H); |
823 | 2.41k | const __m128i vFinalShift = _mm_cvtsi32_si128(finalShift); |
824 | | |
825 | 49.2k | for( int y = 0; y < height; y++) |
826 | 46.8k | { |
827 | 46.8k | leftColumn=pSrc.at( 0, y + 1 ); |
828 | 46.8k | rightColumn = topRight - leftColumn; |
829 | 46.8k | leftColumn = leftColumn << log2W; |
830 | 46.8k | const __m128i leftColumn32 = _mm_set_epi32(leftColumn,leftColumn,leftColumn,leftColumn); |
831 | 46.8k | const __m128i rightcolumn16 = _mm_set_epi16(rightColumn,rightColumn,rightColumn,rightColumn,rightColumn,rightColumn,rightColumn,rightColumn); |
832 | 46.8k | const __m128i y16 = _mm_set_epi16(y+1,y+1,y+1,y+1,y+1,y+1,y+1,y+1); |
833 | 46.8k | __m128i x16 = _mm_set_epi16(8,7,6,5,4,3,2,1); |
834 | | |
835 | 226k | for( int x = 0; x < width; x+=8 ) |
836 | 179k | { |
837 | | //topRow[x] = pSrc.at( x + 1, 0 ); |
838 | 179k | __m128i topRow16 = _mm_loadu_si128 ((__m128i const *) (ptrSrc+(x+1))); |
839 | | //bottomRow[x] = bottomLeft - topRow[x]; |
840 | 179k | __m128i bottomRow16L = _mm_sub_epi16(bottomLeft16,topRow16); |
841 | | // (y+1)*bottomRow[x] |
842 | 179k | __m128i tmpH = _mm_mulhi_epi16(bottomRow16L,y16); |
843 | 179k | __m128i tmpL = _mm_mullo_epi16(bottomRow16L,y16); |
844 | 179k | bottomRow16L = _mm_unpacklo_epi16(tmpL,tmpH); |
845 | 179k | __m128i bottomRow16H = _mm_unpackhi_epi16(tmpL,tmpH); |
846 | | |
847 | | // (topRow[x] topRow16H<< log2H) |
848 | 179k | __m128i topRow32L = _mm_unpacklo_epi16(topRow16,zero); |
849 | 179k | __m128i topRow32H = _mm_unpackhi_epi16(topRow16,zero); |
850 | 179k | topRow32L = _mm_sll_epi32(topRow32L,vLog2H); |
851 | 179k | topRow32H = _mm_sll_epi32(topRow32H,vLog2H); |
852 | | // vertPred = (topRow[x] << log2H) + (y+1)*bottomRow[x]; |
853 | 179k | topRow32L = _mm_add_epi32(topRow32L,bottomRow16L); |
854 | 179k | topRow32H = _mm_add_epi32(topRow32H,bottomRow16H); |
855 | | // horPred = leftColumn + (x+1)*rightColumn; |
856 | 179k | tmpL = _mm_mullo_epi16(rightcolumn16,x16); |
857 | 179k | tmpH = _mm_mulhi_epi16(rightcolumn16,x16); |
858 | 179k | __m128i horpred32L = _mm_unpacklo_epi16(tmpL,tmpH); |
859 | 179k | __m128i horpred32H = _mm_unpackhi_epi16(tmpL,tmpH); |
860 | 179k | horpred32L = _mm_add_epi32(leftColumn32,horpred32L); |
861 | 179k | horpred32H = _mm_add_epi32(leftColumn32,horpred32H); |
862 | | // pred[x] = ( ( horPred << log2H ) + ( vertPred << log2W ) + offset ) >> finalShift; |
863 | 179k | horpred32L = _mm_sll_epi32(horpred32L,vLog2H); |
864 | 179k | horpred32H = _mm_sll_epi32(horpred32H,vLog2H); |
865 | 179k | topRow32L = _mm_sll_epi32(topRow32L,vLog2W); |
866 | 179k | topRow32H = _mm_sll_epi32(topRow32H,vLog2W); |
867 | 179k | horpred32L = _mm_add_epi32(horpred32L,topRow32L); |
868 | 179k | horpred32H = _mm_add_epi32(horpred32H,topRow32H); |
869 | 179k | horpred32L = _mm_add_epi32(horpred32L,offset32); |
870 | 179k | horpred32H = _mm_add_epi32(horpred32H,offset32); |
871 | 179k | horpred32L = _mm_srl_epi32(horpred32L,vFinalShift); |
872 | 179k | horpred32H = _mm_srl_epi32(horpred32H,vFinalShift); |
873 | | |
874 | 179k | tmpL = _mm_packs_epi32(horpred32L,horpred32H); |
875 | 179k | if (width>=8) |
876 | 176k | _mm_storeu_si128(( __m128i * )(pred+y*stride+x), (tmpL) ); |
877 | 3.06k | else if (width==4) |
878 | 3.06k | _mm_storeu_si64(( __m128i * )(pred+y*stride+x), (tmpL) ); |
879 | 0 | else if (width==2) |
880 | 0 | _mm_storeu_si32(( __m128i * )(pred+y*stride+x),(tmpL) ); |
881 | 0 | else |
882 | 0 | pred[y*stride+x]=(Pel)_mm_extract_epi16 (tmpL,0); |
883 | | |
884 | 179k | x16 = _mm_add_epi16(x16,eight); |
885 | 179k | } |
886 | 46.8k | } |
887 | 2.41k | } Unexecuted instantiation: void vvdec::xPredIntraPlanar_SIMD<(vvdec::x86_simd::X86_VEXT)1>(vvdec::AreaBuf<short const> const&, vvdec::AreaBuf<short>&, vvdec::SPS const&) void vvdec::xPredIntraPlanar_SIMD<(vvdec::x86_simd::X86_VEXT)4>(vvdec::AreaBuf<short const> const&, vvdec::AreaBuf<short>&, vvdec::SPS const&) Line | Count | Source | 799 | 2.41k | { | 800 | | | 801 | 2.41k | const uint32_t width = pDst.width; | 802 | 2.41k | const uint32_t height = pDst.height; | 803 | 2.41k | const uint32_t log2W = getLog2( width ); | 804 | 2.41k | const uint32_t log2H = getLog2( height ); | 805 | 2.41k | const uint32_t finalShift = 1 + log2W + log2H; | 806 | 2.41k | const uint32_t offset = 1 << (log2W + log2H); | 807 | 2.41k | const ptrdiff_t stride = pDst.stride; | 808 | 2.41k | Pel* pred = pDst.buf; | 809 | | | 810 | 2.41k | const Pel *ptrSrc =pSrc.buf; | 811 | | | 812 | 2.41k | int leftColumn,rightColumn; | 813 | 2.41k | Pel tmp; | 814 | 2.41k | int topRight = pSrc.at( width + 1, 0 ); | 815 | | | 816 | 2.41k | tmp=pSrc.at( 0, height+1 ); | 817 | 2.41k | const __m128i bottomLeft16 = _mm_set_epi16(tmp,tmp,tmp,tmp,tmp,tmp,tmp,tmp); | 818 | 2.41k | const __m128i zero = _mm_setzero_si128(); | 819 | 2.41k | const __m128i eight = _mm_set_epi16(8,8,8,8,8,8,8,8); | 820 | 2.41k | const __m128i offset32 = _mm_set_epi32(offset,offset,offset,offset); | 821 | 2.41k | const __m128i vLog2W = _mm_cvtsi32_si128(log2W); | 822 | 2.41k | const __m128i vLog2H = _mm_cvtsi32_si128(log2H); | 823 | 2.41k | const __m128i vFinalShift = _mm_cvtsi32_si128(finalShift); | 824 | | | 825 | 49.2k | for( int y = 0; y < height; y++) | 826 | 46.8k | { | 827 | 46.8k | leftColumn=pSrc.at( 0, y + 1 ); | 828 | 46.8k | rightColumn = topRight - leftColumn; | 829 | 46.8k | leftColumn = leftColumn << log2W; | 830 | 46.8k | const __m128i leftColumn32 = _mm_set_epi32(leftColumn,leftColumn,leftColumn,leftColumn); | 831 | 46.8k | const __m128i rightcolumn16 = _mm_set_epi16(rightColumn,rightColumn,rightColumn,rightColumn,rightColumn,rightColumn,rightColumn,rightColumn); | 832 | 46.8k | const __m128i y16 = _mm_set_epi16(y+1,y+1,y+1,y+1,y+1,y+1,y+1,y+1); | 833 | 46.8k | __m128i x16 = _mm_set_epi16(8,7,6,5,4,3,2,1); | 834 | | | 835 | 226k | for( int x = 0; x < width; x+=8 ) | 836 | 179k | { | 837 | | //topRow[x] = pSrc.at( x + 1, 0 ); | 838 | 179k | __m128i topRow16 = _mm_loadu_si128 ((__m128i const *) (ptrSrc+(x+1))); | 839 | | //bottomRow[x] = bottomLeft - topRow[x]; | 840 | 179k | __m128i bottomRow16L = _mm_sub_epi16(bottomLeft16,topRow16); | 841 | | // (y+1)*bottomRow[x] | 842 | 179k | __m128i tmpH = _mm_mulhi_epi16(bottomRow16L,y16); | 843 | 179k | __m128i tmpL = _mm_mullo_epi16(bottomRow16L,y16); | 844 | 179k | bottomRow16L = _mm_unpacklo_epi16(tmpL,tmpH); | 845 | 179k | __m128i bottomRow16H = _mm_unpackhi_epi16(tmpL,tmpH); | 846 | | | 847 | | // (topRow[x] topRow16H<< log2H) | 848 | 179k | __m128i topRow32L = _mm_unpacklo_epi16(topRow16,zero); | 849 | 179k | __m128i topRow32H = _mm_unpackhi_epi16(topRow16,zero); | 850 | 179k | topRow32L = _mm_sll_epi32(topRow32L,vLog2H); | 851 | 179k | topRow32H = _mm_sll_epi32(topRow32H,vLog2H); | 852 | | // vertPred = (topRow[x] << log2H) + (y+1)*bottomRow[x]; | 853 | 179k | topRow32L = _mm_add_epi32(topRow32L,bottomRow16L); | 854 | 179k | topRow32H = _mm_add_epi32(topRow32H,bottomRow16H); | 855 | | // horPred = leftColumn + (x+1)*rightColumn; | 856 | 179k | tmpL = _mm_mullo_epi16(rightcolumn16,x16); | 857 | 179k | tmpH = _mm_mulhi_epi16(rightcolumn16,x16); | 858 | 179k | __m128i horpred32L = _mm_unpacklo_epi16(tmpL,tmpH); | 859 | 179k | __m128i horpred32H = _mm_unpackhi_epi16(tmpL,tmpH); | 860 | 179k | horpred32L = _mm_add_epi32(leftColumn32,horpred32L); | 861 | 179k | horpred32H = _mm_add_epi32(leftColumn32,horpred32H); | 862 | | // pred[x] = ( ( horPred << log2H ) + ( vertPred << log2W ) + offset ) >> finalShift; | 863 | 179k | horpred32L = _mm_sll_epi32(horpred32L,vLog2H); | 864 | 179k | horpred32H = _mm_sll_epi32(horpred32H,vLog2H); | 865 | 179k | topRow32L = _mm_sll_epi32(topRow32L,vLog2W); | 866 | 179k | topRow32H = _mm_sll_epi32(topRow32H,vLog2W); | 867 | 179k | horpred32L = _mm_add_epi32(horpred32L,topRow32L); | 868 | 179k | horpred32H = _mm_add_epi32(horpred32H,topRow32H); | 869 | 179k | horpred32L = _mm_add_epi32(horpred32L,offset32); | 870 | 179k | horpred32H = _mm_add_epi32(horpred32H,offset32); | 871 | 179k | horpred32L = _mm_srl_epi32(horpred32L,vFinalShift); | 872 | 179k | horpred32H = _mm_srl_epi32(horpred32H,vFinalShift); | 873 | | | 874 | 179k | tmpL = _mm_packs_epi32(horpred32L,horpred32H); | 875 | 179k | if (width>=8) | 876 | 176k | _mm_storeu_si128(( __m128i * )(pred+y*stride+x), (tmpL) ); | 877 | 3.06k | else if (width==4) | 878 | 3.06k | _mm_storeu_si64(( __m128i * )(pred+y*stride+x), (tmpL) ); | 879 | 0 | else if (width==2) | 880 | 0 | _mm_storeu_si32(( __m128i * )(pred+y*stride+x),(tmpL) ); | 881 | 0 | else | 882 | 0 | pred[y*stride+x]=(Pel)_mm_extract_epi16 (tmpL,0); | 883 | | | 884 | 179k | x16 = _mm_add_epi16(x16,eight); | 885 | 179k | } | 886 | 46.8k | } | 887 | 2.41k | } |
|
888 | | |
889 | | template< X86_VEXT vext> |
890 | | void GetLumaRecPixel420SIMD (const int width,const int height, const Pel* pRecSrc0,const ptrdiff_t iRecStride,Pel* pDst0,const ptrdiff_t iDstStride) |
891 | 2.86k | { |
892 | | #ifdef USE_AVX2 |
893 | 2.86k | if( ( width & 15 ) == 0 ) // width>=16 |
894 | | // if( 0 ) // width>=16 |
895 | 1.78k | { |
896 | 1.78k | __m256i vzero = _mm256_set1_epi8(0); |
897 | 1.78k | __m256i vfour = _mm256_set1_epi32(4); |
898 | 27.3k | for( int y = 0; y < height; y++ ) |
899 | 25.5k | { |
900 | 64.9k | for( int x = 0; x < width; x += 16 ) |
901 | 39.3k | { |
902 | 39.3k | int x2=x<<1; |
903 | 39.3k | __m256i vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2-1]); // -1 0 1 2 3 4 5 6 |
904 | 39.3k | __m256i vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2]); // 0 1 2 3 4 5 6 7 |
905 | | |
906 | | __m256i vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55); // -1 1 3 5 32 Bit |
907 | | __m256i vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55); // 0 2 4 6 32 Bit |
908 | | __m256i vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA); // 1 3 5 7 32 Bit |
909 | | vsrc10 = _mm256_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit |
910 | | vsrc0 = _mm256_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 |
911 | | |
912 | | vsrc0 = _mm256_add_epi32(vsrc0,vsrc10); |
913 | | __m256i vdst0 = _mm256_add_epi32(vsrc0,vsrc01); // dst 0 1 2 3 32 Bit, untere Zeile fehlt noch |
914 | | |
915 | | vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 +15]); // 7 8 9 10 11 12 13 14 |
916 | | vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 + 16 ]); // 8 9 10 11 12 13 14 15 |
917 | | |
918 | | x2+= (int)iRecStride; |
919 | | |
920 | | vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55); |
921 | | vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55); |
922 | | vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA); |
923 | | vsrc10 = _mm256_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit |
924 | | vsrc0 = _mm256_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 |
925 | | |
926 | | vsrc0 = _mm256_add_epi32(vsrc0,vsrc10); |
927 | | __m256i vdst1 = _mm256_add_epi32(vsrc0,vsrc01); // dst 4 5 6 7 32 Bit, untere Zeile fehlt noch |
928 | | |
929 | | // jetzt die nächste Zeile dazu |
930 | | vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2-1]); // -1 0 1 2 3 4 5 6 |
931 | | vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2]); // 0 1 2 3 4 5 6 7 |
932 | | |
933 | | vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55); // -1 1 3 5 32 Bit |
934 | | vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55); // 0 2 4 6 32 Bit |
935 | | vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA); // 1 3 5 7 32 Bit |
936 | | vsrc10 = _mm256_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit |
937 | | vsrc0 = _mm256_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 |
938 | | |
939 | | vsrc0 = _mm256_add_epi32(vsrc0,vsrc10); |
940 | | __m256i vdst01 = _mm256_add_epi32(vsrc0,vsrc01); // dst 0 1 2 3 32 Bit, untere Zeile |
941 | | |
942 | | vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 + 15]); // 7 8 9 10 11 12 13 14 |
943 | | vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 + 16 ]); // 8 9 10 11 12 13 14 15 |
944 | | |
945 | | vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55); |
946 | | vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55); |
947 | | vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA); |
948 | | vsrc10 = _mm256_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit |
949 | | vsrc0 = _mm256_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 |
950 | | |
951 | | vsrc0 = _mm256_add_epi32(vsrc0,vsrc10); |
952 | | __m256i vdst11 = _mm256_add_epi32(vsrc0,vsrc01); // dst 4 5 6 7 32 Bit, untere Zeile |
953 | | |
954 | | vdst0 = _mm256_add_epi32(vdst0,vdst01); |
955 | | vdst1 = _mm256_add_epi32(vdst1,vdst11); |
956 | | vdst0 = _mm256_add_epi32(vdst0,vfour); |
957 | | vdst1 = _mm256_add_epi32(vdst1,vfour); |
958 | | vdst0 = _mm256_srli_epi32(vdst0,3); |
959 | | vdst1 = _mm256_srli_epi32(vdst1,3); |
960 | | vdst0 = _mm256_packus_epi32 (vdst0,vdst1); // 16 bit |
961 | | vdst0 = _mm256_permute4x64_epi64(vdst0,0xd8); |
962 | | |
963 | 39.3k | _mm256_storeu_si256((__m256i*)&pDst0[x], vdst0); |
964 | | // _mm_storeu_si128((__m128i*)&pDstTmp[x], vdst0); |
965 | 39.3k | } |
966 | 25.5k | pDst0 += iDstStride; |
967 | 25.5k | pRecSrc0 += (iRecStride<<1); |
968 | 25.5k | } |
969 | 1.78k | } |
970 | 1.08k | else |
971 | 1.08k | #endif |
972 | 1.08k | if( ( width & 7 ) == 0 ) // width>=8 |
973 | 768 | { |
974 | 768 | __m128i vzero = _mm_set1_epi8(0); |
975 | 768 | __m128i vfour = _mm_set1_epi32(4); |
976 | | |
977 | | |
978 | 7.18k | for( int y = 0; y < height; y++ ) |
979 | 6.41k | { |
980 | | |
981 | 12.8k | for( int x = 0; x < width; x += 8 ) |
982 | 6.41k | { |
983 | 6.41k | int x2=x<<1; |
984 | 6.41k | __m128i vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2-1]); // -1 0 1 2 3 4 5 6 |
985 | 6.41k | __m128i vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2]); // 0 1 2 3 4 5 6 7 |
986 | | |
987 | 6.41k | __m128i vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55); // -1 1 3 5 32 Bit |
988 | 6.41k | __m128i vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55); // 0 2 4 6 32 Bit |
989 | 6.41k | __m128i vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA); // 1 3 5 7 32 Bit |
990 | 6.41k | vsrc10 = _mm_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit |
991 | 6.41k | vsrc0 = _mm_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 |
992 | | |
993 | 6.41k | vsrc0 = _mm_add_epi32(vsrc0,vsrc10); |
994 | 6.41k | __m128i vdst0 = _mm_add_epi32(vsrc0,vsrc01); // dst 0 1 2 3 32 Bit, untere Zeile fehlt noch |
995 | | |
996 | 6.41k | vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 +7]); // 7 8 9 10 11 12 13 14 |
997 | 6.41k | vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 + 8 ]); // 8 9 10 11 12 13 14 15 |
998 | | |
999 | 6.41k | x2+=(int)iRecStride; |
1000 | | |
1001 | 6.41k | vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55); |
1002 | 6.41k | vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55); |
1003 | 6.41k | vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA); |
1004 | 6.41k | vsrc10 = _mm_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit |
1005 | 6.41k | vsrc0 = _mm_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 |
1006 | | |
1007 | 6.41k | vsrc0 = _mm_add_epi32(vsrc0,vsrc10); |
1008 | 6.41k | __m128i vdst1 = _mm_add_epi32(vsrc0,vsrc01); // dst 4 5 6 7 32 Bit, untere Zeile fehlt noch |
1009 | | |
1010 | | // jetzt die nächste Zeile dazu |
1011 | 6.41k | vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2-1]); // -1 0 1 2 3 4 5 6 |
1012 | 6.41k | vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2]); // 0 1 2 3 4 5 6 7 |
1013 | | |
1014 | 6.41k | vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55); // -1 1 3 5 32 Bit |
1015 | 6.41k | vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55); // 0 2 4 6 32 Bit |
1016 | 6.41k | vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA); // 1 3 5 7 32 Bit |
1017 | 6.41k | vsrc10 = _mm_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit |
1018 | 6.41k | vsrc0 = _mm_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 |
1019 | | |
1020 | 6.41k | vsrc0 = _mm_add_epi32(vsrc0,vsrc10); |
1021 | 6.41k | __m128i vdst01 = _mm_add_epi32(vsrc0,vsrc01); // dst 0 1 2 3 32 Bit, untere Zeile |
1022 | | |
1023 | 6.41k | vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 + 7]); // 7 8 9 10 11 12 13 14 |
1024 | 6.41k | vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 + 8 ]); // 8 9 10 11 12 13 14 15 |
1025 | | |
1026 | 6.41k | vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55); |
1027 | 6.41k | vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55); |
1028 | 6.41k | vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA); |
1029 | 6.41k | vsrc10 = _mm_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit |
1030 | 6.41k | vsrc0 = _mm_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 |
1031 | | |
1032 | 6.41k | vsrc0 = _mm_add_epi32(vsrc0,vsrc10); |
1033 | 6.41k | __m128i vdst11 = _mm_add_epi32(vsrc0,vsrc01); // dst 4 5 6 7 32 Bit, untere Zeile |
1034 | | |
1035 | 6.41k | vdst0 = _mm_add_epi32(vdst0,vdst01); |
1036 | 6.41k | vdst1 = _mm_add_epi32(vdst1,vdst11); |
1037 | 6.41k | vdst0 = _mm_add_epi32(vdst0,vfour); |
1038 | 6.41k | vdst1 = _mm_add_epi32(vdst1,vfour); |
1039 | 6.41k | vdst0 = _mm_srli_epi32(vdst0,3); |
1040 | 6.41k | vdst1 = _mm_srli_epi32(vdst1,3); |
1041 | 6.41k | vdst0 = _mm_packus_epi32 (vdst0,vdst1); // 16 bit |
1042 | | |
1043 | 6.41k | _mm_storeu_si128((__m128i*)&pDst0[x], vdst0); |
1044 | | // _mm_storeu_si128((__m128i*)&pDstTmp[x], vdst0); |
1045 | 6.41k | } |
1046 | 6.41k | pDst0 += iDstStride; |
1047 | 6.41k | pRecSrc0 += (iRecStride<<1); |
1048 | 6.41k | } |
1049 | 768 | } |
1050 | 312 | else // width<=4 |
1051 | 312 | { |
1052 | 312 | __m128i vzero = _mm_set1_epi8(0); |
1053 | 312 | __m128i vfour = _mm_set1_epi32(4); |
1054 | | |
1055 | 2.60k | for( int y = 0; y < height; y++ ) |
1056 | 2.28k | { |
1057 | 2.28k | __m128i vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[-1]); // -1 0 1 2 3 4 5 6 |
1058 | 2.28k | __m128i vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[0]); // 0 1 2 3 4 5 6 7 |
1059 | | |
1060 | 2.28k | __m128i vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55); // -1 1 3 5 32 Bit |
1061 | 2.28k | __m128i vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55); // 0 2 4 6 32 Bit |
1062 | 2.28k | __m128i vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA); // 1 3 5 7 32 Bit |
1063 | 2.28k | vsrc10 = _mm_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit |
1064 | 2.28k | vsrc0 = _mm_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 |
1065 | | |
1066 | 2.28k | vsrc0 = _mm_add_epi32(vsrc0,vsrc10); |
1067 | 2.28k | __m128i vdst0 = _mm_add_epi32(vsrc0,vsrc01); // dst 0 1 2 3 32 Bit, untere Zeile fehlt noch |
1068 | | |
1069 | | // jetzt die nächste Zeile dazu |
1070 | 2.28k | vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[iRecStride-1]); // -1 0 1 2 3 4 5 6 |
1071 | 2.28k | vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[iRecStride]); // 0 1 2 3 4 5 6_mm_storeu_si32 7 |
1072 | | |
1073 | 2.28k | vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55); // -1 1 3 5 32 Bit |
1074 | 2.28k | vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55); // 0 2 4 6 32 Bit |
1075 | 2.28k | vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA); // 1 3 5 7 32 Bit |
1076 | 2.28k | vsrc10 = _mm_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit |
1077 | 2.28k | vsrc0 = _mm_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 |
1078 | | |
1079 | 2.28k | vsrc0 = _mm_add_epi32(vsrc0,vsrc10); |
1080 | 2.28k | __m128i vdst01 = _mm_add_epi32(vsrc0,vsrc01); // dst 0 1 2 3 32 Bit, untere Zeile |
1081 | | |
1082 | | |
1083 | 2.28k | vdst0 = _mm_add_epi32(vdst0,vdst01); |
1084 | 2.28k | vdst0 = _mm_add_epi32(vdst0,vfour); |
1085 | 2.28k | vdst0 = _mm_srli_epi32(vdst0,3); |
1086 | 2.28k | vdst0 = _mm_packus_epi32 (vdst0,vdst0); // 16 bit |
1087 | | |
1088 | 2.28k | if (width==4) |
1089 | 2.28k | _mm_storeu_si64(( __m128i * )&pDst0[0], (vdst0) ); |
1090 | 0 | else if (width==2) |
1091 | 0 | _mm_storeu_si32(( __m128i * )&pDst0[0], (vdst0) ); |
1092 | 0 | else |
1093 | 0 | { |
1094 | 0 | int tmp = _mm_cvtsi128_si32(vdst0); |
1095 | 0 | pDst0[0] = (Pel) tmp; |
1096 | 0 | } |
1097 | | |
1098 | 2.28k | pDst0 += iDstStride; |
1099 | 2.28k | pRecSrc0 += (iRecStride<<1); |
1100 | 2.28k | } |
1101 | 312 | } |
1102 | 2.86k | } Unexecuted instantiation: void vvdec::GetLumaRecPixel420SIMD<(vvdec::x86_simd::X86_VEXT)1>(int, int, short const*, long, short*, long) void vvdec::GetLumaRecPixel420SIMD<(vvdec::x86_simd::X86_VEXT)4>(int, int, short const*, long, short*, long) Line | Count | Source | 891 | 2.86k | { | 892 | 2.86k | #ifdef USE_AVX2 | 893 | 2.86k | if( ( width & 15 ) == 0 ) // width>=16 | 894 | | // if( 0 ) // width>=16 | 895 | 1.78k | { | 896 | 1.78k | __m256i vzero = _mm256_set1_epi8(0); | 897 | 1.78k | __m256i vfour = _mm256_set1_epi32(4); | 898 | 27.3k | for( int y = 0; y < height; y++ ) | 899 | 25.5k | { | 900 | 64.9k | for( int x = 0; x < width; x += 16 ) | 901 | 39.3k | { | 902 | 39.3k | int x2=x<<1; | 903 | 39.3k | __m256i vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2-1]); // -1 0 1 2 3 4 5 6 | 904 | 39.3k | __m256i vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2]); // 0 1 2 3 4 5 6 7 | 905 | | | 906 | 39.3k | __m256i vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55); // -1 1 3 5 32 Bit | 907 | 39.3k | __m256i vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55); // 0 2 4 6 32 Bit | 908 | 39.3k | __m256i vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA); // 1 3 5 7 32 Bit | 909 | 39.3k | vsrc10 = _mm256_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit | 910 | 39.3k | vsrc0 = _mm256_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 | 911 | | | 912 | 39.3k | vsrc0 = _mm256_add_epi32(vsrc0,vsrc10); | 913 | 39.3k | __m256i vdst0 = _mm256_add_epi32(vsrc0,vsrc01); // dst 0 1 2 3 32 Bit, untere Zeile fehlt noch | 914 | | | 915 | 39.3k | vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 +15]); // 7 8 9 10 11 12 13 14 | 916 | 39.3k | vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 + 16 ]); // 8 9 10 11 12 13 14 15 | 917 | | | 918 | 39.3k | x2+= (int)iRecStride; | 919 | | | 920 | 39.3k | vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55); | 921 | 39.3k | vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55); | 922 | 39.3k | vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA); | 923 | 39.3k | vsrc10 = _mm256_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit | 924 | 39.3k | vsrc0 = _mm256_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 | 925 | | | 926 | 39.3k | vsrc0 = _mm256_add_epi32(vsrc0,vsrc10); | 927 | 39.3k | __m256i vdst1 = _mm256_add_epi32(vsrc0,vsrc01); // dst 4 5 6 7 32 Bit, untere Zeile fehlt noch | 928 | | | 929 | | // jetzt die nächste Zeile dazu | 930 | 39.3k | vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2-1]); // -1 0 1 2 3 4 5 6 | 931 | 39.3k | vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2]); // 0 1 2 3 4 5 6 7 | 932 | | | 933 | 39.3k | vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55); // -1 1 3 5 32 Bit | 934 | 39.3k | vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55); // 0 2 4 6 32 Bit | 935 | 39.3k | vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA); // 1 3 5 7 32 Bit | 936 | 39.3k | vsrc10 = _mm256_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit | 937 | 39.3k | vsrc0 = _mm256_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 | 938 | | | 939 | 39.3k | vsrc0 = _mm256_add_epi32(vsrc0,vsrc10); | 940 | 39.3k | __m256i vdst01 = _mm256_add_epi32(vsrc0,vsrc01); // dst 0 1 2 3 32 Bit, untere Zeile | 941 | | | 942 | 39.3k | vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 + 15]); // 7 8 9 10 11 12 13 14 | 943 | 39.3k | vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 + 16 ]); // 8 9 10 11 12 13 14 15 | 944 | | | 945 | 39.3k | vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55); | 946 | 39.3k | vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55); | 947 | 39.3k | vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA); | 948 | 39.3k | vsrc10 = _mm256_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit | 949 | 39.3k | vsrc0 = _mm256_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 | 950 | | | 951 | 39.3k | vsrc0 = _mm256_add_epi32(vsrc0,vsrc10); | 952 | 39.3k | __m256i vdst11 = _mm256_add_epi32(vsrc0,vsrc01); // dst 4 5 6 7 32 Bit, untere Zeile | 953 | | | 954 | 39.3k | vdst0 = _mm256_add_epi32(vdst0,vdst01); | 955 | 39.3k | vdst1 = _mm256_add_epi32(vdst1,vdst11); | 956 | 39.3k | vdst0 = _mm256_add_epi32(vdst0,vfour); | 957 | 39.3k | vdst1 = _mm256_add_epi32(vdst1,vfour); | 958 | 39.3k | vdst0 = _mm256_srli_epi32(vdst0,3); | 959 | 39.3k | vdst1 = _mm256_srli_epi32(vdst1,3); | 960 | 39.3k | vdst0 = _mm256_packus_epi32 (vdst0,vdst1); // 16 bit | 961 | 39.3k | vdst0 = _mm256_permute4x64_epi64(vdst0,0xd8); | 962 | | | 963 | 39.3k | _mm256_storeu_si256((__m256i*)&pDst0[x], vdst0); | 964 | | // _mm_storeu_si128((__m128i*)&pDstTmp[x], vdst0); | 965 | 39.3k | } | 966 | 25.5k | pDst0 += iDstStride; | 967 | 25.5k | pRecSrc0 += (iRecStride<<1); | 968 | 25.5k | } | 969 | 1.78k | } | 970 | 1.08k | else | 971 | 1.08k | #endif | 972 | 1.08k | if( ( width & 7 ) == 0 ) // width>=8 | 973 | 768 | { | 974 | 768 | __m128i vzero = _mm_set1_epi8(0); | 975 | 768 | __m128i vfour = _mm_set1_epi32(4); | 976 | | | 977 | | | 978 | 7.18k | for( int y = 0; y < height; y++ ) | 979 | 6.41k | { | 980 | | | 981 | 12.8k | for( int x = 0; x < width; x += 8 ) | 982 | 6.41k | { | 983 | 6.41k | int x2=x<<1; | 984 | 6.41k | __m128i vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2-1]); // -1 0 1 2 3 4 5 6 | 985 | 6.41k | __m128i vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2]); // 0 1 2 3 4 5 6 7 | 986 | | | 987 | 6.41k | __m128i vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55); // -1 1 3 5 32 Bit | 988 | 6.41k | __m128i vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55); // 0 2 4 6 32 Bit | 989 | 6.41k | __m128i vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA); // 1 3 5 7 32 Bit | 990 | 6.41k | vsrc10 = _mm_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit | 991 | 6.41k | vsrc0 = _mm_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 | 992 | | | 993 | 6.41k | vsrc0 = _mm_add_epi32(vsrc0,vsrc10); | 994 | 6.41k | __m128i vdst0 = _mm_add_epi32(vsrc0,vsrc01); // dst 0 1 2 3 32 Bit, untere Zeile fehlt noch | 995 | | | 996 | 6.41k | vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 +7]); // 7 8 9 10 11 12 13 14 | 997 | 6.41k | vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 + 8 ]); // 8 9 10 11 12 13 14 15 | 998 | | | 999 | 6.41k | x2+=(int)iRecStride; | 1000 | | | 1001 | 6.41k | vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55); | 1002 | 6.41k | vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55); | 1003 | 6.41k | vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA); | 1004 | 6.41k | vsrc10 = _mm_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit | 1005 | 6.41k | vsrc0 = _mm_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 | 1006 | | | 1007 | 6.41k | vsrc0 = _mm_add_epi32(vsrc0,vsrc10); | 1008 | 6.41k | __m128i vdst1 = _mm_add_epi32(vsrc0,vsrc01); // dst 4 5 6 7 32 Bit, untere Zeile fehlt noch | 1009 | | | 1010 | | // jetzt die nächste Zeile dazu | 1011 | 6.41k | vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2-1]); // -1 0 1 2 3 4 5 6 | 1012 | 6.41k | vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2]); // 0 1 2 3 4 5 6 7 | 1013 | | | 1014 | 6.41k | vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55); // -1 1 3 5 32 Bit | 1015 | 6.41k | vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55); // 0 2 4 6 32 Bit | 1016 | 6.41k | vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA); // 1 3 5 7 32 Bit | 1017 | 6.41k | vsrc10 = _mm_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit | 1018 | 6.41k | vsrc0 = _mm_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 | 1019 | | | 1020 | 6.41k | vsrc0 = _mm_add_epi32(vsrc0,vsrc10); | 1021 | 6.41k | __m128i vdst01 = _mm_add_epi32(vsrc0,vsrc01); // dst 0 1 2 3 32 Bit, untere Zeile | 1022 | | | 1023 | 6.41k | vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 + 7]); // 7 8 9 10 11 12 13 14 | 1024 | 6.41k | vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 + 8 ]); // 8 9 10 11 12 13 14 15 | 1025 | | | 1026 | 6.41k | vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55); | 1027 | 6.41k | vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55); | 1028 | 6.41k | vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA); | 1029 | 6.41k | vsrc10 = _mm_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit | 1030 | 6.41k | vsrc0 = _mm_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 | 1031 | | | 1032 | 6.41k | vsrc0 = _mm_add_epi32(vsrc0,vsrc10); | 1033 | 6.41k | __m128i vdst11 = _mm_add_epi32(vsrc0,vsrc01); // dst 4 5 6 7 32 Bit, untere Zeile | 1034 | | | 1035 | 6.41k | vdst0 = _mm_add_epi32(vdst0,vdst01); | 1036 | 6.41k | vdst1 = _mm_add_epi32(vdst1,vdst11); | 1037 | 6.41k | vdst0 = _mm_add_epi32(vdst0,vfour); | 1038 | 6.41k | vdst1 = _mm_add_epi32(vdst1,vfour); | 1039 | 6.41k | vdst0 = _mm_srli_epi32(vdst0,3); | 1040 | 6.41k | vdst1 = _mm_srli_epi32(vdst1,3); | 1041 | 6.41k | vdst0 = _mm_packus_epi32 (vdst0,vdst1); // 16 bit | 1042 | | | 1043 | 6.41k | _mm_storeu_si128((__m128i*)&pDst0[x], vdst0); | 1044 | | // _mm_storeu_si128((__m128i*)&pDstTmp[x], vdst0); | 1045 | 6.41k | } | 1046 | 6.41k | pDst0 += iDstStride; | 1047 | 6.41k | pRecSrc0 += (iRecStride<<1); | 1048 | 6.41k | } | 1049 | 768 | } | 1050 | 312 | else // width<=4 | 1051 | 312 | { | 1052 | 312 | __m128i vzero = _mm_set1_epi8(0); | 1053 | 312 | __m128i vfour = _mm_set1_epi32(4); | 1054 | | | 1055 | 2.60k | for( int y = 0; y < height; y++ ) | 1056 | 2.28k | { | 1057 | 2.28k | __m128i vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[-1]); // -1 0 1 2 3 4 5 6 | 1058 | 2.28k | __m128i vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[0]); // 0 1 2 3 4 5 6 7 | 1059 | | | 1060 | 2.28k | __m128i vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55); // -1 1 3 5 32 Bit | 1061 | 2.28k | __m128i vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55); // 0 2 4 6 32 Bit | 1062 | 2.28k | __m128i vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA); // 1 3 5 7 32 Bit | 1063 | 2.28k | vsrc10 = _mm_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit | 1064 | 2.28k | vsrc0 = _mm_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 | 1065 | | | 1066 | 2.28k | vsrc0 = _mm_add_epi32(vsrc0,vsrc10); | 1067 | 2.28k | __m128i vdst0 = _mm_add_epi32(vsrc0,vsrc01); // dst 0 1 2 3 32 Bit, untere Zeile fehlt noch | 1068 | | | 1069 | | // jetzt die nächste Zeile dazu | 1070 | 2.28k | vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[iRecStride-1]); // -1 0 1 2 3 4 5 6 | 1071 | 2.28k | vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[iRecStride]); // 0 1 2 3 4 5 6_mm_storeu_si32 7 | 1072 | | | 1073 | 2.28k | vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55); // -1 1 3 5 32 Bit | 1074 | 2.28k | vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55); // 0 2 4 6 32 Bit | 1075 | 2.28k | vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA); // 1 3 5 7 32 Bit | 1076 | 2.28k | vsrc10 = _mm_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit | 1077 | 2.28k | vsrc0 = _mm_slli_epi32 (vsrc0,1); // 0 2 4 6 *2 | 1078 | | | 1079 | 2.28k | vsrc0 = _mm_add_epi32(vsrc0,vsrc10); | 1080 | 2.28k | __m128i vdst01 = _mm_add_epi32(vsrc0,vsrc01); // dst 0 1 2 3 32 Bit, untere Zeile | 1081 | | | 1082 | | | 1083 | 2.28k | vdst0 = _mm_add_epi32(vdst0,vdst01); | 1084 | 2.28k | vdst0 = _mm_add_epi32(vdst0,vfour); | 1085 | 2.28k | vdst0 = _mm_srli_epi32(vdst0,3); | 1086 | 2.28k | vdst0 = _mm_packus_epi32 (vdst0,vdst0); // 16 bit | 1087 | | | 1088 | 2.28k | if (width==4) | 1089 | 2.28k | _mm_storeu_si64(( __m128i * )&pDst0[0], (vdst0) ); | 1090 | 0 | else if (width==2) | 1091 | 0 | _mm_storeu_si32(( __m128i * )&pDst0[0], (vdst0) ); | 1092 | 0 | else | 1093 | 0 | { | 1094 | 0 | int tmp = _mm_cvtsi128_si32(vdst0); | 1095 | 0 | pDst0[0] = (Pel) tmp; | 1096 | 0 | } | 1097 | | | 1098 | 2.28k | pDst0 += iDstStride; | 1099 | 2.28k | pRecSrc0 += (iRecStride<<1); | 1100 | 2.28k | } | 1101 | 312 | } | 1102 | 2.86k | } |
|
1103 | | |
1104 | | |
1105 | | |
1106 | | template<X86_VEXT vext> |
1107 | | void IntraPrediction::_initIntraPredictionX86() |
1108 | 9.18k | { |
1109 | 9.18k | IntraPredAngleCore4 = IntraPredAngleCore_SIMD<vext, 4>; |
1110 | 9.18k | IntraPredAngleCore8 = IntraPredAngleCore_SIMD<vext, 8>; |
1111 | 9.18k | IntraPredAngleChroma4 = IntraPredAngleChroma_SIMD<vext, 4>; |
1112 | 9.18k | IntraPredAngleChroma8 = IntraPredAngleChroma_SIMD<vext, 8>; |
1113 | | |
1114 | 9.18k | IntraPredSampleFilter8 = IntraPredSampleFilter_SIMD<vext, 8>; |
1115 | 9.18k | IntraPredSampleFilter16 = IntraPredSampleFilter_SIMD<vext, 16>; |
1116 | | |
1117 | 9.18k | xPredIntraPlanar = xPredIntraPlanar_SIMD<vext>; |
1118 | | |
1119 | 9.18k | GetLumaRecPixel420 = GetLumaRecPixel420SIMD<vext>; |
1120 | | |
1121 | 9.18k | } Unexecuted instantiation: void vvdec::IntraPrediction::_initIntraPredictionX86<(vvdec::x86_simd::X86_VEXT)1>() void vvdec::IntraPrediction::_initIntraPredictionX86<(vvdec::x86_simd::X86_VEXT)4>() Line | Count | Source | 1108 | 9.18k | { | 1109 | 9.18k | IntraPredAngleCore4 = IntraPredAngleCore_SIMD<vext, 4>; | 1110 | 9.18k | IntraPredAngleCore8 = IntraPredAngleCore_SIMD<vext, 8>; | 1111 | 9.18k | IntraPredAngleChroma4 = IntraPredAngleChroma_SIMD<vext, 4>; | 1112 | 9.18k | IntraPredAngleChroma8 = IntraPredAngleChroma_SIMD<vext, 8>; | 1113 | | | 1114 | 9.18k | IntraPredSampleFilter8 = IntraPredSampleFilter_SIMD<vext, 8>; | 1115 | 9.18k | IntraPredSampleFilter16 = IntraPredSampleFilter_SIMD<vext, 16>; | 1116 | | | 1117 | 9.18k | xPredIntraPlanar = xPredIntraPlanar_SIMD<vext>; | 1118 | | | 1119 | 9.18k | GetLumaRecPixel420 = GetLumaRecPixel420SIMD<vext>; | 1120 | | | 1121 | 9.18k | } |
|
1122 | | template void IntraPrediction::_initIntraPredictionX86<SIMDX86>(); |
1123 | | |
1124 | | #endif // TARGET_SIMD_X86 |
1125 | | #endif |
1126 | | |
1127 | | } |