/src/vvdec/source/Lib/FilmGrain/FilmGrainImpl_X86_SIMD.h
Line | Count | Source |
1 | | /* ----------------------------------------------------------------------------- |
2 | | The copyright in this software is being made available under the Clear BSD |
3 | | License, included below. No patent rights, trademark rights and/or |
4 | | other Intellectual Property Rights other than the copyrights concerning |
5 | | the Software are granted under this license. |
6 | | |
7 | | The Clear BSD License |
8 | | |
9 | | Copyright (c) 2018-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVdeC Authors. |
10 | | All rights reserved. |
11 | | |
12 | | Redistribution and use in source and binary forms, with or without modification, |
13 | | are permitted (subject to the limitations in the disclaimer below) provided that |
14 | | the following conditions are met: |
15 | | |
16 | | * Redistributions of source code must retain the above copyright notice, |
17 | | this list of conditions and the following disclaimer. |
18 | | |
19 | | * Redistributions in binary form must reproduce the above copyright |
20 | | notice, this list of conditions and the following disclaimer in the |
21 | | documentation and/or other materials provided with the distribution. |
22 | | |
23 | | * Neither the name of the copyright holder nor the names of its |
24 | | contributors may be used to endorse or promote products derived from this |
25 | | software without specific prior written permission. |
26 | | |
27 | | NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY |
28 | | THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND |
29 | | CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
30 | | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A |
31 | | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR |
32 | | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
33 | | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
34 | | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR |
35 | | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER |
36 | | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
37 | | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
38 | | POSSIBILITY OF SUCH DAMAGE. |
39 | | ------------------------------------------------------------------------------------------- */ |
40 | | |
41 | | #include "FilmGrainImplX86.h" |
42 | | |
43 | | #include <algorithm> |
44 | | |
45 | | #include <CommonDef.h> |
46 | | |
47 | | #if defined(TARGET_SIMD_X86) && ENABLE_SIMD_OPT |
48 | | #include <CommonDefX86.h> |
49 | | |
50 | | namespace vvdec |
51 | | { |
52 | | using namespace x86_simd; |
53 | | |
54 | | template<> |
55 | | void FilmGrainImplX86<CURR_X86_VEXT>::make_grain_pattern( const void* I, |
56 | | int c, |
57 | | int x, |
58 | | int subx, |
59 | | uint8_t oc1, |
60 | | uint8_t oc2, |
61 | | uint8_t ox, |
62 | | uint8_t ox_up, |
63 | | uint8_t oy, |
64 | | uint8_t oy_up, |
65 | | int s, |
66 | | int s_up, |
67 | | int16_t grain[3][32], |
68 | | uint8_t scale[3][32] ) const |
69 | 0 | { |
70 | 0 | const uint8_t* I8 = (const uint8_t*) I; |
71 | 0 | const uint16_t* I16 = (const uint16_t*) I; |
72 | 0 | if( allZero[c] == 1 ) |
73 | 0 | { |
74 | 0 | if( c == 0 ) |
75 | 0 | { |
76 | 0 | __m128i vP = _mm_lddqu_si128( (__m128i*) &pattern[0][0][oy][ox] ); |
77 | 0 | if( s == -1 ) |
78 | 0 | { |
79 | 0 | vP = _mm_sub_epi8( _mm_set1_epi8( 0 ), vP ); |
80 | 0 | } |
81 | | #ifdef USE_AVX2 |
82 | | __m256i vmask = _mm256_set1_epi32(0xff); |
83 | | __m128i tmp0; |
84 | | __m128i tmp1; |
85 | | __m256i vintensity; |
86 | 0 | if (bs) |
87 | 0 | { |
88 | 0 | vintensity = _mm256_lddqu_si256((__m256i*)&I16[x]); //load 16 16 bit values |
89 | 0 | vintensity = _mm256_sra_epi16 (vintensity, _mm_set_epi32 (0,0,0,bs)); |
90 | 0 | tmp0=_mm256_extracti128_si256 (vintensity,0); |
91 | 0 | tmp1=_mm256_extracti128_si256 (vintensity,1); |
92 | 0 | } |
93 | 0 | else |
94 | 0 | { |
95 | 0 | __m128i vintensity128 = _mm_lddqu_si128((__m128i*)&I8[x]); //load 16 8 bit value |
96 | 0 | tmp0=_mm_cvtepi8_epi16 (vintensity128); |
97 | 0 | tmp1=_mm_cvtepi8_epi16 (_mm_bsrli_si128(vintensity128,8)); |
98 | 0 | tmp0 = _mm_and_si128 (tmp0,_mm_set1_epi16(0xff)); // only 8 bit |
99 | 0 | tmp1 = _mm_and_si128 (tmp1,_mm_set1_epi16(0xff)); // only 8 bit |
100 | 0 | vintensity = _mm256_castsi128_si256 (vintensity128); |
101 | 0 | } |
102 | | __m256i vindex0=_mm256_cvtepi16_epi32 (tmp0); |
103 | | __m256i vindex1=_mm256_cvtepi16_epi32 (tmp1); |
104 | | |
105 | | __m256i avP = _mm256_cvtepi8_epi16( vP ); |
106 | 0 | if( oc1 ) |
107 | 0 | { |
108 | 0 | __m256i avoc1 = _mm256_set1_epi16( oc1 ); |
109 | 0 | __m256i avoc2 = _mm256_set1_epi16( oc2 ); |
110 | | // p*oc1 |
111 | 0 | avP = _mm256_mullo_epi16( avP, avoc1 ); // max 16 Bit |
112 | | // pattern * s_up |
113 | 0 | __m128i vP2 = _mm_lddqu_si128( (__m128i*) &pattern[0][0][oy_up][ox_up] ); |
114 | 0 | if( s_up == -1 ) |
115 | 0 | { |
116 | 0 | vP2 = _mm_sub_epi8( _mm_set1_epi8( 0 ), vP2 ); |
117 | 0 | } |
118 | 0 | __m256i avP2 = _mm256_cvtepi8_epi16( vP2 ); |
119 | | // * oc2 |
120 | 0 | avP2 = _mm256_mullo_epi16( avP2, avoc2 ); |
121 | | // add |
122 | 0 | avP = _mm256_add_epi16( avP, avP2 ); |
123 | | // round to 16 bit |
124 | 0 | __m256i avadd = _mm256_set1_epi16( 1 << ( 5 - 1 ) ); |
125 | 0 | __m128i avshift = _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, 5 ); |
126 | 0 | avP = _mm256_add_epi16( avP, avadd ); |
127 | 0 | avP = _mm256_sra_epi16( avP, avshift ); |
128 | 0 | } |
129 | | _mm256_storeu_si256( (__m256i*) &grain[c][16], avP ); |
130 | | |
131 | | __m256i vscale0 = _mm256_i32gather_epi32 ((int *)&sLUT[0][0], vindex0, 1); // load 8 32 bit values |
132 | | __m256i vscale1 = _mm256_i32gather_epi32 ((int *)&sLUT[0][0], vindex1, 1); // load 8 32 bit values |
133 | | |
134 | | vscale0 = _mm256_and_si256 (vscale0,vmask); |
135 | | vscale1 = _mm256_and_si256 (vscale1,vmask); |
136 | | |
137 | | vintensity = _mm256_packus_epi32 (vscale0, vscale1); |
138 | | vscale0 = _mm256_permute4x64_epi64 (vintensity, 0x8); |
139 | | vscale1 = _mm256_permute4x64_epi64 (vintensity, 0xd); |
140 | | vscale0 = _mm256_packus_epi16 (vscale0, vscale1); |
141 | | _mm_storeu_si128(( __m128i * )&scale[0][16],_mm256_castsi256_si128(vscale0)); |
142 | | # else |
143 | | __m128i vPlo = _mm_cvtepi8_epi16( vP ); |
144 | | __m128i vPhi = _mm_cvtepi8_epi16( _mm_bsrli_si128( vP, 8 ) ); |
145 | 0 | if( oc1 ) |
146 | 0 | { |
147 | 0 | __m128i voc1 = _mm_set1_epi16( oc1 ); |
148 | 0 | __m128i voc2 = _mm_set1_epi16( oc2 ); |
149 | | // p*oc1 |
150 | 0 | vPlo = _mm_mullo_epi16( vPlo, voc1 ); // max 16 Bit |
151 | 0 | vPhi = _mm_mullo_epi16( vPhi, voc1 ); |
152 | | // pattern * s_up |
153 | 0 | __m128i vP2 = _mm_lddqu_si128( (__m128i*) &pattern[0][0][oy_up][ox_up] ); |
154 | 0 | if( s_up == -1 ) |
155 | 0 | { |
156 | 0 | vP2 = _mm_sub_epi8( _mm_set1_epi8( 0 ), vP2 ); |
157 | 0 | } |
158 | 0 | __m128i vP2lo = _mm_cvtepi8_epi16( vP2 ); |
159 | 0 | __m128i vP2hi = _mm_cvtepi8_epi16( _mm_bsrli_si128( vP2, 8 ) ); |
160 | | // * oc2 |
161 | 0 | vP2lo = _mm_mullo_epi16( vP2lo, voc2 ); |
162 | 0 | vP2hi = _mm_mullo_epi16( vP2hi, voc2 ); |
163 | | // add |
164 | 0 | vPlo = _mm_add_epi16( vPlo, vP2lo ); |
165 | 0 | vPhi = _mm_add_epi16( vPhi, vP2hi ); |
166 | | // round to 16 bit |
167 | 0 | __m128i vadd = _mm_set1_epi16( 1 << ( 5 - 1 ) ); |
168 | 0 | __m128i vshift = _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, 5 ); |
169 | 0 | vPlo = _mm_add_epi16( vPlo, vadd ); |
170 | 0 | vPhi = _mm_add_epi16( vPhi, vadd ); |
171 | 0 | vPlo = _mm_sra_epi16( vPlo, vshift ); |
172 | 0 | vPhi = _mm_sra_epi16( vPhi, vshift ); |
173 | 0 | } |
174 | | _mm_storeu_si128( (__m128i*) &grain[c][16], vPlo ); |
175 | | _mm_storeu_si128( (__m128i*) &grain[c][16 + 8], vPhi ); |
176 | | // Scale sign already integrated above because of overlap |
177 | | //scale[0][16+i] = sLUT[0][intensity]; |
178 | | uint8_t intensity; |
179 | | uint8_t *pscale=&scale[0][16]; |
180 | | const uint8_t *pLUT=sLUT[0]; |
181 | 0 | if (bs) |
182 | 0 | { |
183 | 0 | const uint16_t *pI16 = I16+x; |
184 | 0 | for (int i=0; i<16; i++) |
185 | 0 | { |
186 | 0 | intensity = *pI16++ >> bs ; |
187 | 0 | *pscale++ = pLUT[intensity]; |
188 | 0 | } |
189 | 0 | } |
190 | 0 | else |
191 | 0 | { |
192 | 0 | const uint8_t *pI8 = I8+x; |
193 | 0 | for (int i=0; i<16; i++) |
194 | 0 | { |
195 | 0 | intensity = *pI8++ ; |
196 | 0 | *pscale++ = pLUT[intensity]; |
197 | 0 | } |
198 | 0 | } |
199 | | #endif |
200 | 0 | } // Y |
201 | 0 | else |
202 | 0 | { // U/V |
203 | 0 | __m128i vP; |
204 | | #ifdef USE_AVX2 |
205 | | __m256i vindex; |
206 | | __m128i vintensity; |
207 | 0 | if (bs) |
208 | 0 | { |
209 | 0 | vintensity = _mm_lddqu_si128((__m128i*)&I16[x>>1]); //load 8 16 bit values |
210 | 0 | vintensity = _mm_sra_epi16 (vintensity, _mm_set_epi32 (0,0,0,bs)); |
211 | 0 | } |
212 | 0 | else |
213 | 0 | { |
214 | 0 | vintensity = _mm_loadu_si64(&I8[x>>1]); //load 8 8 bit values |
215 | 0 | vintensity=_mm_cvtepi8_epi16 (vintensity); |
216 | 0 | vintensity = _mm_and_si128 (vintensity,_mm_set1_epi16(0xff)); // only 8 bit |
217 | 0 | } |
218 | | vindex=_mm256_cvtepi16_epi32 (vintensity); |
219 | | #endif |
220 | 0 | vP = _mm_loadu_si64( (__m128i*) &pattern[1][0][oy][ox] ); |
221 | |
|
222 | 0 | if( s == -1 ) |
223 | 0 | { |
224 | 0 | vP = _mm_sub_epi8( _mm_set1_epi8( 0 ), vP ); |
225 | 0 | } |
226 | 0 | __m128i vPlo = _mm_cvtepi8_epi16( vP ); |
227 | 0 | if( oc1 ) |
228 | 0 | { |
229 | 0 | __m128i voc1 = _mm_set1_epi16( oc1 ); |
230 | 0 | __m128i voc2 = _mm_set1_epi16( oc2 ); |
231 | | // p*oc1 |
232 | 0 | vPlo = _mm_mullo_epi16( vPlo, voc1 ); // max 16 Bit |
233 | | // pattern * s_up |
234 | 0 | __m128i vP2 = _mm_loadu_si64( (__m128i*) &pattern[c ? 1 : 0][0][oy_up][ox_up] ); |
235 | 0 | if( s_up == -1 ) |
236 | 0 | { |
237 | 0 | vP2 = _mm_sub_epi8( _mm_set1_epi8( 0 ), vP2 ); |
238 | 0 | } |
239 | 0 | __m128i vP2lo = _mm_cvtepi8_epi16( vP2 ); |
240 | | // * oc2 |
241 | 0 | vP2lo = _mm_mullo_epi16( vP2lo, voc2 ); |
242 | | // add |
243 | 0 | vPlo = _mm_add_epi16( vPlo, vP2lo ); |
244 | | // round to 16 bit |
245 | 0 | __m128i vadd = _mm_set1_epi16( 1 << ( 5 - 1 ) ); |
246 | 0 | __m128i vshift = _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, 5 ); |
247 | 0 | vPlo = _mm_add_epi16( vPlo, vadd ); |
248 | 0 | vPlo = _mm_sra_epi16( vPlo, vshift ); |
249 | 0 | } |
250 | 0 | _mm_storeu_si128( (__m128i*) &grain[c][8], vPlo ); |
251 | | #ifdef USE_AVX2 |
252 | | __m256i vmask = _mm256_set1_epi32(0xff); |
253 | | __m256i vscale = _mm256_i32gather_epi32 ((int *)&sLUT[c][0], vindex, 1); // load 8 32 bit values |
254 | | vscale = _mm256_and_si256 (vscale,vmask); |
255 | | |
256 | | vmask = _mm256_packus_epi32 (vscale, vscale); |
257 | | vscale = _mm256_permute4x64_epi64 (vmask, 0x8); |
258 | | vscale = _mm256_packus_epi16 (vscale, vscale); |
259 | | _mm_storeu_si64(( __m128i * )&scale[c][8],_mm256_castsi256_si128(vscale)); |
260 | | #else |
261 | | uint8_t* pscale = &scale[c][8]; |
262 | | const uint8_t* pLUT = sLUT[c]; |
263 | 0 | if (bs) |
264 | 0 | { |
265 | 0 | const uint16_t* pI16 = &I16[x >> 1]; |
266 | 0 | for( int i = 0; i < 8; i++ ) |
267 | 0 | { |
268 | 0 | uint8_t intensity = *pI16++ >> bs; |
269 | 0 | *pscale++ = pLUT[intensity]; |
270 | 0 | } |
271 | 0 | } |
272 | 0 | else |
273 | 0 | { |
274 | 0 | const uint8_t* pI8 = &I8[x >> 1]; |
275 | 0 | for( int i = 0; i < 8; i++ ) |
276 | 0 | { |
277 | 0 | uint8_t intensity = *pI8++; |
278 | 0 | *pscale++ = pLUT[intensity]; |
279 | 0 | } |
280 | 0 | } |
281 | | #endif |
282 | 0 | } |
283 | 0 | } |
284 | | #ifdef USE_AVX2 |
285 | 0 | else if( c>0 && allZero[c] == 0 ) |
286 | 0 | { |
287 | 0 | __m128i vP; |
288 | 0 | __m128i vintensity; |
289 | 0 | __m256i vindex; |
290 | 0 | __m256i vmask = _mm256_set1_epi32(0xff); |
291 | 0 | if (bs) |
292 | 0 | { |
293 | 0 | vintensity = _mm_lddqu_si128((__m128i*)&I16[x>>1]); //load 8 16 bit values |
294 | 0 | vintensity = _mm_sra_epi16 (vintensity, _mm_set_epi32 (0,0,0,bs)); |
295 | 0 | } |
296 | 0 | else |
297 | 0 | { |
298 | 0 | vintensity = _mm_loadu_si64(&I8[x>>1]); //load 8 8 bit values |
299 | 0 | vintensity=_mm_cvtepi8_epi16 (vintensity); |
300 | 0 | } |
301 | 0 | vindex=_mm256_cvtepi16_epi32 (vintensity); |
302 | 0 | vindex = _mm256_and_si256 (vindex,vmask); // only 8 bit |
303 | |
|
304 | 0 | __m256i vadd = _mm256_set_epi32(7,6,5,4,3,2,1,0); |
305 | 0 | __m256i vpi = _mm256_i32gather_epi32 ((int *)&pLUT[c][0], vindex, 1); // load 8 32 bit values |
306 | 0 | vpi = _mm256_and_si256 (vpi,vmask); // only 8 bit |
307 | 0 | vpi = _mm256_slli_epi32 (vpi, 8); // 12-4 |
308 | 0 | vpi = _mm256_add_epi32 (vpi, vadd); |
309 | 0 | __m256i avP = _mm256_i32gather_epi32 ((int *)&pattern[1][0][oy][ox], vpi, 1); // load 8 32 bit values |
310 | 0 | avP = _mm256_and_si256 (avP,vmask); // only 8 bit |
311 | | // convert to packed 8 bit |
312 | 0 | __m256i vtmp = _mm256_packus_epi32 (avP, avP); |
313 | 0 | avP = _mm256_permute4x64_epi64 (vtmp, 0x8); |
314 | 0 | avP = _mm256_packus_epi16 (avP, avP); |
315 | 0 | vP = _mm256_castsi256_si128(avP); |
316 | 0 | if( s == -1 ) |
317 | 0 | { |
318 | 0 | vP = _mm_sub_epi8( _mm_set1_epi8( 0 ), vP ); |
319 | 0 | } |
320 | 0 | __m128i vPlo = _mm_cvtepi8_epi16( vP ); |
321 | 0 | if( oc1 ) |
322 | 0 | { |
323 | 0 | __m128i voc1 = _mm_set1_epi16( oc1 ); |
324 | 0 | __m128i voc2 = _mm_set1_epi16( oc2 ); |
325 | | // p*oc1 |
326 | 0 | vPlo = _mm_mullo_epi16( vPlo, voc1 ); // max 16 Bit |
327 | | // pattern * s_up |
328 | 0 | __m256i avP2 = _mm256_i32gather_epi32 ((int *)&pattern[1][0][oy_up][ox_up], vpi, 1); // load 8 32 bit values |
329 | 0 | avP2 = _mm256_and_si256 (avP2,vmask); // only 8 bit |
330 | | // convert to packed 8 bit |
331 | 0 | vtmp = _mm256_packus_epi32 (avP2, avP2); |
332 | 0 | avP2 = _mm256_permute4x64_epi64 (vtmp, 0x8); |
333 | 0 | avP2 = _mm256_packus_epi16 (avP2, avP2); |
334 | 0 | __m128i vP2= _mm256_castsi256_si128(avP2); |
335 | 0 | if( s_up == -1 ) |
336 | 0 | { |
337 | 0 | vP2 = _mm_sub_epi8( _mm_set1_epi8( 0 ), vP2 ); |
338 | 0 | } |
339 | 0 | __m128i vP2lo = _mm_cvtepi8_epi16( vP2 ); |
340 | 0 | vP2lo = _mm_mullo_epi16( vP2lo, voc2 ); |
341 | 0 | vPlo = _mm_add_epi16( vPlo, vP2lo ); |
342 | | // round to 16 bit |
343 | 0 | __m128i vadd = _mm_set1_epi16( 1 << ( 5 - 1 ) ); |
344 | 0 | __m128i vshift = _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, 5 ); |
345 | 0 | vPlo = _mm_add_epi16( vPlo, vadd ); |
346 | 0 | vPlo = _mm_sra_epi16( vPlo, vshift ); |
347 | 0 | } |
348 | 0 | _mm_storeu_si128( (__m128i*) &grain[c][8], vPlo ); |
349 | 0 | __m256i vscale = _mm256_i32gather_epi32 ((int *)&sLUT[c][0], vindex, 1); // load 8 32 bit values |
350 | 0 | vscale = _mm256_and_si256 (vscale,vmask); |
351 | 0 | vmask = _mm256_packus_epi32 (vscale, vscale); |
352 | 0 | vscale = _mm256_permute4x64_epi64 (vmask, 0x8); |
353 | 0 | vscale = _mm256_packus_epi16 (vscale, vscale); |
354 | 0 | _mm_storeu_si64(( __m128i * )&scale[c][8],_mm256_castsi256_si128(vscale)); |
355 | 0 | } |
356 | 0 | #endif |
357 | 0 | else |
358 | 0 | { |
359 | 0 | for( int i = 0; i < 16 / subx; i++ ) |
360 | 0 | { |
361 | 0 | uint8_t intensity = bs ? I16[x / subx + i] >> bs : I8[x / subx + i]; |
362 | 0 | uint8_t pi = pLUT[c][intensity] >> 4; // pattern index (integer part) |
363 | 0 | int P = pattern[c ? 1 : 0][pi][oy][ox + i] * s; // Pattern sample (from current pattern index) |
364 | | // We could consider just XORing the sign bit |
365 | | #if PATTERN_INTERPOLATION |
366 | | uint8_t pf = pLUT[c][intensity] & 15; // pattern index fractional part (interpolate with next) -- could restrict to less bits (e.g. 2) |
367 | | int Pn = |
368 | | pattern[c ? 1 : 0][pi + 1][oy][ox + i] * s; // Next-pattern sample (from pattern index+1) |
369 | | // But there are equivalent hw tricks, e.g. storing values as sign + amplitude instead of two's complement |
370 | | #endif |
371 | |
|
372 | 0 | if( oc1 ) // overlap |
373 | 0 | { |
374 | 0 | P = round( P * oc1 + pattern[c ? 1 : 0][pi][oy_up][ox_up + i] * oc2 * s_up, 5 ); |
375 | | #if PATTERN_INTERPOLATION |
376 | | Pn = round( Pn * oc1 + pattern[c ? 1 : 0][pi + 1][oy_up][ox_up + i] * oc2 * s_up, 5 ); |
377 | | #endif |
378 | 0 | } |
379 | | #if PATTERN_INTERPOLATION |
380 | | // Pattern interpolation: P is current, Pn is next, pf is interpolation coefficient |
381 | | grain[c][16 / subx + i] = round( P * ( 16 - pf ) + Pn * pf, 4 ); |
382 | | #else |
383 | 0 | grain[c][16 / subx + i] = P; |
384 | 0 | #endif |
385 | | // Scale sign already integrated above because of overlap |
386 | 0 | scale[c][16 / subx + i] = sLUT[c][intensity]; |
387 | 0 | } |
388 | 0 | } |
389 | 0 | } Unexecuted instantiation: vvdec::FilmGrainImplX86<(vvdec::x86_simd::X86_VEXT)1>::make_grain_pattern(void const*, int, int, int, unsigned char, unsigned char, unsigned char, unsigned char, unsigned char, unsigned char, int, int, short (*) [32], unsigned char (*) [32]) const Unexecuted instantiation: vvdec::FilmGrainImplX86<(vvdec::x86_simd::X86_VEXT)4>::make_grain_pattern(void const*, int, int, int, unsigned char, unsigned char, unsigned char, unsigned char, unsigned char, unsigned char, int, int, short (*) [32], unsigned char (*) [32]) const |
390 | | |
391 | | template<> |
392 | | void FilmGrainImplX86<CURR_X86_VEXT>::scale_and_output( void* I, int c, int x, int subx, int width, int16_t grain[3][32], uint8_t scale[3][32] ) const |
393 | 0 | { |
394 | 0 | uint8_t* I8 = (uint8_t*) I; |
395 | 0 | uint16_t* I16 = (uint16_t*) I; |
396 | |
|
397 | 0 | const uint8_t I_min = c ? C_min : Y_min; |
398 | 0 | const uint8_t I_max = c ? C_max : Y_max; |
399 | |
|
400 | 0 | int flush = 0; |
401 | 0 | do |
402 | 0 | { |
403 | 0 | if( x > 0 ) |
404 | 0 | { |
405 | 0 | if( !flush ) |
406 | 0 | { |
407 | | // Horizontal deblock (across previous block) |
408 | 0 | __m128i vgrain; |
409 | 0 | __m128i vfac = _mm_set_epi16( 0, 0, 0, 1, 1, 3, 1, 1 ); |
410 | 0 | if( c == 0 ) |
411 | 0 | { |
412 | 0 | vgrain = _mm_loadu_si64( (__m128i*) &grain[0][16 - 2] ); // r1 r0 l0 l1 |
413 | 0 | } |
414 | 0 | else |
415 | 0 | { |
416 | 0 | vgrain = _mm_loadu_si64( (__m128i*) &grain[c][8 - 2] ); // r1 r0 l0 l1 |
417 | 0 | } |
418 | 0 | __m128i vgrainh = _mm_mullo_epi16( vgrain, vfac ); // r1 3*r0 l0 l1 |
419 | 0 | vgrainh = _mm_srli_si128( vgrainh, 2 ); // r1 3+r0 l0 |
420 | 0 | vfac = _mm_srli_si128( vfac, 2 ); |
421 | 0 | __m128i vgrainl = _mm_mullo_epi16( vgrain, vfac ); // r1 r0 3*lo l1 |
422 | 0 | vgrainl = _mm_slli_si128( vgrainl, 10 ); |
423 | 0 | vgrainl = _mm_srli_si128( vgrainl, 10 ); // r0 3*lo l1 |
424 | 0 | vgrainl = _mm_hadd_epi16( vgrainl, vgrainl ); // r0 3*lo+l1 |
425 | 0 | vgrainl = _mm_hadd_epi16( vgrainl, vgrainl ); // r0+3*lo+l1 |
426 | 0 | vgrainh = _mm_hadd_epi16( vgrainh, vgrainh ); |
427 | 0 | vgrainh = _mm_hadd_epi16( vgrainh, vgrainh ); |
428 | 0 | vgrainh = _mm_srli_si128( vgrainh, 2 ); |
429 | 0 | vgrain = _mm_or_si128( vgrainl, vgrainh ); |
430 | 0 | vgrain = _mm_add_epi16( vgrain, _mm_set_epi16( 0, 0, 0, 0, 0, 0, 2, 2 ) ); |
431 | 0 | vgrain = _mm_srai_epi16( vgrain, 2 ); |
432 | 0 | if( c == 0 ) |
433 | 0 | { |
434 | 0 | _mm_storeu_si32( (__m128i*) &grain[0][16 - 1], vgrain ); |
435 | 0 | } |
436 | 0 | else |
437 | 0 | { |
438 | 0 | _mm_storeu_si32( (__m128i*) &grain[c][8 - 1], vgrain ); |
439 | 0 | } |
440 | 0 | } |
441 | 0 | if( bs ) |
442 | 0 | { |
443 | | # ifdef USE_AVX2 |
444 | | __m128i vshift = _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, scale_shift ); |
445 | 0 | if( c == 0 ) |
446 | 0 | { |
447 | 0 | __m256i vadd = _mm256_set1_epi32( 1 << ( scale_shift - 1 ) ); |
448 | 0 | __m256i vgrain = _mm256_lddqu_si256( (__m256i*) &grain[0][0] ); // load 16 * 16 bit |
449 | 0 | __m256i vscale = _mm256_cvtepi8_epi16( _mm_lddqu_si128( (__m128i*) &scale[0][0] ) ); |
450 | 0 | __m256i tmplo = _mm256_mullo_epi16( vscale, vgrain ); |
451 | 0 | __m256i tmphi = _mm256_mulhi_epi16( vscale, vgrain ); |
452 | 0 | __m256i tmpgvlo = _mm256_unpacklo_epi16( tmplo, tmphi ); // 32 bit |
453 | 0 | __m256i tmpgvhi = _mm256_unpackhi_epi16( tmplo, tmphi ); |
454 | | // deinterleave |
455 | 0 | __m256i gvlo = _mm256_permute2x128_si256( tmpgvlo, tmpgvhi, 0x20 ); |
456 | 0 | __m256i gvhi = _mm256_permute2x128_si256( tmpgvlo, tmpgvhi, 0x31 ); |
457 | | // round |
458 | 0 | gvlo = _mm256_add_epi32( gvlo, vadd ); |
459 | 0 | gvhi = _mm256_add_epi32( gvhi, vadd ); |
460 | 0 | gvlo = _mm256_sra_epi32( gvlo, vshift ); |
461 | 0 | gvhi = _mm256_sra_epi32( gvhi, vshift ); |
462 | 0 | __m256i vI16lo = _mm256_cvtepi16_epi32( _mm_lddqu_si128( (__m128i*) &I16[( x - 16 )] ) ); |
463 | 0 | __m256i vI16hi = _mm256_cvtepi16_epi32( _mm_lddqu_si128( (__m128i*) &I16[( x - 16 ) + 8] ) ); |
464 | 0 | vI16lo = _mm256_add_epi32( gvlo, vI16lo ); |
465 | 0 | vI16hi = _mm256_add_epi32( gvhi, vI16hi ); |
466 | 0 | vI16lo = _mm256_max_epi32( _mm256_set1_epi32( I_min ), vI16lo ); |
467 | 0 | vI16hi = _mm256_max_epi32( _mm256_set1_epi32( I_min ), vI16hi ); |
468 | 0 | vI16lo = _mm256_min_epi32( _mm256_set1_epi32( I_max << bs ), vI16lo ); |
469 | 0 | vI16hi = _mm256_min_epi32( _mm256_set1_epi32( I_max << bs ), vI16hi ); |
470 | 0 | vI16lo = _mm256_packs_epi32( vI16lo, vI16hi ); |
471 | 0 | vI16lo = _mm256_permute4x64_epi64( vI16lo, 0xd8 ); |
472 | 0 | _mm256_storeu_si256( (__m256i*) &I16[( x - 16 )], vI16lo ); |
473 | 0 | } |
474 | 0 | else |
475 | 0 | { |
476 | 0 | __m128i vadd = _mm_set1_epi32( 1 << ( scale_shift - 1 ) ); |
477 | 0 | __m128i vscale = _mm_lddqu_si128( (__m128i*) &scale[c] ); |
478 | 0 | __m128i vgrain = _mm_lddqu_si128( (__m128i*) &grain[c] ); |
479 | 0 | vscale = _mm_cvtepi8_epi16( vscale ); // 16 bit |
480 | 0 | __m128i tmplo = _mm_mullo_epi16( vscale, vgrain ); |
481 | 0 | __m128i tmphi = _mm_mulhi_epi16( vscale, vgrain ); |
482 | 0 | __m128i gvlo = _mm_unpacklo_epi16( tmplo, tmphi ); // 32 bit |
483 | 0 | __m128i gvhi = _mm_unpackhi_epi16( tmplo, tmphi ); |
484 | 0 | gvlo = _mm_add_epi32( gvlo, vadd ); |
485 | 0 | gvhi = _mm_add_epi32( gvhi, vadd ); |
486 | 0 | gvlo = _mm_sra_epi32( gvlo, vshift ); |
487 | 0 | gvhi = _mm_sra_epi32( gvhi, vshift ); |
488 | 0 | __m128i vI16lo = _mm_lddqu_si128( (__m128i*) &I16[( x - 16 ) / subx] ); |
489 | 0 | __m128i vI16hi = _mm_lddqu_si128( (__m128i*) &I16[( x - 16 ) / subx + 4] ); |
490 | 0 | vI16lo = _mm_cvtepi16_epi32( vI16lo ); // 32 bit |
491 | 0 | vI16hi = _mm_cvtepi16_epi32( vI16hi ); |
492 | 0 | vI16lo = _mm_add_epi32( gvlo, vI16lo ); |
493 | 0 | vI16hi = _mm_add_epi32( gvhi, vI16hi ); |
494 | 0 | vI16lo = _mm_max_epi32( _mm_set1_epi32( I_min ), vI16lo ); |
495 | 0 | vI16hi = _mm_max_epi32( _mm_set1_epi32( I_min ), vI16hi ); |
496 | 0 | vI16lo = _mm_min_epi32( _mm_set1_epi32( I_max << bs ), vI16lo ); |
497 | 0 | vI16hi = _mm_min_epi32( _mm_set1_epi32( I_max << bs ), vI16hi ); |
498 | 0 | vI16lo = _mm_packs_epi32( vI16lo, vI16hi ); |
499 | 0 | _mm_storeu_si128( (__m128i*) &I16[( x - 16 ) / subx], vI16lo ); |
500 | 0 | } |
501 | | # else // !USE_AVX2 |
502 | | __m128i vadd = _mm_set1_epi32( 1 << ( scale_shift - 1 ) ); |
503 | | __m128i vshift = _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, scale_shift ); |
504 | | __m128i vscale = _mm_lddqu_si128( (__m128i*) &scale[c] ); |
505 | | __m128i vgrain = _mm_lddqu_si128( (__m128i*) &grain[c] ); |
506 | | vscale = _mm_cvtepi8_epi16( vscale ); // 16 bit |
507 | | __m128i tmplo = _mm_mullo_epi16( vscale, vgrain ); |
508 | | __m128i tmphi = _mm_mulhi_epi16( vscale, vgrain ); |
509 | | __m128i gvlo = _mm_unpacklo_epi16( tmplo, tmphi ); // 32 bit |
510 | | __m128i gvhi = _mm_unpackhi_epi16( tmplo, tmphi ); |
511 | | gvlo = _mm_add_epi32( gvlo, vadd ); |
512 | | gvhi = _mm_add_epi32( gvhi, vadd ); |
513 | | gvlo = _mm_sra_epi32( gvlo, vshift ); |
514 | | gvhi = _mm_sra_epi32( gvhi, vshift ); |
515 | | __m128i vI16lo = _mm_lddqu_si128( (__m128i*) &I16[( x - 16 ) / subx] ); |
516 | | __m128i vI16hi = _mm_lddqu_si128( (__m128i*) &I16[( x - 16 ) / subx + 4] ); |
517 | | |
518 | | vI16lo = _mm_cvtepi16_epi32( vI16lo ); // 32 bit |
519 | | vI16hi = _mm_cvtepi16_epi32( vI16hi ); |
520 | | vI16lo = _mm_add_epi32( gvlo, vI16lo ); |
521 | | vI16hi = _mm_add_epi32( gvhi, vI16hi ); |
522 | | vI16lo = _mm_max_epi32( _mm_set1_epi32( I_min ), vI16lo ); |
523 | | vI16hi = _mm_max_epi32( _mm_set1_epi32( I_min ), vI16hi ); |
524 | | vI16lo = _mm_min_epi32( _mm_set1_epi32( I_max << bs ), vI16lo ); |
525 | | vI16hi = _mm_min_epi32( _mm_set1_epi32( I_max << bs ), vI16hi ); |
526 | | vI16lo = _mm_packs_epi32( vI16lo, vI16hi ); |
527 | | _mm_storeu_si128( (__m128i*) &I16[( x - 16 ) / subx], vI16lo ); |
528 | 0 | if( c == 0 ) |
529 | 0 | { |
530 | 0 | __m128i vscale = _mm_lddqu_si128( (__m128i*) &scale[c][8] ); |
531 | 0 | __m128i vgrain = _mm_lddqu_si128( (__m128i*) &grain[c][8] ); |
532 | 0 | vscale = _mm_cvtepi8_epi16( vscale ); // 16 bit |
533 | 0 | __m128i tmplo = _mm_mullo_epi16( vscale, vgrain ); |
534 | 0 | __m128i tmphi = _mm_mulhi_epi16( vscale, vgrain ); |
535 | 0 | __m128i gvlo = _mm_unpacklo_epi16( tmplo, tmphi ); // 32 bit |
536 | 0 | __m128i gvhi = _mm_unpackhi_epi16( tmplo, tmphi ); |
537 | | // round |
538 | 0 | gvlo = _mm_add_epi32( gvlo, vadd ); |
539 | 0 | gvhi = _mm_add_epi32( gvhi, vadd ); |
540 | 0 | gvlo = _mm_sra_epi32( gvlo, vshift ); |
541 | 0 | gvhi = _mm_sra_epi32( gvhi, vshift ); |
542 | 0 | __m128i vI16lo = _mm_lddqu_si128( (__m128i*) &I16[( x - 16 ) / subx + 8] ); |
543 | 0 | __m128i vI16hi = _mm_lddqu_si128( (__m128i*) &I16[( x - 16 ) / subx + 12] ); |
544 | 0 | vI16lo = _mm_cvtepi16_epi32( vI16lo ); // 32 bit |
545 | 0 | vI16hi = _mm_cvtepi16_epi32( vI16hi ); |
546 | 0 | vI16lo = _mm_add_epi32( gvlo, vI16lo ); |
547 | 0 | vI16hi = _mm_add_epi32( gvhi, vI16hi ); |
548 | 0 | vI16lo = _mm_max_epi32( _mm_set1_epi32( I_min ), vI16lo ); |
549 | 0 | vI16hi = _mm_max_epi32( _mm_set1_epi32( I_min ), vI16hi ); |
550 | 0 | vI16lo = _mm_min_epi32( _mm_set1_epi32( I_max << bs ), vI16lo ); |
551 | 0 | vI16hi = _mm_min_epi32( _mm_set1_epi32( I_max << bs ), vI16hi ); |
552 | 0 | vI16lo = _mm_packs_epi32( vI16lo, vI16hi ); |
553 | 0 | _mm_storeu_si128( (__m128i*) &I16[( x - 16 ) / subx + 8], vI16lo ); |
554 | 0 | } |
555 | | #endif // !USE_AVX2 |
556 | 0 | } // bs |
557 | 0 | else |
558 | 0 | { |
559 | 0 | for( int i = 0; i < 16 / subx; i++ ) |
560 | 0 | { |
561 | | // Output previous block (or flush current) |
562 | 0 | int32_t g = round( scale[c][i] * (int16_t) grain[c][i], scale_shift ); |
563 | 0 | if( bs ) |
564 | 0 | { |
565 | 0 | I16[( x - 16 ) / subx + i] = std::max<int32_t>( I_min << bs, std::min<int32_t>( I_max << bs, I16[( x - 16 ) / subx + i] + g ) ); |
566 | 0 | } |
567 | 0 | else |
568 | 0 | { |
569 | 0 | I8[( x - 16 ) / subx + i] = std::max<int32_t>( I_min, std::min<int32_t>( I_max, I8[( x - 16 ) / subx + i] + g ) ); |
570 | 0 | } |
571 | 0 | } |
572 | 0 | } |
573 | 0 | } |
574 | | // Shift pipeline |
575 | 0 | if( !flush ) |
576 | 0 | { |
577 | 0 | if( c == 0 ) |
578 | 0 | { |
579 | | #ifdef USE_AVX2 |
580 | | __m256i vgrain = _mm256_lddqu_si256( (__m256i*) &grain[0][16] ); |
581 | | _mm256_storeu_si256( (__m256i*) &grain[0][0], vgrain ); |
582 | | #else |
583 | | __m128i vgrain0 = _mm_lddqu_si128( (__m128i*) &grain[0][16] ); |
584 | | __m128i vgrain1 = _mm_lddqu_si128( (__m128i*) &grain[0][24] ); |
585 | | _mm_storeu_si128( (__m128i*) &grain[0][0], vgrain0 ); |
586 | | _mm_storeu_si128( (__m128i*) &grain[0][8], vgrain1 ); |
587 | | #endif |
588 | 0 | __m128i vscale = _mm_lddqu_si128( (__m128i*) &scale[0][16] ); |
589 | 0 | _mm_storeu_si128( (__m128i*) &scale[0][0], vscale ); |
590 | 0 | } |
591 | 0 | else |
592 | 0 | { |
593 | 0 | __m128i vgrain = _mm_lddqu_si128( (__m128i*) &grain[c][8] ); |
594 | 0 | __m128i vscale = _mm_loadu_si64( (__m128i*) &scale[c][8] ); |
595 | 0 | _mm_storeu_si128( (__m128i*) &grain[c][0], vgrain ); |
596 | 0 | _mm_storel_epi64( (__m128i*) &scale[c][0], vscale ); |
597 | 0 | } |
598 | 0 | } |
599 | 0 | if( x + 16 >= width ) |
600 | 0 | { |
601 | 0 | flush++; |
602 | 0 | x += 16; |
603 | 0 | } |
604 | 0 | } while( flush == 1 ); |
605 | 0 | } Unexecuted instantiation: vvdec::FilmGrainImplX86<(vvdec::x86_simd::X86_VEXT)1>::scale_and_output(void*, int, int, int, int, short (*) [32], unsigned char (*) [32]) const Unexecuted instantiation: vvdec::FilmGrainImplX86<(vvdec::x86_simd::X86_VEXT)4>::scale_and_output(void*, int, int, int, int, short (*) [32], unsigned char (*) [32]) const |
606 | | |
607 | | } // namespace vvdec |
608 | | |
609 | | #endif // TARGET_SIMD_X86 |