Coverage Report

Created: 2026-04-01 07:49

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/vvdec/source/Lib/FilmGrain/FilmGrainImpl_X86_SIMD.h
Line
Count
Source
1
/* -----------------------------------------------------------------------------
2
The copyright in this software is being made available under the Clear BSD
3
License, included below. No patent rights, trademark rights and/or
4
other Intellectual Property Rights other than the copyrights concerning
5
the Software are granted under this license.
6
7
The Clear BSD License
8
9
Copyright (c) 2018-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVdeC Authors.
10
All rights reserved.
11
12
Redistribution and use in source and binary forms, with or without modification,
13
are permitted (subject to the limitations in the disclaimer below) provided that
14
the following conditions are met:
15
16
  * Redistributions of source code must retain the above copyright notice,
17
  this list of conditions and the following disclaimer.
18
19
  * Redistributions in binary form must reproduce the above copyright
20
  notice, this list of conditions and the following disclaimer in the
21
  documentation and/or other materials provided with the distribution.
22
23
  * Neither the name of the copyright holder nor the names of its
24
  contributors may be used to endorse or promote products derived from this
25
  software without specific prior written permission.
26
27
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
28
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
29
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
32
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
36
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
------------------------------------------------------------------------------------------- */
40
41
#include "FilmGrainImplX86.h"
42
43
#include <algorithm>
44
45
#include <CommonDef.h>
46
47
#if defined(TARGET_SIMD_X86) && ENABLE_SIMD_OPT 
48
#include <CommonDefX86.h>
49
50
namespace vvdec
51
{
52
using namespace x86_simd;
53
54
template<>
55
void FilmGrainImplX86<CURR_X86_VEXT>::make_grain_pattern( const void* I,
56
                                                          int         c,
57
                                                          int         x,
58
                                                          int         subx,
59
                                                          uint8_t     oc1,
60
                                                          uint8_t     oc2,
61
                                                          uint8_t     ox,
62
                                                          uint8_t     ox_up,
63
                                                          uint8_t     oy,
64
                                                          uint8_t     oy_up,
65
                                                          int         s,
66
                                                          int         s_up,
67
                                                          int16_t     grain[3][32],
68
                                                          uint8_t     scale[3][32] ) const
69
0
{
70
0
  const uint8_t*  I8  = (const uint8_t*) I;
71
0
  const uint16_t* I16 = (const uint16_t*) I;
72
0
  if( allZero[c] == 1 )
73
0
  {
74
0
    if( c == 0 )
75
0
    {
76
0
      __m128i vP = _mm_lddqu_si128( (__m128i*) &pattern[0][0][oy][ox] );
77
0
      if( s == -1 )
78
0
      {
79
0
        vP = _mm_sub_epi8( _mm_set1_epi8( 0 ), vP );
80
0
      }
81
#ifdef USE_AVX2
82
      __m256i vmask = _mm256_set1_epi32(0xff);
83
      __m128i tmp0;
84
      __m128i tmp1;
85
      __m256i vintensity;
86
0
      if (bs)
87
0
      {
88
0
        vintensity = _mm256_lddqu_si256((__m256i*)&I16[x]);  //load 16 16 bit values
89
0
        vintensity = _mm256_sra_epi16 (vintensity, _mm_set_epi32 (0,0,0,bs));
90
0
        tmp0=_mm256_extracti128_si256 (vintensity,0);
91
0
        tmp1=_mm256_extracti128_si256 (vintensity,1);
92
0
      }
93
0
      else
94
0
      {
95
0
        __m128i vintensity128 = _mm_lddqu_si128((__m128i*)&I8[x]);  //load 16 8 bit value
96
0
        tmp0=_mm_cvtepi8_epi16 (vintensity128);
97
0
        tmp1=_mm_cvtepi8_epi16 (_mm_bsrli_si128(vintensity128,8));
98
0
        tmp0 = _mm_and_si128 (tmp0,_mm_set1_epi16(0xff));  // only 8 bit
99
0
        tmp1 = _mm_and_si128 (tmp1,_mm_set1_epi16(0xff));  // only 8 bit
100
0
        vintensity = _mm256_castsi128_si256 (vintensity128);
101
0
      }
102
      __m256i vindex0=_mm256_cvtepi16_epi32 (tmp0);
103
      __m256i vindex1=_mm256_cvtepi16_epi32 (tmp1);
104
105
      __m256i avP = _mm256_cvtepi8_epi16( vP );
106
0
      if( oc1 )
107
0
      {
108
0
        __m256i avoc1 = _mm256_set1_epi16( oc1 );
109
0
        __m256i avoc2 = _mm256_set1_epi16( oc2 );
110
        // p*oc1
111
0
        avP = _mm256_mullo_epi16( avP, avoc1 );   // max 16 Bit
112
        // pattern * s_up
113
0
        __m128i vP2 = _mm_lddqu_si128( (__m128i*) &pattern[0][0][oy_up][ox_up] );
114
0
        if( s_up == -1 )
115
0
        {
116
0
          vP2 = _mm_sub_epi8( _mm_set1_epi8( 0 ), vP2 );
117
0
        }
118
0
        __m256i avP2 = _mm256_cvtepi8_epi16( vP2 );
119
        // * oc2
120
0
        avP2 = _mm256_mullo_epi16( avP2, avoc2 );
121
        // add
122
0
        avP = _mm256_add_epi16( avP, avP2 );
123
        // round to 16 bit
124
0
        __m256i avadd   = _mm256_set1_epi16( 1 << ( 5 - 1 ) );
125
0
        __m128i avshift = _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, 5 );
126
0
        avP             = _mm256_add_epi16( avP, avadd );
127
0
        avP             = _mm256_sra_epi16( avP, avshift );
128
0
      }
129
      _mm256_storeu_si256( (__m256i*) &grain[c][16], avP );
130
131
      __m256i vscale0 = _mm256_i32gather_epi32 ((int *)&sLUT[0][0], vindex0, 1);  // load 8 32 bit values
132
      __m256i vscale1 = _mm256_i32gather_epi32 ((int *)&sLUT[0][0], vindex1, 1);  // load 8 32 bit values
133
134
      vscale0 = _mm256_and_si256 (vscale0,vmask);
135
      vscale1 = _mm256_and_si256 (vscale1,vmask);
136
137
      vintensity = _mm256_packus_epi32 (vscale0, vscale1);
138
      vscale0 = _mm256_permute4x64_epi64 (vintensity, 0x8);
139
      vscale1 = _mm256_permute4x64_epi64 (vintensity, 0xd);
140
      vscale0 = _mm256_packus_epi16 (vscale0, vscale1);
141
      _mm_storeu_si128(( __m128i * )&scale[0][16],_mm256_castsi256_si128(vscale0));
142
# else
143
      __m128i vPlo = _mm_cvtepi8_epi16( vP );
144
      __m128i vPhi = _mm_cvtepi8_epi16( _mm_bsrli_si128( vP, 8 ) );
145
0
      if( oc1 )
146
0
      {
147
0
        __m128i voc1 = _mm_set1_epi16( oc1 );
148
0
        __m128i voc2 = _mm_set1_epi16( oc2 );
149
        // p*oc1
150
0
        vPlo = _mm_mullo_epi16( vPlo, voc1 );   // max 16 Bit
151
0
        vPhi = _mm_mullo_epi16( vPhi, voc1 );
152
        // pattern * s_up
153
0
        __m128i vP2 = _mm_lddqu_si128( (__m128i*) &pattern[0][0][oy_up][ox_up] );
154
0
        if( s_up == -1 )
155
0
        {
156
0
          vP2 = _mm_sub_epi8( _mm_set1_epi8( 0 ), vP2 );
157
0
        }
158
0
        __m128i vP2lo = _mm_cvtepi8_epi16( vP2 );
159
0
        __m128i vP2hi = _mm_cvtepi8_epi16( _mm_bsrli_si128( vP2, 8 ) );
160
        // * oc2
161
0
        vP2lo = _mm_mullo_epi16( vP2lo, voc2 );
162
0
        vP2hi = _mm_mullo_epi16( vP2hi, voc2 );
163
        // add
164
0
        vPlo = _mm_add_epi16( vPlo, vP2lo );
165
0
        vPhi = _mm_add_epi16( vPhi, vP2hi );
166
        // round to 16 bit
167
0
        __m128i vadd   = _mm_set1_epi16( 1 << ( 5 - 1 ) );
168
0
        __m128i vshift = _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, 5 );
169
0
        vPlo           = _mm_add_epi16( vPlo, vadd );
170
0
        vPhi           = _mm_add_epi16( vPhi, vadd );
171
0
        vPlo           = _mm_sra_epi16( vPlo, vshift );
172
0
        vPhi           = _mm_sra_epi16( vPhi, vshift );
173
0
      }
174
      _mm_storeu_si128( (__m128i*) &grain[c][16], vPlo );
175
      _mm_storeu_si128( (__m128i*) &grain[c][16 + 8], vPhi );
176
      // Scale sign already integrated above because of overlap
177
      //scale[0][16+i] = sLUT[0][intensity];
178
      uint8_t intensity;
179
      uint8_t *pscale=&scale[0][16];
180
      const uint8_t *pLUT=sLUT[0];
181
0
      if (bs)
182
0
      {
183
0
        const uint16_t *pI16 = I16+x;
184
0
        for (int i=0; i<16; i++)
185
0
        {
186
0
          intensity =  *pI16++ >> bs ;
187
0
          *pscale++ = pLUT[intensity];
188
0
        }
189
0
      }
190
0
      else
191
0
      {
192
0
        const uint8_t *pI8 = I8+x;
193
0
        for (int i=0; i<16; i++)
194
0
        {
195
0
          intensity =  *pI8++ ;
196
0
          *pscale++ = pLUT[intensity];
197
0
        }
198
0
      }
199
#endif
200
0
    }   // Y
201
0
    else
202
0
    {   // U/V
203
0
      __m128i vP;
204
#ifdef USE_AVX2
205
      __m256i vindex;
206
      __m128i vintensity;
207
0
      if (bs)
208
0
      {
209
0
        vintensity = _mm_lddqu_si128((__m128i*)&I16[x>>1]);  //load 8 16 bit values
210
0
        vintensity = _mm_sra_epi16 (vintensity, _mm_set_epi32 (0,0,0,bs));
211
0
      }
212
0
      else
213
0
      {
214
0
        vintensity = _mm_loadu_si64(&I8[x>>1]);  //load 8 8 bit values
215
0
        vintensity=_mm_cvtepi8_epi16 (vintensity);
216
0
        vintensity = _mm_and_si128 (vintensity,_mm_set1_epi16(0xff));  // only 8 bit
217
0
      }
218
      vindex=_mm256_cvtepi16_epi32 (vintensity);
219
#endif
220
0
      vP = _mm_loadu_si64( (__m128i*) &pattern[1][0][oy][ox] );
221
222
0
      if( s == -1 )
223
0
      {
224
0
        vP = _mm_sub_epi8( _mm_set1_epi8( 0 ), vP );
225
0
      }
226
0
      __m128i vPlo = _mm_cvtepi8_epi16( vP );
227
0
      if( oc1 )
228
0
      {
229
0
        __m128i voc1 = _mm_set1_epi16( oc1 );
230
0
        __m128i voc2 = _mm_set1_epi16( oc2 );
231
        // p*oc1
232
0
        vPlo = _mm_mullo_epi16( vPlo, voc1 );   // max 16 Bit
233
        // pattern * s_up
234
0
        __m128i vP2 = _mm_loadu_si64( (__m128i*) &pattern[c ? 1 : 0][0][oy_up][ox_up] );
235
0
        if( s_up == -1 )
236
0
        {
237
0
          vP2 = _mm_sub_epi8( _mm_set1_epi8( 0 ), vP2 );
238
0
        }
239
0
        __m128i vP2lo = _mm_cvtepi8_epi16( vP2 );
240
        // * oc2
241
0
        vP2lo = _mm_mullo_epi16( vP2lo, voc2 );
242
        // add
243
0
        vPlo = _mm_add_epi16( vPlo, vP2lo );
244
        // round to 16 bit
245
0
        __m128i vadd   = _mm_set1_epi16( 1 << ( 5 - 1 ) );
246
0
        __m128i vshift = _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, 5 );
247
0
        vPlo           = _mm_add_epi16( vPlo, vadd );
248
0
        vPlo           = _mm_sra_epi16( vPlo, vshift );
249
0
      }
250
0
      _mm_storeu_si128( (__m128i*) &grain[c][8], vPlo );
251
#ifdef USE_AVX2
252
      __m256i vmask = _mm256_set1_epi32(0xff);
253
      __m256i vscale = _mm256_i32gather_epi32 ((int *)&sLUT[c][0], vindex, 1);  // load 8 32 bit values
254
      vscale = _mm256_and_si256 (vscale,vmask);
255
256
      vmask = _mm256_packus_epi32 (vscale, vscale);
257
      vscale = _mm256_permute4x64_epi64 (vmask, 0x8);
258
      vscale = _mm256_packus_epi16 (vscale, vscale);
259
      _mm_storeu_si64(( __m128i * )&scale[c][8],_mm256_castsi256_si128(vscale));
260
#else
261
      uint8_t*       pscale = &scale[c][8];
262
      const uint8_t* pLUT   = sLUT[c];
263
0
      if (bs)
264
0
      {
265
0
        const uint16_t* pI16 = &I16[x >> 1];
266
0
        for( int i = 0; i < 8; i++ )
267
0
        {
268
0
          uint8_t intensity = *pI16++ >> bs;
269
0
          *pscale++         = pLUT[intensity];
270
0
        }
271
0
      }
272
0
      else
273
0
      {
274
0
        const uint8_t* pI8 = &I8[x >> 1];
275
0
        for( int i = 0; i < 8; i++ )
276
0
        {
277
0
          uint8_t intensity = *pI8++;
278
0
          *pscale++         = pLUT[intensity];
279
0
        }
280
0
      }
281
#endif
282
0
    }
283
0
  }
284
#ifdef USE_AVX2
285
0
  else if( c>0 && allZero[c] == 0 )
286
0
  {
287
0
    __m128i vP;
288
0
    __m128i vintensity;
289
0
    __m256i vindex;
290
0
    __m256i vmask = _mm256_set1_epi32(0xff);
291
0
    if (bs)
292
0
    {
293
0
      vintensity = _mm_lddqu_si128((__m128i*)&I16[x>>1]);  //load 8 16 bit values
294
0
      vintensity = _mm_sra_epi16 (vintensity, _mm_set_epi32 (0,0,0,bs));
295
0
    }
296
0
    else
297
0
    {
298
0
      vintensity = _mm_loadu_si64(&I8[x>>1]);  //load 8 8 bit values
299
0
      vintensity=_mm_cvtepi8_epi16 (vintensity);
300
0
    }
301
0
    vindex=_mm256_cvtepi16_epi32 (vintensity);
302
0
    vindex = _mm256_and_si256 (vindex,vmask);  // only 8 bit
303
304
0
    __m256i vadd = _mm256_set_epi32(7,6,5,4,3,2,1,0);
305
0
    __m256i vpi = _mm256_i32gather_epi32 ((int *)&pLUT[c][0], vindex, 1);  // load 8 32 bit values
306
0
    vpi = _mm256_and_si256 (vpi,vmask);  // only 8 bit
307
0
    vpi = _mm256_slli_epi32 (vpi, 8);  // 12-4
308
0
    vpi = _mm256_add_epi32 (vpi, vadd);
309
0
    __m256i avP = _mm256_i32gather_epi32 ((int *)&pattern[1][0][oy][ox], vpi, 1);  // load 8 32 bit values
310
0
    avP = _mm256_and_si256 (avP,vmask);  // only 8 bit
311
    // convert to packed 8 bit
312
0
    __m256i vtmp = _mm256_packus_epi32 (avP, avP);
313
0
    avP = _mm256_permute4x64_epi64 (vtmp, 0x8);
314
0
    avP = _mm256_packus_epi16 (avP, avP);
315
0
    vP = _mm256_castsi256_si128(avP);
316
0
    if( s == -1 )
317
0
    {
318
0
      vP = _mm_sub_epi8( _mm_set1_epi8( 0 ), vP );
319
0
    }
320
0
    __m128i vPlo = _mm_cvtepi8_epi16( vP );
321
0
    if( oc1 )
322
0
    {
323
0
      __m128i voc1 = _mm_set1_epi16( oc1 );
324
0
      __m128i voc2 = _mm_set1_epi16( oc2 );
325
      // p*oc1
326
0
      vPlo = _mm_mullo_epi16( vPlo, voc1 );   // max 16 Bit
327
      // pattern * s_up
328
0
      __m256i avP2 = _mm256_i32gather_epi32 ((int *)&pattern[1][0][oy_up][ox_up], vpi, 1);  // load 8 32 bit values
329
0
      avP2 = _mm256_and_si256 (avP2,vmask);  // only 8 bit
330
      // convert to packed 8 bit
331
0
      vtmp = _mm256_packus_epi32 (avP2, avP2);
332
0
      avP2 = _mm256_permute4x64_epi64 (vtmp, 0x8);
333
0
      avP2 = _mm256_packus_epi16 (avP2, avP2);
334
0
      __m128i vP2= _mm256_castsi256_si128(avP2);
335
0
      if( s_up == -1 )
336
0
      {
337
0
        vP2 = _mm_sub_epi8( _mm_set1_epi8( 0 ), vP2 );
338
0
      }
339
0
      __m128i vP2lo = _mm_cvtepi8_epi16( vP2 );
340
0
      vP2lo = _mm_mullo_epi16( vP2lo, voc2 );
341
0
      vPlo = _mm_add_epi16( vPlo, vP2lo );
342
      // round to 16 bit
343
0
      __m128i vadd   = _mm_set1_epi16( 1 << ( 5 - 1 ) );
344
0
      __m128i vshift = _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, 5 );
345
0
      vPlo           = _mm_add_epi16( vPlo, vadd );
346
0
      vPlo           = _mm_sra_epi16( vPlo, vshift );
347
0
    }
348
0
    _mm_storeu_si128( (__m128i*) &grain[c][8], vPlo );
349
0
    __m256i vscale = _mm256_i32gather_epi32 ((int *)&sLUT[c][0], vindex, 1);  // load 8 32 bit values
350
0
    vscale = _mm256_and_si256 (vscale,vmask);
351
0
    vmask = _mm256_packus_epi32 (vscale, vscale);
352
0
    vscale = _mm256_permute4x64_epi64 (vmask, 0x8);
353
0
    vscale = _mm256_packus_epi16 (vscale, vscale);
354
0
    _mm_storeu_si64(( __m128i * )&scale[c][8],_mm256_castsi256_si128(vscale));
355
0
  }
356
0
#endif
357
0
  else
358
0
  {
359
0
    for( int i = 0; i < 16 / subx; i++ )
360
0
    {
361
0
      uint8_t intensity = bs ? I16[x / subx + i] >> bs : I8[x / subx + i];
362
0
      uint8_t pi        = pLUT[c][intensity] >> 4;                  // pattern index (integer part)
363
0
      int     P         = pattern[c ? 1 : 0][pi][oy][ox + i] * s;   // Pattern sample (from current pattern index)
364
                                                                    // We could consider just XORing the sign bit
365
#if PATTERN_INTERPOLATION
366
      uint8_t pf = pLUT[c][intensity] & 15;           // pattern index fractional part (interpolate with next) -- could restrict to less bits (e.g. 2)
367
      int     Pn =
368
        pattern[c ? 1 : 0][pi + 1][oy][ox + i] * s;   // Next-pattern sample (from pattern index+1)
369
                                                      // But there are equivalent hw tricks, e.g. storing values as sign + amplitude instead of two's complement
370
#endif
371
372
0
      if( oc1 )   // overlap
373
0
      {
374
0
        P = round( P * oc1 + pattern[c ? 1 : 0][pi][oy_up][ox_up + i] * oc2 * s_up, 5 );
375
#if PATTERN_INTERPOLATION
376
        Pn = round( Pn * oc1 + pattern[c ? 1 : 0][pi + 1][oy_up][ox_up + i] * oc2 * s_up, 5 );
377
#endif
378
0
      }
379
#if PATTERN_INTERPOLATION
380
      // Pattern interpolation: P is current, Pn is next, pf is interpolation coefficient
381
      grain[c][16 / subx + i] = round( P * ( 16 - pf ) + Pn * pf, 4 );
382
#else
383
0
      grain[c][16 / subx + i] = P;
384
0
#endif
385
      // Scale sign already integrated above because of overlap
386
0
      scale[c][16 / subx + i] = sLUT[c][intensity];
387
0
    }
388
0
  }
389
0
}
Unexecuted instantiation: vvdec::FilmGrainImplX86<(vvdec::x86_simd::X86_VEXT)1>::make_grain_pattern(void const*, int, int, int, unsigned char, unsigned char, unsigned char, unsigned char, unsigned char, unsigned char, int, int, short (*) [32], unsigned char (*) [32]) const
Unexecuted instantiation: vvdec::FilmGrainImplX86<(vvdec::x86_simd::X86_VEXT)4>::make_grain_pattern(void const*, int, int, int, unsigned char, unsigned char, unsigned char, unsigned char, unsigned char, unsigned char, int, int, short (*) [32], unsigned char (*) [32]) const
390
391
template<>
392
void FilmGrainImplX86<CURR_X86_VEXT>::scale_and_output( void* I, int c, int x, int subx, int width, int16_t grain[3][32], uint8_t scale[3][32] ) const
393
0
{
394
0
  uint8_t*  I8  = (uint8_t*) I;
395
0
  uint16_t* I16 = (uint16_t*) I;
396
397
0
  const uint8_t I_min = c ? C_min : Y_min;
398
0
  const uint8_t I_max = c ? C_max : Y_max;
399
400
0
  int flush = 0;
401
0
  do
402
0
  {
403
0
    if( x > 0 )
404
0
    {
405
0
      if( !flush )
406
0
      {
407
        // Horizontal deblock (across previous block)
408
0
        __m128i vgrain;
409
0
        __m128i vfac = _mm_set_epi16( 0, 0, 0, 1, 1, 3, 1, 1 );
410
0
        if( c == 0 )
411
0
        {
412
0
          vgrain = _mm_loadu_si64( (__m128i*) &grain[0][16 - 2] );   // r1 r0 l0 l1
413
0
        }
414
0
        else
415
0
        {
416
0
          vgrain = _mm_loadu_si64( (__m128i*) &grain[c][8 - 2] );   // r1 r0 l0 l1
417
0
        }
418
0
        __m128i vgrainh = _mm_mullo_epi16( vgrain, vfac );           // r1 3*r0  l0 l1
419
0
        vgrainh         = _mm_srli_si128( vgrainh, 2 );              //     r1 3+r0 l0
420
0
        vfac            = _mm_srli_si128( vfac, 2 );
421
0
        __m128i vgrainl = _mm_mullo_epi16( vgrain, vfac );           // r1 r0 3*lo l1
422
0
        vgrainl         = _mm_slli_si128( vgrainl, 10 );
423
0
        vgrainl         = _mm_srli_si128( vgrainl, 10 );             //    r0 3*lo l1
424
0
        vgrainl         = _mm_hadd_epi16( vgrainl, vgrainl );        // r0 3*lo+l1
425
0
        vgrainl         = _mm_hadd_epi16( vgrainl, vgrainl );        // r0+3*lo+l1
426
0
        vgrainh         = _mm_hadd_epi16( vgrainh, vgrainh );
427
0
        vgrainh         = _mm_hadd_epi16( vgrainh, vgrainh );
428
0
        vgrainh         = _mm_srli_si128( vgrainh, 2 );
429
0
        vgrain          = _mm_or_si128( vgrainl, vgrainh );
430
0
        vgrain          = _mm_add_epi16( vgrain, _mm_set_epi16( 0, 0, 0, 0, 0, 0, 2, 2 ) );
431
0
        vgrain          = _mm_srai_epi16( vgrain, 2 );
432
0
        if( c == 0 )
433
0
        {
434
0
          _mm_storeu_si32( (__m128i*) &grain[0][16 - 1], vgrain );
435
0
        }
436
0
        else
437
0
        {
438
0
          _mm_storeu_si32( (__m128i*) &grain[c][8 - 1], vgrain );
439
0
        }
440
0
      }
441
0
      if( bs )
442
0
      {
443
#  ifdef USE_AVX2
444
        __m128i vshift = _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, scale_shift );
445
0
        if( c == 0 )
446
0
        {
447
0
          __m256i vadd    = _mm256_set1_epi32( 1 << ( scale_shift - 1 ) );
448
0
          __m256i vgrain  = _mm256_lddqu_si256( (__m256i*) &grain[0][0] );   // load 16 * 16 bit
449
0
          __m256i vscale  = _mm256_cvtepi8_epi16( _mm_lddqu_si128( (__m128i*) &scale[0][0] ) );
450
0
          __m256i tmplo   = _mm256_mullo_epi16( vscale, vgrain );
451
0
          __m256i tmphi   = _mm256_mulhi_epi16( vscale, vgrain );
452
0
          __m256i tmpgvlo = _mm256_unpacklo_epi16( tmplo, tmphi );   // 32 bit
453
0
          __m256i tmpgvhi = _mm256_unpackhi_epi16( tmplo, tmphi );
454
          // deinterleave
455
0
          __m256i gvlo = _mm256_permute2x128_si256( tmpgvlo, tmpgvhi, 0x20 );
456
0
          __m256i gvhi = _mm256_permute2x128_si256( tmpgvlo, tmpgvhi, 0x31 );
457
          // round
458
0
          gvlo           = _mm256_add_epi32( gvlo, vadd );
459
0
          gvhi           = _mm256_add_epi32( gvhi, vadd );
460
0
          gvlo           = _mm256_sra_epi32( gvlo, vshift );
461
0
          gvhi           = _mm256_sra_epi32( gvhi, vshift );
462
0
          __m256i vI16lo = _mm256_cvtepi16_epi32( _mm_lddqu_si128( (__m128i*) &I16[( x - 16 )] ) );
463
0
          __m256i vI16hi = _mm256_cvtepi16_epi32( _mm_lddqu_si128( (__m128i*) &I16[( x - 16 ) + 8] ) );
464
0
          vI16lo         = _mm256_add_epi32( gvlo, vI16lo );
465
0
          vI16hi         = _mm256_add_epi32( gvhi, vI16hi );
466
0
          vI16lo         = _mm256_max_epi32( _mm256_set1_epi32( I_min ), vI16lo );
467
0
          vI16hi         = _mm256_max_epi32( _mm256_set1_epi32( I_min ), vI16hi );
468
0
          vI16lo         = _mm256_min_epi32( _mm256_set1_epi32( I_max << bs ), vI16lo );
469
0
          vI16hi         = _mm256_min_epi32( _mm256_set1_epi32( I_max << bs ), vI16hi );
470
0
          vI16lo         = _mm256_packs_epi32( vI16lo, vI16hi );
471
0
          vI16lo         = _mm256_permute4x64_epi64( vI16lo, 0xd8 );
472
0
          _mm256_storeu_si256( (__m256i*) &I16[( x - 16 )], vI16lo );
473
0
        }
474
0
        else
475
0
        {
476
0
          __m128i vadd   = _mm_set1_epi32( 1 << ( scale_shift - 1 ) );
477
0
          __m128i vscale = _mm_lddqu_si128( (__m128i*) &scale[c] );
478
0
          __m128i vgrain = _mm_lddqu_si128( (__m128i*) &grain[c] );
479
0
          vscale         = _mm_cvtepi8_epi16( vscale );   // 16 bit
480
0
          __m128i tmplo  = _mm_mullo_epi16( vscale, vgrain );
481
0
          __m128i tmphi  = _mm_mulhi_epi16( vscale, vgrain );
482
0
          __m128i gvlo   = _mm_unpacklo_epi16( tmplo, tmphi );   // 32 bit
483
0
          __m128i gvhi   = _mm_unpackhi_epi16( tmplo, tmphi );
484
0
          gvlo           = _mm_add_epi32( gvlo, vadd );
485
0
          gvhi           = _mm_add_epi32( gvhi, vadd );
486
0
          gvlo           = _mm_sra_epi32( gvlo, vshift );
487
0
          gvhi           = _mm_sra_epi32( gvhi, vshift );
488
0
          __m128i vI16lo = _mm_lddqu_si128( (__m128i*) &I16[( x - 16 ) / subx] );
489
0
          __m128i vI16hi = _mm_lddqu_si128( (__m128i*) &I16[( x - 16 ) / subx + 4] );
490
0
          vI16lo         = _mm_cvtepi16_epi32( vI16lo );   // 32 bit
491
0
          vI16hi         = _mm_cvtepi16_epi32( vI16hi );
492
0
          vI16lo         = _mm_add_epi32( gvlo, vI16lo );
493
0
          vI16hi         = _mm_add_epi32( gvhi, vI16hi );
494
0
          vI16lo         = _mm_max_epi32( _mm_set1_epi32( I_min ), vI16lo );
495
0
          vI16hi         = _mm_max_epi32( _mm_set1_epi32( I_min ), vI16hi );
496
0
          vI16lo         = _mm_min_epi32( _mm_set1_epi32( I_max << bs ), vI16lo );
497
0
          vI16hi         = _mm_min_epi32( _mm_set1_epi32( I_max << bs ), vI16hi );
498
0
          vI16lo         = _mm_packs_epi32( vI16lo, vI16hi );
499
0
          _mm_storeu_si128( (__m128i*) &I16[( x - 16 ) / subx], vI16lo );
500
0
        }
501
#  else    // !USE_AVX2
502
        __m128i vadd   = _mm_set1_epi32( 1 << ( scale_shift - 1 ) );
503
        __m128i vshift = _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, scale_shift );
504
        __m128i vscale = _mm_lddqu_si128( (__m128i*) &scale[c] );
505
        __m128i vgrain = _mm_lddqu_si128( (__m128i*) &grain[c] );
506
        vscale         = _mm_cvtepi8_epi16( vscale );   // 16 bit
507
        __m128i tmplo  = _mm_mullo_epi16( vscale, vgrain );
508
        __m128i tmphi  = _mm_mulhi_epi16( vscale, vgrain );
509
        __m128i gvlo   = _mm_unpacklo_epi16( tmplo, tmphi );   // 32 bit
510
        __m128i gvhi   = _mm_unpackhi_epi16( tmplo, tmphi );
511
        gvlo           = _mm_add_epi32( gvlo, vadd );
512
        gvhi           = _mm_add_epi32( gvhi, vadd );
513
        gvlo           = _mm_sra_epi32( gvlo, vshift );
514
        gvhi           = _mm_sra_epi32( gvhi, vshift );
515
        __m128i vI16lo = _mm_lddqu_si128( (__m128i*) &I16[( x - 16 ) / subx] );
516
        __m128i vI16hi = _mm_lddqu_si128( (__m128i*) &I16[( x - 16 ) / subx + 4] );
517
518
        vI16lo = _mm_cvtepi16_epi32( vI16lo );   // 32 bit
519
        vI16hi = _mm_cvtepi16_epi32( vI16hi );
520
        vI16lo = _mm_add_epi32( gvlo, vI16lo );
521
        vI16hi = _mm_add_epi32( gvhi, vI16hi );
522
        vI16lo = _mm_max_epi32( _mm_set1_epi32( I_min ), vI16lo );
523
        vI16hi = _mm_max_epi32( _mm_set1_epi32( I_min ), vI16hi );
524
        vI16lo = _mm_min_epi32( _mm_set1_epi32( I_max << bs ), vI16lo );
525
        vI16hi = _mm_min_epi32( _mm_set1_epi32( I_max << bs ), vI16hi );
526
        vI16lo = _mm_packs_epi32( vI16lo, vI16hi );
527
        _mm_storeu_si128( (__m128i*) &I16[( x - 16 ) / subx], vI16lo );
528
0
        if( c == 0 )
529
0
        {
530
0
          __m128i vscale = _mm_lddqu_si128( (__m128i*) &scale[c][8] );
531
0
          __m128i vgrain = _mm_lddqu_si128( (__m128i*) &grain[c][8] );
532
0
          vscale         = _mm_cvtepi8_epi16( vscale );   // 16 bit
533
0
          __m128i tmplo  = _mm_mullo_epi16( vscale, vgrain );
534
0
          __m128i tmphi  = _mm_mulhi_epi16( vscale, vgrain );
535
0
          __m128i gvlo   = _mm_unpacklo_epi16( tmplo, tmphi );   // 32 bit
536
0
          __m128i gvhi   = _mm_unpackhi_epi16( tmplo, tmphi );
537
          // round
538
0
          gvlo           = _mm_add_epi32( gvlo, vadd );
539
0
          gvhi           = _mm_add_epi32( gvhi, vadd );
540
0
          gvlo           = _mm_sra_epi32( gvlo, vshift );
541
0
          gvhi           = _mm_sra_epi32( gvhi, vshift );
542
0
          __m128i vI16lo = _mm_lddqu_si128( (__m128i*) &I16[( x - 16 ) / subx + 8] );
543
0
          __m128i vI16hi = _mm_lddqu_si128( (__m128i*) &I16[( x - 16 ) / subx + 12] );
544
0
          vI16lo         = _mm_cvtepi16_epi32( vI16lo );   // 32 bit
545
0
          vI16hi         = _mm_cvtepi16_epi32( vI16hi );
546
0
          vI16lo         = _mm_add_epi32( gvlo, vI16lo );
547
0
          vI16hi         = _mm_add_epi32( gvhi, vI16hi );
548
0
          vI16lo         = _mm_max_epi32( _mm_set1_epi32( I_min ), vI16lo );
549
0
          vI16hi         = _mm_max_epi32( _mm_set1_epi32( I_min ), vI16hi );
550
0
          vI16lo         = _mm_min_epi32( _mm_set1_epi32( I_max << bs ), vI16lo );
551
0
          vI16hi         = _mm_min_epi32( _mm_set1_epi32( I_max << bs ), vI16hi );
552
0
          vI16lo         = _mm_packs_epi32( vI16lo, vI16hi );
553
0
          _mm_storeu_si128( (__m128i*) &I16[( x - 16 ) / subx + 8], vI16lo );
554
0
        }
555
#endif   // !USE_AVX2
556
0
      }    // bs
557
0
      else
558
0
      {
559
0
        for( int i = 0; i < 16 / subx; i++ )
560
0
        {
561
          // Output previous block (or flush current)
562
0
          int32_t g = round( scale[c][i] * (int16_t) grain[c][i], scale_shift );
563
0
          if( bs )
564
0
          {
565
0
            I16[( x - 16 ) / subx + i] = std::max<int32_t>( I_min << bs, std::min<int32_t>( I_max << bs, I16[( x - 16 ) / subx + i] + g ) );
566
0
          }
567
0
          else
568
0
          {
569
0
            I8[( x - 16 ) / subx + i] = std::max<int32_t>( I_min, std::min<int32_t>( I_max, I8[( x - 16 ) / subx + i] + g ) );
570
0
          }
571
0
        }
572
0
      }
573
0
    }
574
    // Shift pipeline
575
0
    if( !flush )
576
0
    {
577
0
      if( c == 0 )
578
0
      {
579
#ifdef USE_AVX2
580
        __m256i vgrain = _mm256_lddqu_si256( (__m256i*) &grain[0][16] );
581
        _mm256_storeu_si256( (__m256i*) &grain[0][0], vgrain );
582
#else
583
        __m128i vgrain0 = _mm_lddqu_si128( (__m128i*) &grain[0][16] );
584
        __m128i vgrain1 = _mm_lddqu_si128( (__m128i*) &grain[0][24] );
585
        _mm_storeu_si128( (__m128i*) &grain[0][0], vgrain0 );
586
        _mm_storeu_si128( (__m128i*) &grain[0][8], vgrain1 );
587
#endif
588
0
        __m128i vscale = _mm_lddqu_si128( (__m128i*) &scale[0][16] );
589
0
        _mm_storeu_si128( (__m128i*) &scale[0][0], vscale );
590
0
      }
591
0
      else
592
0
      {
593
0
        __m128i vgrain = _mm_lddqu_si128( (__m128i*) &grain[c][8] );
594
0
        __m128i vscale = _mm_loadu_si64( (__m128i*) &scale[c][8] );
595
0
        _mm_storeu_si128( (__m128i*) &grain[c][0], vgrain );
596
0
        _mm_storel_epi64( (__m128i*) &scale[c][0], vscale );
597
0
      }
598
0
    }
599
0
    if( x + 16 >= width )
600
0
    {
601
0
      flush++;
602
0
      x += 16;
603
0
    }
604
0
  } while( flush == 1 );
605
0
}
Unexecuted instantiation: vvdec::FilmGrainImplX86<(vvdec::x86_simd::X86_VEXT)1>::scale_and_output(void*, int, int, int, int, short (*) [32], unsigned char (*) [32]) const
Unexecuted instantiation: vvdec::FilmGrainImplX86<(vvdec::x86_simd::X86_VEXT)4>::scale_and_output(void*, int, int, int, int, short (*) [32], unsigned char (*) [32]) const
606
607
}   // namespace vvdec
608
609
#endif   // TARGET_SIMD_X86