Coverage Report

Created: 2026-04-01 07:49

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/vvdec/source/Lib/CommonLib/x86/SampleAdaptiveOffsetX86.h
Line
Count
Source
1
/* -----------------------------------------------------------------------------
2
The copyright in this software is being made available under the Clear BSD
3
License, included below. No patent rights, trademark rights and/or 
4
other Intellectual Property Rights other than the copyrights concerning 
5
the Software are granted under this license.
6
7
The Clear BSD License
8
9
Copyright (c) 2018-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVdeC Authors.
10
All rights reserved.
11
12
Redistribution and use in source and binary forms, with or without modification,
13
are permitted (subject to the limitations in the disclaimer below) provided that
14
the following conditions are met:
15
16
     * Redistributions of source code must retain the above copyright notice,
17
     this list of conditions and the following disclaimer.
18
19
     * Redistributions in binary form must reproduce the above copyright
20
     notice, this list of conditions and the following disclaimer in the
21
     documentation and/or other materials provided with the distribution.
22
23
     * Neither the name of the copyright holder nor the names of its
24
     contributors may be used to endorse or promote products derived from this
25
     software without specific prior written permission.
26
27
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
28
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
29
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
32
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
36
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
40
41
------------------------------------------------------------------------------------------- */
42
43
/** \file     SampleAdaptiveOffsetX86.h
44
    \brief    SAO filter class
45
*/
46
#include "CommonDefX86.h"
47
#include "../SampleAdaptiveOffset.h"
48
49
#if defined(TARGET_SIMD_X86)  && ENABLE_SIMD_OPT_SAO
50
51
namespace vvdec
52
{
53
54
0
#  define SAO_NUM_OFFSETS 4                             /* number of SAO offset values */
55
0
#  define SAO_EO_NUM_CATEGORIES ( SAO_NUM_OFFSETS + 1 ) /* number of different eo categories */
56
57
#  if USE_AVX2 && !defined( _mm256_set_m128i )
58
#    define VVCLIB_OWN_mm256_set_m128i
59
#    define _mm256_set_m128i( v0, v1 ) _mm256_inserti128_si256( _mm256_castsi128_si256( v1 ), ( v0 ), 1 )
60
#  endif
61
62
static bool isProcessDisabled( int xPos, int yPos, int numVerVirBndry, int numHorVirBndry, int verVirBndryPos[], int horVirBndryPos[] )
63
0
{
64
0
  for( int i = 0; i < numVerVirBndry; i++ )
65
0
  {
66
0
    if( ( xPos == verVirBndryPos[i] ) || ( xPos == verVirBndryPos[i] - 1 ) )
67
0
    {
68
0
      return true;
69
0
    }
70
0
  }
71
0
  for( int i = 0; i < numHorVirBndry; i++ )
72
0
  {
73
0
    if( ( yPos == horVirBndryPos[i] ) || ( yPos == horVirBndryPos[i] - 1 ) )
74
0
    {
75
0
      return true;
76
0
    }
77
0
  }
78
0
79
0
  return false;
80
0
}
Unexecuted instantiation: SampleAdaptiveOffset_sse41.cpp:vvdec::isProcessDisabled(int, int, int, int, int*, int*)
Unexecuted instantiation: SampleAdaptiveOffset_avx2.cpp:vvdec::isProcessDisabled(int, int, int, int, int*, int*)
81
82
static bool isHorProcessDisabled( int yPos, int numHorVirBndry, int horVirBndryPos[] )
83
0
{
84
0
  for( int i = 0; i < numHorVirBndry; i++ )
85
0
  {
86
0
    if( ( yPos == horVirBndryPos[i] ) || ( yPos == horVirBndryPos[i] - 1 ) )
87
0
    {
88
0
      return true;
89
0
    }
90
0
  }
91
0
  return false;
92
0
}
Unexecuted instantiation: SampleAdaptiveOffset_sse41.cpp:vvdec::isHorProcessDisabled(int, int, int*)
Unexecuted instantiation: SampleAdaptiveOffset_avx2.cpp:vvdec::isHorProcessDisabled(int, int, int*)
93
static bool isVerProcessDisabled( int xPos, int numVerVirBndry, int verVirBndryPos[] )
94
0
{
95
0
  for( int i = 0; i < numVerVirBndry; i++ )
96
0
  {
97
0
    if( ( xPos == verVirBndryPos[i] ) || ( xPos == verVirBndryPos[i] - 1 ) )
98
0
    {
99
0
      return true;
100
0
    }
101
0
  }
102
0
  return false;
103
0
}
Unexecuted instantiation: SampleAdaptiveOffset_sse41.cpp:vvdec::isVerProcessDisabled(int, int, int*)
Unexecuted instantiation: SampleAdaptiveOffset_avx2.cpp:vvdec::isVerProcessDisabled(int, int, int*)
104
105
template<X86_VEXT vext>
106
static void offsetBlock_SIMD_SAO_TYPE_BO( const int  channelBitDepth,
107
                                          int*       offset,
108
                                          int        startIdx,
109
                                          const Pel* srcBlk,
110
                                          Pel*       resBlk,
111
                                          ptrdiff_t  srcStride,
112
                                          ptrdiff_t  resStride,
113
                                          int        width,
114
                                          int        height )
115
0
{
116
0
  const Pel* srcLine = srcBlk;
117
0
  Pel*       resLine = resBlk;
118
119
0
  const int shiftBits        = channelBitDepth - NUM_SAO_BO_CLASSES_LOG2;
120
0
  int8_t    p_eo_offsets[16] = { 0 };
121
0
  for( int i = 0; i < 4; i++ )
122
0
  {
123
0
    p_eo_offsets[i] = offset[( startIdx + i ) % MAX_NUM_SAO_CLASSES];
124
0
  }
125
0
  for( int y = 0; y < height; y++ )
126
0
  {
127
0
    for( int x = 0; x < width; )
128
0
    {
129
#  ifdef USE_AVX2
130
      // AVX2
131
0
      if( width - x >= 16 && vext >= AVX2 )
132
0
      {
133
0
        __m256i vbaseoffset = _mm256_set1_epi16( startIdx - MAX_NUM_SAO_CLASSES );
134
0
        __m256i vminus      = _mm256_set1_epi8( -1 );
135
0
        __m256i vzero       = _mm256_set1_epi8( 0 );
136
137
        __m256i vfour      = _mm256_set1_epi16( 4 );
138
        __m256i vibdimax   = _mm256_set1_epi16( ( 1 << channelBitDepth ) - 1 );
139
        __m256i voffsettbl = _mm256_broadcastsi128_si256( _mm_loadu_si128( (__m128i*)p_eo_offsets ) );
140
141
        __m256i vsrc  = _mm256_loadu_si256( (__m256i*)&srcLine[x] );
142
        __m256i bands = _mm256_srai_epi16( vsrc, shiftBits );
143
        bands         = _mm256_sub_epi16 ( bands, vbaseoffset );
144
0
        bands         = _mm256_and_si256 ( bands, _mm256_set1_epi16( MAX_NUM_SAO_CLASSES - 1 ) ); // modulo 32 = modulo NUM_SAO_BO_CLASSES_LOG2
145
        __m256i mask1 = _mm256_cmpgt_epi16( bands, vminus );
146
        __m256i mask2 = _mm256_cmpgt_epi16( vfour, bands );
147
148
        __m256i veoffsets = _mm256_shuffle_epi8( voffsettbl, bands );
149
        veoffsets         = _mm256_slli_epi16( veoffsets, 8 );
150
        veoffsets         = _mm256_srai_epi16( veoffsets, 8 );
151
152
        veoffsets = _mm256_and_si256( veoffsets, mask1 );
153
        veoffsets = _mm256_and_si256( veoffsets, mask2 );
154
155
        vsrc = _mm256_add_epi16( vsrc, veoffsets );
156
        vsrc = _mm256_min_epi16( _mm256_max_epi16( vsrc, vzero ), vibdimax );
157
        _mm256_storeu_si256( (__m256i*)&resLine[x], vsrc );
158
159
0
        x += 16;
160
0
      }
161
0
      else
162
0
#  endif
163
0
      {
164
0
        __m128i vbaseoffset = _mm_set1_epi16( startIdx - MAX_NUM_SAO_CLASSES );
165
0
        __m128i vminus      = _mm_set1_epi8( -1 );
166
0
        __m128i vzero       = _mm_set1_epi8( 0 );
167
168
0
        __m128i vfour      = _mm_set1_epi16( 4 );
169
0
        __m128i vibdimax   = _mm_set1_epi16( ( 1 << channelBitDepth ) - 1 );
170
0
        __m128i voffsettbl = _mm_loadu_si128( (__m128i*)p_eo_offsets );
171
172
0
        __m128i vsrc  = _mm_loadu_si128( (__m128i*)&srcLine[x] );
173
0
        __m128i bands = _mm_srai_epi16( vsrc, shiftBits );
174
0
        bands         = _mm_sub_epi16 ( bands, vbaseoffset );
175
0
        bands         = _mm_and_si128 ( bands, _mm_set1_epi16( MAX_NUM_SAO_CLASSES - 1 ) ); // modulo 32 = modulo NUM_SAO_BO_CLASSES_LOG2
176
0
        __m128i mask1 = _mm_cmpgt_epi16( bands, vminus );
177
0
        __m128i mask2 = _mm_cmplt_epi16( bands, vfour );
178
179
0
        __m128i veoffsets = _mm_shuffle_epi8( voffsettbl, bands );
180
0
        veoffsets         = _mm_slli_epi16( veoffsets, 8 );
181
0
        veoffsets         = _mm_srai_epi16( veoffsets, 8 );
182
183
0
        veoffsets = _mm_and_si128( veoffsets, mask1 );
184
0
        veoffsets = _mm_and_si128( veoffsets, mask2 );
185
186
0
        vsrc = _mm_add_epi16( vsrc, veoffsets );
187
0
        vsrc = _mm_min_epi16( _mm_max_epi16( vsrc, vzero ), vibdimax );
188
0
        _mm_store_si128( (__m128i*)&resLine[x], vsrc );
189
190
0
        x += 8;
191
0
      }
192
0
    }
193
0
    srcLine += srcStride;
194
0
    resLine += resStride;
195
0
  }
196
0
}
Unexecuted instantiation: SampleAdaptiveOffset_sse41.cpp:void vvdec::offsetBlock_SIMD_SAO_TYPE_BO<(vvdec::x86_simd::X86_VEXT)1>(int, int*, int, short const*, short*, long, long, int, int)
Unexecuted instantiation: SampleAdaptiveOffset_avx2.cpp:void vvdec::offsetBlock_SIMD_SAO_TYPE_BO<(vvdec::x86_simd::X86_VEXT)4>(int, int*, int, short const*, short*, long, long, int, int)
197
198
template<X86_VEXT vext>
199
static void offsetBlock_SIMD_SAO_TYPE_EO_0( const int            channelBitDepth,
200
                                            const ClpRng&        clpRng,
201
                                            int*                 offset,
202
                                            const Pel*           srcBlk,
203
                                            Pel*                 resBlk,
204
                                            ptrdiff_t            srcStride,
205
                                            ptrdiff_t            resStride,
206
                                            int                  width,
207
                                            int                  height,
208
                                            bool                 isLeftAvail,
209
                                            bool                 isRightAvail,
210
                                            bool                 isAboveAvail,
211
                                            bool                 isBelowAvail,
212
                                            bool                 isAboveLeftAvail,
213
                                            bool                 isAboveRightAvail,
214
                                            bool                 isBelowLeftAvail,
215
                                            bool                 isBelowRightAvail,
216
                                            std::vector<int8_t>* m_signLineBuf1,
217
                                            std::vector<int8_t>* m_signLineBuf2,
218
                                            bool                 isCtuCrossedByVirtualBoundaries,
219
                                            int                  horVirBndryPos[],
220
                                            int                  verVirBndryPos[],
221
                                            int                  numHorVirBndry,
222
                                            int                  numVerVirBndry,
223
                                            uint16_t             bndmask[MAX_CU_SIZE] )
224
0
{
225
0
  const Pel* srcLine = srcBlk;
226
0
  Pel*       resLine = resBlk;
227
228
0
  int    x, y, startX, endX, edgeType;
229
0
  int8_t signLeft, signRight;
230
231
0
  if( isLeftAvail && isRightAvail )
232
0
  {
233
0
    int8_t p_eo_offsets[16] = { 0 };
234
0
    for( int i = 0; i < SAO_EO_NUM_CATEGORIES; i++ )
235
0
    {
236
0
      p_eo_offsets[i] = offset[i];
237
0
    }
238
#if defined( USE_AVX2 )
239
    // AVX2
240
0
    if( ( width & 15 ) == 0 && vext >= AVX2 )
241
0
    {
242
243
      __m256i vsrca, vsrcal, vsrcar, virBmask;
244
245
      __m256i vbaseoffset = _mm256_set1_epi16( 2 );
246
      __m256i vplusone    = _mm256_set1_epi16( 1 );
247
      __m256i vzero       = _mm256_set1_epi8( 0 );
248
      __m256i vibdimax    = _mm256_set1_epi16( ( 1 << channelBitDepth ) - 1 );
249
      __m256i voffsettbl  = _mm256_broadcastsi128_si256( _mm_loadu_si128( (__m128i*)p_eo_offsets ) );
250
251
0
      if( isCtuCrossedByVirtualBoundaries )
252
0
      {
253
0
        for( y = 0; y < height; y++ )
254
0
        {
255
0
          for( x = 0; x < width; x += 16 )
256
0
          {
257
0
            vsrca    = _mm256_loadu_si256( (__m256i*)&srcLine[x] );
258
0
            vsrcal   = _mm256_loadu_si256( (__m256i*)&srcLine[x - 1] );
259
0
            vsrcar   = _mm256_loadu_si256( (__m256i*)&srcLine[x + 1] );
260
0
            virBmask = _mm256_loadu_si256( (__m256i*)&bndmask[x] );
261
262
0
            vsrcal            = _mm256_subs_epi16( vsrca, vsrcal );
263
0
            vsrcar            = _mm256_subs_epi16( vsrca, vsrcar );
264
0
            __m256i vsignl    = _mm256_sign_epi16( vplusone, vsrcal );
265
0
            __m256i vsignr    = _mm256_sign_epi16( vplusone, vsrcar );
266
0
            __m256i vsign     = _mm256_adds_epi16( _mm256_adds_epi16( vsignl, vsignr ), vbaseoffset );
267
0
            __m256i veoffsets = _mm256_shuffle_epi8( voffsettbl, vsign );
268
0
            veoffsets         = _mm256_slli_epi16( veoffsets, 8 );
269
0
            veoffsets         = _mm256_srai_epi16( veoffsets, 8 );
270
271
0
            vsrcal = _mm256_add_epi16( vsrca, veoffsets );
272
0
            vsrcal = _mm256_min_epi16( _mm256_max_epi16( vsrcal, vzero ), vibdimax );
273
274
0
            vsrcar = _mm256_blendv_epi8( vsrcal, vsrca, virBmask );
275
276
0
            _mm256_storeu_si256( (__m256i*)&resLine[x], vsrcar );
277
0
          }
278
0
          srcLine += srcStride;
279
0
          resLine += resStride;
280
0
        }
281
0
      }
282
0
      else
283
0
      {
284
0
        for( y = 0; y < height; y++ )
285
0
        {
286
0
          for( x = 0; x < width; x += 16 )
287
0
          {
288
0
            vsrca             = _mm256_loadu_si256( (__m256i*)&srcLine[x] );
289
0
            vsrcal            = _mm256_loadu_si256( (__m256i*)&srcLine[x - 1] );
290
0
            vsrcar            = _mm256_loadu_si256( (__m256i*)&srcLine[x + 1] );
291
0
            vsrcal            = _mm256_subs_epi16( vsrca, vsrcal );
292
0
            vsrcar            = _mm256_subs_epi16( vsrca, vsrcar );
293
0
            __m256i vsignl    = _mm256_sign_epi16( vplusone, vsrcal );
294
0
            __m256i vsignr    = _mm256_sign_epi16( vplusone, vsrcar );
295
0
            __m256i vsign     = _mm256_adds_epi16( _mm256_adds_epi16( vsignl, vsignr ), vbaseoffset );
296
0
            __m256i veoffsets = _mm256_shuffle_epi8( voffsettbl, vsign );
297
0
            veoffsets         = _mm256_slli_epi16( veoffsets, 8 );
298
0
            veoffsets         = _mm256_srai_epi16( veoffsets, 8 );
299
300
0
            vsrca = _mm256_add_epi16( vsrca, veoffsets );
301
0
            vsrca = _mm256_min_epi16( _mm256_max_epi16( vsrca, vzero ), vibdimax );
302
303
0
            _mm256_storeu_si256( (__m256i*)&resLine[x], vsrca );
304
0
          }
305
0
          srcLine += srcStride;
306
0
          resLine += resStride;
307
0
        }
308
0
      }
309
0
    }
310
0
    else
311
0
#  endif
312
0
    {
313
0
      __m128i vsrca, vsrcal, vsrcar, virBmask;
314
0
      __m128i vbaseoffset = _mm_set1_epi16( 2 );
315
0
      __m128i vplusone    = _mm_set1_epi16( 1 );
316
0
      __m128i vzero       = _mm_set1_epi8( 0 );
317
0
      __m128i vibdimax    = _mm_set1_epi16( ( 1 << channelBitDepth ) - 1 );
318
0
      __m128i voffsettbl  = _mm_loadu_si128( (__m128i*)p_eo_offsets );
319
0
      if( isCtuCrossedByVirtualBoundaries )
320
0
      {
321
0
        for( y = 0; y < height; y++ )
322
0
        {
323
0
          for( x = 0; x < width; x += 8 )
324
0
          {
325
0
            vsrca             = _mm_loadu_si128( (__m128i*)&srcLine[x] );
326
0
            vsrcal            = _mm_loadu_si128( (__m128i*)&srcLine[x - 1] );
327
0
            vsrcar            = _mm_loadu_si128( (__m128i*)&srcLine[x + 1] );
328
0
            virBmask          = _mm_loadu_si128( (__m128i*)&bndmask[x] );
329
0
            vsrcal            = _mm_subs_epi16( vsrca, vsrcal );
330
0
            vsrcar            = _mm_subs_epi16( vsrca, vsrcar );
331
0
            __m128i vsignl    = _mm_sign_epi16( vplusone, vsrcal );
332
0
            __m128i vsignr    = _mm_sign_epi16( vplusone, vsrcar );
333
0
            __m128i vsign     = _mm_adds_epi16( _mm_adds_epi16( vsignl, vsignr ), vbaseoffset );
334
0
            __m128i veoffsets = _mm_shuffle_epi8( voffsettbl, vsign );
335
0
            veoffsets         = _mm_slli_epi16( veoffsets, 8 );
336
0
            veoffsets         = _mm_srai_epi16( veoffsets, 8 );
337
338
0
            vsrcal = _mm_add_epi16( vsrca, veoffsets );
339
0
            vsrcal = _mm_min_epi16( _mm_max_epi16( vsrcal, vzero ), vibdimax );
340
341
0
            vsrcar = _mm_blendv_epi8( vsrcal, vsrca, virBmask );
342
343
0
            _mm_store_si128( (__m128i*)&resLine[x], vsrcar );
344
0
          }
345
0
          srcLine += srcStride;
346
0
          resLine += resStride;
347
0
        }
348
0
      }
349
0
      else
350
0
      {
351
0
        for( y = 0; y < height; y++ )
352
0
        {
353
0
          for( x = 0; x < width; x += 8 )
354
0
          {
355
0
            vsrca             = _mm_loadu_si128( (__m128i*)&srcLine[x] );
356
0
            vsrcal            = _mm_loadu_si128( (__m128i*)&srcLine[x - 1] );
357
0
            vsrcar            = _mm_loadu_si128( (__m128i*)&srcLine[x + 1] );
358
0
            vsrcal            = _mm_subs_epi16( vsrca, vsrcal );
359
0
            vsrcar            = _mm_subs_epi16( vsrca, vsrcar );
360
0
            __m128i vsignl    = _mm_sign_epi16( vplusone, vsrcal );
361
0
            __m128i vsignr    = _mm_sign_epi16( vplusone, vsrcar );
362
0
            __m128i vsign     = _mm_adds_epi16( _mm_adds_epi16( vsignl, vsignr ), vbaseoffset );
363
0
            __m128i veoffsets = _mm_shuffle_epi8( voffsettbl, vsign );
364
0
            veoffsets         = _mm_slli_epi16( veoffsets, 8 );
365
0
            veoffsets         = _mm_srai_epi16( veoffsets, 8 );
366
367
0
            vsrca = _mm_add_epi16( vsrca, veoffsets );
368
0
            vsrca = _mm_min_epi16( _mm_max_epi16( vsrca, vzero ), vibdimax );
369
0
            _mm_store_si128( (__m128i*)&resLine[x], vsrca );
370
0
          }
371
0
          srcLine += srcStride;
372
0
          resLine += resStride;
373
0
        }
374
0
      }
375
0
    }
376
0
  }
377
0
  else
378
0
  {
379
0
    offset += 2;
380
0
    startX = isLeftAvail ? 0 : 1;
381
0
    endX   = isRightAvail ? width : ( width - 1 );
382
0
    for( y = 0; y < height; y++ )
383
0
    {
384
0
      signLeft = (int8_t)sgn( srcLine[startX] - srcLine[startX - 1] );
385
0
      for( x = startX; x < endX; x++ )
386
0
      {
387
0
        signRight = (int8_t)sgn( srcLine[x] - srcLine[x + 1] );
388
0
        if( isCtuCrossedByVirtualBoundaries && isVerProcessDisabled( x, numVerVirBndry, verVirBndryPos ) )
389
0
        {
390
0
          signLeft = -signRight;
391
0
          continue;
392
0
        }
393
0
        edgeType = signRight + signLeft;
394
0
        signLeft = -signRight;
395
396
0
        resLine[x] = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng );
397
0
      }
398
399
0
      srcLine += srcStride;
400
0
      resLine += resStride;
401
0
    }
402
0
  }
403
0
}
Unexecuted instantiation: SampleAdaptiveOffset_sse41.cpp:void vvdec::offsetBlock_SIMD_SAO_TYPE_EO_0<(vvdec::x86_simd::X86_VEXT)1>(int, vvdec::ClpRngTemplate<short> const&, int*, short const*, short*, long, long, int, int, bool, bool, bool, bool, bool, bool, bool, bool, std::__1::vector<signed char, std::__1::allocator<signed char> >*, std::__1::vector<signed char, std::__1::allocator<signed char> >*, bool, int*, int*, int, int, unsigned short*)
Unexecuted instantiation: SampleAdaptiveOffset_avx2.cpp:void vvdec::offsetBlock_SIMD_SAO_TYPE_EO_0<(vvdec::x86_simd::X86_VEXT)4>(int, vvdec::ClpRngTemplate<short> const&, int*, short const*, short*, long, long, int, int, bool, bool, bool, bool, bool, bool, bool, bool, std::__1::vector<signed char, std::__1::allocator<signed char> >*, std::__1::vector<signed char, std::__1::allocator<signed char> >*, bool, int*, int*, int, int, unsigned short*)
404
405
template<X86_VEXT vext>
406
static void offsetBlock_SIMD_SAO_TYPE_EO_90( const int            channelBitDepth,
407
                                             const ClpRng&        clpRng,
408
                                             int*                 offset,
409
                                             const Pel*           srcBlk,
410
                                             Pel*                 resBlk,
411
                                             ptrdiff_t            srcStride,
412
                                             ptrdiff_t            resStride,
413
                                             int                  width,
414
                                             int                  height,
415
                                             bool                 isLeftAvail,
416
                                             bool                 isRightAvail,
417
                                             bool                 isAboveAvail,
418
                                             bool                 isBelowAvail,
419
                                             bool                 isAboveLeftAvail,
420
                                             bool                 isAboveRightAvail,
421
                                             bool                 isBelowLeftAvail,
422
                                             bool                 isBelowRightAvail,
423
                                             std::vector<int8_t>* m_signLineBuf1,
424
                                             std::vector<int8_t>* m_signLineBuf2,
425
                                             bool                 isCtuCrossedByVirtualBoundaries,
426
                                             int                  horVirBndryPos[],
427
                                             int                  verVirBndryPos[],
428
                                             int                  numHorVirBndry,
429
                                             int                  numVerVirBndry,
430
                                             uint16_t             bndmask[MAX_CU_SIZE] )
431
0
{
432
0
  const Pel* srcLine = srcBlk;
433
0
  Pel*       resLine = resBlk;
434
435
0
  int x, y, startY, endY;
436
437
0
  int8_t p_eo_offsets[16] = { 0 };
438
0
  for( int i = 0; i < SAO_EO_NUM_CATEGORIES; i++ )
439
0
  {
440
0
    p_eo_offsets[i] = offset[i];
441
0
  }
442
0
  const Pel* srcLineAbove = srcLine - srcStride;
443
0
  const Pel* srcLineBelow = srcLine + srcStride;
444
0
  startY                  = 0;
445
0
  if( !isAboveAvail )
446
0
  {
447
0
    startY       = 1;
448
0
    srcLineAbove = srcLine;
449
0
    srcLine += srcStride;
450
0
    resLine += resStride;
451
0
    srcLineBelow = srcLine + srcStride;
452
0
  }
453
0
  endY = height;
454
0
  if( !isBelowAvail )
455
0
  {
456
0
    endY = height - 1;
457
0
  }
458
#  if defined( USE_AVX2 )
459
  // AVX2
460
0
  if( ( width & 15 ) == 0 && vext >= AVX2 )
461
0
  {
462
0
    __m256i vsrca, vsrcat, vsrcab;
463
464
    __m256i    vbaseoffset  = _mm256_set1_epi16( 2 );
465
    __m256i    vplusone     = _mm256_set1_epi16( 1 );
466
    __m256i    vzero        = _mm256_set1_epi8( 0 );
467
    __m256i    vibdimax     = _mm256_set1_epi16( ( 1 << channelBitDepth ) - 1 );
468
    __m256i    voffsettbl   = _mm256_broadcastsi128_si256( _mm_loadu_si128( (__m128i*)p_eo_offsets ) );
469
    const Pel* srcLineBelow = srcLine + srcStride;
470
471
0
    if( isCtuCrossedByVirtualBoundaries )
472
0
    {
473
0
      for( y = startY; y < endY; y++ )
474
0
      {
475
0
        if( !isHorProcessDisabled( y, numHorVirBndry, horVirBndryPos ) )
476
0
        {
477
0
          for( x = 0; x < width; x += 16 )
478
0
          {
479
0
            vsrca             = _mm256_loadu_si256( (__m256i*)&srcLine[x] );
480
0
            vsrcat            = _mm256_loadu_si256( (__m256i*)&srcLineAbove[x] );
481
0
            vsrcab            = _mm256_loadu_si256( (__m256i*)&srcLineBelow[x] );
482
0
            vsrcat            = _mm256_subs_epi16( vsrca, vsrcat );
483
0
            vsrcab            = _mm256_subs_epi16( vsrca, vsrcab );
484
0
            __m256i vsignt    = _mm256_sign_epi16( vplusone, vsrcat );
485
0
            __m256i vsignb    = _mm256_sign_epi16( vplusone, vsrcab );
486
0
            __m256i vsign     = _mm256_adds_epi16( _mm256_adds_epi16( vsignt, vsignb ), vbaseoffset );
487
0
            __m256i veoffsets = _mm256_shuffle_epi8( voffsettbl, vsign );
488
0
            veoffsets         = _mm256_slli_epi16( veoffsets, 8 );
489
0
            veoffsets         = _mm256_srai_epi16( veoffsets, 8 );
490
491
0
            vsrca = _mm256_add_epi16( vsrca, veoffsets );
492
0
            vsrca = _mm256_min_epi16( _mm256_max_epi16( vsrca, vzero ), vibdimax );
493
494
0
            _mm256_storeu_si256( (__m256i*)&resLine[x], vsrca );
495
0
          }
496
0
        }
497
0
        srcLine += srcStride;
498
0
        srcLineBelow += srcStride;
499
0
        srcLineAbove += srcStride;
500
0
        resLine += resStride;
501
0
      }
502
0
    }
503
0
    else
504
0
    {
505
0
      for( y = startY; y < endY; y++ )
506
0
      {
507
0
        for( x = 0; x < width; x += 16 )
508
0
        {
509
0
          vsrca             = _mm256_loadu_si256( (__m256i*)&srcLine[x] );
510
0
          vsrcat            = _mm256_loadu_si256( (__m256i*)&srcLineAbove[x] );
511
0
          vsrcab            = _mm256_loadu_si256( (__m256i*)&srcLineBelow[x] );
512
0
          vsrcat            = _mm256_subs_epi16( vsrca, vsrcat );
513
0
          vsrcab            = _mm256_subs_epi16( vsrca, vsrcab );
514
0
          __m256i vsignt    = _mm256_sign_epi16( vplusone, vsrcat );
515
0
          __m256i vsignb    = _mm256_sign_epi16( vplusone, vsrcab );
516
0
          __m256i vsign     = _mm256_adds_epi16( _mm256_adds_epi16( vsignt, vsignb ), vbaseoffset );
517
0
          __m256i veoffsets = _mm256_shuffle_epi8( voffsettbl, vsign );
518
0
          veoffsets         = _mm256_slli_epi16( veoffsets, 8 );
519
0
          veoffsets         = _mm256_srai_epi16( veoffsets, 8 );
520
521
0
          vsrca = _mm256_add_epi16( vsrca, veoffsets );
522
0
          vsrca = _mm256_min_epi16( _mm256_max_epi16( vsrca, vzero ), vibdimax );
523
524
0
          _mm256_storeu_si256( (__m256i*)&resLine[x], vsrca );
525
0
        }
526
0
        srcLine += srcStride;
527
0
        srcLineBelow += srcStride;
528
0
        srcLineAbove += srcStride;
529
0
        resLine += resStride;
530
0
      }
531
0
    }
532
0
  }
533
0
  else
534
0
#  endif
535
0
  {
536
0
    __m128i vsrca, vsrcat, vsrcab;
537
0
    __m128i vbaseoffset = _mm_set1_epi16( 2 );
538
0
    __m128i vplusone    = _mm_set1_epi16( 1 );
539
0
    __m128i vzero       = _mm_set1_epi8( 0 );
540
0
    __m128i vibdimax    = _mm_set1_epi16( ( 1 << channelBitDepth ) - 1 );
541
0
    __m128i voffsettbl  = _mm_loadu_si128( (__m128i*)p_eo_offsets );
542
543
0
    if( isCtuCrossedByVirtualBoundaries )
544
0
    {
545
0
      for( y = startY; y < endY; y++ )
546
0
      {
547
0
        if( !isHorProcessDisabled( y, numHorVirBndry, horVirBndryPos ) )
548
0
        {
549
0
          for( x = 0; x < width; x += 8 )
550
0
          {
551
0
            vsrca             = _mm_loadu_si128( (__m128i*)&srcLine[x] );
552
0
            vsrcat            = _mm_loadu_si128( (__m128i*)&srcLineAbove[x] );
553
0
            vsrcab            = _mm_loadu_si128( (__m128i*)&srcLineBelow[x] );
554
0
            vsrcat            = _mm_subs_epi16( vsrca, vsrcat );
555
0
            vsrcab            = _mm_subs_epi16( vsrca, vsrcab );
556
0
            __m128i vsignt    = _mm_sign_epi16( vplusone, vsrcat );
557
0
            __m128i vsignb    = _mm_sign_epi16( vplusone, vsrcab );
558
0
            __m128i vsign     = _mm_adds_epi16( _mm_adds_epi16( vsignt, vsignb ), vbaseoffset );
559
0
            __m128i veoffsets = _mm_shuffle_epi8( voffsettbl, vsign );
560
0
            veoffsets         = _mm_slli_epi16( veoffsets, 8 );
561
0
            veoffsets         = _mm_srai_epi16( veoffsets, 8 );
562
563
0
            vsrca = _mm_add_epi16( vsrca, veoffsets );
564
0
            vsrca = _mm_min_epi16( _mm_max_epi16( vsrca, vzero ), vibdimax );
565
566
0
            _mm_store_si128( (__m128i*)&resLine[x], vsrca );
567
0
          }
568
0
        }
569
0
        srcLine += srcStride;
570
0
        srcLineBelow += srcStride;
571
0
        srcLineAbove += srcStride;
572
0
        resLine += resStride;
573
0
      }
574
0
    }
575
0
    else
576
0
    {
577
0
      for( y = startY; y < endY; y++ )
578
0
      {
579
0
        for( x = 0; x < width; x += 8 )
580
0
        {
581
0
          vsrca             = _mm_loadu_si128( (__m128i*)&srcLine[x] );
582
0
          vsrcat            = _mm_loadu_si128( (__m128i*)&srcLineAbove[x] );
583
0
          vsrcab            = _mm_loadu_si128( (__m128i*)&srcLineBelow[x] );
584
0
          vsrcat            = _mm_subs_epi16( vsrca, vsrcat );
585
0
          vsrcab            = _mm_subs_epi16( vsrca, vsrcab );
586
0
          __m128i vsignt    = _mm_sign_epi16( vplusone, vsrcat );
587
0
          __m128i vsignb    = _mm_sign_epi16( vplusone, vsrcab );
588
0
          __m128i vsign     = _mm_adds_epi16( _mm_adds_epi16( vsignt, vsignb ), vbaseoffset );
589
0
          __m128i veoffsets = _mm_shuffle_epi8( voffsettbl, vsign );
590
0
          veoffsets         = _mm_slli_epi16( veoffsets, 8 );
591
0
          veoffsets         = _mm_srai_epi16( veoffsets, 8 );
592
593
0
          vsrca = _mm_add_epi16( vsrca, veoffsets );
594
0
          vsrca = _mm_min_epi16( _mm_max_epi16( vsrca, vzero ), vibdimax );
595
0
          _mm_store_si128( (__m128i*)&resLine[x], vsrca );
596
0
        }
597
0
        srcLine += srcStride;
598
0
        srcLineBelow += srcStride;
599
0
        srcLineAbove += srcStride;
600
0
        resLine += resStride;
601
0
      }
602
0
    }
603
0
  }
604
0
}
Unexecuted instantiation: SampleAdaptiveOffset_sse41.cpp:void vvdec::offsetBlock_SIMD_SAO_TYPE_EO_90<(vvdec::x86_simd::X86_VEXT)1>(int, vvdec::ClpRngTemplate<short> const&, int*, short const*, short*, long, long, int, int, bool, bool, bool, bool, bool, bool, bool, bool, std::__1::vector<signed char, std::__1::allocator<signed char> >*, std::__1::vector<signed char, std::__1::allocator<signed char> >*, bool, int*, int*, int, int, unsigned short*)
Unexecuted instantiation: SampleAdaptiveOffset_avx2.cpp:void vvdec::offsetBlock_SIMD_SAO_TYPE_EO_90<(vvdec::x86_simd::X86_VEXT)4>(int, vvdec::ClpRngTemplate<short> const&, int*, short const*, short*, long, long, int, int, bool, bool, bool, bool, bool, bool, bool, bool, std::__1::vector<signed char, std::__1::allocator<signed char> >*, std::__1::vector<signed char, std::__1::allocator<signed char> >*, bool, int*, int*, int, int, unsigned short*)
605
606
template<X86_VEXT vext>
607
static void offsetBlock_SIMD_SAO_TYPE_EO_135( const int            channelBitDepth,
608
                                              const ClpRng&        clpRng,
609
                                              int*                 offset,
610
                                              const Pel*           srcBlk,
611
                                              Pel*                 resBlk,
612
                                              ptrdiff_t            srcStride,
613
                                              ptrdiff_t            resStride,
614
                                              int                  width,
615
                                              int                  height,
616
                                              bool                 isLeftAvail,
617
                                              bool                 isRightAvail,
618
                                              bool                 isAboveAvail,
619
                                              bool                 isBelowAvail,
620
                                              bool                 isAboveLeftAvail,
621
                                              bool                 isAboveRightAvail,
622
                                              bool                 isBelowLeftAvail,
623
                                              bool                 isBelowRightAvail,
624
                                              std::vector<int8_t>* m_signLineBuf1,
625
                                              std::vector<int8_t>* m_signLineBuf2,
626
                                              bool                 isCtuCrossedByVirtualBoundaries,
627
                                              int                  horVirBndryPos[],
628
                                              int                  verVirBndryPos[],
629
                                              int                  numHorVirBndry,
630
                                              int                  numVerVirBndry,
631
                                              uint16_t             bndmask[MAX_CU_SIZE] )
632
0
{
633
0
  const Pel* srcLine = srcBlk;
634
0
  Pel*       resLine = resBlk;
635
636
0
  int    x, y, startX, startY, endX, endY, edgeType;
637
0
  int    firstLineStartX, firstLineEndX, lastLineStartX, lastLineEndX;
638
0
  int8_t signDown;
639
640
0
  if( isLeftAvail && isRightAvail && isAboveLeftAvail && isBelowRightAvail )
641
0
  {
642
0
    int8_t p_eo_offsets[16] = { 0 };
643
0
    for( int i = 0; i < SAO_EO_NUM_CATEGORIES; i++ )
644
0
    {
645
0
      p_eo_offsets[i] = offset[i];
646
0
    }
647
0
    const Pel* srcLineAbove = srcLine - srcStride;
648
0
    const Pel* srcLineBelow = srcLine + srcStride;
649
0
    startY                  = 0;
650
0
    if( !isAboveAvail )
651
0
    {
652
0
      startY       = 1;
653
0
      srcLineAbove = srcLine;
654
0
      srcLine += srcStride;
655
0
      resLine += resStride;
656
0
      srcLineBelow = srcLine + srcStride;
657
0
    }
658
0
    endY = height;
659
0
    if( !isBelowAvail )
660
0
    {
661
0
      endY = height - 1;
662
0
    }
663
#  if defined( USE_AVX2 )
664
    // AVX2
665
0
    if( ( width & 15 ) == 0 && vext >= AVX2 )
666
0
    {
667
0
      __m256i vsrca, vsrcat, vsrcab, virBmask;
668
669
      __m256i    vbaseoffset  = _mm256_set1_epi16( 2 );
670
      __m256i    vplusone     = _mm256_set1_epi16( 1 );
671
      __m256i    vzero        = _mm256_set1_epi8( 0 );
672
      __m256i    vibdimax     = _mm256_set1_epi16( ( 1 << channelBitDepth ) - 1 );
673
      __m256i    voffsettbl   = _mm256_broadcastsi128_si256( _mm_loadu_si128( (__m128i*)p_eo_offsets ) );
674
      const Pel* srcLineBelow = srcLine + srcStride;
675
676
0
      for( y = startY; y < endY; y++ )
677
0
      {
678
0
        if( !isHorProcessDisabled( y, numHorVirBndry, horVirBndryPos ) )
679
0
        {
680
0
          for( x = 0; x < width; x += 16 )
681
0
          {
682
0
            vsrca             = _mm256_loadu_si256( (__m256i*)&srcLine[x] );
683
0
            vsrcat            = _mm256_loadu_si256( (__m256i*)&srcLineAbove[x - 1] );
684
0
            vsrcab            = _mm256_loadu_si256( (__m256i*)&srcLineBelow[x + 1] );
685
0
            virBmask          = _mm256_loadu_si256( (__m256i*)&bndmask[x] );
686
0
            vsrcat            = _mm256_subs_epi16( vsrca, vsrcat );
687
0
            vsrcab            = _mm256_subs_epi16( vsrca, vsrcab );
688
0
            __m256i vsignt    = _mm256_sign_epi16( vplusone, vsrcat );
689
0
            __m256i vsignb    = _mm256_sign_epi16( vplusone, vsrcab );
690
0
            __m256i vsign     = _mm256_adds_epi16( _mm256_adds_epi16( vsignt, vsignb ), vbaseoffset );
691
0
            __m256i veoffsets = _mm256_shuffle_epi8( voffsettbl, vsign );
692
0
            veoffsets         = _mm256_slli_epi16( veoffsets, 8 );
693
0
            veoffsets         = _mm256_srai_epi16( veoffsets, 8 );
694
695
0
            vsrcat = _mm256_add_epi16( vsrca, veoffsets );
696
0
            vsrcat = _mm256_min_epi16( _mm256_max_epi16( vsrcat, vzero ), vibdimax );
697
698
0
            vsrcab = _mm256_blendv_epi8( vsrcat, vsrca, virBmask );
699
700
0
            _mm256_storeu_si256( (__m256i*)&resLine[x], vsrcab );
701
0
          }
702
0
        }
703
0
        srcLine += srcStride;
704
0
        srcLineBelow += srcStride;
705
0
        srcLineAbove += srcStride;
706
0
        resLine += resStride;
707
0
      }
708
0
    }
709
0
    else
710
0
#  endif
711
0
    {
712
0
      __m128i vsrca, vsrcat, vsrcab, virBmask;
713
0
      __m128i vbaseoffset = _mm_set1_epi16( 2 );
714
0
      __m128i vplusone    = _mm_set1_epi16( 1 );
715
0
      __m128i vzero       = _mm_set1_epi8( 0 );
716
0
      __m128i vibdimax    = _mm_set1_epi16( ( 1 << channelBitDepth ) - 1 );
717
0
      __m128i voffsettbl  = _mm_loadu_si128( (__m128i*)p_eo_offsets );
718
719
0
      for( y = startY; y < endY; y++ )
720
0
      {
721
0
        if( !isHorProcessDisabled( y, numHorVirBndry, horVirBndryPos ) )
722
0
        {
723
0
          for( x = 0; x < width; x += 8 )
724
0
          {
725
0
            vsrca             = _mm_loadu_si128( (__m128i*)&srcLine[x] );
726
0
            vsrcat            = _mm_loadu_si128( (__m128i*)&srcLineAbove[x - 1] );
727
0
            vsrcab            = _mm_loadu_si128( (__m128i*)&srcLineBelow[x + 1] );
728
0
            virBmask          = _mm_loadu_si128( (__m128i*)&bndmask[x] );
729
0
            vsrcat            = _mm_subs_epi16( vsrca, vsrcat );
730
0
            vsrcab            = _mm_subs_epi16( vsrca, vsrcab );
731
0
            __m128i vsignt    = _mm_sign_epi16( vplusone, vsrcat );
732
0
            __m128i vsignb    = _mm_sign_epi16( vplusone, vsrcab );
733
0
            __m128i vsign     = _mm_adds_epi16( _mm_adds_epi16( vsignt, vsignb ), vbaseoffset );
734
0
            __m128i veoffsets = _mm_shuffle_epi8( voffsettbl, vsign );
735
0
            veoffsets         = _mm_slli_epi16( veoffsets, 8 );
736
0
            veoffsets         = _mm_srai_epi16( veoffsets, 8 );
737
738
0
            vsrcat = _mm_add_epi16( vsrca, veoffsets );
739
0
            vsrcat = _mm_min_epi16( _mm_max_epi16( vsrcat, vzero ), vibdimax );
740
741
0
            vsrcab = _mm_blendv_epi8( vsrcat, vsrca, virBmask );
742
743
0
            _mm_store_si128( (__m128i*)&resLine[x], vsrcab );
744
0
          }
745
0
        }
746
0
        srcLine += srcStride;
747
0
        srcLineBelow += srcStride;
748
0
        srcLineAbove += srcStride;
749
0
        resLine += resStride;
750
0
      }
751
0
    }
752
0
  }
753
0
  else
754
0
  {
755
0
    offset += 2;
756
0
    int8_t *signUpLine, *signDownLine, *signTmpLine;
757
758
0
    signUpLine   = &m_signLineBuf1->front();
759
0
    signDownLine = &m_signLineBuf2->front();
760
761
0
    startX = isLeftAvail ? 0 : 1;
762
0
    endX   = isRightAvail ? width : ( width - 1 );
763
764
    // prepare 2nd line's upper sign
765
0
    const Pel* srcLineBelow = srcLine + srcStride;
766
0
    for( x = startX; x < endX + 1; x++ )
767
0
    {
768
0
      signUpLine[x] = (int8_t)sgn( srcLineBelow[x] - srcLine[x - 1] );
769
0
    }
770
0
    if( isCtuCrossedByVirtualBoundaries )
771
0
    {
772
      // 1st line
773
0
      const Pel* srcLineAbove = srcLine - srcStride;
774
0
      firstLineStartX         = isAboveLeftAvail ? 0 : 1;
775
0
      firstLineEndX           = isAboveAvail ? endX : 1;
776
0
      if( !isHorProcessDisabled( 0, numHorVirBndry, horVirBndryPos ) )
777
0
      {
778
0
        for( x = firstLineStartX; x < firstLineEndX; x++ )
779
0
        {
780
0
          if( isVerProcessDisabled( x, numVerVirBndry, verVirBndryPos ) )
781
0
          {
782
0
            continue;
783
0
          }
784
0
          edgeType = sgn( srcLine[x] - srcLineAbove[x - 1] ) - signUpLine[x + 1];
785
786
0
          resLine[x] = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng );
787
0
        }
788
0
      }
789
0
      srcLine += srcStride;
790
0
      resLine += resStride;
791
      // middle lines
792
0
      for( y = 1; y < height - 1; y++ )
793
0
      {
794
0
        if( isHorProcessDisabled( y, numHorVirBndry, horVirBndryPos ) )
795
0
        {
796
0
          srcLineBelow = srcLine + srcStride;
797
0
          for( x = startX; x < endX; x++ )
798
0
          {
799
0
            signDown            = (int8_t)sgn( srcLine[x] - srcLineBelow[x + 1] );
800
0
            signDownLine[x + 1] = -signDown;
801
0
          }
802
0
          signDownLine[startX] = (int8_t)sgn( srcLineBelow[startX] - srcLine[startX - 1] );
803
0
          signTmpLine          = signUpLine;
804
0
          signUpLine           = signDownLine;
805
0
          signDownLine         = signTmpLine;
806
0
          srcLine += srcStride;
807
0
          resLine += resStride;
808
0
        }
809
0
        else
810
0
        {
811
0
          srcLineBelow = srcLine + srcStride;
812
0
          for( x = startX; x < endX; x++ )
813
0
          {
814
0
            signDown = (int8_t)sgn( srcLine[x] - srcLineBelow[x + 1] );
815
0
            if( isVerProcessDisabled( x, numVerVirBndry, verVirBndryPos ) )
816
0
            {
817
0
              signDownLine[x + 1] = -signDown;
818
0
              continue;
819
0
            }
820
0
            edgeType   = signDown + signUpLine[x];
821
0
            resLine[x] = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng );
822
823
0
            signDownLine[x + 1] = -signDown;
824
0
          }
825
0
          signDownLine[startX] = (int8_t)sgn( srcLineBelow[startX] - srcLine[startX - 1] );
826
0
          signTmpLine          = signUpLine;
827
0
          signUpLine           = signDownLine;
828
0
          signDownLine         = signTmpLine;
829
0
          srcLine += srcStride;
830
0
          resLine += resStride;
831
0
        }
832
0
      }
833
      // last line
834
0
      srcLineBelow   = srcLine + srcStride;
835
0
      lastLineStartX = isBelowAvail ? startX : ( width - 1 );
836
0
      lastLineEndX   = isBelowRightAvail ? width : ( width - 1 );
837
0
      if( !isHorProcessDisabled( height - 1, numHorVirBndry, horVirBndryPos ) )
838
0
      {
839
0
        for( x = lastLineStartX; x < lastLineEndX; x++ )
840
0
        {
841
0
          if( isVerProcessDisabled( x, numVerVirBndry, verVirBndryPos ) )
842
0
          {
843
0
            continue;
844
0
          }
845
0
          edgeType   = sgn( srcLine[x] - srcLineBelow[x + 1] ) + signUpLine[x];
846
0
          resLine[x] = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng );
847
0
        }
848
0
      }
849
0
    }
850
0
    else
851
0
    {
852
      // 1st line
853
0
      const Pel* srcLineAbove = srcLine - srcStride;
854
0
      firstLineStartX         = isAboveLeftAvail ? 0 : 1;
855
0
      firstLineEndX           = isAboveAvail ? endX : 1;
856
0
      for( x = firstLineStartX; x < firstLineEndX; x++ )
857
0
      {
858
0
        edgeType   = sgn( srcLine[x] - srcLineAbove[x - 1] ) - signUpLine[x + 1];
859
0
        resLine[x] = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng );
860
0
      }
861
0
      srcLine += srcStride;
862
0
      resLine += resStride;
863
      // middle lines
864
0
      for( y = 1; y < height - 1; y++ )
865
0
      {
866
0
        srcLineBelow = srcLine + srcStride;
867
0
        for( x = startX; x < endX; x++ )
868
0
        {
869
0
          signDown            = (int8_t)sgn( srcLine[x] - srcLineBelow[x + 1] );
870
0
          edgeType            = signDown + signUpLine[x];
871
0
          resLine[x]          = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng );
872
0
          signDownLine[x + 1] = -signDown;
873
0
        }
874
0
        signDownLine[startX] = (int8_t)sgn( srcLineBelow[startX] - srcLine[startX - 1] );
875
0
        signTmpLine          = signUpLine;
876
0
        signUpLine           = signDownLine;
877
0
        signDownLine         = signTmpLine;
878
0
        srcLine += srcStride;
879
0
        resLine += resStride;
880
0
      }
881
      // last line
882
0
      srcLineBelow   = srcLine + srcStride;
883
0
      lastLineStartX = isBelowAvail ? startX : ( width - 1 );
884
0
      lastLineEndX   = isBelowRightAvail ? width : ( width - 1 );
885
0
      for( x = lastLineStartX; x < lastLineEndX; x++ )
886
0
      {
887
0
        edgeType   = sgn( srcLine[x] - srcLineBelow[x + 1] ) + signUpLine[x];
888
0
        resLine[x] = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng );
889
0
      }
890
0
    }
891
0
  }
892
0
}
Unexecuted instantiation: SampleAdaptiveOffset_sse41.cpp:void vvdec::offsetBlock_SIMD_SAO_TYPE_EO_135<(vvdec::x86_simd::X86_VEXT)1>(int, vvdec::ClpRngTemplate<short> const&, int*, short const*, short*, long, long, int, int, bool, bool, bool, bool, bool, bool, bool, bool, std::__1::vector<signed char, std::__1::allocator<signed char> >*, std::__1::vector<signed char, std::__1::allocator<signed char> >*, bool, int*, int*, int, int, unsigned short*)
Unexecuted instantiation: SampleAdaptiveOffset_avx2.cpp:void vvdec::offsetBlock_SIMD_SAO_TYPE_EO_135<(vvdec::x86_simd::X86_VEXT)4>(int, vvdec::ClpRngTemplate<short> const&, int*, short const*, short*, long, long, int, int, bool, bool, bool, bool, bool, bool, bool, bool, std::__1::vector<signed char, std::__1::allocator<signed char> >*, std::__1::vector<signed char, std::__1::allocator<signed char> >*, bool, int*, int*, int, int, unsigned short*)
893
894
template<X86_VEXT vext>
895
static void offsetBlock_SIMD_SAO_TYPE_EO_45( const int            channelBitDepth,
896
                                             const ClpRng&        clpRng,
897
                                             int*                 offset,
898
                                             const Pel*           srcBlk,
899
                                             Pel*                 resBlk,
900
                                             ptrdiff_t            srcStride,
901
                                             ptrdiff_t            resStride,
902
                                             int                  width,
903
                                             int                  height,
904
                                             bool                 isLeftAvail,
905
                                             bool                 isRightAvail,
906
                                             bool                 isAboveAvail,
907
                                             bool                 isBelowAvail,
908
                                             bool                 isAboveLeftAvail,
909
                                             bool                 isAboveRightAvail,
910
                                             bool                 isBelowLeftAvail,
911
                                             bool                 isBelowRightAvail,
912
                                             std::vector<int8_t>* m_signLineBuf1,
913
                                             std::vector<int8_t>* m_signLineBuf2,
914
                                             bool                 isCtuCrossedByVirtualBoundaries,
915
                                             int                  horVirBndryPos[],
916
                                             int                  verVirBndryPos[],
917
                                             int                  numHorVirBndry,
918
                                             int                  numVerVirBndry,
919
                                             uint16_t             bndmask[MAX_CU_SIZE] )
920
0
{
921
0
  const Pel* srcLine = srcBlk;
922
0
  Pel*       resLine = resBlk;
923
924
0
  int    x, y, startX, startY, endX, endY, edgeType;
925
0
  int    firstLineStartX, firstLineEndX, lastLineStartX, lastLineEndX;
926
0
  int8_t signDown;
927
928
0
  if( isLeftAvail && isRightAvail && isAboveLeftAvail && isBelowRightAvail )
929
0
  {
930
0
    int8_t p_eo_offsets[16] = { 0 };
931
0
    for( int i = 0; i < SAO_EO_NUM_CATEGORIES; i++ )
932
0
    {
933
0
      p_eo_offsets[i] = offset[i];
934
0
    }
935
0
    const Pel* srcLineAbove = srcLine - srcStride;
936
0
    const Pel* srcLineBelow = srcLine + srcStride;
937
0
    startY                  = 0;
938
0
    if( !isAboveAvail )
939
0
    {
940
0
      startY       = 1;
941
0
      srcLineAbove = srcLine;
942
0
      srcLine += srcStride;
943
0
      resLine += resStride;
944
0
      srcLineBelow = srcLine + srcStride;
945
0
    }
946
0
    endY = height;
947
0
    if( !isBelowAvail )
948
0
    {
949
0
      endY = height - 1;
950
0
    }
951
#  if defined( USE_AVX2 )
952
    // AVX2
953
0
    if( ( width & 15 ) == 0 && vext >= AVX2 )
954
0
    {
955
0
      __m256i    virBmask;
956
0
      __m256i    vsrca, vsrcat, vsrcab;
957
0
      __m256i    vbaseoffset  = _mm256_set1_epi16( 2 );
958
0
      __m256i    vplusone     = _mm256_set1_epi16( 1 );
959
0
      __m256i    vzero        = _mm256_set1_epi8( 0 );
960
0
      __m256i    vibdimax     = _mm256_set1_epi16( ( 1 << channelBitDepth ) - 1 );
961
0
      __m256i    voffsettbl   = _mm256_broadcastsi128_si256( _mm_loadu_si128( (__m128i*)p_eo_offsets ) );
962
0
      const Pel* srcLineBelow = srcLine + srcStride;
963
964
0
      for( y = startY; y < endY; y++ )
965
0
      {
966
        //         printf("y %d endY %d x%d y %d \n"y,endY,isAboveAvail,)
967
0
        if( !isHorProcessDisabled( y, numHorVirBndry, horVirBndryPos ) )
968
0
        {
969
0
          for( x = 0; x < width; x += 16 )
970
0
          {
971
0
            virBmask = _mm256_loadu_si256( (__m256i*)&bndmask[x] );
972
0
            vsrca    = _mm256_loadu_si256( (__m256i*)&srcLine[x] );
973
0
            vsrcat   = _mm256_loadu_si256( (__m256i*)&srcLineAbove[x + 1] );
974
0
            vsrcab   = _mm256_loadu_si256( (__m256i*)&srcLineBelow[x - 1] );
975
976
0
            vsrcat            = _mm256_subs_epi16( vsrca, vsrcat );
977
0
            vsrcab            = _mm256_subs_epi16( vsrca, vsrcab );
978
0
            __m256i vsignt    = _mm256_sign_epi16( vplusone, vsrcat );
979
0
            __m256i vsignb    = _mm256_sign_epi16( vplusone, vsrcab );
980
0
            __m256i vsign     = _mm256_adds_epi16( _mm256_adds_epi16( vsignt, vsignb ), vbaseoffset );
981
0
            __m256i veoffsets = _mm256_shuffle_epi8( voffsettbl, vsign );
982
0
            veoffsets         = _mm256_slli_epi16( veoffsets, 8 );
983
0
            veoffsets         = _mm256_srai_epi16( veoffsets, 8 );
984
985
0
            vsrcat = _mm256_add_epi16( vsrca, veoffsets );
986
0
            vsrcat = _mm256_min_epi16( _mm256_max_epi16( vsrcat, vzero ), vibdimax );
987
988
0
            vsrcab = _mm256_blendv_epi8( vsrcat, vsrca, virBmask );
989
990
0
            _mm256_storeu_si256( (__m256i*)&resLine[x], vsrcab );
991
0
          }
992
0
        }
993
0
        srcLine += srcStride;
994
0
        srcLineBelow += srcStride;
995
0
        srcLineAbove += srcStride;
996
0
        resLine += resStride;
997
0
      }
998
0
    }
999
0
    else
1000
0
#  endif
1001
0
    {
1002
0
      __m128i vsrca, vsrcat, vsrcab, virBmask;
1003
0
      __m128i vbaseoffset = _mm_set1_epi16( 2 );
1004
0
      __m128i vplusone    = _mm_set1_epi16( 1 );
1005
0
      __m128i vzero       = _mm_set1_epi8( 0 );
1006
0
      __m128i vibdimax    = _mm_set1_epi16( ( 1 << channelBitDepth ) - 1 );
1007
0
      __m128i voffsettbl  = _mm_loadu_si128( (__m128i*)p_eo_offsets );
1008
1009
0
      for( y = startY; y < endY; y++ )
1010
0
      {
1011
0
        if( !isHorProcessDisabled( y, numHorVirBndry, horVirBndryPos ) )
1012
0
        {
1013
0
          for( x = 0; x < width; x += 8 )
1014
0
          {
1015
0
            vsrca    = _mm_loadu_si128( (__m128i*)&srcLine[x] );
1016
0
            vsrcat   = _mm_loadu_si128( (__m128i*)&srcLineAbove[x + 1] );
1017
0
            virBmask = _mm_loadu_si128( (__m128i*)&bndmask[x] );
1018
1019
0
            vsrcab            = _mm_loadu_si128( (__m128i*)&srcLineBelow[x - 1] );
1020
0
            vsrcat            = _mm_subs_epi16( vsrca, vsrcat );
1021
0
            vsrcab            = _mm_subs_epi16( vsrca, vsrcab );
1022
0
            __m128i vsignt    = _mm_sign_epi16( vplusone, vsrcat );
1023
0
            __m128i vsignb    = _mm_sign_epi16( vplusone, vsrcab );
1024
0
            __m128i vsign     = _mm_adds_epi16( _mm_adds_epi16( vsignt, vsignb ), vbaseoffset );
1025
0
            __m128i veoffsets = _mm_shuffle_epi8( voffsettbl, vsign );
1026
0
            veoffsets         = _mm_slli_epi16( veoffsets, 8 );
1027
0
            veoffsets         = _mm_srai_epi16( veoffsets, 8 );
1028
1029
0
            vsrcat = _mm_add_epi16( vsrca, veoffsets );
1030
0
            vsrcat = _mm_min_epi16( _mm_max_epi16( vsrcat, vzero ), vibdimax );
1031
1032
0
            vsrcab = _mm_blendv_epi8( vsrcat, vsrca, virBmask );
1033
1034
0
            _mm_store_si128( (__m128i*)&resLine[x], vsrcab );
1035
0
          }
1036
0
        }
1037
0
        srcLine += srcStride;
1038
0
        srcLineBelow += srcStride;
1039
0
        srcLineAbove += srcStride;
1040
0
        resLine += resStride;
1041
0
      }
1042
0
    }
1043
0
  }
1044
0
  else
1045
0
  {
1046
0
    offset += 2;
1047
0
    int8_t* signUpLine = &m_signLineBuf1->at( 1 );
1048
0
    startX             = isLeftAvail ? 0 : 1;
1049
0
    endX               = isRightAvail ? width : ( width - 1 );
1050
    // prepare 2nd line upper sign
1051
0
    const Pel* srcLineBelow = srcLine + srcStride;
1052
0
    for( x = startX - 1; x < endX; x++ )
1053
0
    {
1054
0
      signUpLine[x] = (int8_t)sgn( srcLineBelow[x] - srcLine[x + 1] );
1055
0
    }
1056
1057
0
    if( isCtuCrossedByVirtualBoundaries )
1058
0
    {
1059
      // first line
1060
0
      const Pel* srcLineAbove = srcLine - srcStride;
1061
0
      firstLineStartX         = isAboveAvail ? startX : ( width - 1 );
1062
0
      firstLineEndX           = isAboveRightAvail ? width : ( width - 1 );
1063
0
      if( !isHorProcessDisabled( 0, numHorVirBndry, horVirBndryPos ) )
1064
0
      {
1065
0
        for( x = firstLineStartX; x < firstLineEndX; x++ )
1066
0
        {
1067
0
          if( isVerProcessDisabled( x, numVerVirBndry, verVirBndryPos ) )
1068
0
          {
1069
0
            continue;
1070
0
          }
1071
0
          edgeType   = sgn( srcLine[x] - srcLineAbove[x + 1] ) - signUpLine[x - 1];
1072
0
          resLine[x] = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng );
1073
0
        }
1074
0
      }
1075
0
      srcLine += srcStride;
1076
0
      resLine += resStride;
1077
      // middle lines
1078
0
      for( y = 1; y < height - 1; y++ )
1079
0
      {
1080
0
        if( isHorProcessDisabled( y, numHorVirBndry, horVirBndryPos ) )
1081
0
        {
1082
0
          srcLineBelow = srcLine + ( srcStride );
1083
0
          for( x = startX; x < endX; x++ )
1084
0
          {
1085
0
            signDown          = (int8_t)sgn( srcLine[x] - srcLineBelow[x - 1] );
1086
0
            signUpLine[x - 1] = -signDown;
1087
0
          }
1088
0
          signUpLine[endX - 1] = (int8_t)sgn( srcLineBelow[endX - 1] - srcLine[endX] );
1089
0
          srcLine += srcStride;
1090
0
          resLine += ( resStride );
1091
0
        }
1092
0
        else
1093
0
        {
1094
0
          srcLineBelow = srcLine + srcStride;
1095
0
          for( x = startX; x < endX; x++ )
1096
0
          {
1097
0
            signDown = (int8_t)sgn( srcLine[x] - srcLineBelow[x - 1] );
1098
0
            if( isVerProcessDisabled( x, numVerVirBndry, verVirBndryPos ) )
1099
0
            {
1100
0
              signUpLine[x - 1] = -signDown;
1101
0
              continue;
1102
0
            }
1103
0
            edgeType          = signDown + signUpLine[x];
1104
0
            resLine[x]        = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng );
1105
0
            signUpLine[x - 1] = -signDown;
1106
0
          }
1107
0
          signUpLine[endX - 1] = (int8_t)sgn( srcLineBelow[endX - 1] - srcLine[endX] );
1108
0
          srcLine += srcStride;
1109
0
          resLine += resStride;
1110
0
        }
1111
0
      }
1112
      // last line
1113
0
      srcLineBelow   = srcLine + srcStride;
1114
0
      lastLineStartX = isBelowLeftAvail ? 0 : 1;
1115
0
      lastLineEndX   = isBelowAvail ? endX : 1;
1116
0
      if( !isHorProcessDisabled( height - 1, numHorVirBndry, horVirBndryPos ) )
1117
0
      {
1118
0
        for( x = lastLineStartX; x < lastLineEndX; x++ )
1119
0
        {
1120
0
          if( isVerProcessDisabled( x, numVerVirBndry, verVirBndryPos ) )
1121
0
          {
1122
0
            continue;
1123
0
          }
1124
0
          edgeType   = sgn( srcLine[x] - srcLineBelow[x - 1] ) + signUpLine[x];
1125
0
          resLine[x] = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng );
1126
0
        }
1127
0
      }
1128
0
    }
1129
0
    else
1130
0
    {
1131
      // first line
1132
0
      const Pel* srcLineAbove = srcLine - srcStride;
1133
0
      firstLineStartX         = isAboveAvail ? startX : ( width - 1 );
1134
0
      firstLineEndX           = isAboveRightAvail ? width : ( width - 1 );
1135
0
      for( x = firstLineStartX; x < firstLineEndX; x++ )
1136
0
      {
1137
0
        edgeType   = sgn( srcLine[x] - srcLineAbove[x + 1] ) - signUpLine[x - 1];
1138
0
        resLine[x] = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng );
1139
0
      }
1140
0
      srcLine += srcStride;
1141
0
      resLine += resStride;
1142
      // middle lines
1143
0
      for( y = 1; y < height - 1; y++ )
1144
0
      {
1145
0
        srcLineBelow = srcLine + srcStride;
1146
0
        for( x = startX; x < endX; x++ )
1147
0
        {
1148
0
          signDown          = (int8_t)sgn( srcLine[x] - srcLineBelow[x - 1] );
1149
0
          edgeType          = signDown + signUpLine[x];
1150
0
          resLine[x]        = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng );
1151
0
          signUpLine[x - 1] = -signDown;
1152
0
        }
1153
0
        signUpLine[endX - 1] = (int8_t)sgn( srcLineBelow[endX - 1] - srcLine[endX] );
1154
0
        srcLine += srcStride;
1155
0
        resLine += resStride;
1156
0
      }
1157
      // last line
1158
0
      srcLineBelow   = srcLine + srcStride;
1159
0
      lastLineStartX = isBelowLeftAvail ? 0 : 1;
1160
0
      lastLineEndX   = isBelowAvail ? endX : 1;
1161
0
      for( x = lastLineStartX; x < lastLineEndX; x++ )
1162
0
      {
1163
0
        edgeType   = sgn( srcLine[x] - srcLineBelow[x - 1] ) + signUpLine[x];
1164
0
        resLine[x] = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng );
1165
0
      }
1166
0
    }
1167
0
  }
1168
0
}
Unexecuted instantiation: SampleAdaptiveOffset_sse41.cpp:void vvdec::offsetBlock_SIMD_SAO_TYPE_EO_45<(vvdec::x86_simd::X86_VEXT)1>(int, vvdec::ClpRngTemplate<short> const&, int*, short const*, short*, long, long, int, int, bool, bool, bool, bool, bool, bool, bool, bool, std::__1::vector<signed char, std::__1::allocator<signed char> >*, std::__1::vector<signed char, std::__1::allocator<signed char> >*, bool, int*, int*, int, int, unsigned short*)
Unexecuted instantiation: SampleAdaptiveOffset_avx2.cpp:void vvdec::offsetBlock_SIMD_SAO_TYPE_EO_45<(vvdec::x86_simd::X86_VEXT)4>(int, vvdec::ClpRngTemplate<short> const&, int*, short const*, short*, long, long, int, int, bool, bool, bool, bool, bool, bool, bool, bool, std::__1::vector<signed char, std::__1::allocator<signed char> >*, std::__1::vector<signed char, std::__1::allocator<signed char> >*, bool, int*, int*, int, int, unsigned short*)
1169
1170
template<X86_VEXT vext>
1171
void offsetBlock_SIMD( const int            channelBitDepth,
1172
                       const ClpRng&        clpRng,
1173
                       int                  typeIdx,
1174
                       int*                 offset,
1175
                       int                  startIdx,
1176
                       const Pel*           srcBlk,
1177
                       Pel*                 resBlk,
1178
                       ptrdiff_t            srcStride,
1179
                       ptrdiff_t            resStride,
1180
                       int                  width,
1181
                       int                  height,
1182
                       bool                 isLeftAvail,
1183
                       bool                 isRightAvail,
1184
                       bool                 isAboveAvail,
1185
                       bool                 isBelowAvail,
1186
                       bool                 isAboveLeftAvail,
1187
                       bool                 isAboveRightAvail,
1188
                       bool                 isBelowLeftAvail,
1189
                       bool                 isBelowRightAvail,
1190
                       std::vector<int8_t>* m_signLineBuf1,
1191
                       std::vector<int8_t>* m_signLineBuf2,
1192
                       bool                 isCtuCrossedByVirtualBoundaries,
1193
                       int                  horVirBndryPos[],
1194
                       int                  verVirBndryPos[],
1195
                       int                  numHorVirBndry,
1196
                       int                  numVerVirBndry )
1197
0
{
1198
0
  if( typeIdx == SAO_TYPE_BO )
1199
0
  {
1200
0
    offsetBlock_SIMD_SAO_TYPE_BO<vext>( channelBitDepth, offset, startIdx, srcBlk, resBlk, srcStride, resStride, width, height );
1201
1202
#  if USE_AVX2
1203
    _mm256_zeroupper();
1204
#  endif
1205
0
    return;
1206
0
  }
1207
1208
0
  uint16_t bndmask[MAX_CU_SIZE];
1209
0
  memset( &bndmask, 0, MAX_CU_SIZE * sizeof( uint16_t ) );
1210
0
  if( isCtuCrossedByVirtualBoundaries && numVerVirBndry > 0 )
1211
0
  {
1212
0
    for( int i = 0; i < numVerVirBndry; i++ )
1213
0
    {
1214
0
      if( verVirBndryPos[i] >= 0 && verVirBndryPos[i] < width )
1215
0
      {
1216
0
        bndmask[verVirBndryPos[i]] = 0xffff;
1217
0
      }
1218
1219
0
      if( verVirBndryPos[i] - 1 >= 0 && verVirBndryPos[i] - 1 < width )
1220
0
      {
1221
0
        bndmask[verVirBndryPos[i] - 1] = 0xffff;
1222
0
      }
1223
0
    }
1224
0
  }
1225
1226
0
  switch( typeIdx )
1227
0
  {
1228
0
  case SAO_TYPE_EO_0:
1229
0
    offsetBlock_SIMD_SAO_TYPE_EO_0<vext>( channelBitDepth,
1230
0
                                          clpRng,
1231
0
                                          offset,
1232
0
                                          srcBlk,
1233
0
                                          resBlk,
1234
0
                                          srcStride,
1235
0
                                          resStride,
1236
0
                                          width,
1237
0
                                          height,
1238
0
                                          isLeftAvail,
1239
0
                                          isRightAvail,
1240
0
                                          isAboveAvail,
1241
0
                                          isBelowAvail,
1242
0
                                          isAboveLeftAvail,
1243
0
                                          isAboveRightAvail,
1244
0
                                          isBelowLeftAvail,
1245
0
                                          isBelowRightAvail,
1246
0
                                          m_signLineBuf1,
1247
0
                                          m_signLineBuf2,
1248
0
                                          isCtuCrossedByVirtualBoundaries,
1249
0
                                          horVirBndryPos,
1250
0
                                          verVirBndryPos,
1251
0
                                          numHorVirBndry,
1252
0
                                          numVerVirBndry,
1253
0
                                          bndmask );
1254
0
    break;
1255
1256
0
  case SAO_TYPE_EO_90:
1257
0
    offsetBlock_SIMD_SAO_TYPE_EO_90<vext>( channelBitDepth,
1258
0
                                           clpRng,
1259
0
                                           offset,
1260
0
                                           srcBlk,
1261
0
                                           resBlk,
1262
0
                                           srcStride,
1263
0
                                           resStride,
1264
0
                                           width,
1265
0
                                           height,
1266
0
                                           isLeftAvail,
1267
0
                                           isRightAvail,
1268
0
                                           isAboveAvail,
1269
0
                                           isBelowAvail,
1270
0
                                           isAboveLeftAvail,
1271
0
                                           isAboveRightAvail,
1272
0
                                           isBelowLeftAvail,
1273
0
                                           isBelowRightAvail,
1274
0
                                           m_signLineBuf1,
1275
0
                                           m_signLineBuf2,
1276
0
                                           isCtuCrossedByVirtualBoundaries,
1277
0
                                           horVirBndryPos,
1278
0
                                           verVirBndryPos,
1279
0
                                           numHorVirBndry,
1280
0
                                           numVerVirBndry,
1281
0
                                           bndmask );
1282
0
    break;
1283
1284
0
  case SAO_TYPE_EO_135:
1285
0
    offsetBlock_SIMD_SAO_TYPE_EO_135<vext>( channelBitDepth,
1286
0
                                            clpRng,
1287
0
                                            offset,
1288
0
                                            srcBlk,
1289
0
                                            resBlk,
1290
0
                                            srcStride,
1291
0
                                            resStride,
1292
0
                                            width,
1293
0
                                            height,
1294
0
                                            isLeftAvail,
1295
0
                                            isRightAvail,
1296
0
                                            isAboveAvail,
1297
0
                                            isBelowAvail,
1298
0
                                            isAboveLeftAvail,
1299
0
                                            isAboveRightAvail,
1300
0
                                            isBelowLeftAvail,
1301
0
                                            isBelowRightAvail,
1302
0
                                            m_signLineBuf1,
1303
0
                                            m_signLineBuf2,
1304
0
                                            isCtuCrossedByVirtualBoundaries,
1305
0
                                            horVirBndryPos,
1306
0
                                            verVirBndryPos,
1307
0
                                            numHorVirBndry,
1308
0
                                            numVerVirBndry,
1309
0
                                            bndmask );
1310
0
    break;
1311
1312
0
  case SAO_TYPE_EO_45:
1313
0
    offsetBlock_SIMD_SAO_TYPE_EO_45<vext>( channelBitDepth,
1314
0
                                           clpRng,
1315
0
                                           offset,
1316
0
                                           srcBlk,
1317
0
                                           resBlk,
1318
0
                                           srcStride,
1319
0
                                           resStride,
1320
0
                                           width,
1321
0
                                           height,
1322
0
                                           isLeftAvail,
1323
0
                                           isRightAvail,
1324
0
                                           isAboveAvail,
1325
0
                                           isBelowAvail,
1326
0
                                           isAboveLeftAvail,
1327
0
                                           isAboveRightAvail,
1328
0
                                           isBelowLeftAvail,
1329
0
                                           isBelowRightAvail,
1330
0
                                           m_signLineBuf1,
1331
0
                                           m_signLineBuf2,
1332
0
                                           isCtuCrossedByVirtualBoundaries,
1333
0
                                           horVirBndryPos,
1334
0
                                           verVirBndryPos,
1335
0
                                           numHorVirBndry,
1336
0
                                           numVerVirBndry,
1337
0
                                           bndmask );
1338
0
    break;
1339
1340
0
  default:
1341
0
    THROW_FATAL( "Not a supported SAO types\n" );
1342
0
  }
1343
1344
#  if USE_AVX2
1345
0
  _mm256_zeroupper();
1346
0
#  endif
1347
0
}
Unexecuted instantiation: void vvdec::offsetBlock_SIMD<(vvdec::x86_simd::X86_VEXT)1>(int, vvdec::ClpRngTemplate<short> const&, int, int*, int, short const*, short*, long, long, int, int, bool, bool, bool, bool, bool, bool, bool, bool, std::__1::vector<signed char, std::__1::allocator<signed char> >*, std::__1::vector<signed char, std::__1::allocator<signed char> >*, bool, int*, int*, int, int)
Unexecuted instantiation: void vvdec::offsetBlock_SIMD<(vvdec::x86_simd::X86_VEXT)4>(int, vvdec::ClpRngTemplate<short> const&, int, int*, int, short const*, short*, long, long, int, int, bool, bool, bool, bool, bool, bool, bool, bool, std::__1::vector<signed char, std::__1::allocator<signed char> >*, std::__1::vector<signed char, std::__1::allocator<signed char> >*, bool, int*, int*, int, int)
1348
1349
template<X86_VEXT vext>
1350
void SampleAdaptiveOffset::_initSampleAdaptiveOffsetX86()
1351
0
{
1352
0
  offsetBlock = offsetBlock_SIMD<vext>;
1353
0
}
Unexecuted instantiation: void vvdec::SampleAdaptiveOffset::_initSampleAdaptiveOffsetX86<(vvdec::x86_simd::X86_VEXT)1>()
Unexecuted instantiation: void vvdec::SampleAdaptiveOffset::_initSampleAdaptiveOffsetX86<(vvdec::x86_simd::X86_VEXT)4>()
1354
1355
template void SampleAdaptiveOffset::_initSampleAdaptiveOffsetX86<SIMDX86>();
1356
1357
}
1358
#endif   //#ifdef TARGET_SIMD_X86