Coverage Report

Created: 2026-06-16 07:20

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/vvdec/source/Lib/CommonLib/x86/SampleAdaptiveOffsetX86.h
Line
Count
Source
1
/* -----------------------------------------------------------------------------
2
The copyright in this software is being made available under the Clear BSD
3
License, included below. No patent rights, trademark rights and/or 
4
other Intellectual Property Rights other than the copyrights concerning 
5
the Software are granted under this license.
6
7
The Clear BSD License
8
9
Copyright (c) 2018-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVdeC Authors.
10
All rights reserved.
11
12
Redistribution and use in source and binary forms, with or without modification,
13
are permitted (subject to the limitations in the disclaimer below) provided that
14
the following conditions are met:
15
16
     * Redistributions of source code must retain the above copyright notice,
17
     this list of conditions and the following disclaimer.
18
19
     * Redistributions in binary form must reproduce the above copyright
20
     notice, this list of conditions and the following disclaimer in the
21
     documentation and/or other materials provided with the distribution.
22
23
     * Neither the name of the copyright holder nor the names of its
24
     contributors may be used to endorse or promote products derived from this
25
     software without specific prior written permission.
26
27
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
28
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
29
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
32
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
36
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
40
41
------------------------------------------------------------------------------------------- */
42
43
/** \file     SampleAdaptiveOffsetX86.h
44
    \brief    SAO filter class
45
*/
46
#include "CommonDefX86.h"
47
#include "../SampleAdaptiveOffset.h"
48
49
#if defined(TARGET_SIMD_X86)  && ENABLE_SIMD_OPT_SAO
50
51
namespace vvdec
52
{
53
54
#  if USE_AVX2 && !defined( _mm256_set_m128i )
55
#    define VVCLIB_OWN_mm256_set_m128i
56
#    define _mm256_set_m128i( v0, v1 ) _mm256_inserti128_si256( _mm256_castsi128_si256( v1 ), ( v0 ), 1 )
57
#  endif
58
59
static bool isProcessDisabled( int xPos, int yPos, int numVerVirBndry, int numHorVirBndry, int verVirBndryPos[], int horVirBndryPos[] )
60
0
{
61
0
  for( int i = 0; i < numVerVirBndry; i++ )
62
0
  {
63
0
    if( ( xPos == verVirBndryPos[i] ) || ( xPos == verVirBndryPos[i] - 1 ) )
64
0
    {
65
0
      return true;
66
0
    }
67
0
  }
68
0
  for( int i = 0; i < numHorVirBndry; i++ )
69
0
  {
70
0
    if( ( yPos == horVirBndryPos[i] ) || ( yPos == horVirBndryPos[i] - 1 ) )
71
0
    {
72
0
      return true;
73
0
    }
74
0
  }
75
0
76
0
  return false;
77
0
}
Unexecuted instantiation: SampleAdaptiveOffset_sse41.cpp:vvdec::isProcessDisabled(int, int, int, int, int*, int*)
Unexecuted instantiation: SampleAdaptiveOffset_avx2.cpp:vvdec::isProcessDisabled(int, int, int, int, int*, int*)
78
79
static bool isHorProcessDisabled( int yPos, int numHorVirBndry, int horVirBndryPos[] )
80
0
{
81
0
  for( int i = 0; i < numHorVirBndry; i++ )
82
0
  {
83
0
    if( ( yPos == horVirBndryPos[i] ) || ( yPos == horVirBndryPos[i] - 1 ) )
84
0
    {
85
0
      return true;
86
0
    }
87
0
  }
88
0
  return false;
89
0
}
Unexecuted instantiation: SampleAdaptiveOffset_sse41.cpp:vvdec::isHorProcessDisabled(int, int, int*)
Unexecuted instantiation: SampleAdaptiveOffset_avx2.cpp:vvdec::isHorProcessDisabled(int, int, int*)
90
static bool isVerProcessDisabled( int xPos, int numVerVirBndry, int verVirBndryPos[] )
91
0
{
92
0
  for( int i = 0; i < numVerVirBndry; i++ )
93
0
  {
94
0
    if( ( xPos == verVirBndryPos[i] ) || ( xPos == verVirBndryPos[i] - 1 ) )
95
0
    {
96
0
      return true;
97
0
    }
98
0
  }
99
0
  return false;
100
0
}
Unexecuted instantiation: SampleAdaptiveOffset_sse41.cpp:vvdec::isVerProcessDisabled(int, int, int*)
Unexecuted instantiation: SampleAdaptiveOffset_avx2.cpp:vvdec::isVerProcessDisabled(int, int, int*)
101
102
template<X86_VEXT vext>
103
static void offsetBlock_SIMD_SAO_TYPE_BO( const int  channelBitDepth,
104
                                          int*       offset,
105
                                          int        startIdx,
106
                                          const Pel* srcBlk,
107
                                          Pel*       resBlk,
108
                                          ptrdiff_t  srcStride,
109
                                          ptrdiff_t  resStride,
110
                                          int        width,
111
                                          int        height )
112
5
{
113
5
  const Pel* srcLine = srcBlk;
114
5
  Pel*       resLine = resBlk;
115
116
5
  const int shiftBits        = channelBitDepth - NUM_SAO_BO_CLASSES_LOG2;
117
5
  int8_t    p_eo_offsets[16] = { 0 };
118
25
  for( int i = 0; i < 4; i++ )
119
20
  {
120
20
    p_eo_offsets[i] = offset[( startIdx + i ) % MAX_NUM_SAO_CLASSES];
121
20
  }
122
580
  for( int y = 0; y < height; y++ )
123
575
  {
124
4.54k
    for( int x = 0; x < width; )
125
3.96k
    {
126
#  ifdef USE_AVX2
127
      // AVX2
128
3.96k
      if( width - x >= 16 && vext >= AVX2 )
129
3.96k
      {
130
3.96k
        __m256i vbaseoffset = _mm256_set1_epi16( startIdx - MAX_NUM_SAO_CLASSES );
131
3.96k
        __m256i vminus      = _mm256_set1_epi8( -1 );
132
3.96k
        __m256i vzero       = _mm256_set1_epi8( 0 );
133
134
        __m256i vfour      = _mm256_set1_epi16( 4 );
135
        __m256i vibdimax   = _mm256_set1_epi16( ( 1 << channelBitDepth ) - 1 );
136
        __m256i voffsettbl = _mm256_broadcastsi128_si256( _mm_loadu_si128( (__m128i*)p_eo_offsets ) );
137
138
        __m256i vsrc  = _mm256_loadu_si256( (__m256i*)&srcLine[x] );
139
        __m256i bands = _mm256_srai_epi16( vsrc, shiftBits );
140
        bands         = _mm256_sub_epi16 ( bands, vbaseoffset );
141
3.96k
        bands         = _mm256_and_si256 ( bands, _mm256_set1_epi16( MAX_NUM_SAO_CLASSES - 1 ) ); // modulo 32 = modulo NUM_SAO_BO_CLASSES_LOG2
142
        __m256i mask1 = _mm256_cmpgt_epi16( bands, vminus );
143
        __m256i mask2 = _mm256_cmpgt_epi16( vfour, bands );
144
145
        __m256i veoffsets = _mm256_shuffle_epi8( voffsettbl, bands );
146
        veoffsets         = _mm256_slli_epi16( veoffsets, 8 );
147
        veoffsets         = _mm256_srai_epi16( veoffsets, 8 );
148
149
        veoffsets = _mm256_and_si256( veoffsets, mask1 );
150
        veoffsets = _mm256_and_si256( veoffsets, mask2 );
151
152
        vsrc = _mm256_add_epi16( vsrc, veoffsets );
153
        vsrc = _mm256_min_epi16( _mm256_max_epi16( vsrc, vzero ), vibdimax );
154
        _mm256_storeu_si256( (__m256i*)&resLine[x], vsrc );
155
156
3.96k
        x += 16;
157
3.96k
      }
158
4
      else
159
4
#  endif
160
4
      {
161
4
        __m128i vbaseoffset = _mm_set1_epi16( startIdx - MAX_NUM_SAO_CLASSES );
162
4
        __m128i vminus      = _mm_set1_epi8( -1 );
163
4
        __m128i vzero       = _mm_set1_epi8( 0 );
164
165
4
        __m128i vfour      = _mm_set1_epi16( 4 );
166
4
        __m128i vibdimax   = _mm_set1_epi16( ( 1 << channelBitDepth ) - 1 );
167
4
        __m128i voffsettbl = _mm_loadu_si128( (__m128i*)p_eo_offsets );
168
169
4
        __m128i vsrc  = _mm_loadu_si128( (__m128i*)&srcLine[x] );
170
4
        __m128i bands = _mm_srai_epi16( vsrc, shiftBits );
171
4
        bands         = _mm_sub_epi16 ( bands, vbaseoffset );
172
4
        bands         = _mm_and_si128 ( bands, _mm_set1_epi16( MAX_NUM_SAO_CLASSES - 1 ) ); // modulo 32 = modulo NUM_SAO_BO_CLASSES_LOG2
173
4
        __m128i mask1 = _mm_cmpgt_epi16( bands, vminus );
174
4
        __m128i mask2 = _mm_cmplt_epi16( bands, vfour );
175
176
4
        __m128i veoffsets = _mm_shuffle_epi8( voffsettbl, bands );
177
4
        veoffsets         = _mm_slli_epi16( veoffsets, 8 );
178
4
        veoffsets         = _mm_srai_epi16( veoffsets, 8 );
179
180
4
        veoffsets = _mm_and_si128( veoffsets, mask1 );
181
4
        veoffsets = _mm_and_si128( veoffsets, mask2 );
182
183
4
        vsrc = _mm_add_epi16( vsrc, veoffsets );
184
4
        vsrc = _mm_min_epi16( _mm_max_epi16( vsrc, vzero ), vibdimax );
185
4
        _mm_store_si128( (__m128i*)&resLine[x], vsrc );
186
187
4
        x += 8;
188
4
      }
189
3.96k
    }
190
575
    srcLine += srcStride;
191
575
    resLine += resStride;
192
575
  }
193
5
}
Unexecuted instantiation: SampleAdaptiveOffset_sse41.cpp:void vvdec::offsetBlock_SIMD_SAO_TYPE_BO<(vvdec::x86_simd::X86_VEXT)1>(int, int*, int, short const*, short*, long, long, int, int)
SampleAdaptiveOffset_avx2.cpp:void vvdec::offsetBlock_SIMD_SAO_TYPE_BO<(vvdec::x86_simd::X86_VEXT)4>(int, int*, int, short const*, short*, long, long, int, int)
Line
Count
Source
112
5
{
113
5
  const Pel* srcLine = srcBlk;
114
5
  Pel*       resLine = resBlk;
115
116
5
  const int shiftBits        = channelBitDepth - NUM_SAO_BO_CLASSES_LOG2;
117
5
  int8_t    p_eo_offsets[16] = { 0 };
118
25
  for( int i = 0; i < 4; i++ )
119
20
  {
120
20
    p_eo_offsets[i] = offset[( startIdx + i ) % MAX_NUM_SAO_CLASSES];
121
20
  }
122
580
  for( int y = 0; y < height; y++ )
123
575
  {
124
4.54k
    for( int x = 0; x < width; )
125
3.96k
    {
126
3.96k
#  ifdef USE_AVX2
127
      // AVX2
128
3.96k
      if( width - x >= 16 && vext >= AVX2 )
129
3.96k
      {
130
3.96k
        __m256i vbaseoffset = _mm256_set1_epi16( startIdx - MAX_NUM_SAO_CLASSES );
131
3.96k
        __m256i vminus      = _mm256_set1_epi8( -1 );
132
3.96k
        __m256i vzero       = _mm256_set1_epi8( 0 );
133
134
3.96k
        __m256i vfour      = _mm256_set1_epi16( 4 );
135
3.96k
        __m256i vibdimax   = _mm256_set1_epi16( ( 1 << channelBitDepth ) - 1 );
136
3.96k
        __m256i voffsettbl = _mm256_broadcastsi128_si256( _mm_loadu_si128( (__m128i*)p_eo_offsets ) );
137
138
3.96k
        __m256i vsrc  = _mm256_loadu_si256( (__m256i*)&srcLine[x] );
139
3.96k
        __m256i bands = _mm256_srai_epi16( vsrc, shiftBits );
140
3.96k
        bands         = _mm256_sub_epi16 ( bands, vbaseoffset );
141
3.96k
        bands         = _mm256_and_si256 ( bands, _mm256_set1_epi16( MAX_NUM_SAO_CLASSES - 1 ) ); // modulo 32 = modulo NUM_SAO_BO_CLASSES_LOG2
142
3.96k
        __m256i mask1 = _mm256_cmpgt_epi16( bands, vminus );
143
3.96k
        __m256i mask2 = _mm256_cmpgt_epi16( vfour, bands );
144
145
3.96k
        __m256i veoffsets = _mm256_shuffle_epi8( voffsettbl, bands );
146
3.96k
        veoffsets         = _mm256_slli_epi16( veoffsets, 8 );
147
3.96k
        veoffsets         = _mm256_srai_epi16( veoffsets, 8 );
148
149
3.96k
        veoffsets = _mm256_and_si256( veoffsets, mask1 );
150
3.96k
        veoffsets = _mm256_and_si256( veoffsets, mask2 );
151
152
3.96k
        vsrc = _mm256_add_epi16( vsrc, veoffsets );
153
3.96k
        vsrc = _mm256_min_epi16( _mm256_max_epi16( vsrc, vzero ), vibdimax );
154
3.96k
        _mm256_storeu_si256( (__m256i*)&resLine[x], vsrc );
155
156
3.96k
        x += 16;
157
3.96k
      }
158
4
      else
159
4
#  endif
160
4
      {
161
4
        __m128i vbaseoffset = _mm_set1_epi16( startIdx - MAX_NUM_SAO_CLASSES );
162
4
        __m128i vminus      = _mm_set1_epi8( -1 );
163
4
        __m128i vzero       = _mm_set1_epi8( 0 );
164
165
4
        __m128i vfour      = _mm_set1_epi16( 4 );
166
4
        __m128i vibdimax   = _mm_set1_epi16( ( 1 << channelBitDepth ) - 1 );
167
4
        __m128i voffsettbl = _mm_loadu_si128( (__m128i*)p_eo_offsets );
168
169
4
        __m128i vsrc  = _mm_loadu_si128( (__m128i*)&srcLine[x] );
170
4
        __m128i bands = _mm_srai_epi16( vsrc, shiftBits );
171
4
        bands         = _mm_sub_epi16 ( bands, vbaseoffset );
172
4
        bands         = _mm_and_si128 ( bands, _mm_set1_epi16( MAX_NUM_SAO_CLASSES - 1 ) ); // modulo 32 = modulo NUM_SAO_BO_CLASSES_LOG2
173
4
        __m128i mask1 = _mm_cmpgt_epi16( bands, vminus );
174
4
        __m128i mask2 = _mm_cmplt_epi16( bands, vfour );
175
176
4
        __m128i veoffsets = _mm_shuffle_epi8( voffsettbl, bands );
177
4
        veoffsets         = _mm_slli_epi16( veoffsets, 8 );
178
4
        veoffsets         = _mm_srai_epi16( veoffsets, 8 );
179
180
4
        veoffsets = _mm_and_si128( veoffsets, mask1 );
181
4
        veoffsets = _mm_and_si128( veoffsets, mask2 );
182
183
4
        vsrc = _mm_add_epi16( vsrc, veoffsets );
184
4
        vsrc = _mm_min_epi16( _mm_max_epi16( vsrc, vzero ), vibdimax );
185
4
        _mm_store_si128( (__m128i*)&resLine[x], vsrc );
186
187
4
        x += 8;
188
4
      }
189
3.96k
    }
190
575
    srcLine += srcStride;
191
575
    resLine += resStride;
192
575
  }
193
5
}
194
195
template<X86_VEXT vext>
196
static void offsetBlock_SIMD_SAO_TYPE_EO_0( const int            channelBitDepth,
197
                                            const ClpRng&        clpRng,
198
                                            int*                 offset,
199
                                            const Pel*           srcBlk,
200
                                            Pel*                 resBlk,
201
                                            ptrdiff_t            srcStride,
202
                                            ptrdiff_t            resStride,
203
                                            int                  width,
204
                                            int                  height,
205
                                            bool                 isLeftAvail,
206
                                            bool                 isRightAvail,
207
                                            bool                 isAboveAvail,
208
                                            bool                 isBelowAvail,
209
                                            bool                 isAboveLeftAvail,
210
                                            bool                 isAboveRightAvail,
211
                                            bool                 isBelowLeftAvail,
212
                                            bool                 isBelowRightAvail,
213
                                            std::vector<int8_t>* m_signLineBuf1,
214
                                            std::vector<int8_t>* m_signLineBuf2,
215
                                            bool                 isCtuCrossedByVirtualBoundaries,
216
                                            int                  horVirBndryPos[],
217
                                            int                  verVirBndryPos[],
218
                                            int                  numHorVirBndry,
219
                                            int                  numVerVirBndry,
220
                                            uint16_t             bndmask[MAX_CU_SIZE] )
221
7
{
222
7
  const Pel* srcLine = srcBlk;
223
7
  Pel*       resLine = resBlk;
224
225
7
  int    x, y, startX, endX, edgeType;
226
7
  int8_t signLeft, signRight;
227
228
7
  if( isLeftAvail && isRightAvail )
229
4
  {
230
4
    int8_t p_eo_offsets[16] = { 0 };
231
24
    for( int i = 0; i < SAO_EO_NUM_CATEGORIES; i++ )
232
20
    {
233
20
      p_eo_offsets[i] = offset[i];
234
20
    }
235
#if defined( USE_AVX2 )
236
    // AVX2
237
4
    if( ( width & 15 ) == 0 && vext >= AVX2 )
238
4
    {
239
240
      __m256i vsrca, vsrcal, vsrcar, virBmask;
241
242
      __m256i vbaseoffset = _mm256_set1_epi16( 2 );
243
      __m256i vplusone    = _mm256_set1_epi16( 1 );
244
      __m256i vzero       = _mm256_set1_epi8( 0 );
245
      __m256i vibdimax    = _mm256_set1_epi16( ( 1 << channelBitDepth ) - 1 );
246
      __m256i voffsettbl  = _mm256_broadcastsi128_si256( _mm_loadu_si128( (__m128i*)p_eo_offsets ) );
247
248
4
      if( isCtuCrossedByVirtualBoundaries )
249
0
      {
250
0
        for( y = 0; y < height; y++ )
251
0
        {
252
0
          for( x = 0; x < width; x += 16 )
253
0
          {
254
0
            vsrca    = _mm256_loadu_si256( (__m256i*)&srcLine[x] );
255
0
            vsrcal   = _mm256_loadu_si256( (__m256i*)&srcLine[x - 1] );
256
0
            vsrcar   = _mm256_loadu_si256( (__m256i*)&srcLine[x + 1] );
257
0
            virBmask = _mm256_loadu_si256( (__m256i*)&bndmask[x] );
258
259
0
            vsrcal            = _mm256_subs_epi16( vsrca, vsrcal );
260
0
            vsrcar            = _mm256_subs_epi16( vsrca, vsrcar );
261
0
            __m256i vsignl    = _mm256_sign_epi16( vplusone, vsrcal );
262
0
            __m256i vsignr    = _mm256_sign_epi16( vplusone, vsrcar );
263
0
            __m256i vsign     = _mm256_adds_epi16( _mm256_adds_epi16( vsignl, vsignr ), vbaseoffset );
264
0
            __m256i veoffsets = _mm256_shuffle_epi8( voffsettbl, vsign );
265
0
            veoffsets         = _mm256_slli_epi16( veoffsets, 8 );
266
0
            veoffsets         = _mm256_srai_epi16( veoffsets, 8 );
267
268
0
            vsrcal = _mm256_add_epi16( vsrca, veoffsets );
269
0
            vsrcal = _mm256_min_epi16( _mm256_max_epi16( vsrcal, vzero ), vibdimax );
270
271
0
            vsrcar = _mm256_blendv_epi8( vsrcal, vsrca, virBmask );
272
273
0
            _mm256_storeu_si256( (__m256i*)&resLine[x], vsrcar );
274
0
          }
275
0
          srcLine += srcStride;
276
0
          resLine += resStride;
277
0
        }
278
0
      }
279
4
      else
280
4
      {
281
481
        for( y = 0; y < height; y++ )
282
477
        {
283
4.27k
          for( x = 0; x < width; x += 16 )
284
3.79k
          {
285
3.79k
            vsrca             = _mm256_loadu_si256( (__m256i*)&srcLine[x] );
286
3.79k
            vsrcal            = _mm256_loadu_si256( (__m256i*)&srcLine[x - 1] );
287
3.79k
            vsrcar            = _mm256_loadu_si256( (__m256i*)&srcLine[x + 1] );
288
3.79k
            vsrcal            = _mm256_subs_epi16( vsrca, vsrcal );
289
3.79k
            vsrcar            = _mm256_subs_epi16( vsrca, vsrcar );
290
3.79k
            __m256i vsignl    = _mm256_sign_epi16( vplusone, vsrcal );
291
3.79k
            __m256i vsignr    = _mm256_sign_epi16( vplusone, vsrcar );
292
3.79k
            __m256i vsign     = _mm256_adds_epi16( _mm256_adds_epi16( vsignl, vsignr ), vbaseoffset );
293
3.79k
            __m256i veoffsets = _mm256_shuffle_epi8( voffsettbl, vsign );
294
3.79k
            veoffsets         = _mm256_slli_epi16( veoffsets, 8 );
295
3.79k
            veoffsets         = _mm256_srai_epi16( veoffsets, 8 );
296
297
3.79k
            vsrca = _mm256_add_epi16( vsrca, veoffsets );
298
3.79k
            vsrca = _mm256_min_epi16( _mm256_max_epi16( vsrca, vzero ), vibdimax );
299
300
3.79k
            _mm256_storeu_si256( (__m256i*)&resLine[x], vsrca );
301
3.79k
          }
302
477
          srcLine += srcStride;
303
477
          resLine += resStride;
304
477
        }
305
4
      }
306
4
    }
307
0
    else
308
0
#  endif
309
0
    {
310
0
      __m128i vsrca, vsrcal, vsrcar, virBmask;
311
0
      __m128i vbaseoffset = _mm_set1_epi16( 2 );
312
0
      __m128i vplusone    = _mm_set1_epi16( 1 );
313
0
      __m128i vzero       = _mm_set1_epi8( 0 );
314
0
      __m128i vibdimax    = _mm_set1_epi16( ( 1 << channelBitDepth ) - 1 );
315
0
      __m128i voffsettbl  = _mm_loadu_si128( (__m128i*)p_eo_offsets );
316
0
      if( isCtuCrossedByVirtualBoundaries )
317
0
      {
318
0
        for( y = 0; y < height; y++ )
319
0
        {
320
0
          for( x = 0; x < width; x += 8 )
321
0
          {
322
0
            vsrca             = _mm_loadu_si128( (__m128i*)&srcLine[x] );
323
0
            vsrcal            = _mm_loadu_si128( (__m128i*)&srcLine[x - 1] );
324
0
            vsrcar            = _mm_loadu_si128( (__m128i*)&srcLine[x + 1] );
325
0
            virBmask          = _mm_loadu_si128( (__m128i*)&bndmask[x] );
326
0
            vsrcal            = _mm_subs_epi16( vsrca, vsrcal );
327
0
            vsrcar            = _mm_subs_epi16( vsrca, vsrcar );
328
0
            __m128i vsignl    = _mm_sign_epi16( vplusone, vsrcal );
329
0
            __m128i vsignr    = _mm_sign_epi16( vplusone, vsrcar );
330
0
            __m128i vsign     = _mm_adds_epi16( _mm_adds_epi16( vsignl, vsignr ), vbaseoffset );
331
0
            __m128i veoffsets = _mm_shuffle_epi8( voffsettbl, vsign );
332
0
            veoffsets         = _mm_slli_epi16( veoffsets, 8 );
333
0
            veoffsets         = _mm_srai_epi16( veoffsets, 8 );
334
335
0
            vsrcal = _mm_add_epi16( vsrca, veoffsets );
336
0
            vsrcal = _mm_min_epi16( _mm_max_epi16( vsrcal, vzero ), vibdimax );
337
338
0
            vsrcar = _mm_blendv_epi8( vsrcal, vsrca, virBmask );
339
340
0
            _mm_store_si128( (__m128i*)&resLine[x], vsrcar );
341
0
          }
342
0
          srcLine += srcStride;
343
0
          resLine += resStride;
344
0
        }
345
0
      }
346
0
      else
347
0
      {
348
0
        for( y = 0; y < height; y++ )
349
0
        {
350
0
          for( x = 0; x < width; x += 8 )
351
0
          {
352
0
            vsrca             = _mm_loadu_si128( (__m128i*)&srcLine[x] );
353
0
            vsrcal            = _mm_loadu_si128( (__m128i*)&srcLine[x - 1] );
354
0
            vsrcar            = _mm_loadu_si128( (__m128i*)&srcLine[x + 1] );
355
0
            vsrcal            = _mm_subs_epi16( vsrca, vsrcal );
356
0
            vsrcar            = _mm_subs_epi16( vsrca, vsrcar );
357
0
            __m128i vsignl    = _mm_sign_epi16( vplusone, vsrcal );
358
0
            __m128i vsignr    = _mm_sign_epi16( vplusone, vsrcar );
359
0
            __m128i vsign     = _mm_adds_epi16( _mm_adds_epi16( vsignl, vsignr ), vbaseoffset );
360
0
            __m128i veoffsets = _mm_shuffle_epi8( voffsettbl, vsign );
361
0
            veoffsets         = _mm_slli_epi16( veoffsets, 8 );
362
0
            veoffsets         = _mm_srai_epi16( veoffsets, 8 );
363
364
0
            vsrca = _mm_add_epi16( vsrca, veoffsets );
365
0
            vsrca = _mm_min_epi16( _mm_max_epi16( vsrca, vzero ), vibdimax );
366
0
            _mm_store_si128( (__m128i*)&resLine[x], vsrca );
367
0
          }
368
0
          srcLine += srcStride;
369
0
          resLine += resStride;
370
0
        }
371
0
      }
372
0
    }
373
4
  }
374
3
  else
375
3
  {
376
3
    offset += 2;
377
3
    startX = isLeftAvail ? 0 : 1;
378
3
    endX   = isRightAvail ? width : ( width - 1 );
379
387
    for( y = 0; y < height; y++ )
380
384
    {
381
384
      signLeft = (int8_t)sgn( srcLine[startX] - srcLine[startX - 1] );
382
45.0k
      for( x = startX; x < endX; x++ )
383
44.6k
      {
384
44.6k
        signRight = (int8_t)sgn( srcLine[x] - srcLine[x + 1] );
385
44.6k
        if( isCtuCrossedByVirtualBoundaries && isVerProcessDisabled( x, numVerVirBndry, verVirBndryPos ) )
386
0
        {
387
0
          signLeft = -signRight;
388
0
          continue;
389
0
        }
390
44.6k
        edgeType = signRight + signLeft;
391
44.6k
        signLeft = -signRight;
392
393
44.6k
        resLine[x] = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng );
394
44.6k
      }
395
396
384
      srcLine += srcStride;
397
384
      resLine += resStride;
398
384
    }
399
3
  }
400
7
}
Unexecuted instantiation: SampleAdaptiveOffset_sse41.cpp:void vvdec::offsetBlock_SIMD_SAO_TYPE_EO_0<(vvdec::x86_simd::X86_VEXT)1>(int, vvdec::ClpRngTemplate<short> const&, int*, short const*, short*, long, long, int, int, bool, bool, bool, bool, bool, bool, bool, bool, std::__1::vector<signed char, std::__1::allocator<signed char> >*, std::__1::vector<signed char, std::__1::allocator<signed char> >*, bool, int*, int*, int, int, unsigned short*)
SampleAdaptiveOffset_avx2.cpp:void vvdec::offsetBlock_SIMD_SAO_TYPE_EO_0<(vvdec::x86_simd::X86_VEXT)4>(int, vvdec::ClpRngTemplate<short> const&, int*, short const*, short*, long, long, int, int, bool, bool, bool, bool, bool, bool, bool, bool, std::__1::vector<signed char, std::__1::allocator<signed char> >*, std::__1::vector<signed char, std::__1::allocator<signed char> >*, bool, int*, int*, int, int, unsigned short*)
Line
Count
Source
221
7
{
222
7
  const Pel* srcLine = srcBlk;
223
7
  Pel*       resLine = resBlk;
224
225
7
  int    x, y, startX, endX, edgeType;
226
7
  int8_t signLeft, signRight;
227
228
7
  if( isLeftAvail && isRightAvail )
229
4
  {
230
4
    int8_t p_eo_offsets[16] = { 0 };
231
24
    for( int i = 0; i < SAO_EO_NUM_CATEGORIES; i++ )
232
20
    {
233
20
      p_eo_offsets[i] = offset[i];
234
20
    }
235
4
#if defined( USE_AVX2 )
236
    // AVX2
237
4
    if( ( width & 15 ) == 0 && vext >= AVX2 )
238
4
    {
239
240
4
      __m256i vsrca, vsrcal, vsrcar, virBmask;
241
242
4
      __m256i vbaseoffset = _mm256_set1_epi16( 2 );
243
4
      __m256i vplusone    = _mm256_set1_epi16( 1 );
244
4
      __m256i vzero       = _mm256_set1_epi8( 0 );
245
4
      __m256i vibdimax    = _mm256_set1_epi16( ( 1 << channelBitDepth ) - 1 );
246
4
      __m256i voffsettbl  = _mm256_broadcastsi128_si256( _mm_loadu_si128( (__m128i*)p_eo_offsets ) );
247
248
4
      if( isCtuCrossedByVirtualBoundaries )
249
0
      {
250
0
        for( y = 0; y < height; y++ )
251
0
        {
252
0
          for( x = 0; x < width; x += 16 )
253
0
          {
254
0
            vsrca    = _mm256_loadu_si256( (__m256i*)&srcLine[x] );
255
0
            vsrcal   = _mm256_loadu_si256( (__m256i*)&srcLine[x - 1] );
256
0
            vsrcar   = _mm256_loadu_si256( (__m256i*)&srcLine[x + 1] );
257
0
            virBmask = _mm256_loadu_si256( (__m256i*)&bndmask[x] );
258
259
0
            vsrcal            = _mm256_subs_epi16( vsrca, vsrcal );
260
0
            vsrcar            = _mm256_subs_epi16( vsrca, vsrcar );
261
0
            __m256i vsignl    = _mm256_sign_epi16( vplusone, vsrcal );
262
0
            __m256i vsignr    = _mm256_sign_epi16( vplusone, vsrcar );
263
0
            __m256i vsign     = _mm256_adds_epi16( _mm256_adds_epi16( vsignl, vsignr ), vbaseoffset );
264
0
            __m256i veoffsets = _mm256_shuffle_epi8( voffsettbl, vsign );
265
0
            veoffsets         = _mm256_slli_epi16( veoffsets, 8 );
266
0
            veoffsets         = _mm256_srai_epi16( veoffsets, 8 );
267
268
0
            vsrcal = _mm256_add_epi16( vsrca, veoffsets );
269
0
            vsrcal = _mm256_min_epi16( _mm256_max_epi16( vsrcal, vzero ), vibdimax );
270
271
0
            vsrcar = _mm256_blendv_epi8( vsrcal, vsrca, virBmask );
272
273
0
            _mm256_storeu_si256( (__m256i*)&resLine[x], vsrcar );
274
0
          }
275
0
          srcLine += srcStride;
276
0
          resLine += resStride;
277
0
        }
278
0
      }
279
4
      else
280
4
      {
281
481
        for( y = 0; y < height; y++ )
282
477
        {
283
4.27k
          for( x = 0; x < width; x += 16 )
284
3.79k
          {
285
3.79k
            vsrca             = _mm256_loadu_si256( (__m256i*)&srcLine[x] );
286
3.79k
            vsrcal            = _mm256_loadu_si256( (__m256i*)&srcLine[x - 1] );
287
3.79k
            vsrcar            = _mm256_loadu_si256( (__m256i*)&srcLine[x + 1] );
288
3.79k
            vsrcal            = _mm256_subs_epi16( vsrca, vsrcal );
289
3.79k
            vsrcar            = _mm256_subs_epi16( vsrca, vsrcar );
290
3.79k
            __m256i vsignl    = _mm256_sign_epi16( vplusone, vsrcal );
291
3.79k
            __m256i vsignr    = _mm256_sign_epi16( vplusone, vsrcar );
292
3.79k
            __m256i vsign     = _mm256_adds_epi16( _mm256_adds_epi16( vsignl, vsignr ), vbaseoffset );
293
3.79k
            __m256i veoffsets = _mm256_shuffle_epi8( voffsettbl, vsign );
294
3.79k
            veoffsets         = _mm256_slli_epi16( veoffsets, 8 );
295
3.79k
            veoffsets         = _mm256_srai_epi16( veoffsets, 8 );
296
297
3.79k
            vsrca = _mm256_add_epi16( vsrca, veoffsets );
298
3.79k
            vsrca = _mm256_min_epi16( _mm256_max_epi16( vsrca, vzero ), vibdimax );
299
300
3.79k
            _mm256_storeu_si256( (__m256i*)&resLine[x], vsrca );
301
3.79k
          }
302
477
          srcLine += srcStride;
303
477
          resLine += resStride;
304
477
        }
305
4
      }
306
4
    }
307
0
    else
308
0
#  endif
309
0
    {
310
0
      __m128i vsrca, vsrcal, vsrcar, virBmask;
311
0
      __m128i vbaseoffset = _mm_set1_epi16( 2 );
312
0
      __m128i vplusone    = _mm_set1_epi16( 1 );
313
0
      __m128i vzero       = _mm_set1_epi8( 0 );
314
0
      __m128i vibdimax    = _mm_set1_epi16( ( 1 << channelBitDepth ) - 1 );
315
0
      __m128i voffsettbl  = _mm_loadu_si128( (__m128i*)p_eo_offsets );
316
0
      if( isCtuCrossedByVirtualBoundaries )
317
0
      {
318
0
        for( y = 0; y < height; y++ )
319
0
        {
320
0
          for( x = 0; x < width; x += 8 )
321
0
          {
322
0
            vsrca             = _mm_loadu_si128( (__m128i*)&srcLine[x] );
323
0
            vsrcal            = _mm_loadu_si128( (__m128i*)&srcLine[x - 1] );
324
0
            vsrcar            = _mm_loadu_si128( (__m128i*)&srcLine[x + 1] );
325
0
            virBmask          = _mm_loadu_si128( (__m128i*)&bndmask[x] );
326
0
            vsrcal            = _mm_subs_epi16( vsrca, vsrcal );
327
0
            vsrcar            = _mm_subs_epi16( vsrca, vsrcar );
328
0
            __m128i vsignl    = _mm_sign_epi16( vplusone, vsrcal );
329
0
            __m128i vsignr    = _mm_sign_epi16( vplusone, vsrcar );
330
0
            __m128i vsign     = _mm_adds_epi16( _mm_adds_epi16( vsignl, vsignr ), vbaseoffset );
331
0
            __m128i veoffsets = _mm_shuffle_epi8( voffsettbl, vsign );
332
0
            veoffsets         = _mm_slli_epi16( veoffsets, 8 );
333
0
            veoffsets         = _mm_srai_epi16( veoffsets, 8 );
334
335
0
            vsrcal = _mm_add_epi16( vsrca, veoffsets );
336
0
            vsrcal = _mm_min_epi16( _mm_max_epi16( vsrcal, vzero ), vibdimax );
337
338
0
            vsrcar = _mm_blendv_epi8( vsrcal, vsrca, virBmask );
339
340
0
            _mm_store_si128( (__m128i*)&resLine[x], vsrcar );
341
0
          }
342
0
          srcLine += srcStride;
343
0
          resLine += resStride;
344
0
        }
345
0
      }
346
0
      else
347
0
      {
348
0
        for( y = 0; y < height; y++ )
349
0
        {
350
0
          for( x = 0; x < width; x += 8 )
351
0
          {
352
0
            vsrca             = _mm_loadu_si128( (__m128i*)&srcLine[x] );
353
0
            vsrcal            = _mm_loadu_si128( (__m128i*)&srcLine[x - 1] );
354
0
            vsrcar            = _mm_loadu_si128( (__m128i*)&srcLine[x + 1] );
355
0
            vsrcal            = _mm_subs_epi16( vsrca, vsrcal );
356
0
            vsrcar            = _mm_subs_epi16( vsrca, vsrcar );
357
0
            __m128i vsignl    = _mm_sign_epi16( vplusone, vsrcal );
358
0
            __m128i vsignr    = _mm_sign_epi16( vplusone, vsrcar );
359
0
            __m128i vsign     = _mm_adds_epi16( _mm_adds_epi16( vsignl, vsignr ), vbaseoffset );
360
0
            __m128i veoffsets = _mm_shuffle_epi8( voffsettbl, vsign );
361
0
            veoffsets         = _mm_slli_epi16( veoffsets, 8 );
362
0
            veoffsets         = _mm_srai_epi16( veoffsets, 8 );
363
364
0
            vsrca = _mm_add_epi16( vsrca, veoffsets );
365
0
            vsrca = _mm_min_epi16( _mm_max_epi16( vsrca, vzero ), vibdimax );
366
0
            _mm_store_si128( (__m128i*)&resLine[x], vsrca );
367
0
          }
368
0
          srcLine += srcStride;
369
0
          resLine += resStride;
370
0
        }
371
0
      }
372
0
    }
373
4
  }
374
3
  else
375
3
  {
376
3
    offset += 2;
377
3
    startX = isLeftAvail ? 0 : 1;
378
3
    endX   = isRightAvail ? width : ( width - 1 );
379
387
    for( y = 0; y < height; y++ )
380
384
    {
381
384
      signLeft = (int8_t)sgn( srcLine[startX] - srcLine[startX - 1] );
382
45.0k
      for( x = startX; x < endX; x++ )
383
44.6k
      {
384
44.6k
        signRight = (int8_t)sgn( srcLine[x] - srcLine[x + 1] );
385
44.6k
        if( isCtuCrossedByVirtualBoundaries && isVerProcessDisabled( x, numVerVirBndry, verVirBndryPos ) )
386
0
        {
387
0
          signLeft = -signRight;
388
0
          continue;
389
0
        }
390
44.6k
        edgeType = signRight + signLeft;
391
44.6k
        signLeft = -signRight;
392
393
44.6k
        resLine[x] = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng );
394
44.6k
      }
395
396
384
      srcLine += srcStride;
397
384
      resLine += resStride;
398
384
    }
399
3
  }
400
7
}
401
402
template<X86_VEXT vext>
403
static void offsetBlock_SIMD_SAO_TYPE_EO_90( const int            channelBitDepth,
404
                                             const ClpRng&        clpRng,
405
                                             int*                 offset,
406
                                             const Pel*           srcBlk,
407
                                             Pel*                 resBlk,
408
                                             ptrdiff_t            srcStride,
409
                                             ptrdiff_t            resStride,
410
                                             int                  width,
411
                                             int                  height,
412
                                             bool                 isLeftAvail,
413
                                             bool                 isRightAvail,
414
                                             bool                 isAboveAvail,
415
                                             bool                 isBelowAvail,
416
                                             bool                 isAboveLeftAvail,
417
                                             bool                 isAboveRightAvail,
418
                                             bool                 isBelowLeftAvail,
419
                                             bool                 isBelowRightAvail,
420
                                             std::vector<int8_t>* m_signLineBuf1,
421
                                             std::vector<int8_t>* m_signLineBuf2,
422
                                             bool                 isCtuCrossedByVirtualBoundaries,
423
                                             int                  horVirBndryPos[],
424
                                             int                  verVirBndryPos[],
425
                                             int                  numHorVirBndry,
426
                                             int                  numVerVirBndry,
427
                                             uint16_t             bndmask[MAX_CU_SIZE] )
428
4
{
429
4
  const Pel* srcLine = srcBlk;
430
4
  Pel*       resLine = resBlk;
431
432
4
  int x, y, startY, endY;
433
434
4
  int8_t p_eo_offsets[16] = { 0 };
435
24
  for( int i = 0; i < SAO_EO_NUM_CATEGORIES; i++ )
436
20
  {
437
20
    p_eo_offsets[i] = offset[i];
438
20
  }
439
4
  const Pel* srcLineAbove = srcLine - srcStride;
440
4
  const Pel* srcLineBelow = srcLine + srcStride;
441
4
  startY                  = 0;
442
4
  if( !isAboveAvail )
443
0
  {
444
0
    startY       = 1;
445
0
    srcLineAbove = srcLine;
446
0
    srcLine += srcStride;
447
0
    resLine += resStride;
448
0
    srcLineBelow = srcLine + srcStride;
449
0
  }
450
4
  endY = height;
451
4
  if( !isBelowAvail )
452
0
  {
453
0
    endY = height - 1;
454
0
  }
455
#  if defined( USE_AVX2 )
456
  // AVX2
457
4
  if( ( width & 15 ) == 0 && vext >= AVX2 )
458
4
  {
459
4
    __m256i vsrca, vsrcat, vsrcab;
460
461
    __m256i    vbaseoffset  = _mm256_set1_epi16( 2 );
462
    __m256i    vplusone     = _mm256_set1_epi16( 1 );
463
    __m256i    vzero        = _mm256_set1_epi8( 0 );
464
    __m256i    vibdimax     = _mm256_set1_epi16( ( 1 << channelBitDepth ) - 1 );
465
    __m256i    voffsettbl   = _mm256_broadcastsi128_si256( _mm_loadu_si128( (__m128i*)p_eo_offsets ) );
466
    const Pel* srcLineBelow = srcLine + srcStride;
467
468
4
    if( isCtuCrossedByVirtualBoundaries )
469
0
    {
470
0
      for( y = startY; y < endY; y++ )
471
0
      {
472
0
        if( !isHorProcessDisabled( y, numHorVirBndry, horVirBndryPos ) )
473
0
        {
474
0
          for( x = 0; x < width; x += 16 )
475
0
          {
476
0
            vsrca             = _mm256_loadu_si256( (__m256i*)&srcLine[x] );
477
0
            vsrcat            = _mm256_loadu_si256( (__m256i*)&srcLineAbove[x] );
478
0
            vsrcab            = _mm256_loadu_si256( (__m256i*)&srcLineBelow[x] );
479
0
            vsrcat            = _mm256_subs_epi16( vsrca, vsrcat );
480
0
            vsrcab            = _mm256_subs_epi16( vsrca, vsrcab );
481
0
            __m256i vsignt    = _mm256_sign_epi16( vplusone, vsrcat );
482
0
            __m256i vsignb    = _mm256_sign_epi16( vplusone, vsrcab );
483
0
            __m256i vsign     = _mm256_adds_epi16( _mm256_adds_epi16( vsignt, vsignb ), vbaseoffset );
484
0
            __m256i veoffsets = _mm256_shuffle_epi8( voffsettbl, vsign );
485
0
            veoffsets         = _mm256_slli_epi16( veoffsets, 8 );
486
0
            veoffsets         = _mm256_srai_epi16( veoffsets, 8 );
487
488
0
            vsrca = _mm256_add_epi16( vsrca, veoffsets );
489
0
            vsrca = _mm256_min_epi16( _mm256_max_epi16( vsrca, vzero ), vibdimax );
490
491
0
            _mm256_storeu_si256( (__m256i*)&resLine[x], vsrca );
492
0
          }
493
0
        }
494
0
        srcLine += srcStride;
495
0
        srcLineBelow += srcStride;
496
0
        srcLineAbove += srcStride;
497
0
        resLine += resStride;
498
0
      }
499
0
    }
500
4
    else
501
4
    {
502
403
      for( y = startY; y < endY; y++ )
503
399
      {
504
3.08k
        for( x = 0; x < width; x += 16 )
505
2.68k
        {
506
2.68k
          vsrca             = _mm256_loadu_si256( (__m256i*)&srcLine[x] );
507
2.68k
          vsrcat            = _mm256_loadu_si256( (__m256i*)&srcLineAbove[x] );
508
2.68k
          vsrcab            = _mm256_loadu_si256( (__m256i*)&srcLineBelow[x] );
509
2.68k
          vsrcat            = _mm256_subs_epi16( vsrca, vsrcat );
510
2.68k
          vsrcab            = _mm256_subs_epi16( vsrca, vsrcab );
511
2.68k
          __m256i vsignt    = _mm256_sign_epi16( vplusone, vsrcat );
512
2.68k
          __m256i vsignb    = _mm256_sign_epi16( vplusone, vsrcab );
513
2.68k
          __m256i vsign     = _mm256_adds_epi16( _mm256_adds_epi16( vsignt, vsignb ), vbaseoffset );
514
2.68k
          __m256i veoffsets = _mm256_shuffle_epi8( voffsettbl, vsign );
515
2.68k
          veoffsets         = _mm256_slli_epi16( veoffsets, 8 );
516
2.68k
          veoffsets         = _mm256_srai_epi16( veoffsets, 8 );
517
518
2.68k
          vsrca = _mm256_add_epi16( vsrca, veoffsets );
519
2.68k
          vsrca = _mm256_min_epi16( _mm256_max_epi16( vsrca, vzero ), vibdimax );
520
521
2.68k
          _mm256_storeu_si256( (__m256i*)&resLine[x], vsrca );
522
2.68k
        }
523
399
        srcLine += srcStride;
524
399
        srcLineBelow += srcStride;
525
399
        srcLineAbove += srcStride;
526
399
        resLine += resStride;
527
399
      }
528
4
    }
529
4
  }
530
0
  else
531
0
#  endif
532
0
  {
533
0
    __m128i vsrca, vsrcat, vsrcab;
534
0
    __m128i vbaseoffset = _mm_set1_epi16( 2 );
535
0
    __m128i vplusone    = _mm_set1_epi16( 1 );
536
0
    __m128i vzero       = _mm_set1_epi8( 0 );
537
0
    __m128i vibdimax    = _mm_set1_epi16( ( 1 << channelBitDepth ) - 1 );
538
0
    __m128i voffsettbl  = _mm_loadu_si128( (__m128i*)p_eo_offsets );
539
540
0
    if( isCtuCrossedByVirtualBoundaries )
541
0
    {
542
0
      for( y = startY; y < endY; y++ )
543
0
      {
544
0
        if( !isHorProcessDisabled( y, numHorVirBndry, horVirBndryPos ) )
545
0
        {
546
0
          for( x = 0; x < width; x += 8 )
547
0
          {
548
0
            vsrca             = _mm_loadu_si128( (__m128i*)&srcLine[x] );
549
0
            vsrcat            = _mm_loadu_si128( (__m128i*)&srcLineAbove[x] );
550
0
            vsrcab            = _mm_loadu_si128( (__m128i*)&srcLineBelow[x] );
551
0
            vsrcat            = _mm_subs_epi16( vsrca, vsrcat );
552
0
            vsrcab            = _mm_subs_epi16( vsrca, vsrcab );
553
0
            __m128i vsignt    = _mm_sign_epi16( vplusone, vsrcat );
554
0
            __m128i vsignb    = _mm_sign_epi16( vplusone, vsrcab );
555
0
            __m128i vsign     = _mm_adds_epi16( _mm_adds_epi16( vsignt, vsignb ), vbaseoffset );
556
0
            __m128i veoffsets = _mm_shuffle_epi8( voffsettbl, vsign );
557
0
            veoffsets         = _mm_slli_epi16( veoffsets, 8 );
558
0
            veoffsets         = _mm_srai_epi16( veoffsets, 8 );
559
560
0
            vsrca = _mm_add_epi16( vsrca, veoffsets );
561
0
            vsrca = _mm_min_epi16( _mm_max_epi16( vsrca, vzero ), vibdimax );
562
563
0
            _mm_store_si128( (__m128i*)&resLine[x], vsrca );
564
0
          }
565
0
        }
566
0
        srcLine += srcStride;
567
0
        srcLineBelow += srcStride;
568
0
        srcLineAbove += srcStride;
569
0
        resLine += resStride;
570
0
      }
571
0
    }
572
0
    else
573
0
    {
574
0
      for( y = startY; y < endY; y++ )
575
0
      {
576
0
        for( x = 0; x < width; x += 8 )
577
0
        {
578
0
          vsrca             = _mm_loadu_si128( (__m128i*)&srcLine[x] );
579
0
          vsrcat            = _mm_loadu_si128( (__m128i*)&srcLineAbove[x] );
580
0
          vsrcab            = _mm_loadu_si128( (__m128i*)&srcLineBelow[x] );
581
0
          vsrcat            = _mm_subs_epi16( vsrca, vsrcat );
582
0
          vsrcab            = _mm_subs_epi16( vsrca, vsrcab );
583
0
          __m128i vsignt    = _mm_sign_epi16( vplusone, vsrcat );
584
0
          __m128i vsignb    = _mm_sign_epi16( vplusone, vsrcab );
585
0
          __m128i vsign     = _mm_adds_epi16( _mm_adds_epi16( vsignt, vsignb ), vbaseoffset );
586
0
          __m128i veoffsets = _mm_shuffle_epi8( voffsettbl, vsign );
587
0
          veoffsets         = _mm_slli_epi16( veoffsets, 8 );
588
0
          veoffsets         = _mm_srai_epi16( veoffsets, 8 );
589
590
0
          vsrca = _mm_add_epi16( vsrca, veoffsets );
591
0
          vsrca = _mm_min_epi16( _mm_max_epi16( vsrca, vzero ), vibdimax );
592
0
          _mm_store_si128( (__m128i*)&resLine[x], vsrca );
593
0
        }
594
0
        srcLine += srcStride;
595
0
        srcLineBelow += srcStride;
596
0
        srcLineAbove += srcStride;
597
0
        resLine += resStride;
598
0
      }
599
0
    }
600
0
  }
601
4
}
Unexecuted instantiation: SampleAdaptiveOffset_sse41.cpp:void vvdec::offsetBlock_SIMD_SAO_TYPE_EO_90<(vvdec::x86_simd::X86_VEXT)1>(int, vvdec::ClpRngTemplate<short> const&, int*, short const*, short*, long, long, int, int, bool, bool, bool, bool, bool, bool, bool, bool, std::__1::vector<signed char, std::__1::allocator<signed char> >*, std::__1::vector<signed char, std::__1::allocator<signed char> >*, bool, int*, int*, int, int, unsigned short*)
SampleAdaptiveOffset_avx2.cpp:void vvdec::offsetBlock_SIMD_SAO_TYPE_EO_90<(vvdec::x86_simd::X86_VEXT)4>(int, vvdec::ClpRngTemplate<short> const&, int*, short const*, short*, long, long, int, int, bool, bool, bool, bool, bool, bool, bool, bool, std::__1::vector<signed char, std::__1::allocator<signed char> >*, std::__1::vector<signed char, std::__1::allocator<signed char> >*, bool, int*, int*, int, int, unsigned short*)
Line
Count
Source
428
4
{
429
4
  const Pel* srcLine = srcBlk;
430
4
  Pel*       resLine = resBlk;
431
432
4
  int x, y, startY, endY;
433
434
4
  int8_t p_eo_offsets[16] = { 0 };
435
24
  for( int i = 0; i < SAO_EO_NUM_CATEGORIES; i++ )
436
20
  {
437
20
    p_eo_offsets[i] = offset[i];
438
20
  }
439
4
  const Pel* srcLineAbove = srcLine - srcStride;
440
4
  const Pel* srcLineBelow = srcLine + srcStride;
441
4
  startY                  = 0;
442
4
  if( !isAboveAvail )
443
0
  {
444
0
    startY       = 1;
445
0
    srcLineAbove = srcLine;
446
0
    srcLine += srcStride;
447
0
    resLine += resStride;
448
0
    srcLineBelow = srcLine + srcStride;
449
0
  }
450
4
  endY = height;
451
4
  if( !isBelowAvail )
452
0
  {
453
0
    endY = height - 1;
454
0
  }
455
4
#  if defined( USE_AVX2 )
456
  // AVX2
457
4
  if( ( width & 15 ) == 0 && vext >= AVX2 )
458
4
  {
459
4
    __m256i vsrca, vsrcat, vsrcab;
460
461
4
    __m256i    vbaseoffset  = _mm256_set1_epi16( 2 );
462
4
    __m256i    vplusone     = _mm256_set1_epi16( 1 );
463
4
    __m256i    vzero        = _mm256_set1_epi8( 0 );
464
4
    __m256i    vibdimax     = _mm256_set1_epi16( ( 1 << channelBitDepth ) - 1 );
465
4
    __m256i    voffsettbl   = _mm256_broadcastsi128_si256( _mm_loadu_si128( (__m128i*)p_eo_offsets ) );
466
4
    const Pel* srcLineBelow = srcLine + srcStride;
467
468
4
    if( isCtuCrossedByVirtualBoundaries )
469
0
    {
470
0
      for( y = startY; y < endY; y++ )
471
0
      {
472
0
        if( !isHorProcessDisabled( y, numHorVirBndry, horVirBndryPos ) )
473
0
        {
474
0
          for( x = 0; x < width; x += 16 )
475
0
          {
476
0
            vsrca             = _mm256_loadu_si256( (__m256i*)&srcLine[x] );
477
0
            vsrcat            = _mm256_loadu_si256( (__m256i*)&srcLineAbove[x] );
478
0
            vsrcab            = _mm256_loadu_si256( (__m256i*)&srcLineBelow[x] );
479
0
            vsrcat            = _mm256_subs_epi16( vsrca, vsrcat );
480
0
            vsrcab            = _mm256_subs_epi16( vsrca, vsrcab );
481
0
            __m256i vsignt    = _mm256_sign_epi16( vplusone, vsrcat );
482
0
            __m256i vsignb    = _mm256_sign_epi16( vplusone, vsrcab );
483
0
            __m256i vsign     = _mm256_adds_epi16( _mm256_adds_epi16( vsignt, vsignb ), vbaseoffset );
484
0
            __m256i veoffsets = _mm256_shuffle_epi8( voffsettbl, vsign );
485
0
            veoffsets         = _mm256_slli_epi16( veoffsets, 8 );
486
0
            veoffsets         = _mm256_srai_epi16( veoffsets, 8 );
487
488
0
            vsrca = _mm256_add_epi16( vsrca, veoffsets );
489
0
            vsrca = _mm256_min_epi16( _mm256_max_epi16( vsrca, vzero ), vibdimax );
490
491
0
            _mm256_storeu_si256( (__m256i*)&resLine[x], vsrca );
492
0
          }
493
0
        }
494
0
        srcLine += srcStride;
495
0
        srcLineBelow += srcStride;
496
0
        srcLineAbove += srcStride;
497
0
        resLine += resStride;
498
0
      }
499
0
    }
500
4
    else
501
4
    {
502
403
      for( y = startY; y < endY; y++ )
503
399
      {
504
3.08k
        for( x = 0; x < width; x += 16 )
505
2.68k
        {
506
2.68k
          vsrca             = _mm256_loadu_si256( (__m256i*)&srcLine[x] );
507
2.68k
          vsrcat            = _mm256_loadu_si256( (__m256i*)&srcLineAbove[x] );
508
2.68k
          vsrcab            = _mm256_loadu_si256( (__m256i*)&srcLineBelow[x] );
509
2.68k
          vsrcat            = _mm256_subs_epi16( vsrca, vsrcat );
510
2.68k
          vsrcab            = _mm256_subs_epi16( vsrca, vsrcab );
511
2.68k
          __m256i vsignt    = _mm256_sign_epi16( vplusone, vsrcat );
512
2.68k
          __m256i vsignb    = _mm256_sign_epi16( vplusone, vsrcab );
513
2.68k
          __m256i vsign     = _mm256_adds_epi16( _mm256_adds_epi16( vsignt, vsignb ), vbaseoffset );
514
2.68k
          __m256i veoffsets = _mm256_shuffle_epi8( voffsettbl, vsign );
515
2.68k
          veoffsets         = _mm256_slli_epi16( veoffsets, 8 );
516
2.68k
          veoffsets         = _mm256_srai_epi16( veoffsets, 8 );
517
518
2.68k
          vsrca = _mm256_add_epi16( vsrca, veoffsets );
519
2.68k
          vsrca = _mm256_min_epi16( _mm256_max_epi16( vsrca, vzero ), vibdimax );
520
521
2.68k
          _mm256_storeu_si256( (__m256i*)&resLine[x], vsrca );
522
2.68k
        }
523
399
        srcLine += srcStride;
524
399
        srcLineBelow += srcStride;
525
399
        srcLineAbove += srcStride;
526
399
        resLine += resStride;
527
399
      }
528
4
    }
529
4
  }
530
0
  else
531
0
#  endif
532
0
  {
533
0
    __m128i vsrca, vsrcat, vsrcab;
534
0
    __m128i vbaseoffset = _mm_set1_epi16( 2 );
535
0
    __m128i vplusone    = _mm_set1_epi16( 1 );
536
0
    __m128i vzero       = _mm_set1_epi8( 0 );
537
0
    __m128i vibdimax    = _mm_set1_epi16( ( 1 << channelBitDepth ) - 1 );
538
0
    __m128i voffsettbl  = _mm_loadu_si128( (__m128i*)p_eo_offsets );
539
540
0
    if( isCtuCrossedByVirtualBoundaries )
541
0
    {
542
0
      for( y = startY; y < endY; y++ )
543
0
      {
544
0
        if( !isHorProcessDisabled( y, numHorVirBndry, horVirBndryPos ) )
545
0
        {
546
0
          for( x = 0; x < width; x += 8 )
547
0
          {
548
0
            vsrca             = _mm_loadu_si128( (__m128i*)&srcLine[x] );
549
0
            vsrcat            = _mm_loadu_si128( (__m128i*)&srcLineAbove[x] );
550
0
            vsrcab            = _mm_loadu_si128( (__m128i*)&srcLineBelow[x] );
551
0
            vsrcat            = _mm_subs_epi16( vsrca, vsrcat );
552
0
            vsrcab            = _mm_subs_epi16( vsrca, vsrcab );
553
0
            __m128i vsignt    = _mm_sign_epi16( vplusone, vsrcat );
554
0
            __m128i vsignb    = _mm_sign_epi16( vplusone, vsrcab );
555
0
            __m128i vsign     = _mm_adds_epi16( _mm_adds_epi16( vsignt, vsignb ), vbaseoffset );
556
0
            __m128i veoffsets = _mm_shuffle_epi8( voffsettbl, vsign );
557
0
            veoffsets         = _mm_slli_epi16( veoffsets, 8 );
558
0
            veoffsets         = _mm_srai_epi16( veoffsets, 8 );
559
560
0
            vsrca = _mm_add_epi16( vsrca, veoffsets );
561
0
            vsrca = _mm_min_epi16( _mm_max_epi16( vsrca, vzero ), vibdimax );
562
563
0
            _mm_store_si128( (__m128i*)&resLine[x], vsrca );
564
0
          }
565
0
        }
566
0
        srcLine += srcStride;
567
0
        srcLineBelow += srcStride;
568
0
        srcLineAbove += srcStride;
569
0
        resLine += resStride;
570
0
      }
571
0
    }
572
0
    else
573
0
    {
574
0
      for( y = startY; y < endY; y++ )
575
0
      {
576
0
        for( x = 0; x < width; x += 8 )
577
0
        {
578
0
          vsrca             = _mm_loadu_si128( (__m128i*)&srcLine[x] );
579
0
          vsrcat            = _mm_loadu_si128( (__m128i*)&srcLineAbove[x] );
580
0
          vsrcab            = _mm_loadu_si128( (__m128i*)&srcLineBelow[x] );
581
0
          vsrcat            = _mm_subs_epi16( vsrca, vsrcat );
582
0
          vsrcab            = _mm_subs_epi16( vsrca, vsrcab );
583
0
          __m128i vsignt    = _mm_sign_epi16( vplusone, vsrcat );
584
0
          __m128i vsignb    = _mm_sign_epi16( vplusone, vsrcab );
585
0
          __m128i vsign     = _mm_adds_epi16( _mm_adds_epi16( vsignt, vsignb ), vbaseoffset );
586
0
          __m128i veoffsets = _mm_shuffle_epi8( voffsettbl, vsign );
587
0
          veoffsets         = _mm_slli_epi16( veoffsets, 8 );
588
0
          veoffsets         = _mm_srai_epi16( veoffsets, 8 );
589
590
0
          vsrca = _mm_add_epi16( vsrca, veoffsets );
591
0
          vsrca = _mm_min_epi16( _mm_max_epi16( vsrca, vzero ), vibdimax );
592
0
          _mm_store_si128( (__m128i*)&resLine[x], vsrca );
593
0
        }
594
0
        srcLine += srcStride;
595
0
        srcLineBelow += srcStride;
596
0
        srcLineAbove += srcStride;
597
0
        resLine += resStride;
598
0
      }
599
0
    }
600
0
  }
601
4
}
602
603
template<X86_VEXT vext>
604
static void offsetBlock_SIMD_SAO_TYPE_EO_135( const int            channelBitDepth,
605
                                              const ClpRng&        clpRng,
606
                                              int*                 offset,
607
                                              const Pel*           srcBlk,
608
                                              Pel*                 resBlk,
609
                                              ptrdiff_t            srcStride,
610
                                              ptrdiff_t            resStride,
611
                                              int                  width,
612
                                              int                  height,
613
                                              bool                 isLeftAvail,
614
                                              bool                 isRightAvail,
615
                                              bool                 isAboveAvail,
616
                                              bool                 isBelowAvail,
617
                                              bool                 isAboveLeftAvail,
618
                                              bool                 isAboveRightAvail,
619
                                              bool                 isBelowLeftAvail,
620
                                              bool                 isBelowRightAvail,
621
                                              std::vector<int8_t>* m_signLineBuf1,
622
                                              std::vector<int8_t>* m_signLineBuf2,
623
                                              bool                 isCtuCrossedByVirtualBoundaries,
624
                                              int                  horVirBndryPos[],
625
                                              int                  verVirBndryPos[],
626
                                              int                  numHorVirBndry,
627
                                              int                  numVerVirBndry,
628
                                              uint16_t             bndmask[MAX_CU_SIZE] )
629
0
{
630
0
  const Pel* srcLine = srcBlk;
631
0
  Pel*       resLine = resBlk;
632
633
0
  int    x, y, startX, startY, endX, endY, edgeType;
634
0
  int    firstLineStartX, firstLineEndX, lastLineStartX, lastLineEndX;
635
0
  int8_t signDown;
636
637
0
  if( isLeftAvail && isRightAvail && isAboveLeftAvail && isBelowRightAvail )
638
0
  {
639
0
    int8_t p_eo_offsets[16] = { 0 };
640
0
    for( int i = 0; i < SAO_EO_NUM_CATEGORIES; i++ )
641
0
    {
642
0
      p_eo_offsets[i] = offset[i];
643
0
    }
644
0
    const Pel* srcLineAbove = srcLine - srcStride;
645
0
    const Pel* srcLineBelow = srcLine + srcStride;
646
0
    startY                  = 0;
647
0
    if( !isAboveAvail )
648
0
    {
649
0
      startY       = 1;
650
0
      srcLineAbove = srcLine;
651
0
      srcLine += srcStride;
652
0
      resLine += resStride;
653
0
      srcLineBelow = srcLine + srcStride;
654
0
    }
655
0
    endY = height;
656
0
    if( !isBelowAvail )
657
0
    {
658
0
      endY = height - 1;
659
0
    }
660
#  if defined( USE_AVX2 )
661
    // AVX2
662
0
    if( ( width & 15 ) == 0 && vext >= AVX2 )
663
0
    {
664
0
      __m256i vsrca, vsrcat, vsrcab, virBmask;
665
666
      __m256i    vbaseoffset  = _mm256_set1_epi16( 2 );
667
      __m256i    vplusone     = _mm256_set1_epi16( 1 );
668
      __m256i    vzero        = _mm256_set1_epi8( 0 );
669
      __m256i    vibdimax     = _mm256_set1_epi16( ( 1 << channelBitDepth ) - 1 );
670
      __m256i    voffsettbl   = _mm256_broadcastsi128_si256( _mm_loadu_si128( (__m128i*)p_eo_offsets ) );
671
      const Pel* srcLineBelow = srcLine + srcStride;
672
673
0
      for( y = startY; y < endY; y++ )
674
0
      {
675
0
        if( !isHorProcessDisabled( y, numHorVirBndry, horVirBndryPos ) )
676
0
        {
677
0
          for( x = 0; x < width; x += 16 )
678
0
          {
679
0
            vsrca             = _mm256_loadu_si256( (__m256i*)&srcLine[x] );
680
0
            vsrcat            = _mm256_loadu_si256( (__m256i*)&srcLineAbove[x - 1] );
681
0
            vsrcab            = _mm256_loadu_si256( (__m256i*)&srcLineBelow[x + 1] );
682
0
            virBmask          = _mm256_loadu_si256( (__m256i*)&bndmask[x] );
683
0
            vsrcat            = _mm256_subs_epi16( vsrca, vsrcat );
684
0
            vsrcab            = _mm256_subs_epi16( vsrca, vsrcab );
685
0
            __m256i vsignt    = _mm256_sign_epi16( vplusone, vsrcat );
686
0
            __m256i vsignb    = _mm256_sign_epi16( vplusone, vsrcab );
687
0
            __m256i vsign     = _mm256_adds_epi16( _mm256_adds_epi16( vsignt, vsignb ), vbaseoffset );
688
0
            __m256i veoffsets = _mm256_shuffle_epi8( voffsettbl, vsign );
689
0
            veoffsets         = _mm256_slli_epi16( veoffsets, 8 );
690
0
            veoffsets         = _mm256_srai_epi16( veoffsets, 8 );
691
692
0
            vsrcat = _mm256_add_epi16( vsrca, veoffsets );
693
0
            vsrcat = _mm256_min_epi16( _mm256_max_epi16( vsrcat, vzero ), vibdimax );
694
695
0
            vsrcab = _mm256_blendv_epi8( vsrcat, vsrca, virBmask );
696
697
0
            _mm256_storeu_si256( (__m256i*)&resLine[x], vsrcab );
698
0
          }
699
0
        }
700
0
        srcLine += srcStride;
701
0
        srcLineBelow += srcStride;
702
0
        srcLineAbove += srcStride;
703
0
        resLine += resStride;
704
0
      }
705
0
    }
706
0
    else
707
0
#  endif
708
0
    {
709
0
      __m128i vsrca, vsrcat, vsrcab, virBmask;
710
0
      __m128i vbaseoffset = _mm_set1_epi16( 2 );
711
0
      __m128i vplusone    = _mm_set1_epi16( 1 );
712
0
      __m128i vzero       = _mm_set1_epi8( 0 );
713
0
      __m128i vibdimax    = _mm_set1_epi16( ( 1 << channelBitDepth ) - 1 );
714
0
      __m128i voffsettbl  = _mm_loadu_si128( (__m128i*)p_eo_offsets );
715
716
0
      for( y = startY; y < endY; y++ )
717
0
      {
718
0
        if( !isHorProcessDisabled( y, numHorVirBndry, horVirBndryPos ) )
719
0
        {
720
0
          for( x = 0; x < width; x += 8 )
721
0
          {
722
0
            vsrca             = _mm_loadu_si128( (__m128i*)&srcLine[x] );
723
0
            vsrcat            = _mm_loadu_si128( (__m128i*)&srcLineAbove[x - 1] );
724
0
            vsrcab            = _mm_loadu_si128( (__m128i*)&srcLineBelow[x + 1] );
725
0
            virBmask          = _mm_loadu_si128( (__m128i*)&bndmask[x] );
726
0
            vsrcat            = _mm_subs_epi16( vsrca, vsrcat );
727
0
            vsrcab            = _mm_subs_epi16( vsrca, vsrcab );
728
0
            __m128i vsignt    = _mm_sign_epi16( vplusone, vsrcat );
729
0
            __m128i vsignb    = _mm_sign_epi16( vplusone, vsrcab );
730
0
            __m128i vsign     = _mm_adds_epi16( _mm_adds_epi16( vsignt, vsignb ), vbaseoffset );
731
0
            __m128i veoffsets = _mm_shuffle_epi8( voffsettbl, vsign );
732
0
            veoffsets         = _mm_slli_epi16( veoffsets, 8 );
733
0
            veoffsets         = _mm_srai_epi16( veoffsets, 8 );
734
735
0
            vsrcat = _mm_add_epi16( vsrca, veoffsets );
736
0
            vsrcat = _mm_min_epi16( _mm_max_epi16( vsrcat, vzero ), vibdimax );
737
738
0
            vsrcab = _mm_blendv_epi8( vsrcat, vsrca, virBmask );
739
740
0
            _mm_store_si128( (__m128i*)&resLine[x], vsrcab );
741
0
          }
742
0
        }
743
0
        srcLine += srcStride;
744
0
        srcLineBelow += srcStride;
745
0
        srcLineAbove += srcStride;
746
0
        resLine += resStride;
747
0
      }
748
0
    }
749
0
  }
750
0
  else
751
0
  {
752
0
    offset += 2;
753
0
    int8_t *signUpLine, *signDownLine, *signTmpLine;
754
755
0
    signUpLine   = &m_signLineBuf1->front();
756
0
    signDownLine = &m_signLineBuf2->front();
757
758
0
    startX = isLeftAvail ? 0 : 1;
759
0
    endX   = isRightAvail ? width : ( width - 1 );
760
761
    // prepare 2nd line's upper sign
762
0
    const Pel* srcLineBelow = srcLine + srcStride;
763
0
    for( x = startX; x < endX + 1; x++ )
764
0
    {
765
0
      signUpLine[x] = (int8_t)sgn( srcLineBelow[x] - srcLine[x - 1] );
766
0
    }
767
0
    if( isCtuCrossedByVirtualBoundaries )
768
0
    {
769
      // 1st line
770
0
      const Pel* srcLineAbove = srcLine - srcStride;
771
0
      firstLineStartX         = isAboveLeftAvail ? 0 : 1;
772
0
      firstLineEndX           = isAboveAvail ? endX : 1;
773
0
      if( !isHorProcessDisabled( 0, numHorVirBndry, horVirBndryPos ) )
774
0
      {
775
0
        for( x = firstLineStartX; x < firstLineEndX; x++ )
776
0
        {
777
0
          if( isVerProcessDisabled( x, numVerVirBndry, verVirBndryPos ) )
778
0
          {
779
0
            continue;
780
0
          }
781
0
          edgeType = sgn( srcLine[x] - srcLineAbove[x - 1] ) - signUpLine[x + 1];
782
783
0
          resLine[x] = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng );
784
0
        }
785
0
      }
786
0
      srcLine += srcStride;
787
0
      resLine += resStride;
788
      // middle lines
789
0
      for( y = 1; y < height - 1; y++ )
790
0
      {
791
0
        if( isHorProcessDisabled( y, numHorVirBndry, horVirBndryPos ) )
792
0
        {
793
0
          srcLineBelow = srcLine + srcStride;
794
0
          for( x = startX; x < endX; x++ )
795
0
          {
796
0
            signDown            = (int8_t)sgn( srcLine[x] - srcLineBelow[x + 1] );
797
0
            signDownLine[x + 1] = -signDown;
798
0
          }
799
0
          signDownLine[startX] = (int8_t)sgn( srcLineBelow[startX] - srcLine[startX - 1] );
800
0
          signTmpLine          = signUpLine;
801
0
          signUpLine           = signDownLine;
802
0
          signDownLine         = signTmpLine;
803
0
          srcLine += srcStride;
804
0
          resLine += resStride;
805
0
        }
806
0
        else
807
0
        {
808
0
          srcLineBelow = srcLine + srcStride;
809
0
          for( x = startX; x < endX; x++ )
810
0
          {
811
0
            signDown = (int8_t)sgn( srcLine[x] - srcLineBelow[x + 1] );
812
0
            if( isVerProcessDisabled( x, numVerVirBndry, verVirBndryPos ) )
813
0
            {
814
0
              signDownLine[x + 1] = -signDown;
815
0
              continue;
816
0
            }
817
0
            edgeType   = signDown + signUpLine[x];
818
0
            resLine[x] = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng );
819
820
0
            signDownLine[x + 1] = -signDown;
821
0
          }
822
0
          signDownLine[startX] = (int8_t)sgn( srcLineBelow[startX] - srcLine[startX - 1] );
823
0
          signTmpLine          = signUpLine;
824
0
          signUpLine           = signDownLine;
825
0
          signDownLine         = signTmpLine;
826
0
          srcLine += srcStride;
827
0
          resLine += resStride;
828
0
        }
829
0
      }
830
      // last line
831
0
      srcLineBelow   = srcLine + srcStride;
832
0
      lastLineStartX = isBelowAvail ? startX : ( width - 1 );
833
0
      lastLineEndX   = isBelowRightAvail ? width : ( width - 1 );
834
0
      if( !isHorProcessDisabled( height - 1, numHorVirBndry, horVirBndryPos ) )
835
0
      {
836
0
        for( x = lastLineStartX; x < lastLineEndX; x++ )
837
0
        {
838
0
          if( isVerProcessDisabled( x, numVerVirBndry, verVirBndryPos ) )
839
0
          {
840
0
            continue;
841
0
          }
842
0
          edgeType   = sgn( srcLine[x] - srcLineBelow[x + 1] ) + signUpLine[x];
843
0
          resLine[x] = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng );
844
0
        }
845
0
      }
846
0
    }
847
0
    else
848
0
    {
849
      // 1st line
850
0
      const Pel* srcLineAbove = srcLine - srcStride;
851
0
      firstLineStartX         = isAboveLeftAvail ? 0 : 1;
852
0
      firstLineEndX           = isAboveAvail ? endX : 1;
853
0
      for( x = firstLineStartX; x < firstLineEndX; x++ )
854
0
      {
855
0
        edgeType   = sgn( srcLine[x] - srcLineAbove[x - 1] ) - signUpLine[x + 1];
856
0
        resLine[x] = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng );
857
0
      }
858
0
      srcLine += srcStride;
859
0
      resLine += resStride;
860
      // middle lines
861
0
      for( y = 1; y < height - 1; y++ )
862
0
      {
863
0
        srcLineBelow = srcLine + srcStride;
864
0
        for( x = startX; x < endX; x++ )
865
0
        {
866
0
          signDown            = (int8_t)sgn( srcLine[x] - srcLineBelow[x + 1] );
867
0
          edgeType            = signDown + signUpLine[x];
868
0
          resLine[x]          = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng );
869
0
          signDownLine[x + 1] = -signDown;
870
0
        }
871
0
        signDownLine[startX] = (int8_t)sgn( srcLineBelow[startX] - srcLine[startX - 1] );
872
0
        signTmpLine          = signUpLine;
873
0
        signUpLine           = signDownLine;
874
0
        signDownLine         = signTmpLine;
875
0
        srcLine += srcStride;
876
0
        resLine += resStride;
877
0
      }
878
      // last line
879
0
      srcLineBelow   = srcLine + srcStride;
880
0
      lastLineStartX = isBelowAvail ? startX : ( width - 1 );
881
0
      lastLineEndX   = isBelowRightAvail ? width : ( width - 1 );
882
0
      for( x = lastLineStartX; x < lastLineEndX; x++ )
883
0
      {
884
0
        edgeType   = sgn( srcLine[x] - srcLineBelow[x + 1] ) + signUpLine[x];
885
0
        resLine[x] = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng );
886
0
      }
887
0
    }
888
0
  }
889
0
}
Unexecuted instantiation: SampleAdaptiveOffset_sse41.cpp:void vvdec::offsetBlock_SIMD_SAO_TYPE_EO_135<(vvdec::x86_simd::X86_VEXT)1>(int, vvdec::ClpRngTemplate<short> const&, int*, short const*, short*, long, long, int, int, bool, bool, bool, bool, bool, bool, bool, bool, std::__1::vector<signed char, std::__1::allocator<signed char> >*, std::__1::vector<signed char, std::__1::allocator<signed char> >*, bool, int*, int*, int, int, unsigned short*)
Unexecuted instantiation: SampleAdaptiveOffset_avx2.cpp:void vvdec::offsetBlock_SIMD_SAO_TYPE_EO_135<(vvdec::x86_simd::X86_VEXT)4>(int, vvdec::ClpRngTemplate<short> const&, int*, short const*, short*, long, long, int, int, bool, bool, bool, bool, bool, bool, bool, bool, std::__1::vector<signed char, std::__1::allocator<signed char> >*, std::__1::vector<signed char, std::__1::allocator<signed char> >*, bool, int*, int*, int, int, unsigned short*)
890
891
template<X86_VEXT vext>
892
static void offsetBlock_SIMD_SAO_TYPE_EO_45( const int            channelBitDepth,
893
                                             const ClpRng&        clpRng,
894
                                             int*                 offset,
895
                                             const Pel*           srcBlk,
896
                                             Pel*                 resBlk,
897
                                             ptrdiff_t            srcStride,
898
                                             ptrdiff_t            resStride,
899
                                             int                  width,
900
                                             int                  height,
901
                                             bool                 isLeftAvail,
902
                                             bool                 isRightAvail,
903
                                             bool                 isAboveAvail,
904
                                             bool                 isBelowAvail,
905
                                             bool                 isAboveLeftAvail,
906
                                             bool                 isAboveRightAvail,
907
                                             bool                 isBelowLeftAvail,
908
                                             bool                 isBelowRightAvail,
909
                                             std::vector<int8_t>* m_signLineBuf1,
910
                                             std::vector<int8_t>* m_signLineBuf2,
911
                                             bool                 isCtuCrossedByVirtualBoundaries,
912
                                             int                  horVirBndryPos[],
913
                                             int                  verVirBndryPos[],
914
                                             int                  numHorVirBndry,
915
                                             int                  numVerVirBndry,
916
                                             uint16_t             bndmask[MAX_CU_SIZE] )
917
2
{
918
2
  const Pel* srcLine = srcBlk;
919
2
  Pel*       resLine = resBlk;
920
921
2
  int    x, y, startX, startY, endX, endY, edgeType;
922
2
  int    firstLineStartX, firstLineEndX, lastLineStartX, lastLineEndX;
923
2
  int8_t signDown;
924
925
2
  if( isLeftAvail && isRightAvail && isAboveLeftAvail && isBelowRightAvail )
926
0
  {
927
0
    int8_t p_eo_offsets[16] = { 0 };
928
0
    for( int i = 0; i < SAO_EO_NUM_CATEGORIES; i++ )
929
0
    {
930
0
      p_eo_offsets[i] = offset[i];
931
0
    }
932
0
    const Pel* srcLineAbove = srcLine - srcStride;
933
0
    const Pel* srcLineBelow = srcLine + srcStride;
934
0
    startY                  = 0;
935
0
    if( !isAboveAvail )
936
0
    {
937
0
      startY       = 1;
938
0
      srcLineAbove = srcLine;
939
0
      srcLine += srcStride;
940
0
      resLine += resStride;
941
0
      srcLineBelow = srcLine + srcStride;
942
0
    }
943
0
    endY = height;
944
0
    if( !isBelowAvail )
945
0
    {
946
0
      endY = height - 1;
947
0
    }
948
#  if defined( USE_AVX2 )
949
    // AVX2
950
0
    if( ( width & 15 ) == 0 && vext >= AVX2 )
951
0
    {
952
0
      __m256i    virBmask;
953
0
      __m256i    vsrca, vsrcat, vsrcab;
954
0
      __m256i    vbaseoffset  = _mm256_set1_epi16( 2 );
955
0
      __m256i    vplusone     = _mm256_set1_epi16( 1 );
956
0
      __m256i    vzero        = _mm256_set1_epi8( 0 );
957
0
      __m256i    vibdimax     = _mm256_set1_epi16( ( 1 << channelBitDepth ) - 1 );
958
0
      __m256i    voffsettbl   = _mm256_broadcastsi128_si256( _mm_loadu_si128( (__m128i*)p_eo_offsets ) );
959
0
      const Pel* srcLineBelow = srcLine + srcStride;
960
961
0
      for( y = startY; y < endY; y++ )
962
0
      {
963
        //         printf("y %d endY %d x%d y %d \n"y,endY,isAboveAvail,)
964
0
        if( !isHorProcessDisabled( y, numHorVirBndry, horVirBndryPos ) )
965
0
        {
966
0
          for( x = 0; x < width; x += 16 )
967
0
          {
968
0
            virBmask = _mm256_loadu_si256( (__m256i*)&bndmask[x] );
969
0
            vsrca    = _mm256_loadu_si256( (__m256i*)&srcLine[x] );
970
0
            vsrcat   = _mm256_loadu_si256( (__m256i*)&srcLineAbove[x + 1] );
971
0
            vsrcab   = _mm256_loadu_si256( (__m256i*)&srcLineBelow[x - 1] );
972
973
0
            vsrcat            = _mm256_subs_epi16( vsrca, vsrcat );
974
0
            vsrcab            = _mm256_subs_epi16( vsrca, vsrcab );
975
0
            __m256i vsignt    = _mm256_sign_epi16( vplusone, vsrcat );
976
0
            __m256i vsignb    = _mm256_sign_epi16( vplusone, vsrcab );
977
0
            __m256i vsign     = _mm256_adds_epi16( _mm256_adds_epi16( vsignt, vsignb ), vbaseoffset );
978
0
            __m256i veoffsets = _mm256_shuffle_epi8( voffsettbl, vsign );
979
0
            veoffsets         = _mm256_slli_epi16( veoffsets, 8 );
980
0
            veoffsets         = _mm256_srai_epi16( veoffsets, 8 );
981
982
0
            vsrcat = _mm256_add_epi16( vsrca, veoffsets );
983
0
            vsrcat = _mm256_min_epi16( _mm256_max_epi16( vsrcat, vzero ), vibdimax );
984
985
0
            vsrcab = _mm256_blendv_epi8( vsrcat, vsrca, virBmask );
986
987
0
            _mm256_storeu_si256( (__m256i*)&resLine[x], vsrcab );
988
0
          }
989
0
        }
990
0
        srcLine += srcStride;
991
0
        srcLineBelow += srcStride;
992
0
        srcLineAbove += srcStride;
993
0
        resLine += resStride;
994
0
      }
995
0
    }
996
0
    else
997
0
#  endif
998
0
    {
999
0
      __m128i vsrca, vsrcat, vsrcab, virBmask;
1000
0
      __m128i vbaseoffset = _mm_set1_epi16( 2 );
1001
0
      __m128i vplusone    = _mm_set1_epi16( 1 );
1002
0
      __m128i vzero       = _mm_set1_epi8( 0 );
1003
0
      __m128i vibdimax    = _mm_set1_epi16( ( 1 << channelBitDepth ) - 1 );
1004
0
      __m128i voffsettbl  = _mm_loadu_si128( (__m128i*)p_eo_offsets );
1005
1006
0
      for( y = startY; y < endY; y++ )
1007
0
      {
1008
0
        if( !isHorProcessDisabled( y, numHorVirBndry, horVirBndryPos ) )
1009
0
        {
1010
0
          for( x = 0; x < width; x += 8 )
1011
0
          {
1012
0
            vsrca    = _mm_loadu_si128( (__m128i*)&srcLine[x] );
1013
0
            vsrcat   = _mm_loadu_si128( (__m128i*)&srcLineAbove[x + 1] );
1014
0
            virBmask = _mm_loadu_si128( (__m128i*)&bndmask[x] );
1015
1016
0
            vsrcab            = _mm_loadu_si128( (__m128i*)&srcLineBelow[x - 1] );
1017
0
            vsrcat            = _mm_subs_epi16( vsrca, vsrcat );
1018
0
            vsrcab            = _mm_subs_epi16( vsrca, vsrcab );
1019
0
            __m128i vsignt    = _mm_sign_epi16( vplusone, vsrcat );
1020
0
            __m128i vsignb    = _mm_sign_epi16( vplusone, vsrcab );
1021
0
            __m128i vsign     = _mm_adds_epi16( _mm_adds_epi16( vsignt, vsignb ), vbaseoffset );
1022
0
            __m128i veoffsets = _mm_shuffle_epi8( voffsettbl, vsign );
1023
0
            veoffsets         = _mm_slli_epi16( veoffsets, 8 );
1024
0
            veoffsets         = _mm_srai_epi16( veoffsets, 8 );
1025
1026
0
            vsrcat = _mm_add_epi16( vsrca, veoffsets );
1027
0
            vsrcat = _mm_min_epi16( _mm_max_epi16( vsrcat, vzero ), vibdimax );
1028
1029
0
            vsrcab = _mm_blendv_epi8( vsrcat, vsrca, virBmask );
1030
1031
0
            _mm_store_si128( (__m128i*)&resLine[x], vsrcab );
1032
0
          }
1033
0
        }
1034
0
        srcLine += srcStride;
1035
0
        srcLineBelow += srcStride;
1036
0
        srcLineAbove += srcStride;
1037
0
        resLine += resStride;
1038
0
      }
1039
0
    }
1040
0
  }
1041
2
  else
1042
2
  {
1043
2
    offset += 2;
1044
2
    int8_t* signUpLine = &m_signLineBuf1->at( 1 );
1045
2
    startX             = isLeftAvail ? 0 : 1;
1046
2
    endX               = isRightAvail ? width : ( width - 1 );
1047
    // prepare 2nd line upper sign
1048
2
    const Pel* srcLineBelow = srcLine + srcStride;
1049
64
    for( x = startX - 1; x < endX; x++ )
1050
62
    {
1051
62
      signUpLine[x] = (int8_t)sgn( srcLineBelow[x] - srcLine[x + 1] );
1052
62
    }
1053
1054
2
    if( isCtuCrossedByVirtualBoundaries )
1055
0
    {
1056
      // first line
1057
0
      const Pel* srcLineAbove = srcLine - srcStride;
1058
0
      firstLineStartX         = isAboveAvail ? startX : ( width - 1 );
1059
0
      firstLineEndX           = isAboveRightAvail ? width : ( width - 1 );
1060
0
      if( !isHorProcessDisabled( 0, numHorVirBndry, horVirBndryPos ) )
1061
0
      {
1062
0
        for( x = firstLineStartX; x < firstLineEndX; x++ )
1063
0
        {
1064
0
          if( isVerProcessDisabled( x, numVerVirBndry, verVirBndryPos ) )
1065
0
          {
1066
0
            continue;
1067
0
          }
1068
0
          edgeType   = sgn( srcLine[x] - srcLineAbove[x + 1] ) - signUpLine[x - 1];
1069
0
          resLine[x] = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng );
1070
0
        }
1071
0
      }
1072
0
      srcLine += srcStride;
1073
0
      resLine += resStride;
1074
      // middle lines
1075
0
      for( y = 1; y < height - 1; y++ )
1076
0
      {
1077
0
        if( isHorProcessDisabled( y, numHorVirBndry, horVirBndryPos ) )
1078
0
        {
1079
0
          srcLineBelow = srcLine + ( srcStride );
1080
0
          for( x = startX; x < endX; x++ )
1081
0
          {
1082
0
            signDown          = (int8_t)sgn( srcLine[x] - srcLineBelow[x - 1] );
1083
0
            signUpLine[x - 1] = -signDown;
1084
0
          }
1085
0
          signUpLine[endX - 1] = (int8_t)sgn( srcLineBelow[endX - 1] - srcLine[endX] );
1086
0
          srcLine += srcStride;
1087
0
          resLine += ( resStride );
1088
0
        }
1089
0
        else
1090
0
        {
1091
0
          srcLineBelow = srcLine + srcStride;
1092
0
          for( x = startX; x < endX; x++ )
1093
0
          {
1094
0
            signDown = (int8_t)sgn( srcLine[x] - srcLineBelow[x - 1] );
1095
0
            if( isVerProcessDisabled( x, numVerVirBndry, verVirBndryPos ) )
1096
0
            {
1097
0
              signUpLine[x - 1] = -signDown;
1098
0
              continue;
1099
0
            }
1100
0
            edgeType          = signDown + signUpLine[x];
1101
0
            resLine[x]        = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng );
1102
0
            signUpLine[x - 1] = -signDown;
1103
0
          }
1104
0
          signUpLine[endX - 1] = (int8_t)sgn( srcLineBelow[endX - 1] - srcLine[endX] );
1105
0
          srcLine += srcStride;
1106
0
          resLine += resStride;
1107
0
        }
1108
0
      }
1109
      // last line
1110
0
      srcLineBelow   = srcLine + srcStride;
1111
0
      lastLineStartX = isBelowLeftAvail ? 0 : 1;
1112
0
      lastLineEndX   = isBelowAvail ? endX : 1;
1113
0
      if( !isHorProcessDisabled( height - 1, numHorVirBndry, horVirBndryPos ) )
1114
0
      {
1115
0
        for( x = lastLineStartX; x < lastLineEndX; x++ )
1116
0
        {
1117
0
          if( isVerProcessDisabled( x, numVerVirBndry, verVirBndryPos ) )
1118
0
          {
1119
0
            continue;
1120
0
          }
1121
0
          edgeType   = sgn( srcLine[x] - srcLineBelow[x - 1] ) + signUpLine[x];
1122
0
          resLine[x] = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng );
1123
0
        }
1124
0
      }
1125
0
    }
1126
2
    else
1127
2
    {
1128
      // first line
1129
2
      const Pel* srcLineAbove = srcLine - srcStride;
1130
2
      firstLineStartX         = isAboveAvail ? startX : ( width - 1 );
1131
2
      firstLineEndX           = isAboveRightAvail ? width : ( width - 1 );
1132
2
      for( x = firstLineStartX; x < firstLineEndX; x++ )
1133
0
      {
1134
0
        edgeType   = sgn( srcLine[x] - srcLineAbove[x + 1] ) - signUpLine[x - 1];
1135
0
        resLine[x] = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng );
1136
0
      }
1137
2
      srcLine += srcStride;
1138
2
      resLine += resStride;
1139
      // middle lines
1140
62
      for( y = 1; y < height - 1; y++ )
1141
60
      {
1142
60
        srcLineBelow = srcLine + srcStride;
1143
1.86k
        for( x = startX; x < endX; x++ )
1144
1.80k
        {
1145
1.80k
          signDown          = (int8_t)sgn( srcLine[x] - srcLineBelow[x - 1] );
1146
1.80k
          edgeType          = signDown + signUpLine[x];
1147
1.80k
          resLine[x]        = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng );
1148
1.80k
          signUpLine[x - 1] = -signDown;
1149
1.80k
        }
1150
60
        signUpLine[endX - 1] = (int8_t)sgn( srcLineBelow[endX - 1] - srcLine[endX] );
1151
60
        srcLine += srcStride;
1152
60
        resLine += resStride;
1153
60
      }
1154
      // last line
1155
2
      srcLineBelow   = srcLine + srcStride;
1156
2
      lastLineStartX = isBelowLeftAvail ? 0 : 1;
1157
2
      lastLineEndX   = isBelowAvail ? endX : 1;
1158
2
      for( x = lastLineStartX; x < lastLineEndX; x++ )
1159
0
      {
1160
0
        edgeType   = sgn( srcLine[x] - srcLineBelow[x - 1] ) + signUpLine[x];
1161
0
        resLine[x] = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng );
1162
0
      }
1163
2
    }
1164
2
  }
1165
2
}
Unexecuted instantiation: SampleAdaptiveOffset_sse41.cpp:void vvdec::offsetBlock_SIMD_SAO_TYPE_EO_45<(vvdec::x86_simd::X86_VEXT)1>(int, vvdec::ClpRngTemplate<short> const&, int*, short const*, short*, long, long, int, int, bool, bool, bool, bool, bool, bool, bool, bool, std::__1::vector<signed char, std::__1::allocator<signed char> >*, std::__1::vector<signed char, std::__1::allocator<signed char> >*, bool, int*, int*, int, int, unsigned short*)
SampleAdaptiveOffset_avx2.cpp:void vvdec::offsetBlock_SIMD_SAO_TYPE_EO_45<(vvdec::x86_simd::X86_VEXT)4>(int, vvdec::ClpRngTemplate<short> const&, int*, short const*, short*, long, long, int, int, bool, bool, bool, bool, bool, bool, bool, bool, std::__1::vector<signed char, std::__1::allocator<signed char> >*, std::__1::vector<signed char, std::__1::allocator<signed char> >*, bool, int*, int*, int, int, unsigned short*)
Line
Count
Source
917
2
{
918
2
  const Pel* srcLine = srcBlk;
919
2
  Pel*       resLine = resBlk;
920
921
2
  int    x, y, startX, startY, endX, endY, edgeType;
922
2
  int    firstLineStartX, firstLineEndX, lastLineStartX, lastLineEndX;
923
2
  int8_t signDown;
924
925
2
  if( isLeftAvail && isRightAvail && isAboveLeftAvail && isBelowRightAvail )
926
0
  {
927
0
    int8_t p_eo_offsets[16] = { 0 };
928
0
    for( int i = 0; i < SAO_EO_NUM_CATEGORIES; i++ )
929
0
    {
930
0
      p_eo_offsets[i] = offset[i];
931
0
    }
932
0
    const Pel* srcLineAbove = srcLine - srcStride;
933
0
    const Pel* srcLineBelow = srcLine + srcStride;
934
0
    startY                  = 0;
935
0
    if( !isAboveAvail )
936
0
    {
937
0
      startY       = 1;
938
0
      srcLineAbove = srcLine;
939
0
      srcLine += srcStride;
940
0
      resLine += resStride;
941
0
      srcLineBelow = srcLine + srcStride;
942
0
    }
943
0
    endY = height;
944
0
    if( !isBelowAvail )
945
0
    {
946
0
      endY = height - 1;
947
0
    }
948
0
#  if defined( USE_AVX2 )
949
    // AVX2
950
0
    if( ( width & 15 ) == 0 && vext >= AVX2 )
951
0
    {
952
0
      __m256i    virBmask;
953
0
      __m256i    vsrca, vsrcat, vsrcab;
954
0
      __m256i    vbaseoffset  = _mm256_set1_epi16( 2 );
955
0
      __m256i    vplusone     = _mm256_set1_epi16( 1 );
956
0
      __m256i    vzero        = _mm256_set1_epi8( 0 );
957
0
      __m256i    vibdimax     = _mm256_set1_epi16( ( 1 << channelBitDepth ) - 1 );
958
0
      __m256i    voffsettbl   = _mm256_broadcastsi128_si256( _mm_loadu_si128( (__m128i*)p_eo_offsets ) );
959
0
      const Pel* srcLineBelow = srcLine + srcStride;
960
961
0
      for( y = startY; y < endY; y++ )
962
0
      {
963
        //         printf("y %d endY %d x%d y %d \n"y,endY,isAboveAvail,)
964
0
        if( !isHorProcessDisabled( y, numHorVirBndry, horVirBndryPos ) )
965
0
        {
966
0
          for( x = 0; x < width; x += 16 )
967
0
          {
968
0
            virBmask = _mm256_loadu_si256( (__m256i*)&bndmask[x] );
969
0
            vsrca    = _mm256_loadu_si256( (__m256i*)&srcLine[x] );
970
0
            vsrcat   = _mm256_loadu_si256( (__m256i*)&srcLineAbove[x + 1] );
971
0
            vsrcab   = _mm256_loadu_si256( (__m256i*)&srcLineBelow[x - 1] );
972
973
0
            vsrcat            = _mm256_subs_epi16( vsrca, vsrcat );
974
0
            vsrcab            = _mm256_subs_epi16( vsrca, vsrcab );
975
0
            __m256i vsignt    = _mm256_sign_epi16( vplusone, vsrcat );
976
0
            __m256i vsignb    = _mm256_sign_epi16( vplusone, vsrcab );
977
0
            __m256i vsign     = _mm256_adds_epi16( _mm256_adds_epi16( vsignt, vsignb ), vbaseoffset );
978
0
            __m256i veoffsets = _mm256_shuffle_epi8( voffsettbl, vsign );
979
0
            veoffsets         = _mm256_slli_epi16( veoffsets, 8 );
980
0
            veoffsets         = _mm256_srai_epi16( veoffsets, 8 );
981
982
0
            vsrcat = _mm256_add_epi16( vsrca, veoffsets );
983
0
            vsrcat = _mm256_min_epi16( _mm256_max_epi16( vsrcat, vzero ), vibdimax );
984
985
0
            vsrcab = _mm256_blendv_epi8( vsrcat, vsrca, virBmask );
986
987
0
            _mm256_storeu_si256( (__m256i*)&resLine[x], vsrcab );
988
0
          }
989
0
        }
990
0
        srcLine += srcStride;
991
0
        srcLineBelow += srcStride;
992
0
        srcLineAbove += srcStride;
993
0
        resLine += resStride;
994
0
      }
995
0
    }
996
0
    else
997
0
#  endif
998
0
    {
999
0
      __m128i vsrca, vsrcat, vsrcab, virBmask;
1000
0
      __m128i vbaseoffset = _mm_set1_epi16( 2 );
1001
0
      __m128i vplusone    = _mm_set1_epi16( 1 );
1002
0
      __m128i vzero       = _mm_set1_epi8( 0 );
1003
0
      __m128i vibdimax    = _mm_set1_epi16( ( 1 << channelBitDepth ) - 1 );
1004
0
      __m128i voffsettbl  = _mm_loadu_si128( (__m128i*)p_eo_offsets );
1005
1006
0
      for( y = startY; y < endY; y++ )
1007
0
      {
1008
0
        if( !isHorProcessDisabled( y, numHorVirBndry, horVirBndryPos ) )
1009
0
        {
1010
0
          for( x = 0; x < width; x += 8 )
1011
0
          {
1012
0
            vsrca    = _mm_loadu_si128( (__m128i*)&srcLine[x] );
1013
0
            vsrcat   = _mm_loadu_si128( (__m128i*)&srcLineAbove[x + 1] );
1014
0
            virBmask = _mm_loadu_si128( (__m128i*)&bndmask[x] );
1015
1016
0
            vsrcab            = _mm_loadu_si128( (__m128i*)&srcLineBelow[x - 1] );
1017
0
            vsrcat            = _mm_subs_epi16( vsrca, vsrcat );
1018
0
            vsrcab            = _mm_subs_epi16( vsrca, vsrcab );
1019
0
            __m128i vsignt    = _mm_sign_epi16( vplusone, vsrcat );
1020
0
            __m128i vsignb    = _mm_sign_epi16( vplusone, vsrcab );
1021
0
            __m128i vsign     = _mm_adds_epi16( _mm_adds_epi16( vsignt, vsignb ), vbaseoffset );
1022
0
            __m128i veoffsets = _mm_shuffle_epi8( voffsettbl, vsign );
1023
0
            veoffsets         = _mm_slli_epi16( veoffsets, 8 );
1024
0
            veoffsets         = _mm_srai_epi16( veoffsets, 8 );
1025
1026
0
            vsrcat = _mm_add_epi16( vsrca, veoffsets );
1027
0
            vsrcat = _mm_min_epi16( _mm_max_epi16( vsrcat, vzero ), vibdimax );
1028
1029
0
            vsrcab = _mm_blendv_epi8( vsrcat, vsrca, virBmask );
1030
1031
0
            _mm_store_si128( (__m128i*)&resLine[x], vsrcab );
1032
0
          }
1033
0
        }
1034
0
        srcLine += srcStride;
1035
0
        srcLineBelow += srcStride;
1036
0
        srcLineAbove += srcStride;
1037
0
        resLine += resStride;
1038
0
      }
1039
0
    }
1040
0
  }
1041
2
  else
1042
2
  {
1043
2
    offset += 2;
1044
2
    int8_t* signUpLine = &m_signLineBuf1->at( 1 );
1045
2
    startX             = isLeftAvail ? 0 : 1;
1046
2
    endX               = isRightAvail ? width : ( width - 1 );
1047
    // prepare 2nd line upper sign
1048
2
    const Pel* srcLineBelow = srcLine + srcStride;
1049
64
    for( x = startX - 1; x < endX; x++ )
1050
62
    {
1051
62
      signUpLine[x] = (int8_t)sgn( srcLineBelow[x] - srcLine[x + 1] );
1052
62
    }
1053
1054
2
    if( isCtuCrossedByVirtualBoundaries )
1055
0
    {
1056
      // first line
1057
0
      const Pel* srcLineAbove = srcLine - srcStride;
1058
0
      firstLineStartX         = isAboveAvail ? startX : ( width - 1 );
1059
0
      firstLineEndX           = isAboveRightAvail ? width : ( width - 1 );
1060
0
      if( !isHorProcessDisabled( 0, numHorVirBndry, horVirBndryPos ) )
1061
0
      {
1062
0
        for( x = firstLineStartX; x < firstLineEndX; x++ )
1063
0
        {
1064
0
          if( isVerProcessDisabled( x, numVerVirBndry, verVirBndryPos ) )
1065
0
          {
1066
0
            continue;
1067
0
          }
1068
0
          edgeType   = sgn( srcLine[x] - srcLineAbove[x + 1] ) - signUpLine[x - 1];
1069
0
          resLine[x] = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng );
1070
0
        }
1071
0
      }
1072
0
      srcLine += srcStride;
1073
0
      resLine += resStride;
1074
      // middle lines
1075
0
      for( y = 1; y < height - 1; y++ )
1076
0
      {
1077
0
        if( isHorProcessDisabled( y, numHorVirBndry, horVirBndryPos ) )
1078
0
        {
1079
0
          srcLineBelow = srcLine + ( srcStride );
1080
0
          for( x = startX; x < endX; x++ )
1081
0
          {
1082
0
            signDown          = (int8_t)sgn( srcLine[x] - srcLineBelow[x - 1] );
1083
0
            signUpLine[x - 1] = -signDown;
1084
0
          }
1085
0
          signUpLine[endX - 1] = (int8_t)sgn( srcLineBelow[endX - 1] - srcLine[endX] );
1086
0
          srcLine += srcStride;
1087
0
          resLine += ( resStride );
1088
0
        }
1089
0
        else
1090
0
        {
1091
0
          srcLineBelow = srcLine + srcStride;
1092
0
          for( x = startX; x < endX; x++ )
1093
0
          {
1094
0
            signDown = (int8_t)sgn( srcLine[x] - srcLineBelow[x - 1] );
1095
0
            if( isVerProcessDisabled( x, numVerVirBndry, verVirBndryPos ) )
1096
0
            {
1097
0
              signUpLine[x - 1] = -signDown;
1098
0
              continue;
1099
0
            }
1100
0
            edgeType          = signDown + signUpLine[x];
1101
0
            resLine[x]        = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng );
1102
0
            signUpLine[x - 1] = -signDown;
1103
0
          }
1104
0
          signUpLine[endX - 1] = (int8_t)sgn( srcLineBelow[endX - 1] - srcLine[endX] );
1105
0
          srcLine += srcStride;
1106
0
          resLine += resStride;
1107
0
        }
1108
0
      }
1109
      // last line
1110
0
      srcLineBelow   = srcLine + srcStride;
1111
0
      lastLineStartX = isBelowLeftAvail ? 0 : 1;
1112
0
      lastLineEndX   = isBelowAvail ? endX : 1;
1113
0
      if( !isHorProcessDisabled( height - 1, numHorVirBndry, horVirBndryPos ) )
1114
0
      {
1115
0
        for( x = lastLineStartX; x < lastLineEndX; x++ )
1116
0
        {
1117
0
          if( isVerProcessDisabled( x, numVerVirBndry, verVirBndryPos ) )
1118
0
          {
1119
0
            continue;
1120
0
          }
1121
0
          edgeType   = sgn( srcLine[x] - srcLineBelow[x - 1] ) + signUpLine[x];
1122
0
          resLine[x] = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng );
1123
0
        }
1124
0
      }
1125
0
    }
1126
2
    else
1127
2
    {
1128
      // first line
1129
2
      const Pel* srcLineAbove = srcLine - srcStride;
1130
2
      firstLineStartX         = isAboveAvail ? startX : ( width - 1 );
1131
2
      firstLineEndX           = isAboveRightAvail ? width : ( width - 1 );
1132
2
      for( x = firstLineStartX; x < firstLineEndX; x++ )
1133
0
      {
1134
0
        edgeType   = sgn( srcLine[x] - srcLineAbove[x + 1] ) - signUpLine[x - 1];
1135
0
        resLine[x] = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng );
1136
0
      }
1137
2
      srcLine += srcStride;
1138
2
      resLine += resStride;
1139
      // middle lines
1140
62
      for( y = 1; y < height - 1; y++ )
1141
60
      {
1142
60
        srcLineBelow = srcLine + srcStride;
1143
1.86k
        for( x = startX; x < endX; x++ )
1144
1.80k
        {
1145
1.80k
          signDown          = (int8_t)sgn( srcLine[x] - srcLineBelow[x - 1] );
1146
1.80k
          edgeType          = signDown + signUpLine[x];
1147
1.80k
          resLine[x]        = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng );
1148
1.80k
          signUpLine[x - 1] = -signDown;
1149
1.80k
        }
1150
60
        signUpLine[endX - 1] = (int8_t)sgn( srcLineBelow[endX - 1] - srcLine[endX] );
1151
60
        srcLine += srcStride;
1152
60
        resLine += resStride;
1153
60
      }
1154
      // last line
1155
2
      srcLineBelow   = srcLine + srcStride;
1156
2
      lastLineStartX = isBelowLeftAvail ? 0 : 1;
1157
2
      lastLineEndX   = isBelowAvail ? endX : 1;
1158
2
      for( x = lastLineStartX; x < lastLineEndX; x++ )
1159
0
      {
1160
0
        edgeType   = sgn( srcLine[x] - srcLineBelow[x - 1] ) + signUpLine[x];
1161
0
        resLine[x] = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng );
1162
0
      }
1163
2
    }
1164
2
  }
1165
2
}
1166
1167
template<X86_VEXT vext>
1168
void offsetBlock_SIMD( const int            channelBitDepth,
1169
                       const ClpRng&        clpRng,
1170
                       int                  typeIdx,
1171
                       int*                 offset,
1172
                       int                  startIdx,
1173
                       const Pel*           srcBlk,
1174
                       Pel*                 resBlk,
1175
                       ptrdiff_t            srcStride,
1176
                       ptrdiff_t            resStride,
1177
                       int                  width,
1178
                       int                  height,
1179
                       bool                 isLeftAvail,
1180
                       bool                 isRightAvail,
1181
                       bool                 isAboveAvail,
1182
                       bool                 isBelowAvail,
1183
                       bool                 isAboveLeftAvail,
1184
                       bool                 isAboveRightAvail,
1185
                       bool                 isBelowLeftAvail,
1186
                       bool                 isBelowRightAvail,
1187
                       std::vector<int8_t>* m_signLineBuf1,
1188
                       std::vector<int8_t>* m_signLineBuf2,
1189
                       bool                 isCtuCrossedByVirtualBoundaries,
1190
                       int                  horVirBndryPos[],
1191
                       int                  verVirBndryPos[],
1192
                       int                  numHorVirBndry,
1193
                       int                  numVerVirBndry )
1194
18
{
1195
18
  if( typeIdx == SAO_TYPE_BO )
1196
5
  {
1197
5
    offsetBlock_SIMD_SAO_TYPE_BO<vext>( channelBitDepth, offset, startIdx, srcBlk, resBlk, srcStride, resStride, width, height );
1198
1199
#  if USE_AVX2
1200
    _mm256_zeroupper();
1201
#  endif
1202
5
    return;
1203
5
  }
1204
1205
13
  uint16_t bndmask[MAX_CU_SIZE];
1206
13
  memset( &bndmask, 0, MAX_CU_SIZE * sizeof( uint16_t ) );
1207
13
  if( isCtuCrossedByVirtualBoundaries && numVerVirBndry > 0 )
1208
0
  {
1209
0
    for( int i = 0; i < numVerVirBndry; i++ )
1210
0
    {
1211
0
      if( verVirBndryPos[i] >= 0 && verVirBndryPos[i] < width )
1212
0
      {
1213
0
        bndmask[verVirBndryPos[i]] = 0xffff;
1214
0
      }
1215
1216
0
      if( verVirBndryPos[i] - 1 >= 0 && verVirBndryPos[i] - 1 < width )
1217
0
      {
1218
0
        bndmask[verVirBndryPos[i] - 1] = 0xffff;
1219
0
      }
1220
0
    }
1221
0
  }
1222
1223
13
  switch( typeIdx )
1224
13
  {
1225
7
  case SAO_TYPE_EO_0:
1226
7
    offsetBlock_SIMD_SAO_TYPE_EO_0<vext>( channelBitDepth,
1227
7
                                          clpRng,
1228
7
                                          offset,
1229
7
                                          srcBlk,
1230
7
                                          resBlk,
1231
7
                                          srcStride,
1232
7
                                          resStride,
1233
7
                                          width,
1234
7
                                          height,
1235
7
                                          isLeftAvail,
1236
7
                                          isRightAvail,
1237
7
                                          isAboveAvail,
1238
7
                                          isBelowAvail,
1239
7
                                          isAboveLeftAvail,
1240
7
                                          isAboveRightAvail,
1241
7
                                          isBelowLeftAvail,
1242
7
                                          isBelowRightAvail,
1243
7
                                          m_signLineBuf1,
1244
7
                                          m_signLineBuf2,
1245
7
                                          isCtuCrossedByVirtualBoundaries,
1246
7
                                          horVirBndryPos,
1247
7
                                          verVirBndryPos,
1248
7
                                          numHorVirBndry,
1249
7
                                          numVerVirBndry,
1250
7
                                          bndmask );
1251
7
    break;
1252
1253
4
  case SAO_TYPE_EO_90:
1254
4
    offsetBlock_SIMD_SAO_TYPE_EO_90<vext>( channelBitDepth,
1255
4
                                           clpRng,
1256
4
                                           offset,
1257
4
                                           srcBlk,
1258
4
                                           resBlk,
1259
4
                                           srcStride,
1260
4
                                           resStride,
1261
4
                                           width,
1262
4
                                           height,
1263
4
                                           isLeftAvail,
1264
4
                                           isRightAvail,
1265
4
                                           isAboveAvail,
1266
4
                                           isBelowAvail,
1267
4
                                           isAboveLeftAvail,
1268
4
                                           isAboveRightAvail,
1269
4
                                           isBelowLeftAvail,
1270
4
                                           isBelowRightAvail,
1271
4
                                           m_signLineBuf1,
1272
4
                                           m_signLineBuf2,
1273
4
                                           isCtuCrossedByVirtualBoundaries,
1274
4
                                           horVirBndryPos,
1275
4
                                           verVirBndryPos,
1276
4
                                           numHorVirBndry,
1277
4
                                           numVerVirBndry,
1278
4
                                           bndmask );
1279
4
    break;
1280
1281
0
  case SAO_TYPE_EO_135:
1282
0
    offsetBlock_SIMD_SAO_TYPE_EO_135<vext>( channelBitDepth,
1283
0
                                            clpRng,
1284
0
                                            offset,
1285
0
                                            srcBlk,
1286
0
                                            resBlk,
1287
0
                                            srcStride,
1288
0
                                            resStride,
1289
0
                                            width,
1290
0
                                            height,
1291
0
                                            isLeftAvail,
1292
0
                                            isRightAvail,
1293
0
                                            isAboveAvail,
1294
0
                                            isBelowAvail,
1295
0
                                            isAboveLeftAvail,
1296
0
                                            isAboveRightAvail,
1297
0
                                            isBelowLeftAvail,
1298
0
                                            isBelowRightAvail,
1299
0
                                            m_signLineBuf1,
1300
0
                                            m_signLineBuf2,
1301
0
                                            isCtuCrossedByVirtualBoundaries,
1302
0
                                            horVirBndryPos,
1303
0
                                            verVirBndryPos,
1304
0
                                            numHorVirBndry,
1305
0
                                            numVerVirBndry,
1306
0
                                            bndmask );
1307
0
    break;
1308
1309
2
  case SAO_TYPE_EO_45:
1310
2
    offsetBlock_SIMD_SAO_TYPE_EO_45<vext>( channelBitDepth,
1311
2
                                           clpRng,
1312
2
                                           offset,
1313
2
                                           srcBlk,
1314
2
                                           resBlk,
1315
2
                                           srcStride,
1316
2
                                           resStride,
1317
2
                                           width,
1318
2
                                           height,
1319
2
                                           isLeftAvail,
1320
2
                                           isRightAvail,
1321
2
                                           isAboveAvail,
1322
2
                                           isBelowAvail,
1323
2
                                           isAboveLeftAvail,
1324
2
                                           isAboveRightAvail,
1325
2
                                           isBelowLeftAvail,
1326
2
                                           isBelowRightAvail,
1327
2
                                           m_signLineBuf1,
1328
2
                                           m_signLineBuf2,
1329
2
                                           isCtuCrossedByVirtualBoundaries,
1330
2
                                           horVirBndryPos,
1331
2
                                           verVirBndryPos,
1332
2
                                           numHorVirBndry,
1333
2
                                           numVerVirBndry,
1334
2
                                           bndmask );
1335
2
    break;
1336
1337
0
  default:
1338
0
    THROW_FATAL( "Not a supported SAO types\n" );
1339
13
  }
1340
1341
#  if USE_AVX2
1342
13
  _mm256_zeroupper();
1343
13
#  endif
1344
13
}
Unexecuted instantiation: void vvdec::offsetBlock_SIMD<(vvdec::x86_simd::X86_VEXT)1>(int, vvdec::ClpRngTemplate<short> const&, int, int*, int, short const*, short*, long, long, int, int, bool, bool, bool, bool, bool, bool, bool, bool, std::__1::vector<signed char, std::__1::allocator<signed char> >*, std::__1::vector<signed char, std::__1::allocator<signed char> >*, bool, int*, int*, int, int)
void vvdec::offsetBlock_SIMD<(vvdec::x86_simd::X86_VEXT)4>(int, vvdec::ClpRngTemplate<short> const&, int, int*, int, short const*, short*, long, long, int, int, bool, bool, bool, bool, bool, bool, bool, bool, std::__1::vector<signed char, std::__1::allocator<signed char> >*, std::__1::vector<signed char, std::__1::allocator<signed char> >*, bool, int*, int*, int, int)
Line
Count
Source
1194
18
{
1195
18
  if( typeIdx == SAO_TYPE_BO )
1196
5
  {
1197
5
    offsetBlock_SIMD_SAO_TYPE_BO<vext>( channelBitDepth, offset, startIdx, srcBlk, resBlk, srcStride, resStride, width, height );
1198
1199
5
#  if USE_AVX2
1200
5
    _mm256_zeroupper();
1201
5
#  endif
1202
5
    return;
1203
5
  }
1204
1205
13
  uint16_t bndmask[MAX_CU_SIZE];
1206
13
  memset( &bndmask, 0, MAX_CU_SIZE * sizeof( uint16_t ) );
1207
13
  if( isCtuCrossedByVirtualBoundaries && numVerVirBndry > 0 )
1208
0
  {
1209
0
    for( int i = 0; i < numVerVirBndry; i++ )
1210
0
    {
1211
0
      if( verVirBndryPos[i] >= 0 && verVirBndryPos[i] < width )
1212
0
      {
1213
0
        bndmask[verVirBndryPos[i]] = 0xffff;
1214
0
      }
1215
1216
0
      if( verVirBndryPos[i] - 1 >= 0 && verVirBndryPos[i] - 1 < width )
1217
0
      {
1218
0
        bndmask[verVirBndryPos[i] - 1] = 0xffff;
1219
0
      }
1220
0
    }
1221
0
  }
1222
1223
13
  switch( typeIdx )
1224
13
  {
1225
7
  case SAO_TYPE_EO_0:
1226
7
    offsetBlock_SIMD_SAO_TYPE_EO_0<vext>( channelBitDepth,
1227
7
                                          clpRng,
1228
7
                                          offset,
1229
7
                                          srcBlk,
1230
7
                                          resBlk,
1231
7
                                          srcStride,
1232
7
                                          resStride,
1233
7
                                          width,
1234
7
                                          height,
1235
7
                                          isLeftAvail,
1236
7
                                          isRightAvail,
1237
7
                                          isAboveAvail,
1238
7
                                          isBelowAvail,
1239
7
                                          isAboveLeftAvail,
1240
7
                                          isAboveRightAvail,
1241
7
                                          isBelowLeftAvail,
1242
7
                                          isBelowRightAvail,
1243
7
                                          m_signLineBuf1,
1244
7
                                          m_signLineBuf2,
1245
7
                                          isCtuCrossedByVirtualBoundaries,
1246
7
                                          horVirBndryPos,
1247
7
                                          verVirBndryPos,
1248
7
                                          numHorVirBndry,
1249
7
                                          numVerVirBndry,
1250
7
                                          bndmask );
1251
7
    break;
1252
1253
4
  case SAO_TYPE_EO_90:
1254
4
    offsetBlock_SIMD_SAO_TYPE_EO_90<vext>( channelBitDepth,
1255
4
                                           clpRng,
1256
4
                                           offset,
1257
4
                                           srcBlk,
1258
4
                                           resBlk,
1259
4
                                           srcStride,
1260
4
                                           resStride,
1261
4
                                           width,
1262
4
                                           height,
1263
4
                                           isLeftAvail,
1264
4
                                           isRightAvail,
1265
4
                                           isAboveAvail,
1266
4
                                           isBelowAvail,
1267
4
                                           isAboveLeftAvail,
1268
4
                                           isAboveRightAvail,
1269
4
                                           isBelowLeftAvail,
1270
4
                                           isBelowRightAvail,
1271
4
                                           m_signLineBuf1,
1272
4
                                           m_signLineBuf2,
1273
4
                                           isCtuCrossedByVirtualBoundaries,
1274
4
                                           horVirBndryPos,
1275
4
                                           verVirBndryPos,
1276
4
                                           numHorVirBndry,
1277
4
                                           numVerVirBndry,
1278
4
                                           bndmask );
1279
4
    break;
1280
1281
0
  case SAO_TYPE_EO_135:
1282
0
    offsetBlock_SIMD_SAO_TYPE_EO_135<vext>( channelBitDepth,
1283
0
                                            clpRng,
1284
0
                                            offset,
1285
0
                                            srcBlk,
1286
0
                                            resBlk,
1287
0
                                            srcStride,
1288
0
                                            resStride,
1289
0
                                            width,
1290
0
                                            height,
1291
0
                                            isLeftAvail,
1292
0
                                            isRightAvail,
1293
0
                                            isAboveAvail,
1294
0
                                            isBelowAvail,
1295
0
                                            isAboveLeftAvail,
1296
0
                                            isAboveRightAvail,
1297
0
                                            isBelowLeftAvail,
1298
0
                                            isBelowRightAvail,
1299
0
                                            m_signLineBuf1,
1300
0
                                            m_signLineBuf2,
1301
0
                                            isCtuCrossedByVirtualBoundaries,
1302
0
                                            horVirBndryPos,
1303
0
                                            verVirBndryPos,
1304
0
                                            numHorVirBndry,
1305
0
                                            numVerVirBndry,
1306
0
                                            bndmask );
1307
0
    break;
1308
1309
2
  case SAO_TYPE_EO_45:
1310
2
    offsetBlock_SIMD_SAO_TYPE_EO_45<vext>( channelBitDepth,
1311
2
                                           clpRng,
1312
2
                                           offset,
1313
2
                                           srcBlk,
1314
2
                                           resBlk,
1315
2
                                           srcStride,
1316
2
                                           resStride,
1317
2
                                           width,
1318
2
                                           height,
1319
2
                                           isLeftAvail,
1320
2
                                           isRightAvail,
1321
2
                                           isAboveAvail,
1322
2
                                           isBelowAvail,
1323
2
                                           isAboveLeftAvail,
1324
2
                                           isAboveRightAvail,
1325
2
                                           isBelowLeftAvail,
1326
2
                                           isBelowRightAvail,
1327
2
                                           m_signLineBuf1,
1328
2
                                           m_signLineBuf2,
1329
2
                                           isCtuCrossedByVirtualBoundaries,
1330
2
                                           horVirBndryPos,
1331
2
                                           verVirBndryPos,
1332
2
                                           numHorVirBndry,
1333
2
                                           numVerVirBndry,
1334
2
                                           bndmask );
1335
2
    break;
1336
1337
0
  default:
1338
0
    THROW_FATAL( "Not a supported SAO types\n" );
1339
13
  }
1340
1341
13
#  if USE_AVX2
1342
13
  _mm256_zeroupper();
1343
13
#  endif
1344
13
}
1345
1346
template<X86_VEXT vext>
1347
void SampleAdaptiveOffset::_initSampleAdaptiveOffsetX86()
1348
5.61k
{
1349
5.61k
  offsetBlock = offsetBlock_SIMD<vext>;
1350
5.61k
}
Unexecuted instantiation: void vvdec::SampleAdaptiveOffset::_initSampleAdaptiveOffsetX86<(vvdec::x86_simd::X86_VEXT)1>()
void vvdec::SampleAdaptiveOffset::_initSampleAdaptiveOffsetX86<(vvdec::x86_simd::X86_VEXT)4>()
Line
Count
Source
1348
5.61k
{
1349
5.61k
  offsetBlock = offsetBlock_SIMD<vext>;
1350
5.61k
}
1351
1352
template void SampleAdaptiveOffset::_initSampleAdaptiveOffsetX86<SIMDX86>();
1353
1354
}
1355
#endif   //#ifdef TARGET_SIMD_X86