Coverage Report

Created: 2026-04-01 07:49

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/vvenc/source/Lib/CommonLib/x86/SampleAdaptiveOffsetX86.h
Line
Count
Source
1
/* -----------------------------------------------------------------------------
2
The copyright in this software is being made available under the Clear BSD
3
License, included below. No patent rights, trademark rights and/or 
4
other Intellectual Property Rights other than the copyrights concerning 
5
the Software are granted under this license.
6
7
The Clear BSD License
8
9
Copyright (c) 2019-2026, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
10
All rights reserved.
11
12
Redistribution and use in source and binary forms, with or without modification,
13
are permitted (subject to the limitations in the disclaimer below) provided that
14
the following conditions are met:
15
16
     * Redistributions of source code must retain the above copyright notice,
17
     this list of conditions and the following disclaimer.
18
19
     * Redistributions in binary form must reproduce the above copyright
20
     notice, this list of conditions and the following disclaimer in the
21
     documentation and/or other materials provided with the distribution.
22
23
     * Neither the name of the copyright holder nor the names of its
24
     contributors may be used to endorse or promote products derived from this
25
     software without specific prior written permission.
26
27
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
28
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
29
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
31
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
32
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
33
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
34
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
36
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38
POSSIBILITY OF SUCH DAMAGE.
39
40
41
------------------------------------------------------------------------------------------- */
42
/** \file     SampleAdaptiveOffsetX86.h
43
    \brief    SAO filter class
44
*/
45
46
#pragma once
47
48
#include "CommonDefX86.h"
49
#include "SampleAdaptiveOffset.h"
50
51
#if defined(TARGET_SIMD_X86)  && ENABLE_SIMD_OPT_SAO
52
53
//! \ingroup CommonLib
54
//! \{
55
56
namespace vvenc {
57
58
0
#define SAO_NUM_OFFSETS                     4 /* number of SAO offset values */
59
0
#define SAO_EO_NUM_CATEGORIES               (SAO_NUM_OFFSETS + 1) /* number of different eo categories */
60
61
template <X86_VEXT vext>
62
void offsetBlock_SIMD( const int     channelBitDepth,
63
                                        const ClpRng& clpRng,
64
                                        int           typeIdx,
65
                                        int*          offset,
66
                                        int           startIdx,
67
                                        const Pel*    srcBlk,
68
                                        Pel*          resBlk,
69
                                        ptrdiff_t     srcStride,
70
                                        ptrdiff_t     resStride,
71
                                        int           width,
72
                                        int           height,
73
                                        uint8_t       availMask,
74
//                                        bool          isLeftAvail,
75
//                                        bool          isRightAvail,
76
//                                        bool          isAboveAvail,
77
//                                        bool          isBelowAvail,
78
//                                        bool          isAboveLeftAvail,
79
//                                        bool          isAboveRightAvail,
80
//                                        bool          isBelowLeftAvail,
81
//                                        bool          isBelowRightAvail,
82
                                        std::vector<int8_t> &signLineBuf1,
83
                                        std::vector<int8_t> &signLineBuf2)
84
0
{
85
86
0
  int x,y, startX, startY, endX, endY, edgeType;
87
0
  int firstLineStartX, firstLineEndX, lastLineStartX, lastLineEndX;
88
0
  int8_t signLeft, signRight, signDown;
89
90
0
  const Pel* srcLine = srcBlk;
91
0
  Pel* resLine = resBlk;
92
93
0
  switch(typeIdx)
94
0
  {
95
0
  case SAO_TYPE_EO_0:
96
0
  {
97
0
    if (availMask&LeftAvail && availMask&RightAvail)
98
0
    {
99
100
0
      int8_t p_eo_offsets[16] = {0,};
101
0
      for (int i = 0; i < SAO_EO_NUM_CATEGORIES; i++)
102
0
      {
103
0
        p_eo_offsets[i] = offset[i];
104
0
      }
105
106
#ifdef USE_AVX2
107
      // AVX2
108
0
      if( ( width & 15 ) == 0 && vext >= AVX2 )
109
0
      {
110
0
        __m256i vsrca,vsrcal,vsrcar;
111
0
        __m256i vbaseoffset = _mm256_set1_epi16(2) ;
112
0
        __m256i vplusone = _mm256_set1_epi16(1);
113
0
        __m256i vzero = _mm256_set1_epi8(0);
114
0
        __m256i vibdimax = _mm256_set1_epi16((1<<channelBitDepth) -1 );
115
0
        __m256i voffsettbl =  _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)p_eo_offsets));
116
117
0
        for (y=0; y< height; y++)
118
0
        {
119
0
          for (x=0; x< width; x+=16)
120
0
          {
121
0
            vsrca = _mm256_loadu_si256((__m256i*)&srcLine[x]);
122
0
            vsrcal = _mm256_loadu_si256((__m256i*)&srcLine[x-1]);
123
0
            vsrcar = _mm256_loadu_si256((__m256i*)&srcLine[x+1]);
124
0
            vsrcal = _mm256_sub_epi16(vsrca, vsrcal);
125
0
            vsrcar = _mm256_sub_epi16(vsrca, vsrcar);
126
0
            __m256i vsignl = _mm256_sign_epi16(vplusone, vsrcal);
127
0
            __m256i vsignr = _mm256_sign_epi16(vplusone, vsrcar);
128
0
            __m256i vsign = _mm256_add_epi16(_mm256_add_epi16(vsignl, vsignr), vbaseoffset);
129
0
            __m256i veoffsets = _mm256_shuffle_epi8(voffsettbl, vsign);
130
0
            veoffsets = _mm256_slli_epi16 (veoffsets,8);
131
0
            veoffsets = _mm256_srai_epi16 (veoffsets,8);
132
133
0
            vsrca = _mm256_add_epi16(vsrca, veoffsets);
134
0
            vsrca    = _mm256_min_epi16(_mm256_max_epi16(vsrca, vzero), vibdimax);
135
0
            _mm256_storeu_si256((__m256i*)&resLine[x], vsrca);
136
0
          }
137
0
          srcLine  += srcStride;
138
0
          resLine += resStride;
139
0
        }
140
0
      }
141
0
      else
142
0
#endif
143
0
      {
144
0
        __m128i vsrca,vsrcal,vsrcar;
145
0
        __m128i vbaseoffset = _mm_set1_epi16(2) ;
146
0
        __m128i vplusone = _mm_set1_epi16(1);
147
0
        __m128i vzero = _mm_set1_epi8(0);
148
0
        __m128i vibdimax = _mm_set1_epi16((1<<channelBitDepth) -1 );
149
0
        __m128i voffsettbl = _mm_loadu_si128((__m128i*)p_eo_offsets);
150
151
152
0
        for (y=0; y< height; y++)
153
0
        {
154
155
0
          for (x=0; x< width; x+=8)
156
0
          {
157
0
            vsrca = _mm_loadu_si128((__m128i*)&srcLine[x]);
158
0
            vsrcal = _mm_loadu_si128((__m128i*)&srcLine[x-1]);
159
0
            vsrcar = _mm_loadu_si128((__m128i*)&srcLine[x+1]);
160
0
            vsrcal = _mm_sub_epi16(vsrca, vsrcal);
161
0
            vsrcar = _mm_sub_epi16(vsrca, vsrcar);
162
0
            __m128i vsignl = _mm_sign_epi16(vplusone, vsrcal);
163
0
            __m128i vsignr = _mm_sign_epi16(vplusone, vsrcar);
164
0
            __m128i vsign = _mm_add_epi16(_mm_add_epi16(vsignl, vsignr), vbaseoffset);
165
0
            __m128i veoffsets = _mm_shuffle_epi8(voffsettbl, vsign);
166
0
            veoffsets = _mm_slli_epi16 (veoffsets,8);
167
0
            veoffsets = _mm_srai_epi16 (veoffsets,8);
168
169
0
            vsrca = _mm_add_epi16(vsrca, veoffsets);
170
0
            vsrca    = _mm_min_epi16(_mm_max_epi16(vsrca, vzero), vibdimax);
171
0
            _mm_store_si128((__m128i*)&resLine[x], vsrca);
172
0
          }
173
0
          srcLine  += srcStride;
174
0
          resLine += resStride;
175
0
        }
176
0
      }
177
0
    }
178
0
    else
179
0
    {
180
0
      offset += 2;
181
0
      startX = availMask&LeftAvail ? 0 : 1;
182
0
      endX   = availMask&RightAvail ? width : (width -1);
183
0
      for (y=0; y< height; y++)
184
0
      {
185
0
        signLeft = (int8_t)sgn(srcLine[startX] - srcLine[startX-1]);
186
0
        for (x=startX; x< endX; x++)
187
0
        {
188
0
          signRight = (int8_t)sgn(srcLine[x] - srcLine[x+1]);
189
0
          edgeType =  signRight + signLeft;
190
0
          signLeft  = -signRight;
191
192
0
          resLine[x] = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng);
193
0
        }
194
0
        srcLine  += srcStride;
195
0
        resLine += resStride;
196
0
      }
197
198
0
    }
199
0
  }
200
0
  break;
201
0
  case SAO_TYPE_EO_90:
202
0
  {
203
204
0
    int8_t p_eo_offsets[16] = {0,};
205
0
    for (int i = 0; i < SAO_EO_NUM_CATEGORIES; i++)
206
0
    {
207
0
      p_eo_offsets[i] = offset[i];
208
0
    }
209
0
    const Pel* srcLineAbove= srcLine- srcStride;
210
0
    const Pel* srcLineBelow= srcLine+ srcStride;
211
0
    startY=0;
212
0
    if (!(availMask&AboveAvail))
213
0
    {
214
0
      startY=1;
215
0
      srcLineAbove= srcLine;
216
0
      srcLine  += srcStride;
217
0
      resLine += resStride;
218
0
      srcLineBelow= srcLine+ srcStride;
219
0
    }
220
0
    endY=height;
221
0
    if (!(availMask&BelowAvail))
222
0
    {
223
0
      endY=height-1;
224
0
    }
225
#ifdef USE_AVX2
226
    // AVX2
227
0
    if( ( width & 15 ) == 0 && ( vext >= AVX2 ) )
228
0
    {
229
0
      __m256i vsrca,vsrcat,vsrcab;
230
231
      __m256i vbaseoffset = _mm256_set1_epi16(2) ;
232
      __m256i vplusone = _mm256_set1_epi16(1);
233
      __m256i vzero = _mm256_set1_epi8(0);
234
      __m256i vibdimax = _mm256_set1_epi16((1<<channelBitDepth) -1 );
235
      __m256i voffsettbl =  _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)p_eo_offsets));
236
      const Pel* srcLineBelow= srcLine+ srcStride;
237
238
0
      for (y=startY; y< endY; y++)
239
0
      {
240
0
        for (x=0; x< width; x+=16)
241
0
        {
242
0
          vsrca = _mm256_loadu_si256((__m256i*)&srcLine[x]);
243
0
          vsrcat = _mm256_loadu_si256((__m256i*)&srcLineAbove[x]);
244
0
          vsrcab = _mm256_loadu_si256((__m256i*)&srcLineBelow[x]);
245
0
          vsrcat = _mm256_sub_epi16(vsrca, vsrcat);
246
0
          vsrcab = _mm256_sub_epi16(vsrca, vsrcab);
247
0
          __m256i vsignt = _mm256_sign_epi16(vplusone, vsrcat);
248
0
          __m256i vsignb = _mm256_sign_epi16(vplusone, vsrcab);
249
0
          __m256i vsign = _mm256_add_epi16(_mm256_add_epi16(vsignt, vsignb), vbaseoffset);
250
0
          __m256i veoffsets = _mm256_shuffle_epi8(voffsettbl, vsign);
251
0
          veoffsets = _mm256_slli_epi16 (veoffsets,8);
252
0
          veoffsets = _mm256_srai_epi16 (veoffsets,8);
253
254
0
          vsrca = _mm256_add_epi16(vsrca, veoffsets);
255
0
          vsrca    = _mm256_min_epi16(_mm256_max_epi16(vsrca, vzero), vibdimax);
256
0
          _mm256_storeu_si256((__m256i*)&resLine[x], vsrca);
257
0
        }
258
0
        srcLine  += srcStride;
259
0
        srcLineBelow += srcStride;
260
0
        srcLineAbove += srcStride;
261
0
        resLine += resStride;
262
0
      }
263
0
    }
264
0
    else
265
0
#endif
266
0
    {
267
0
      __m128i vsrca,vsrcat,vsrcab;
268
0
      __m128i vbaseoffset = _mm_set1_epi16(2) ;
269
0
      __m128i vplusone = _mm_set1_epi16(1);
270
0
      __m128i vzero = _mm_set1_epi8(0);
271
0
      __m128i vibdimax = _mm_set1_epi16((1<<channelBitDepth) -1 );
272
0
      __m128i voffsettbl = _mm_loadu_si128((__m128i*)p_eo_offsets);
273
274
275
0
      for (y=startY; y< endY; y++)
276
0
      {
277
0
        for (x=0; x< width; x+=8)
278
0
        {
279
0
          vsrca = _mm_loadu_si128((__m128i*)&srcLine[x]);
280
0
          vsrcat = _mm_loadu_si128((__m128i*)&srcLineAbove[x]);
281
0
          vsrcab = _mm_loadu_si128((__m128i*)&srcLineBelow[x]);
282
0
          vsrcat = _mm_sub_epi16(vsrca, vsrcat);
283
0
          vsrcab = _mm_sub_epi16(vsrca, vsrcab);
284
0
          __m128i vsignt = _mm_sign_epi16(vplusone, vsrcat);
285
0
          __m128i vsignb = _mm_sign_epi16(vplusone, vsrcab);
286
0
          __m128i vsign = _mm_add_epi16(_mm_add_epi16(vsignt, vsignb), vbaseoffset);
287
0
          __m128i veoffsets = _mm_shuffle_epi8(voffsettbl, vsign);
288
0
          veoffsets = _mm_slli_epi16 (veoffsets,8);
289
0
          veoffsets = _mm_srai_epi16 (veoffsets,8);
290
291
0
          vsrca = _mm_add_epi16(vsrca, veoffsets);
292
0
          vsrca    = _mm_min_epi16(_mm_max_epi16(vsrca, vzero), vibdimax);
293
0
          _mm_store_si128((__m128i*)&resLine[x], vsrca);
294
0
        }
295
0
        srcLine  += srcStride;
296
0
        srcLineBelow += srcStride;
297
0
        srcLineAbove += srcStride;
298
0
        resLine += resStride;
299
0
      }
300
0
    }
301
0
  }
302
0
  break;
303
0
  case SAO_TYPE_EO_135:
304
0
  {
305
//    if (isLeftAvail && isRightAvail && isAboveLeftAvail && isBelowRightAvail )
306
0
    if((LeftAvail|RightAvail|AboveLeftAvail|BelowRightAvail) == (int)(availMask&(LeftAvail|RightAvail|AboveLeftAvail|BelowRightAvail)))
307
0
    {
308
309
0
      int8_t p_eo_offsets[16] = {0,};
310
0
      for (int i = 0; i < SAO_EO_NUM_CATEGORIES; i++)
311
0
      {
312
0
        p_eo_offsets[i] = offset[i];
313
0
      }
314
0
      const Pel* srcLineAbove= srcLine- srcStride;
315
0
      const Pel* srcLineBelow= srcLine+ srcStride;
316
0
      startY=0;
317
0
      if (!(availMask&AboveAvail))
318
0
      {
319
0
        startY=1;
320
0
        srcLineAbove= srcLine;
321
0
        srcLine  += srcStride;
322
0
        resLine += resStride;
323
0
        srcLineBelow= srcLine+ srcStride;
324
0
      }
325
0
      endY=height;
326
0
      if (!(availMask&BelowAvail))
327
0
      {
328
0
        endY=height-1;
329
0
      }
330
#ifdef USE_AVX2
331
      // AVX2
332
0
      if( ( width & 15 ) == 0 && vext >= AVX2 )
333
0
      {
334
0
        __m256i vsrca,vsrcat,vsrcab;
335
336
        __m256i vbaseoffset = _mm256_set1_epi16(2) ;
337
        __m256i vplusone = _mm256_set1_epi16(1);
338
        __m256i vzero = _mm256_set1_epi8(0);
339
        __m256i vibdimax = _mm256_set1_epi16((1<<channelBitDepth) -1 );
340
        __m256i voffsettbl =  _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)p_eo_offsets));
341
        const Pel* srcLineBelow= srcLine+ srcStride;
342
343
0
        for (y=startY; y< endY; y++)
344
0
        {
345
0
          for (x=0; x< width; x+=16)
346
0
          {
347
0
            vsrca = _mm256_loadu_si256((__m256i*)&srcLine[x]);
348
0
            vsrcat = _mm256_loadu_si256((__m256i*)&srcLineAbove[x-1]);
349
0
            vsrcab = _mm256_loadu_si256((__m256i*)&srcLineBelow[x+1]);
350
0
            vsrcat = _mm256_sub_epi16(vsrca, vsrcat);
351
0
            vsrcab = _mm256_sub_epi16(vsrca, vsrcab);
352
0
            __m256i vsignt = _mm256_sign_epi16(vplusone, vsrcat);
353
0
            __m256i vsignb = _mm256_sign_epi16(vplusone, vsrcab);
354
0
            __m256i vsign = _mm256_add_epi16(_mm256_add_epi16(vsignt, vsignb), vbaseoffset);
355
0
            __m256i veoffsets = _mm256_shuffle_epi8(voffsettbl, vsign);
356
0
            veoffsets = _mm256_slli_epi16 (veoffsets,8);
357
0
            veoffsets = _mm256_srai_epi16 (veoffsets,8);
358
359
0
            vsrca = _mm256_add_epi16(vsrca, veoffsets);
360
0
            vsrca    = _mm256_min_epi16(_mm256_max_epi16(vsrca, vzero), vibdimax);
361
0
            _mm256_storeu_si256((__m256i*)&resLine[x], vsrca);
362
0
          }
363
0
          srcLine  += srcStride;
364
0
          srcLineBelow += srcStride;
365
0
          srcLineAbove += srcStride;
366
0
          resLine += resStride;
367
0
        }
368
0
      }
369
0
      else
370
0
#endif
371
0
      {
372
0
        __m128i vsrca,vsrcat,vsrcab;
373
0
        __m128i vbaseoffset = _mm_set1_epi16(2) ;
374
0
        __m128i vplusone = _mm_set1_epi16(1);
375
0
        __m128i vzero = _mm_set1_epi8(0);
376
0
        __m128i vibdimax = _mm_set1_epi16((1<<channelBitDepth) -1 );
377
0
        __m128i voffsettbl = _mm_loadu_si128((__m128i*)p_eo_offsets);
378
379
380
0
        for (y=startY; y< endY; y++)
381
0
        {
382
0
          for (x=0; x< width; x+=8)
383
0
          {
384
0
            vsrca = _mm_loadu_si128((__m128i*)&srcLine[x]);
385
0
            vsrcat = _mm_loadu_si128((__m128i*)&srcLineAbove[x-1]);
386
0
            vsrcab = _mm_loadu_si128((__m128i*)&srcLineBelow[x+1]);
387
0
            vsrcat = _mm_sub_epi16(vsrca, vsrcat);
388
0
            vsrcab = _mm_sub_epi16(vsrca, vsrcab);
389
0
            __m128i vsignt = _mm_sign_epi16(vplusone, vsrcat);
390
0
            __m128i vsignb = _mm_sign_epi16(vplusone, vsrcab);
391
0
            __m128i vsign = _mm_add_epi16(_mm_add_epi16(vsignt, vsignb), vbaseoffset);
392
0
            __m128i veoffsets = _mm_shuffle_epi8(voffsettbl, vsign);
393
0
            veoffsets = _mm_slli_epi16 (veoffsets,8);
394
0
            veoffsets = _mm_srai_epi16 (veoffsets,8);
395
396
0
            vsrca = _mm_add_epi16(vsrca, veoffsets);
397
0
            vsrca    = _mm_min_epi16(_mm_max_epi16(vsrca, vzero), vibdimax);
398
0
            _mm_store_si128((__m128i*)&resLine[x], vsrca);
399
0
          }
400
0
          srcLine  += srcStride;
401
0
          srcLineBelow += srcStride;
402
0
          srcLineAbove += srcStride;
403
0
          resLine += resStride;
404
0
        }
405
0
      }
406
407
408
0
    }
409
0
    else
410
0
    {
411
0
      offset += 2;
412
0
      int8_t *signUpLine, *signDownLine, *signTmpLine;
413
414
0
      signUpLine  = &signLineBuf1[0];
415
0
      signDownLine= &signLineBuf2[0];
416
417
0
      startX = availMask&LeftAvail ? 0 : 1 ;
418
0
      endX   = availMask&RightAvail ? width : (width-1);
419
420
      //prepare 2nd line's upper sign
421
0
      const Pel* srcLineBelow= srcLine+ srcStride;
422
0
      for (x=startX; x< endX+1; x++)
423
0
      {
424
0
        signUpLine[x] = (int8_t)sgn(srcLineBelow[x] - srcLine[x- 1]);
425
0
      }
426
427
      //1st line
428
0
      const Pel* srcLineAbove= srcLine- srcStride;
429
0
      firstLineStartX = availMask&AboveLeftAvail ? 0 : 1;
430
0
      firstLineEndX   = availMask&AboveAvail? endX: 1;
431
0
      for(x= firstLineStartX; x< firstLineEndX; x++)
432
0
      {
433
0
        edgeType  =  sgn(srcLine[x] - srcLineAbove[x- 1]) - signUpLine[x+1];
434
435
0
        resLine[x] = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng);
436
0
      }
437
0
      srcLine  += srcStride;
438
0
      resLine  += resStride;
439
440
441
      //middle lines
442
0
      for (y= 1; y< height-1; y++)
443
0
      {
444
0
        srcLineBelow= srcLine+ srcStride;
445
446
0
        for (x=startX; x<endX; x++)
447
0
        {
448
0
          signDown =  (int8_t)sgn(srcLine[x] - srcLineBelow[x+ 1]);
449
0
          edgeType =  signDown + signUpLine[x];
450
0
          resLine[x] = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng);
451
452
0
          signDownLine[x+1] = -signDown;
453
0
        }
454
0
        signDownLine[startX] = (int8_t)sgn(srcLineBelow[startX] - srcLine[startX-1]);
455
456
0
        signTmpLine  = signUpLine;
457
0
        signUpLine   = signDownLine;
458
0
        signDownLine = signTmpLine;
459
460
0
        srcLine += srcStride;
461
0
        resLine += resStride;
462
0
      }
463
464
      //last line
465
0
      srcLineBelow= srcLine+ srcStride;
466
0
      lastLineStartX = availMask&BelowAvail ? startX : (width -1);
467
0
      lastLineEndX   = availMask&BelowRightAvail ? width : (width -1);
468
0
      for(x= lastLineStartX; x< lastLineEndX; x++)
469
0
      {
470
0
        edgeType =  sgn(srcLine[x] - srcLineBelow[x+ 1]) + signUpLine[x];
471
0
        resLine[x] = ClipPel<int>( srcLine[x] + offset[edgeType], clpRng);
472
473
0
      }
474
475
0
    }
476
0
  }
477
0
  break;
478
0
  case SAO_TYPE_EO_45:
479
0
  {
480
//    if (isLeftAvail && isRightAvail && isAboveLeftAvail && isBelowRightAvail )
481
0
    if((LeftAvail|RightAvail|AboveLeftAvail|BelowRightAvail) == ((int)availMask&(LeftAvail|RightAvail|AboveLeftAvail|BelowRightAvail)))
482
0
    {
483
484
0
      int8_t p_eo_offsets[16] = {0,};
485
0
      for (int i = 0; i < SAO_EO_NUM_CATEGORIES; i++)
486
0
      {
487
0
        p_eo_offsets[i] = offset[i];
488
0
      }
489
0
      const Pel* srcLineAbove= srcLine- srcStride;
490
0
      const Pel* srcLineBelow= srcLine+ srcStride;
491
0
      startY=0;
492
0
      if (!(availMask&AboveAvail))
493
0
      {
494
0
        startY=1;
495
0
        srcLineAbove= srcLine;
496
0
        srcLine  += srcStride;
497
0
        resLine += resStride;
498
0
        srcLineBelow= srcLine+ srcStride;
499
0
      }
500
0
      endY=height;
501
0
      if (!(availMask&BelowAvail))
502
0
      {
503
0
        endY=height-1;
504
0
      }
505
#ifdef USE_AVX2
506
      // AVX2
507
0
      if( ( width & 15 ) == 0 && vext >= AVX2 )
508
0
      {
509
0
        __m256i vsrca,vsrcat,vsrcab;
510
0
        __m256i vbaseoffset = _mm256_set1_epi16(2) ;
511
0
        __m256i vplusone = _mm256_set1_epi16(1);
512
0
        __m256i vzero = _mm256_set1_epi8(0);
513
0
        __m256i vibdimax = _mm256_set1_epi16((1<<channelBitDepth) -1 );
514
0
        __m256i voffsettbl =  _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)p_eo_offsets));
515
0
        const Pel* srcLineBelow= srcLine+ srcStride;
516
517
0
        for (y=startY; y< endY; y++)
518
0
        {
519
0
          for (x=0; x< width; x+=16)
520
0
          {
521
0
            vsrca = _mm256_loadu_si256((__m256i*)&srcLine[x]);
522
0
            vsrcat = _mm256_loadu_si256((__m256i*)&srcLineAbove[x+1]);
523
0
            vsrcab = _mm256_loadu_si256((__m256i*)&srcLineBelow[x-1]);
524
0
            vsrcat = _mm256_sub_epi16(vsrca, vsrcat);
525
0
            vsrcab = _mm256_sub_epi16(vsrca, vsrcab);
526
0
            __m256i vsignt = _mm256_sign_epi16(vplusone, vsrcat);
527
0
            __m256i vsignb = _mm256_sign_epi16(vplusone, vsrcab);
528
0
            __m256i vsign = _mm256_add_epi16(_mm256_add_epi16(vsignt, vsignb), vbaseoffset);
529
0
            __m256i veoffsets = _mm256_shuffle_epi8(voffsettbl, vsign);
530
0
            veoffsets = _mm256_slli_epi16 (veoffsets,8);
531
0
            veoffsets = _mm256_srai_epi16 (veoffsets,8);
532
533
0
            vsrca = _mm256_add_epi16(vsrca, veoffsets);
534
0
            vsrca    = _mm256_min_epi16(_mm256_max_epi16(vsrca, vzero), vibdimax);
535
0
            _mm256_storeu_si256((__m256i*)&resLine[x], vsrca);
536
0
          }
537
0
          srcLine  += srcStride;
538
0
          srcLineBelow += srcStride;
539
0
          srcLineAbove += srcStride;
540
0
          resLine += resStride;
541
0
        }
542
0
      }
543
0
      else
544
0
#endif
545
0
      {
546
0
        __m128i vsrca,vsrcat,vsrcab;
547
0
        __m128i vbaseoffset = _mm_set1_epi16(2) ;
548
0
        __m128i vplusone = _mm_set1_epi16(1);
549
0
        __m128i vzero = _mm_set1_epi8(0);
550
0
        __m128i vibdimax = _mm_set1_epi16((1<<channelBitDepth) -1 );
551
0
        __m128i voffsettbl = _mm_loadu_si128((__m128i*)p_eo_offsets);
552
553
0
        for (y=startY; y< endY; y++)
554
0
        {
555
0
          for (x=0; x< width; x+=8)
556
0
          {
557
0
            vsrca = _mm_loadu_si128((__m128i*)&srcLine[x]);
558
0
            vsrcat = _mm_loadu_si128((__m128i*)&srcLineAbove[x+1]);
559
0
            vsrcab = _mm_loadu_si128((__m128i*)&srcLineBelow[x-1]);
560
0
            vsrcat = _mm_sub_epi16(vsrca, vsrcat);
561
0
            vsrcab = _mm_sub_epi16(vsrca, vsrcab);
562
0
            __m128i vsignt = _mm_sign_epi16(vplusone, vsrcat);
563
0
            __m128i vsignb = _mm_sign_epi16(vplusone, vsrcab);
564
0
            __m128i vsign = _mm_add_epi16(_mm_add_epi16(vsignt, vsignb), vbaseoffset);
565
0
            __m128i veoffsets = _mm_shuffle_epi8(voffsettbl, vsign);
566
0
            veoffsets = _mm_slli_epi16 (veoffsets,8);
567
0
            veoffsets = _mm_srai_epi16 (veoffsets,8);
568
569
0
            vsrca = _mm_add_epi16(vsrca, veoffsets);
570
0
            vsrca    = _mm_min_epi16(_mm_max_epi16(vsrca, vzero), vibdimax);
571
0
            _mm_store_si128((__m128i*)&resLine[x], vsrca);
572
0
          }
573
0
          srcLine  += srcStride;
574
0
          srcLineBelow += srcStride;
575
0
          srcLineAbove += srcStride;
576
0
          resLine += resStride;
577
0
        }
578
0
      }
579
0
    }
580
0
    else
581
0
    {
582
0
      offset += 2;
583
0
      int8_t *signUpLine = &signLineBuf1[1];
584
585
0
      startX = availMask&LeftAvail ? 0 : 1;
586
0
      endX   = availMask&RightAvail ? width : (width -1);
587
588
      //prepare 2nd line upper sign
589
0
      const Pel* srcLineBelow= srcLine+ srcStride;
590
0
      for (x=startX-1; x< endX; x++)
591
0
      {
592
0
        signUpLine[x] = (int8_t)sgn(srcLineBelow[x] - srcLine[x+1]);
593
0
      }
594
      //first line
595
0
      const Pel* srcLineAbove= srcLine- srcStride;
596
0
      firstLineStartX = availMask&AboveAvail ? startX : (width -1 );
597
0
      firstLineEndX   = availMask&AboveRightAvail ? width : (width-1);
598
0
      for(x= firstLineStartX; x< firstLineEndX; x++)
599
0
      {
600
0
        edgeType = sgn(srcLine[x] - srcLineAbove[x+1]) -signUpLine[x-1];
601
0
        resLine[x] = ClipPel<int>(srcLine[x] + offset[edgeType], clpRng);
602
0
      }
603
0
      srcLine += srcStride;
604
0
      resLine += resStride;
605
606
      //middle lines
607
0
      for (y= 1; y< height-1; y++)
608
0
      {
609
0
        srcLineBelow= srcLine+ srcStride;
610
611
0
        for(x= startX; x< endX; x++)
612
0
        {
613
0
          signDown =  (int8_t)sgn(srcLine[x] - srcLineBelow[x-1]);
614
0
          edgeType =  signDown + signUpLine[x];
615
0
          resLine[x] = ClipPel<int>(srcLine[x] + offset[edgeType], clpRng);
616
0
          signUpLine[x-1] = -signDown;
617
0
        }
618
0
        signUpLine[endX-1] = (int8_t)sgn(srcLineBelow[endX-1] - srcLine[endX]);
619
0
        srcLine  += srcStride;
620
0
        resLine += resStride;
621
0
      }
622
623
      //last line
624
0
      srcLineBelow= srcLine+ srcStride;
625
0
      lastLineStartX = availMask&BelowLeftAvail ? 0 : 1;
626
0
      lastLineEndX   = availMask&BelowAvail ? endX : 1;
627
0
      for(x= lastLineStartX; x< lastLineEndX; x++)
628
0
      {
629
0
        edgeType = sgn(srcLine[x] - srcLineBelow[x-1]) + signUpLine[x];
630
0
        resLine[x] = ClipPel<int>(srcLine[x] + offset[edgeType], clpRng);
631
632
0
      }
633
634
0
    }
635
0
  }
636
0
  break;
637
0
  case SAO_TYPE_BO:
638
0
  {
639
0
    const int shiftBits = channelBitDepth - NUM_SAO_BO_CLASSES_LOG2;
640
0
    int8_t p_eo_offsets[16] = {0,};
641
0
    for (int i = 0; i < 4; i++)
642
0
    {
643
0
      p_eo_offsets[i] = offset[( startIdx + i ) % MAX_NUM_SAO_CLASSES];
644
0
    }
645
#ifdef USE_AVX2
646
    // AVX2
647
0
    if( ( width & 15 ) == 0 && vext >= AVX2 )
648
0
    {
649
0
      __m256i vsrc;
650
0
      __m256i vbaseoffset = _mm256_set1_epi16(startIdx - MAX_NUM_SAO_CLASSES) ;
651
0
      __m256i vminus = _mm256_set1_epi8(-1);
652
0
      __m256i vzero = _mm256_set1_epi8(0);
653
654
      __m256i vfour = _mm256_set1_epi16(4);
655
      __m256i vibdimax = _mm256_set1_epi16((1<<channelBitDepth) -1 );
656
      __m256i voffsettbl =  _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)p_eo_offsets));
657
658
0
      for (y=0; y< height; y++)
659
0
      {
660
0
        for (x=0; x< width; x+=16)
661
0
        {
662
0
          vsrc = _mm256_loadu_si256((__m256i*)&srcLine[x]);
663
0
          __m256i bands = _mm256_srai_epi16(vsrc, shiftBits);
664
0
          bands = _mm256_sub_epi16(bands, vbaseoffset);
665
0
          bands = _mm256_and_si256(bands, _mm256_set1_epi16( MAX_NUM_SAO_CLASSES - 1 )); // modulo 32 = modulo NUM_SAO_BO_CLASSES_LOG2
666
0
          __m256i mask1 = _mm256_cmpgt_epi16(bands,vminus);
667
0
          __m256i mask2 = _mm256_cmpgt_epi16(vfour,bands);
668
669
0
          __m256i veoffsets = _mm256_shuffle_epi8(voffsettbl, bands);
670
0
          veoffsets = _mm256_slli_epi16 (veoffsets,8);
671
0
          veoffsets = _mm256_srai_epi16 (veoffsets,8);
672
673
0
          veoffsets = _mm256_and_si256(veoffsets,mask1);
674
0
          veoffsets = _mm256_and_si256(veoffsets,mask2);
675
676
0
          vsrc = _mm256_add_epi16(vsrc, veoffsets);
677
0
          vsrc    = _mm256_min_epi16(_mm256_max_epi16(vsrc, vzero), vibdimax);
678
0
          _mm256_storeu_si256((__m256i*)&resLine[x], vsrc);
679
0
        }
680
0
        srcLine  += srcStride;
681
0
        resLine += resStride;
682
0
      }
683
684
0
    }
685
0
    else
686
0
#endif
687
0
    {
688
0
      __m128i vsrc;
689
0
      __m128i vbaseoffset = _mm_set1_epi16(startIdx - MAX_NUM_SAO_CLASSES) ;
690
0
      __m128i vminus = _mm_set1_epi8(-1);
691
0
      __m128i vzero = _mm_set1_epi8(0);
692
693
0
      __m128i vfour = _mm_set1_epi16(4);
694
0
      __m128i vibdimax = _mm_set1_epi16((1<<channelBitDepth) -1 );
695
0
      __m128i voffsettbl = _mm_loadu_si128((__m128i*)p_eo_offsets);
696
0
      for (y=0; y< height; y++)
697
0
      {
698
0
        for (x=0; x< width; x+=8)
699
0
        {
700
0
          vsrc = _mm_loadu_si128((__m128i*)&srcLine[x]);
701
0
          __m128i bands = _mm_srai_epi16(vsrc, shiftBits);
702
0
          bands = _mm_sub_epi16(bands, vbaseoffset);
703
0
          bands = _mm_and_si128(bands, _mm_set1_epi16( MAX_NUM_SAO_CLASSES - 1 )); // modulo 32 = modulo NUM_SAO_BO_CLASSES_LOG2
704
0
          __m128i mask1 = _mm_cmpgt_epi16(bands,vminus);
705
0
          __m128i mask2 = _mm_cmplt_epi16(bands,vfour);
706
707
0
          __m128i veoffsets = _mm_shuffle_epi8(voffsettbl, bands);
708
0
          veoffsets = _mm_slli_epi16 (veoffsets,8);
709
0
          veoffsets = _mm_srai_epi16 (veoffsets,8);
710
711
0
          veoffsets = _mm_and_si128(veoffsets,mask1);
712
0
          veoffsets = _mm_and_si128(veoffsets,mask2);
713
714
0
          vsrc = _mm_add_epi16(vsrc, veoffsets);
715
0
          vsrc    = _mm_min_epi16(_mm_max_epi16(vsrc, vzero), vibdimax);
716
0
          _mm_store_si128((__m128i*)&resLine[x], vsrc);
717
0
        }
718
0
        srcLine  += srcStride;
719
0
        resLine += resStride;
720
0
      }
721
0
    }
722
0
  }
723
0
  break;
724
0
  default:
725
0
  {
726
0
    THROW("Not a supported SAO types\n");
727
0
  }
728
0
  }
729
#if USE_AVX2
730
731
0
  _mm256_zeroupper();
732
0
#endif
733
0
}
Unexecuted instantiation: void vvenc::offsetBlock_SIMD<(vvenc::x86_simd::X86_VEXT)1>(int, vvenc::ClpRng const&, int, int*, int, short const*, short*, long, long, int, int, unsigned char, std::__1::vector<signed char, std::__1::allocator<signed char> >&, std::__1::vector<signed char, std::__1::allocator<signed char> >&)
Unexecuted instantiation: void vvenc::offsetBlock_SIMD<(vvenc::x86_simd::X86_VEXT)4>(int, vvenc::ClpRng const&, int, int*, int, short const*, short*, long, long, int, int, unsigned char, std::__1::vector<signed char, std::__1::allocator<signed char> >&, std::__1::vector<signed char, std::__1::allocator<signed char> >&)
734
735
template <X86_VEXT vext>
736
void calcSaoStatisticsBo_SIMD(int width,int endX,int endY,Pel*  srcLine,Pel*  orgLine,int srcStride,int orgStride,int channelBitDepth, int64_t *count,int64_t  *diff)
737
0
{
738
0
  if ( width % 16 == 0 )
739
0
  {
740
0
    int iNaRight=width-endX;
741
0
    int x;
742
0
    int i_bo_range_shift = channelBitDepth - NUM_SAO_BO_CLASSES_LOG2;
743
0
    __m128i vzero = _mm_setzero_si128();
744
0
    for (int y=0; y<endY; y++)
745
0
    {
746
0
      for (x=0; x<endX-16; x+=16)
747
0
      {
748
0
        __m128i vsrca, vsrcb;
749
0
        __m128i vdiffa,vdiffb;
750
0
        if (sizeof(Pel) == 1){
751
0
          __m128i vsrc = _mm_loadu_si128((__m128i*)&srcLine[x]);
752
0
          vsrca = _mm_unpacklo_epi8(vsrc, vzero);
753
0
          vsrcb = _mm_unpackhi_epi8(vsrc, vzero);
754
0
          __m128i vorg  = _mm_loadu_si128((__m128i*)&orgLine[x]);
755
0
          __m128i vorga = _mm_unpacklo_epi8(vorg, vzero);
756
0
          __m128i vorgb = _mm_unpackhi_epi8(vorg, vzero);
757
0
          vdiffa = _mm_sub_epi16(vorga, vsrca);
758
0
          vdiffb = _mm_sub_epi16(vorgb, vsrcb);
759
0
        }
760
0
        else
761
0
        {
762
0
          vsrca = _mm_loadu_si128((__m128i*)&srcLine[x]);
763
0
          vsrcb = _mm_loadu_si128((__m128i*)&srcLine[x+8]);
764
0
          __m128i vorga = _mm_loadu_si128((__m128i*)&orgLine[x]);
765
0
          __m128i vorgb = _mm_loadu_si128((__m128i*)&orgLine[x+8]);
766
0
          vdiffa = _mm_sub_epi16(vorga, vsrca);
767
0
          vdiffb = _mm_sub_epi16(vorgb, vsrcb);
768
0
        }
769
0
        __m128i vbanda = _mm_srai_epi16(vsrca, i_bo_range_shift);
770
0
        __m128i vbandb = _mm_srai_epi16(vsrcb, i_bo_range_shift);
771
0
        int iBand;
772
        // since gcc 4.6 synopsis of _mm_extract_epi16 has changed to (int)(unsigned short)_mm_extract_epi16()
773
        // therefore cast result to short to have signed values
774
0
        short iDiff;
775
0
        iBand = _mm_extract_epi16(vbanda, 0);
776
0
        iDiff = (short)_mm_extract_epi16(vdiffa, 0);
777
0
        diff[iBand]  += iDiff;
778
0
        count[iBand] += 1;
779
0
        iBand = _mm_extract_epi16(vbanda, 1);
780
0
        iDiff = (short)_mm_extract_epi16(vdiffa, 1);
781
0
        diff[iBand]  += iDiff;
782
0
        count[iBand] += 1;
783
0
        iBand = _mm_extract_epi16(vbanda, 2);
784
0
        iDiff = (short)_mm_extract_epi16(vdiffa, 2);
785
0
        diff[iBand]  += iDiff;
786
0
        count[iBand] += 1;
787
0
        iBand = _mm_extract_epi16(vbanda, 3);
788
0
        iDiff = (short)_mm_extract_epi16(vdiffa, 3);
789
0
        diff[iBand]  += iDiff;
790
0
        count[iBand] += 1;
791
0
        iBand = _mm_extract_epi16(vbanda, 4);
792
0
        iDiff = (short)_mm_extract_epi16(vdiffa, 4);
793
0
        diff[iBand]  += iDiff;
794
0
        count[iBand] += 1;
795
0
        iBand = _mm_extract_epi16(vbanda, 5);
796
0
        iDiff = (short)_mm_extract_epi16(vdiffa, 5);
797
0
        diff[iBand]  += iDiff;
798
0
        count[iBand] += 1;
799
0
        iBand = _mm_extract_epi16(vbanda, 6);
800
0
        iDiff = (short)_mm_extract_epi16(vdiffa, 6);
801
0
        diff[iBand]  += iDiff;
802
0
        count[iBand] += 1;
803
0
        iBand = _mm_extract_epi16(vbanda, 7);
804
0
        iDiff = (short)_mm_extract_epi16(vdiffa, 7);
805
0
        diff[iBand]  += iDiff;
806
0
        count[iBand] += 1;
807
0
        iBand = _mm_extract_epi16(vbandb, 0);
808
0
        iDiff = (short)_mm_extract_epi16(vdiffb, 0);
809
0
        diff[iBand]  += iDiff;
810
0
        count[iBand] += 1;
811
0
        iBand = _mm_extract_epi16(vbandb, 1);
812
0
        iDiff = (short)_mm_extract_epi16(vdiffb, 1);
813
0
        diff[iBand]  += iDiff;
814
0
        count[iBand] += 1;
815
0
        iBand = _mm_extract_epi16(vbandb, 2);
816
0
        iDiff = (short)_mm_extract_epi16(vdiffb, 2);
817
0
        diff[iBand]  += iDiff;
818
0
        count[iBand] += 1;
819
0
        iBand = _mm_extract_epi16(vbandb, 3);
820
0
        iDiff = (short)_mm_extract_epi16(vdiffb, 3);
821
0
        diff[iBand]  += iDiff;
822
0
        count[iBand] += 1;
823
0
        iBand = _mm_extract_epi16(vbandb, 4);
824
0
        iDiff = (short)_mm_extract_epi16(vdiffb, 4);
825
0
        diff[iBand]  += iDiff;
826
0
        count[iBand] += 1;
827
0
        iBand = _mm_extract_epi16(vbandb, 5);
828
0
        iDiff = (short)_mm_extract_epi16(vdiffb, 5);
829
0
        diff[iBand]  += iDiff;
830
0
        count[iBand] += 1;
831
0
        iBand = _mm_extract_epi16(vbandb, 6);
832
0
        iDiff = (short)_mm_extract_epi16(vdiffb, 6);
833
0
        diff[iBand]  += iDiff;
834
0
        count[iBand] += 1;
835
0
        iBand = _mm_extract_epi16(vbandb, 7);
836
0
        iDiff = (short)_mm_extract_epi16(vdiffb, 7);
837
0
        diff[iBand]  += iDiff;
838
0
        count[iBand] += 1;
839
0
      }
840
      //last colum
841
0
      {
842
0
        __m128i vsrca, vsrcb;
843
0
        __m128i vdiffa,vdiffb;
844
0
        if (sizeof(Pel) == 1){
845
0
          __m128i vsrc = _mm_loadu_si128((__m128i*)&srcLine[x]);
846
0
          vsrca = _mm_unpacklo_epi8(vsrc, vzero);
847
0
          vsrcb = _mm_unpackhi_epi8(vsrc, vzero);
848
0
          __m128i vorg  = _mm_loadu_si128((__m128i*)&orgLine[x]);
849
0
          __m128i vorga = _mm_unpacklo_epi8(vorg, vzero);
850
0
          __m128i vorgb = _mm_unpackhi_epi8(vorg, vzero);
851
0
          vdiffa = _mm_sub_epi16(vorga, vsrca);
852
0
          vdiffb = _mm_sub_epi16(vorgb, vsrcb);
853
0
        }
854
0
        else
855
0
        {
856
0
          vsrca = _mm_loadu_si128((__m128i*)&srcLine[x]);
857
0
          vsrcb = _mm_loadu_si128((__m128i*)&srcLine[x+8]);
858
0
          __m128i vorga = _mm_loadu_si128((__m128i*)&orgLine[x]);
859
0
          __m128i vorgb = _mm_loadu_si128((__m128i*)&orgLine[x+8]);
860
0
          vdiffa = _mm_sub_epi16(vorga, vsrca);
861
0
          vdiffb = _mm_sub_epi16(vorgb, vsrcb);
862
0
        }
863
0
        __m128i vbanda = _mm_srai_epi16(vsrca, i_bo_range_shift);
864
0
        __m128i vbandb = _mm_srai_epi16(vsrcb, i_bo_range_shift);
865
0
        int iBand;
866
        // since gcc 4.6 synopsis of _mm_extract_epi16 has changed to (int)(unsigned short)_mm_extract_epi16()
867
        // therefore cast result to short to have signed values
868
0
        short iDiff;
869
0
        iBand = _mm_extract_epi16(vbanda, 0);
870
0
        iDiff = (short)_mm_extract_epi16(vdiffa, 0);
871
0
        diff[iBand]  += iDiff;
872
0
        count[iBand] += 1;
873
0
        if (iNaRight<15)
874
0
        {
875
0
          iBand = _mm_extract_epi16(vbanda, 1);
876
0
          iDiff = (short)_mm_extract_epi16(vdiffa, 1);
877
0
          diff[iBand]  += iDiff;
878
0
          count[iBand] += 1;
879
0
        }
880
0
        if (iNaRight<14)
881
0
        {
882
0
          iBand = _mm_extract_epi16(vbanda, 2);
883
0
          iDiff = (short)_mm_extract_epi16(vdiffa, 2);
884
0
          diff[iBand]  += iDiff;
885
0
          count[iBand] += 1;
886
0
        }
887
0
        if (iNaRight<13)
888
0
        {
889
0
          iBand = _mm_extract_epi16(vbanda, 3);
890
0
          iDiff = (short)_mm_extract_epi16(vdiffa, 3);
891
0
          diff[iBand]  += iDiff;
892
0
          count[iBand] += 1;
893
0
        }
894
0
        if (iNaRight<12)
895
0
        {
896
0
          iBand = _mm_extract_epi16(vbanda, 4);
897
0
          iDiff = (short)_mm_extract_epi16(vdiffa, 4);
898
0
          diff[iBand]  += iDiff;
899
0
          count[iBand] += 1;
900
0
        }
901
0
        if (iNaRight<11)
902
0
        {
903
0
          iBand = _mm_extract_epi16(vbanda, 5);
904
0
          iDiff = (short)_mm_extract_epi16(vdiffa, 5);
905
0
          diff[iBand]  += iDiff;
906
0
          count[iBand] += 1;
907
0
        }
908
0
        if (iNaRight<10)
909
0
        {
910
0
          iBand = _mm_extract_epi16(vbanda, 6);
911
0
          iDiff = (short)_mm_extract_epi16(vdiffa, 6);
912
0
          diff[iBand]  += iDiff;
913
0
          count[iBand] += 1;
914
0
        }
915
0
        if (iNaRight<9)
916
0
        {
917
0
          iBand = _mm_extract_epi16(vbanda, 7);
918
0
          iDiff = (short)_mm_extract_epi16(vdiffa, 7);
919
0
          diff[iBand]  += iDiff;
920
0
          count[iBand] += 1;
921
0
        }
922
0
        if (iNaRight<8)
923
0
        {
924
0
          iBand = _mm_extract_epi16(vbandb, 0);
925
0
          iDiff = (short)_mm_extract_epi16(vdiffb, 0);
926
0
          diff[iBand]  += iDiff;
927
0
          count[iBand] += 1;
928
0
        }
929
0
        if (iNaRight<7)
930
0
        {
931
0
          iBand = _mm_extract_epi16(vbandb, 1);
932
0
          iDiff = (short)_mm_extract_epi16(vdiffb, 1);
933
0
          diff[iBand]  += iDiff;
934
0
          count[iBand] += 1;
935
0
        }
936
0
        if (iNaRight<6)
937
0
        {
938
0
          iBand = _mm_extract_epi16(vbandb, 2);
939
0
          iDiff = (short)_mm_extract_epi16(vdiffb, 2);
940
0
          diff[iBand]  += iDiff;
941
0
          count[iBand] += 1;
942
0
        }
943
0
        if (iNaRight<5)
944
0
        {
945
0
          iBand = _mm_extract_epi16(vbandb, 3);
946
0
          iDiff = (short)_mm_extract_epi16(vdiffb, 3);
947
0
          diff[iBand]  += iDiff;
948
0
          count[iBand] += 1;
949
0
        }
950
0
        if (iNaRight<=4)
951
0
        {
952
0
          iBand = _mm_extract_epi16(vbandb, 4);
953
0
          iDiff = (short)_mm_extract_epi16(vdiffb, 4);
954
0
          diff[iBand]  += iDiff;
955
0
          count[iBand] += 1;
956
0
        }
957
0
        if (iNaRight<3)
958
0
        {
959
0
          iBand = _mm_extract_epi16(vbandb, 5);
960
0
          iDiff = (short)_mm_extract_epi16(vdiffb, 5);
961
0
          diff[iBand]  += iDiff;
962
0
          count[iBand] += 1;
963
0
        }
964
0
        if (iNaRight<2)
965
0
        {
966
0
          iBand = _mm_extract_epi16(vbandb, 6);
967
0
          iDiff = (short)_mm_extract_epi16(vdiffb, 6);
968
0
          diff[iBand]  += iDiff;
969
0
          count[iBand] += 1;
970
0
        }
971
0
        if (iNaRight<1)
972
0
        {
973
0
          iBand = _mm_extract_epi16(vbandb, 7);
974
0
          iDiff = (short)_mm_extract_epi16(vdiffb, 7);
975
0
          diff[iBand]  += iDiff;
976
0
          count[iBand] += 1;
977
0
        }
978
0
      }
979
0
      srcLine += srcStride;
980
0
      orgLine += orgStride;
981
0
    }
982
0
  }
983
0
  else
984
0
  {
985
0
    int i,j;
986
0
    int iBoRangeShift = channelBitDepth - NUM_SAO_BO_CLASSES_LOG2;
987
0
    for ( i = 0; i < endY; i++ )
988
0
    {
989
0
      for ( j = 0; j < endX; j++, srcLine++, orgLine++ )
990
0
      {
991
0
        int iBand            = *srcLine >> iBoRangeShift;
992
0
        diff[iBand]  += (*orgLine - *srcLine);
993
0
        count[iBand] += 1;
994
0
      }
995
0
      srcLine += srcStride - endX;
996
0
      orgLine += orgStride - endX;
997
0
    }
998
0
  }
999
0
}
Unexecuted instantiation: void vvenc::calcSaoStatisticsBo_SIMD<(vvenc::x86_simd::X86_VEXT)1>(int, int, int, short*, short*, int, int, int, long*, long*)
Unexecuted instantiation: void vvenc::calcSaoStatisticsBo_SIMD<(vvenc::x86_simd::X86_VEXT)4>(int, int, int, short*, short*, int, int, int, long*, long*)
1000
1001
template <X86_VEXT vext>
1002
void calcSaoStatisticsEo0_SIMD(int width,int startX,int endX,int endY,Pel*  srcLine,Pel*  orgLine,int srcStride,int orgStride,int64_t  *count, int64_t *diff)
1003
0
{
1004
0
  int iNaRight=width-endX;
1005
1006
0
  int iNaWidth = startX + iNaRight;
1007
0
  int i,j;
1008
0
  if ( width % 16 == 0 )
1009
0
  {
1010
0
    __m128i vzero       = _mm_set1_epi8(0);
1011
0
    __m128i vplusone    = _mm_set1_epi8(1);
1012
0
    __m128i vbaseoffset = _mm_set1_epi8(2);
1013
    // store intermediate results in 32bit partial sums for each EO type
1014
0
    __m128i vdiffsum[NUM_SAO_EO_CLASSES];
1015
0
    __m128i vcountsum[NUM_SAO_EO_CLASSES];
1016
0
    __m128i vconst[NUM_SAO_EO_CLASSES];
1017
0
    for ( int i = 0; i < NUM_SAO_EO_CLASSES; i++ )
1018
0
    {
1019
0
      vdiffsum[i]  = _mm_set1_epi32(0);
1020
0
      vcountsum[i] = _mm_set1_epi32(0);
1021
0
      vconst[i]    = _mm_set1_epi16(i);
1022
0
    }
1023
    // create masks for first and last pixel row
1024
0
    const unsigned short mask[16]={0,0,0,0,0,0,0,0,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff};
1025
0
    __m128i vmaskgs = _mm_set1_epi16(0);
1026
0
    __m128i vmaskge= _mm_set1_epi16(0);
1027
0
    if ( startX )
1028
0
    {
1029
0
      vmaskgs = _mm_insert_epi16( vmaskgs, 0xffff, 0);
1030
0
    }
1031
0
    if ( iNaRight )
1032
0
    {
1033
0
      vmaskge = _mm_loadu_si128((__m128i*)&mask[iNaRight]);
1034
0
    }
1035
0
    for ( int y = 0; y < endY; y++)
1036
0
    {
1037
0
      __m128i vmaskga = vmaskgs;
1038
0
      __m128i vmaskgb = vzero;
1039
0
      for ( int  x= 0; x < width; x+=16 )
1040
0
      {
1041
0
        __m128i vsrcal,vsrcar;
1042
0
        __m128i vsrcbl,vsrcbr;
1043
0
        __m128i vdiffa,vdiffb;
1044
        // set mask for last pixel
1045
0
        if ( x >= width - 16 )
1046
0
        {
1047
0
          vmaskgb = vmaskge;
1048
0
        }
1049
        // load reconstruction and compute difference between original signal and reconstruction
1050
0
        if (sizeof(Pel) ==1)
1051
0
        {
1052
0
          __m128i vsrc = _mm_loadu_si128((__m128i*)&srcLine[x]);
1053
0
          __m128i vsrcl = _mm_loadu_si128((__m128i*)&srcLine[x-1]);
1054
0
          __m128i vsrcr = _mm_loadu_si128((__m128i*)&srcLine[x+1]);
1055
0
          __m128i vsrca = _mm_unpacklo_epi8(vsrc, vzero);
1056
0
          __m128i vsrcb = _mm_unpackhi_epi8(vsrc, vzero);
1057
0
          vsrcal = _mm_unpacklo_epi8(vsrcl, vzero);
1058
0
          vsrcbl = _mm_unpackhi_epi8(vsrcl, vzero);
1059
0
          vsrcar = _mm_unpacklo_epi8(vsrcr, vzero);
1060
0
          vsrcbr = _mm_unpackhi_epi8(vsrcr, vzero);
1061
0
          vsrcal = _mm_sub_epi16(vsrca, vsrcal);
1062
0
          vsrcar = _mm_sub_epi16(vsrca, vsrcar);
1063
0
          vsrcbl = _mm_sub_epi16(vsrcb, vsrcbl);
1064
0
          vsrcbr = _mm_sub_epi16(vsrcb, vsrcbr);
1065
0
          __m128i vorg  = _mm_loadu_si128((__m128i*)&orgLine[x]);
1066
0
          __m128i vorga = _mm_unpacklo_epi8(vorg, vzero);
1067
0
          __m128i vorgb = _mm_unpackhi_epi8(vorg, vzero);
1068
0
          vdiffa = _mm_sub_epi16(vorga, vsrca);
1069
0
          vdiffb = _mm_sub_epi16(vorgb, vsrcb);
1070
0
        }
1071
0
        else
1072
0
        {
1073
0
          __m128i vsrca = _mm_loadu_si128((__m128i*)&srcLine[x]);
1074
0
          vsrcal = _mm_loadu_si128((__m128i*)&srcLine[x-1]);
1075
0
          vsrcar = _mm_loadu_si128((__m128i*)&srcLine[x+1]);
1076
0
          vsrcal = _mm_sub_epi16(vsrca, vsrcal);
1077
0
          vsrcar = _mm_sub_epi16(vsrca, vsrcar);
1078
0
          __m128i vsrcb = _mm_loadu_si128((__m128i*)&srcLine[x+8]);
1079
0
          vsrcbl = _mm_loadu_si128((__m128i*)&srcLine[x+8-1]);
1080
0
          vsrcbr = _mm_loadu_si128((__m128i*)&srcLine[x+8+1]);
1081
0
          vsrcbl = _mm_sub_epi16(vsrcb, vsrcbl);
1082
0
          vsrcbr = _mm_sub_epi16(vsrcb, vsrcbr);
1083
0
          __m128i vorga = _mm_loadu_si128((__m128i*)&orgLine[x]);
1084
0
          __m128i vorgb = _mm_loadu_si128((__m128i*)&orgLine[x+8]);
1085
0
          vdiffa = _mm_sub_epi16(vorga, vsrca);
1086
0
          vdiffb = _mm_sub_epi16(vorgb, vsrcb);
1087
0
        }
1088
        // compute sign and type for 16 pixels
1089
0
        __m128i vsignl = _mm_packs_epi16(vsrcal, vsrcbl);
1090
0
        __m128i vsignr = _mm_packs_epi16(vsrcar, vsrcbr);
1091
0
        vsignl = _mm_sign_epi8(vplusone, vsignl);
1092
0
        vsignr = _mm_sign_epi8(vplusone, vsignr);
1093
0
        __m128i vtype  = _mm_add_epi8(_mm_add_epi8(vsignl, vsignr), vbaseoffset);
1094
0
        __m128i vtypea = _mm_unpacklo_epi8(vtype, vzero);
1095
0
        __m128i vtypeb = _mm_unpackhi_epi8(vtype, vzero);
1096
0
        vtypea = _mm_or_si128(vtypea, vmaskga);
1097
0
        vtypeb = _mm_or_si128(vtypeb, vmaskgb);
1098
        // count occurence of each type and accumulate partial sums for each type
1099
0
        for ( int i = 0; i < NUM_SAO_EO_CLASSES; i++ )
1100
0
        {
1101
0
          __m128i vmaska = _mm_cmpeq_epi16(vtypea, vconst[i]);
1102
0
          __m128i vmaskb = _mm_cmpeq_epi16(vtypeb, vconst[i]);
1103
0
          __m128i vdiffma = _mm_and_si128(vmaska, vdiffa);
1104
0
          __m128i vdiffmb = _mm_and_si128(vmaskb, vdiffb);
1105
0
          vdiffsum[i] = _mm_add_epi32(vdiffsum[i], _mm_madd_epi16(vdiffma, vconst[1]));
1106
0
          vdiffsum[i] = _mm_add_epi32(vdiffsum[i], _mm_madd_epi16(vdiffmb, vconst[1]));
1107
0
          __m128i vcountma = _mm_srli_epi16(vmaska,15);
1108
0
          __m128i vcountmb = _mm_srli_epi16(vmaskb,15);
1109
0
          vcountsum[i] = _mm_add_epi32(vcountsum[i], _mm_madd_epi16(vcountma, vconst[1]));
1110
0
          vcountsum[i] = _mm_add_epi32(vcountsum[i], _mm_madd_epi16(vcountmb, vconst[1]));
1111
0
        }
1112
        // clear mask for first pixel
1113
0
        vmaskga = vzero;
1114
0
      }
1115
      // next pixel line
1116
0
      srcLine += srcStride;
1117
0
      orgLine += orgStride;
1118
0
    }
1119
    // horizontal add of four 32 bit partial sums
1120
0
    for ( int i = 0; i < NUM_SAO_EO_CLASSES; i++ )
1121
0
    {
1122
0
      vdiffsum[i] = _mm_add_epi32(vdiffsum[i], _mm_srli_si128(vdiffsum[i], 8));
1123
0
      vdiffsum[i] = _mm_add_epi32(vdiffsum[i], _mm_srli_si128(vdiffsum[i], 4));
1124
0
      diff[i] = _mm_cvtsi128_si32(vdiffsum[i]);
1125
0
      vcountsum[i] = _mm_add_epi32(vcountsum[i], _mm_srli_si128(vcountsum[i], 8));
1126
0
      vcountsum[i] = _mm_add_epi32(vcountsum[i], _mm_srli_si128(vcountsum[i], 4));
1127
0
      count[i] = _mm_cvtsi128_si32(vcountsum[i]);
1128
0
    }
1129
0
  }
1130
0
  else
1131
0
  {
1132
0
    srcLine      = srcLine + startX;
1133
0
    orgLine      =orgLine + startX;
1134
0
    diff +=2;
1135
0
    count+=2;
1136
1137
0
    for ( i = 0; i < endY; i++ )
1138
0
    {
1139
0
      int iSignLeft = sgn( *srcLine - *(srcLine - 1) );
1140
0
      for ( j = 0; j < width - iNaWidth; j++, srcLine++, orgLine++ )
1141
0
      {
1142
0
        int iSignRight       = sgn( *srcLine - *(srcLine + 1) );
1143
        //printf("%d ",*srcLine);
1144
0
        int iType            = iSignLeft + iSignRight;
1145
0
        iSignLeft            = -1 * iSignRight;
1146
0
        diff[iType]  += (*orgLine - *srcLine);
1147
0
        count[iType] += 1;
1148
0
      }
1149
0
      srcLine += srcStride - ( width - iNaWidth );
1150
0
      orgLine += orgStride - ( width - iNaWidth );
1151
0
    }
1152
0
  }
1153
0
}
Unexecuted instantiation: void vvenc::calcSaoStatisticsEo0_SIMD<(vvenc::x86_simd::X86_VEXT)1>(int, int, int, int, short*, short*, int, int, long*, long*)
Unexecuted instantiation: void vvenc::calcSaoStatisticsEo0_SIMD<(vvenc::x86_simd::X86_VEXT)4>(int, int, int, int, short*, short*, int, int, long*, long*)
1154
template <X86_VEXT vext>
1155
void calcSaoStatisticsEo90_SIMD(int width,int endX,int startY,int endY,Pel*  srcLine,Pel*  orgLine,int srcStride,int orgStride,int64_t  *count, int64_t *diff,int8_t *signUpLine)
1156
0
{
1157
0
  if ( width % 16 == 0 )
1158
0
  {
1159
0
    int iNaRight=width-endX;
1160
0
    __m128i vzero       = _mm_set1_epi8(0);
1161
0
    __m128i vplusone    = _mm_set1_epi8(1);
1162
0
    __m128i vbaseoffset = _mm_set1_epi8(2);
1163
    // store intermediate results in 32bit partial sums for each EO type
1164
0
    __m128i vdiffsum[NUM_SAO_EO_CLASSES];
1165
0
    __m128i vcountsum[NUM_SAO_EO_CLASSES];
1166
0
    __m128i vconst[NUM_SAO_EO_CLASSES];
1167
0
    for ( int i = 0; i < NUM_SAO_EO_CLASSES; i++ )
1168
0
    {
1169
0
      vdiffsum[i]  = _mm_set1_epi32(0);
1170
0
      vcountsum[i] = _mm_set1_epi32(0);
1171
0
      vconst[i]    = _mm_set1_epi16(i);
1172
0
    }
1173
    // create masks for first and last pixel row
1174
0
    const unsigned short mask[16]={0,0,0,0,0,0,0,0,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff};
1175
0
    __m128i vmaskge= _mm_set1_epi16(0);
1176
0
    if ( iNaRight )
1177
0
    {
1178
0
      vmaskge = _mm_loadu_si128((__m128i*)&mask[iNaRight]);
1179
0
    }
1180
1181
0
    __m128i vsigns[MAX_CU_SIZE/16 +1];  //+1 to avoid MSVC error
1182
0
    for (int x=0; x< endX; x+=16)
1183
0
    {
1184
0
      __m128i vsrca,vsrcb;
1185
0
      __m128i vsrcat,vsrcbt;
1186
0
      if (sizeof(Pel) == 1)
1187
0
      {
1188
0
        __m128i vsrc = _mm_loadu_si128((__m128i*)&srcLine[x]);
1189
0
        __m128i vsrct = _mm_loadu_si128((__m128i*)&srcLine[x-srcStride]);
1190
0
        vsrca = _mm_unpacklo_epi8(vsrc, vzero);
1191
0
        vsrcb = _mm_unpackhi_epi8(vsrc, vzero);
1192
0
        vsrcat = _mm_unpacklo_epi8(vsrct, vzero);
1193
0
        vsrcbt = _mm_unpackhi_epi8(vsrct, vzero);
1194
0
      }
1195
0
      else
1196
0
      {
1197
0
        vsrca = _mm_loadu_si128((__m128i*)&srcLine[x]);
1198
0
        vsrcb = _mm_loadu_si128((__m128i*)&srcLine[x+8]);
1199
0
        vsrcat = _mm_loadu_si128((__m128i*)&srcLine[x   - srcStride]);
1200
0
        vsrcbt = _mm_loadu_si128((__m128i*)&srcLine[x+8 - srcStride]);
1201
0
      }
1202
0
      vsrcat = _mm_sub_epi16(vsrcat, vsrca);
1203
0
      vsrcbt = _mm_sub_epi16(vsrcbt, vsrcb);
1204
0
      vsigns[x/16] = _mm_sign_epi8(vplusone, _mm_packs_epi16(vsrcat, vsrcbt));
1205
0
    }
1206
    /* filter all lines */
1207
0
    for (int j = startY; j < endY ; j++)
1208
0
    {
1209
0
      __m128i vmaskgb = vzero;
1210
1211
      /* start with first pixel */
1212
      /* filter all pixels of this line */
1213
0
      for (int x = 0; x < endX; x+=16)
1214
0
      {
1215
0
        __m128i vsrca,vsrcb;
1216
0
        __m128i vsrcad, vsrcbd;
1217
0
        __m128i vdiffa,vdiffb;
1218
        // set mask for last pixel
1219
0
        if ( x >= width - 16 )
1220
0
        {
1221
0
          vmaskgb = vmaskge;
1222
0
        }
1223
        // load reconstruction and compute difference between original signal and reconstruction
1224
0
        if (sizeof(Pel) == 1)
1225
0
        {
1226
0
          __m128i vsrc = _mm_loadu_si128((__m128i*)&srcLine[x]);
1227
0
          __m128i vsrcd = _mm_loadu_si128((__m128i*)&srcLine[x+srcStride]);
1228
0
          vsrca = _mm_unpacklo_epi8(vsrc, vzero);
1229
0
          vsrcb = _mm_unpackhi_epi8(vsrc, vzero);
1230
0
          vsrcad = _mm_unpacklo_epi8(vsrcd, vzero);
1231
0
          vsrcbd = _mm_unpackhi_epi8(vsrcd, vzero);
1232
1233
0
          __m128i vorg  = _mm_loadu_si128((__m128i*)&orgLine[x]);
1234
0
          __m128i vorga = _mm_unpacklo_epi8(vorg, vzero);
1235
0
          __m128i vorgb = _mm_unpackhi_epi8(vorg, vzero);
1236
0
          vdiffa = _mm_sub_epi16(vorga, vsrca);
1237
0
          vdiffb = _mm_sub_epi16(vorgb, vsrcb);
1238
0
        }
1239
0
        else
1240
0
        {
1241
0
          vsrca = _mm_loadu_si128((__m128i*)&srcLine[x]);
1242
0
          vsrcb = _mm_loadu_si128((__m128i*)&srcLine[x+8]);
1243
0
          vsrcad = _mm_loadu_si128((__m128i*)&srcLine[x   + srcStride]);
1244
0
          vsrcbd = _mm_loadu_si128((__m128i*)&srcLine[x+8 + srcStride]);
1245
0
          __m128i vorga = _mm_loadu_si128((__m128i*)&orgLine[x]);
1246
0
          __m128i vorgb = _mm_loadu_si128((__m128i*)&orgLine[x+8]);
1247
0
          vdiffa = _mm_sub_epi16(vorga, vsrca);
1248
0
          vdiffb = _mm_sub_epi16(vorgb, vsrcb);
1249
0
        }
1250
        // compute sign and type for 16 pixels
1251
0
        vsrcad = _mm_sub_epi16(vsrca, vsrcad);
1252
0
        vsrcbd = _mm_sub_epi16(vsrcb, vsrcbd);
1253
0
        __m128i vsignd = _mm_sign_epi8(vplusone, _mm_packs_epi16(vsrcad, vsrcbd));
1254
0
        __m128i vsignt = vsigns[x/16];
1255
0
        vsigns[x/16] = vsignd;
1256
0
        __m128i vtype  = _mm_add_epi8(_mm_sub_epi8(vsignd, vsignt), vbaseoffset);
1257
0
        __m128i vtypea = _mm_unpacklo_epi8(vtype, vzero);
1258
0
        __m128i vtypeb = _mm_unpackhi_epi8(vtype, vzero);
1259
0
        vtypeb = _mm_or_si128(vtypeb, vmaskgb);
1260
1261
        // count occurence of each type and accumulate partial sums for each type
1262
0
        for ( int i = 0; i < NUM_SAO_EO_CLASSES; i++ )
1263
0
        {
1264
0
          __m128i vmaska = _mm_cmpeq_epi16(vtypea, vconst[i]);
1265
0
          __m128i vmaskb = _mm_cmpeq_epi16(vtypeb, vconst[i]);
1266
0
          __m128i vdiffma = _mm_and_si128(vmaska, vdiffa);
1267
0
          __m128i vdiffmb = _mm_and_si128(vmaskb, vdiffb);
1268
0
          vdiffsum[i] = _mm_add_epi32(vdiffsum[i], _mm_madd_epi16(vdiffma, vconst[1]));
1269
0
          vdiffsum[i] = _mm_add_epi32(vdiffsum[i], _mm_madd_epi16(vdiffmb, vconst[1]));
1270
0
          __m128i vcountma = _mm_srli_epi16(vmaska,15);
1271
0
          __m128i vcountmb = _mm_srli_epi16(vmaskb,15);
1272
0
          vcountsum[i] = _mm_add_epi32(vcountsum[i], _mm_madd_epi16(vcountma, vconst[1]));
1273
0
          vcountsum[i] = _mm_add_epi32(vcountsum[i], _mm_madd_epi16(vcountmb, vconst[1]));
1274
0
        }
1275
0
      }
1276
      // next pixel line
1277
0
      srcLine += srcStride;
1278
0
      orgLine += orgStride;
1279
0
    }
1280
    // horizontal add of four 32 bit partial sums
1281
0
    for ( int i = 0; i < NUM_SAO_EO_CLASSES; i++ )
1282
0
    {
1283
0
      vdiffsum[i] = _mm_add_epi32(vdiffsum[i], _mm_srli_si128(vdiffsum[i], 8));
1284
0
      vdiffsum[i] = _mm_add_epi32(vdiffsum[i], _mm_srli_si128(vdiffsum[i], 4));
1285
0
      diff[i] = _mm_cvtsi128_si32(vdiffsum[i]);
1286
0
      vcountsum[i] = _mm_add_epi32(vcountsum[i], _mm_srli_si128(vcountsum[i], 8));
1287
0
      vcountsum[i] = _mm_add_epi32(vcountsum[i], _mm_srli_si128(vcountsum[i], 4));
1288
0
      count[i] = _mm_cvtsi128_si32(vcountsum[i]);
1289
0
    }
1290
0
  }
1291
0
  else
1292
0
  {
1293
0
    diff +=2;
1294
0
    count+=2;
1295
0
    int x,y,edgeType;
1296
0
    Pel* srcLineAbove = srcLine - srcStride;
1297
0
    int8_t signDown;
1298
0
    for (x=0; x<endX; x++)
1299
0
    {
1300
0
      signUpLine[x] = (int8_t)sgn(srcLine[x] - srcLineAbove[x]);
1301
0
    }
1302
0
    Pel* srcLineBelow;
1303
0
    for (y=startY; y<endY; y++)
1304
0
    {
1305
0
      srcLineBelow = srcLine + srcStride;
1306
1307
0
      for (x=0; x<endX; x++)
1308
0
      {
1309
0
        signDown  = (int8_t)sgn(srcLine[x] - srcLineBelow[x]);
1310
0
        edgeType  = signDown + signUpLine[x];
1311
0
        signUpLine[x]= -signDown;
1312
1313
0
        diff [edgeType] += (orgLine[x] - srcLine[x]);
1314
0
        count[edgeType] ++;
1315
0
      }
1316
0
      srcLine += srcStride;
1317
0
      orgLine += orgStride;
1318
0
    }
1319
0
  }
1320
0
}
Unexecuted instantiation: void vvenc::calcSaoStatisticsEo90_SIMD<(vvenc::x86_simd::X86_VEXT)1>(int, int, int, int, short*, short*, int, int, long*, long*, signed char*)
Unexecuted instantiation: void vvenc::calcSaoStatisticsEo90_SIMD<(vvenc::x86_simd::X86_VEXT)4>(int, int, int, int, short*, short*, int, int, long*, long*, signed char*)
1321
template <X86_VEXT vext>
1322
void calcSaoStatisticsEo135_SIMD(int width,int startX,int endX,int endY,Pel*  srcLine,Pel*  orgLine,int srcStride,int orgStride,int64_t  *count, int64_t *diff,int8_t *signUpLine,int8_t *signDownLine)
1323
0
{
1324
0
  if ( width % 16 == 0 )
1325
0
  {
1326
0
    int iNaRight=width-endX;
1327
0
    diff -=2;
1328
0
    count-=2;
1329
0
    __m128i vzero       = _mm_set1_epi8(0);
1330
0
    __m128i vplusone    = _mm_set1_epi8(1);
1331
0
    __m128i vbaseoffset = _mm_set1_epi8(2);
1332
    // store intermediate results in 32bit partial sums for each EO type
1333
0
    __m128i vdiffsum[NUM_SAO_EO_CLASSES];
1334
0
    __m128i vcountsum[NUM_SAO_EO_CLASSES];
1335
0
    __m128i vconst[NUM_SAO_EO_CLASSES];
1336
0
    for ( int i = 0; i < NUM_SAO_EO_CLASSES; i++ )
1337
0
    {
1338
0
      vdiffsum[i]  = _mm_set1_epi32(0);
1339
0
      vcountsum[i] = _mm_set1_epi32(0);
1340
0
      vconst[i]    = _mm_set1_epi16(i);
1341
0
    }
1342
    // create masks for first and last pixel row
1343
0
    const unsigned short mask[16]={0,0,0,0,0,0,0,0,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff};
1344
0
    __m128i vmaskgs = _mm_set1_epi16(0);
1345
0
    __m128i vmaskge = _mm_set1_epi16(0);
1346
0
    if ( startX )
1347
0
    {
1348
0
      vmaskgs = _mm_insert_epi16( vmaskgs, 0xffff, 0);
1349
0
    }
1350
0
    if ( iNaRight )
1351
0
    {
1352
0
      vmaskge = _mm_loadu_si128((__m128i*)&mask[iNaRight]);
1353
0
    }
1354
    /* filter all lines */
1355
0
    for (int j = 1; j < endY; j++)
1356
0
    {
1357
0
      __m128i vmaskga = vmaskgs;
1358
0
      __m128i vmaskgb = vconst[0];
1359
      /* start with first pixel */
1360
      /* filter all pixels of this line */
1361
0
      for (int x = 0; x < width; x+=16)
1362
0
      {
1363
0
        __m128i vsrca,vsrcb;
1364
0
        __m128i vsrcad,vsrcbd;
1365
0
        __m128i vsrcat,vsrcbt;
1366
0
        __m128i vdiffa,vdiffb;
1367
        // set mask for last pixel
1368
0
        if ( x >= width - 16 )
1369
0
        {
1370
0
          vmaskgb = vmaskge;
1371
0
        }
1372
0
        if (sizeof(Pel) == 1)
1373
0
        {
1374
0
          __m128i vsrct = _mm_loadu_si128((__m128i*)&srcLine[ x-srcStride-1 ]);
1375
0
          __m128i vsrc  = _mm_loadu_si128((__m128i*)&srcLine[ x ]);
1376
0
          __m128i vsrcd = _mm_loadu_si128((__m128i*)&srcLine[ x+srcStride+1 ]);
1377
0
          vsrcat = _mm_unpacklo_epi8(vsrct, vzero);
1378
0
          vsrcbt = _mm_unpackhi_epi8(vsrct, vzero);
1379
0
          vsrca = _mm_unpacklo_epi8(vsrc, vzero);
1380
0
          vsrcb = _mm_unpackhi_epi8(vsrc, vzero);
1381
0
          vsrcad = _mm_unpacklo_epi8(vsrcd, vzero);
1382
0
          vsrcbd = _mm_unpackhi_epi8(vsrcd, vzero);
1383
0
          __m128i vorg  = _mm_loadu_si128((__m128i*)&orgLine[x]);
1384
0
          __m128i vorga = _mm_unpacklo_epi8(vorg, vzero);
1385
0
          __m128i vorgb = _mm_unpackhi_epi8(vorg, vzero);
1386
0
          vdiffa = _mm_sub_epi16(vorga, vsrca);
1387
0
          vdiffb = _mm_sub_epi16(vorgb, vsrcb);
1388
0
        }
1389
0
        else
1390
0
        {
1391
0
          vsrcat = _mm_loadu_si128((__m128i*)&srcLine[x - 1 - srcStride ]);
1392
0
          vsrcbt = _mm_loadu_si128((__m128i*)&srcLine[x - 1 + 8 - srcStride]);
1393
0
          vsrca = _mm_loadu_si128((__m128i*)&srcLine[x]);
1394
0
          vsrcb = _mm_loadu_si128((__m128i*)&srcLine[x+8]);
1395
0
          vsrcad = _mm_loadu_si128((__m128i*)&srcLine[x + 1 + srcStride ]);
1396
0
          vsrcbd = _mm_loadu_si128((__m128i*)&srcLine[x + 1 + 8 + srcStride]);
1397
0
          __m128i vorga = _mm_loadu_si128((__m128i*)&orgLine[x]);
1398
0
          __m128i vorgb = _mm_loadu_si128((__m128i*)&orgLine[x+8]);
1399
0
          vdiffa = _mm_sub_epi16(vorga, vsrca);
1400
0
          vdiffb = _mm_sub_epi16(vorgb, vsrcb);
1401
0
        }
1402
        // compute sign and type for 16 pixels
1403
0
        vsrcat = _mm_sub_epi16(vsrca, vsrcat);
1404
0
        vsrcbt = _mm_sub_epi16(vsrcb, vsrcbt);
1405
0
        vsrcad = _mm_sub_epi16(vsrca, vsrcad);
1406
0
        vsrcbd = _mm_sub_epi16(vsrcb, vsrcbd);
1407
0
        __m128i vsignt = _mm_sign_epi8(vplusone, _mm_packs_epi16(vsrcat, vsrcbt));
1408
0
        __m128i vsignd = _mm_sign_epi8(vplusone, _mm_packs_epi16(vsrcad, vsrcbd));
1409
0
        __m128i vtype = _mm_add_epi8(_mm_add_epi8(vsignd, vsignt), vbaseoffset);
1410
0
        __m128i vtypea = _mm_unpacklo_epi8(vtype, vzero);
1411
0
        __m128i vtypeb = _mm_unpackhi_epi8(vtype, vzero);
1412
0
        vtypea = _mm_or_si128(vtypea, vmaskga);
1413
0
        vtypeb = _mm_or_si128(vtypeb, vmaskgb);
1414
        // count occurence of each type and accumulate partial sums for each type
1415
0
        for ( int i = 0; i < NUM_SAO_EO_CLASSES; i++ )
1416
0
        {
1417
0
          __m128i vmaska = _mm_cmpeq_epi16(vtypea, vconst[i]);
1418
0
          __m128i vmaskb = _mm_cmpeq_epi16(vtypeb, vconst[i]);
1419
0
          __m128i vdiffma = _mm_and_si128(vmaska, vdiffa);
1420
0
          __m128i vdiffmb = _mm_and_si128(vmaskb, vdiffb);
1421
0
          vdiffsum[i] = _mm_add_epi32(vdiffsum[i], _mm_madd_epi16(vdiffma, vconst[1]));
1422
0
          vdiffsum[i] = _mm_add_epi32(vdiffsum[i], _mm_madd_epi16(vdiffmb, vconst[1]));
1423
0
          __m128i vcountma = _mm_srli_epi16(vmaska,15);
1424
0
          __m128i vcountmb = _mm_srli_epi16(vmaskb,15);
1425
0
          vcountsum[i] = _mm_add_epi32(vcountsum[i], _mm_madd_epi16(vcountma, vconst[1]));
1426
0
          vcountsum[i] = _mm_add_epi32(vcountsum[i], _mm_madd_epi16(vcountmb, vconst[1]));
1427
0
        }
1428
        // clear mask for first pixel
1429
0
        vmaskga = vconst[0];
1430
0
      }
1431
      // next pixel line
1432
0
      srcLine += srcStride;
1433
0
      orgLine += orgStride;
1434
0
    }
1435
    // horizontal add of four 32 bit partial sums
1436
0
    for ( int i = 0; i < NUM_SAO_EO_CLASSES; i++ )
1437
0
    {
1438
0
      vdiffsum[i] = _mm_add_epi32(vdiffsum[i], _mm_srli_si128(vdiffsum[i], 8));
1439
0
      vdiffsum[i] = _mm_add_epi32(vdiffsum[i], _mm_srli_si128(vdiffsum[i], 4));
1440
0
      diff[i] += _mm_cvtsi128_si32(vdiffsum[i]);
1441
0
      vcountsum[i] = _mm_add_epi32(vcountsum[i], _mm_srli_si128(vcountsum[i], 8));
1442
0
      vcountsum[i] = _mm_add_epi32(vcountsum[i], _mm_srli_si128(vcountsum[i], 4));
1443
0
      count[i] += _mm_cvtsi128_si32(vcountsum[i]);
1444
0
    }
1445
0
  }
1446
0
  else
1447
0
  {
1448
0
    int x,y,edgeType;
1449
0
    int8_t signDown;
1450
    //middle lines
1451
0
     for (y=1; y<endY; y++)
1452
0
     {
1453
0
       int8_t* pTopSign = NULL;
1454
0
       Pel* srcLineBelow = srcLine + srcStride;
1455
0
       int8_t iTmpSign =  (int8_t)sgn( srcLineBelow[startX]   - srcLine[startX-1] );
1456
0
       for ( x=startX,pTopSign = &signUpLine[startX]; x<endX; x++ , pTopSign++ )
1457
0
       {
1458
0
         signDown = (int8_t)sgn(srcLine[x] - srcLineBelow[x+1]);
1459
0
         edgeType = signDown + *pTopSign;
1460
0
         *pTopSign            = iTmpSign;
1461
0
         iTmpSign             = -signDown;
1462
0
         diff [edgeType] += (orgLine[x] - srcLine[x]);
1463
0
         count[edgeType] ++;
1464
0
       }
1465
0
       srcLine += srcStride;
1466
0
       orgLine += orgStride;
1467
0
     }
1468
0
  }
1469
0
}
Unexecuted instantiation: void vvenc::calcSaoStatisticsEo135_SIMD<(vvenc::x86_simd::X86_VEXT)1>(int, int, int, int, short*, short*, int, int, long*, long*, signed char*, signed char*)
Unexecuted instantiation: void vvenc::calcSaoStatisticsEo135_SIMD<(vvenc::x86_simd::X86_VEXT)4>(int, int, int, int, short*, short*, int, int, long*, long*, signed char*, signed char*)
1470
template <X86_VEXT vext>
1471
void calcSaoStatisticsEo45_SIMD(int width,int startX,int endX,int endY,Pel*  srcLine,Pel*  orgLine,int srcStride,int orgStride,int64_t  *count, int64_t *diff,int8_t *signUpLine)
1472
0
{
1473
0
  Pel* pRec = srcLine;
1474
0
  Pel* pOrg = orgLine;
1475
0
  Pel* srcLineBelow = srcLine + srcStride;
1476
0
  if (width % 16 == 0 )
1477
0
  {
1478
    //const unsigned short mask[16]={0,0,0,0,0,0,0,0,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff};
1479
0
    int iNaRight=width-endX;
1480
0
    diff -=2;
1481
0
    count-=2;
1482
0
    __m128i vzero       = _mm_set1_epi8(0);
1483
0
    __m128i vplusone    = _mm_set1_epi8(1);
1484
0
    __m128i vbaseoffset = _mm_set1_epi8(2);
1485
    // store intermediate results in 32bit partial sums for each EO type
1486
0
    __m128i vdiffsum[NUM_SAO_EO_CLASSES];
1487
0
    __m128i vcountsum[NUM_SAO_EO_CLASSES];
1488
0
    __m128i vconst[NUM_SAO_EO_CLASSES];
1489
0
    for ( int i = 0; i < NUM_SAO_EO_CLASSES; i++ )
1490
0
    {
1491
0
      vdiffsum[i]  = _mm_set1_epi32(0);
1492
0
      vcountsum[i] = _mm_set1_epi32(0);
1493
0
      vconst[i]    = _mm_set1_epi16(i);
1494
0
    }
1495
    // create masks for first and last pixel row
1496
0
    const unsigned short mask[16]={0,0,0,0,0,0,0,0,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff};
1497
0
    __m128i vmaskgs = _mm_set1_epi16(0);
1498
0
    __m128i vmaskge = _mm_set1_epi16(0);
1499
0
    if ( startX )
1500
0
    {
1501
0
      vmaskgs = _mm_insert_epi16( vmaskgs, 0xffff, 0);
1502
0
    }
1503
0
    if ( iNaRight )
1504
0
    {
1505
0
      vmaskge = _mm_loadu_si128((__m128i*)&mask[iNaRight]);
1506
0
    }
1507
    /* filter all lines */
1508
0
    for (int j = 1; j < endY; j++)
1509
0
    {
1510
0
      __m128i vmaskga = vmaskgs;
1511
0
      __m128i vmaskgb = vconst[0];
1512
      /* start with first pixel */
1513
      /* filter all pixels of this line */
1514
0
      for (int x = 0; x < width; x+=16)
1515
0
      {
1516
0
        __m128i vsrca,vsrcb;
1517
0
        __m128i vsrcad,vsrcbd;
1518
0
        __m128i vsrcat,vsrcbt;
1519
0
        __m128i vdiffa,vdiffb;
1520
        // set mask for last pixel
1521
0
        if ( x >= width - 16 )
1522
0
        {
1523
0
          vmaskgb = vmaskge;
1524
0
        }
1525
0
        if (sizeof(Pel) == 1)
1526
0
        {
1527
0
          __m128i vsrct = _mm_loadu_si128((__m128i*)&pRec[ x-srcStride+1 ]);
1528
0
          __m128i vsrc  = _mm_loadu_si128((__m128i*)&pRec[ x ]);
1529
0
          __m128i vsrcd = _mm_loadu_si128((__m128i*)&pRec[ x+srcStride-1 ]);
1530
0
          vsrcat = _mm_unpacklo_epi8(vsrct, vzero);
1531
0
          vsrcbt = _mm_unpackhi_epi8(vsrct, vzero);
1532
0
          vsrca = _mm_unpacklo_epi8(vsrc, vzero);
1533
0
          vsrcb = _mm_unpackhi_epi8(vsrc, vzero);
1534
0
          vsrcad = _mm_unpacklo_epi8(vsrcd, vzero);
1535
0
          vsrcbd = _mm_unpackhi_epi8(vsrcd, vzero);
1536
0
          __m128i vorg  = _mm_loadu_si128((__m128i*)&pOrg[x]);
1537
0
          __m128i vorga = _mm_unpacklo_epi8(vorg, vzero);
1538
0
          __m128i vorgb = _mm_unpackhi_epi8(vorg, vzero);
1539
0
          vdiffa = _mm_sub_epi16(vorga, vsrca);
1540
0
          vdiffb = _mm_sub_epi16(vorgb, vsrcb);
1541
0
        }
1542
0
        else
1543
0
        {
1544
0
          vsrcat = _mm_loadu_si128((__m128i*)&pRec[x + 1 - srcStride ]);
1545
0
          vsrcbt = _mm_loadu_si128((__m128i*)&pRec[x + 1 + 8 - srcStride]);
1546
0
          vsrca = _mm_loadu_si128((__m128i*)&pRec[x]);
1547
0
          vsrcb = _mm_loadu_si128((__m128i*)&pRec[x+8]);
1548
0
          vsrcad = _mm_loadu_si128((__m128i*)&pRec[x - 1 + srcStride ]);
1549
0
          vsrcbd = _mm_loadu_si128((__m128i*)&pRec[x - 1 + 8 + srcStride]);
1550
0
          __m128i vorga = _mm_loadu_si128((__m128i*)&pOrg[x]);
1551
0
          __m128i vorgb = _mm_loadu_si128((__m128i*)&pOrg[x+8]);
1552
0
          vdiffa = _mm_sub_epi16(vorga, vsrca);
1553
0
          vdiffb = _mm_sub_epi16(vorgb, vsrcb);
1554
0
        }
1555
        // compute sign and type for 16 pixels
1556
0
        vsrcat = _mm_sub_epi16(vsrca, vsrcat);
1557
0
        vsrcbt = _mm_sub_epi16(vsrcb, vsrcbt);
1558
0
        vsrcad = _mm_sub_epi16(vsrca, vsrcad);
1559
0
        vsrcbd = _mm_sub_epi16(vsrcb, vsrcbd);
1560
0
        __m128i vsignt = _mm_sign_epi8(vplusone, _mm_packs_epi16(vsrcat, vsrcbt));
1561
0
        __m128i vsignd = _mm_sign_epi8(vplusone, _mm_packs_epi16(vsrcad, vsrcbd));
1562
0
        __m128i vtype = _mm_add_epi8(_mm_add_epi8(vsignd, vsignt), vbaseoffset);
1563
0
        __m128i vtypea = _mm_unpacklo_epi8(vtype, vzero);
1564
0
        __m128i vtypeb = _mm_unpackhi_epi8(vtype, vzero);
1565
0
        vtypea = _mm_or_si128(vtypea, vmaskga);
1566
0
        vtypeb = _mm_or_si128(vtypeb, vmaskgb);
1567
        // count occurence of each type and accumulate partial sums for each type
1568
0
        for ( int i = 0; i < NUM_SAO_EO_CLASSES; i++ )
1569
0
        {
1570
0
          __m128i vmaska = _mm_cmpeq_epi16(vtypea, vconst[i]);
1571
0
          __m128i vmaskb = _mm_cmpeq_epi16(vtypeb, vconst[i]);
1572
0
          __m128i vdiffma = _mm_and_si128(vmaska, vdiffa);
1573
0
          __m128i vdiffmb = _mm_and_si128(vmaskb, vdiffb);
1574
0
          vdiffsum[i] = _mm_add_epi32(vdiffsum[i], _mm_madd_epi16(vdiffma, vconst[1]));
1575
0
          vdiffsum[i] = _mm_add_epi32(vdiffsum[i], _mm_madd_epi16(vdiffmb, vconst[1]));
1576
0
          __m128i vcountma = _mm_srli_epi16(vmaska,15);
1577
0
          __m128i vcountmb = _mm_srli_epi16(vmaskb,15);
1578
0
          vcountsum[i] = _mm_add_epi32(vcountsum[i], _mm_madd_epi16(vcountma, vconst[1]));
1579
0
          vcountsum[i] = _mm_add_epi32(vcountsum[i], _mm_madd_epi16(vcountmb, vconst[1]));
1580
0
        }
1581
        // clear mask for first pixel
1582
0
        vmaskga = vconst[0];
1583
0
      }
1584
      // next pixel line
1585
0
      pRec += srcStride;
1586
0
      pOrg += orgStride;
1587
0
    }
1588
1589
    // horizontal add of four 32 bit partial sums
1590
0
    for ( int i = 0; i < NUM_SAO_EO_CLASSES; i++ )
1591
0
    {
1592
0
      vdiffsum[i] = _mm_add_epi32(vdiffsum[i], _mm_srli_si128(vdiffsum[i], 8));
1593
0
      vdiffsum[i] = _mm_add_epi32(vdiffsum[i], _mm_srli_si128(vdiffsum[i], 4));
1594
0
      diff[i] += _mm_cvtsi128_si32(vdiffsum[i]);
1595
0
      vcountsum[i] = _mm_add_epi32(vcountsum[i], _mm_srli_si128(vcountsum[i], 8));
1596
0
      vcountsum[i] = _mm_add_epi32(vcountsum[i], _mm_srli_si128(vcountsum[i], 4));
1597
0
      count[i] += _mm_cvtsi128_si32(vcountsum[i]);
1598
0
    }
1599
0
  }
1600
0
  else
1601
0
  {
1602
0
    int x,y,edgeType;
1603
0
    int8_t signDown;
1604
    //middle lines
1605
0
    for (y=1; y<endY; y++)
1606
0
    {
1607
0
      srcLineBelow = srcLine + srcStride;
1608
0
      for(x=startX; x<endX; x++)
1609
0
      {
1610
0
        signDown = (int8_t)sgn(srcLine[x] - srcLineBelow[x-1]);
1611
0
        edgeType = signDown + signUpLine[x];
1612
0
        diff [edgeType] += (orgLine[x] - srcLine[x]);
1613
0
        count[edgeType] ++;
1614
0
        signUpLine[x-1] = -signDown;
1615
0
      }
1616
0
      signUpLine[endX-1] = (int8_t)sgn(srcLineBelow[endX-1] - srcLine[endX]);
1617
0
      srcLine  += srcStride;
1618
0
      orgLine  += orgStride;
1619
0
    }
1620
0
  }
1621
0
}
Unexecuted instantiation: void vvenc::calcSaoStatisticsEo45_SIMD<(vvenc::x86_simd::X86_VEXT)1>(int, int, int, int, short*, short*, int, int, long*, long*, signed char*)
Unexecuted instantiation: void vvenc::calcSaoStatisticsEo45_SIMD<(vvenc::x86_simd::X86_VEXT)4>(int, int, int, int, short*, short*, int, int, long*, long*, signed char*)
1622
template <X86_VEXT vext>
1623
void SampleAdaptiveOffset::_initSampleAdaptiveOffsetX86()
1624
0
{
1625
0
  offsetBlock= offsetBlock_SIMD<vext>;
1626
0
  calcSaoStatisticsEo0 =  calcSaoStatisticsEo0_SIMD<vext>;
1627
0
  calcSaoStatisticsEo90 =  calcSaoStatisticsEo90_SIMD<vext>;
1628
0
  calcSaoStatisticsEo135 =  calcSaoStatisticsEo135_SIMD<vext>;
1629
0
  calcSaoStatisticsEo45 =  calcSaoStatisticsEo45_SIMD<vext>;
1630
0
  calcSaoStatisticsBo =  calcSaoStatisticsBo_SIMD<vext>;
1631
1632
0
}
Unexecuted instantiation: void vvenc::SampleAdaptiveOffset::_initSampleAdaptiveOffsetX86<(vvenc::x86_simd::X86_VEXT)1>()
Unexecuted instantiation: void vvenc::SampleAdaptiveOffset::_initSampleAdaptiveOffsetX86<(vvenc::x86_simd::X86_VEXT)4>()
1633
1634
template void SampleAdaptiveOffset::_initSampleAdaptiveOffsetX86<SIMDX86>();
1635
1636
} // namespace vvenc
1637
1638
//! \}
1639
1640
#endif // TARGET_SIMD_X86
1641