Coverage Report

Created: 2026-04-01 06:18

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libhevc/common/x86/ihevc_sao_ssse3_intr.c
Line
Count
Source
1
/******************************************************************************
2
*
3
* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4
*
5
* Licensed under the Apache License, Version 2.0 (the "License");
6
* you may not use this file except in compliance with the License.
7
* You may obtain a copy of the License at:
8
*
9
* http://www.apache.org/licenses/LICENSE-2.0
10
*
11
* Unless required by applicable law or agreed to in writing, software
12
* distributed under the License is distributed on an "AS IS" BASIS,
13
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
* See the License for the specific language governing permissions and
15
* limitations under the License.
16
*
17
******************************************************************************/
18
/**
19
*******************************************************************************
20
* @file
21
*  ihevc_sao_atom_intr.c
22
*
23
* @brief
24
*  Contains function definitions for Sample adaptive offset(SAO) used in-loop
25
* filtering
26
*
27
* @author
28
* 100592
29
*
30
* @par List of Functions:
31
*   - ihevc_sao_band_offset_luma_ssse3()
32
*   - ihevc_sao_band_offset_chroma_ssse3()
33
*   - ihevc_sao_edge_offset_class0_ssse3()
34
*   - ihevc_sao_edge_offset_class0_chroma_ssse3()
35
*   - ihevc_sao_edge_offset_class1_ssse3()
36
*   - ihevc_sao_edge_offset_class1_chroma_ssse3()
37
*   - ihevc_sao_edge_offset_class2_ssse3()
38
*   - ihevc_sao_edge_offset_class2_chroma_ssse3()
39
*   - ihevc_sao_edge_offset_class3_ssse3()
40
*   - ihevc_sao_edge_offset_class3_chroma_ssse3()
41
*
42
* @remarks
43
*  None
44
*
45
*******************************************************************************
46
*/
47
/*****************************************************************************/
48
/* File Includes                                                             */
49
/*****************************************************************************/
50
#include <stdio.h>
51
52
#include "ihevc_typedefs.h"
53
#include "ihevc_platform_macros.h"
54
#include "ihevc_macros.h"
55
#include "ihevc_func_selector.h"
56
#include "ihevc_defs.h"
57
#include "ihevc_tables_x86_intr.h"
58
#include "ihevc_common_tables.h"
59
#include "ihevc_sao.h"
60
61
#include <immintrin.h>
62
63
#define NUM_BAND_TABLE  32
64
/**
65
*******************************************************************************
66
*
67
* @brief
68
* Has two sets of functions : band offset and edge offset both for luma and chroma
69
* edge offset has horizontal ,vertical, 135 degree and 45 degree
70
*
71
* @par Description:
72
*
73
*
74
* @param[in-out] pu1_src
75
*  Pointer to the source
76
*
77
* @param[in] src_strd
78
*  Source stride
79
*
80
* @param[in-out] pu1_src_left
81
*  source left boundary
82
*
83
* @param[in-out] pu1_src_top
84
* Source top boundary
85
*
86
* @param[in-out] pu1_src_top_left
87
*  Source top left boundary
88
*
89
* @param[in] pu1_src_top_right
90
*  Source top right boundary
91
*
92
* @param[in] pu1_src_bot_left
93
*  Source bottom left boundary
94
*
95
* @param[in] pu1_avail
96
*  boundary availability flags
97
*
98
* @param[in] pi1_sao_offset_u
99
*  Chroma U sao offset values
100
*
101
* @param[in] pi1_sao_offset_v
102
*  Chroma V sao offset values
103
*
104
* @param[in] pi1_sao_offset
105
*  Luma sao offset values
106
*
107
* @param[in] wd
108
*  width of the source
109
110
* @param[in] ht
111
*  height of the source
112
* @returns
113
*
114
* @remarks
115
*  None
116
*
117
*******************************************************************************
118
*/
119
120
121
void ihevc_sao_band_offset_luma_ssse3(UWORD8 *pu1_src,
122
                                      WORD32 src_strd,
123
                                      UWORD8 *pu1_src_left,
124
                                      UWORD8 *pu1_src_top,
125
                                      UWORD8 *pu1_src_top_left,
126
                                      WORD32 sao_band_pos,
127
                                      WORD8 *pi1_sao_offset,
128
                                      WORD32 wd,
129
                                      WORD32 ht)
130
187k
{
131
187k
    WORD32 row, col;
132
187k
    UWORD8 *pu1_src_cpy;
133
187k
    WORD32 wd_rem;
134
187k
    WORD8 offset = 0;
135
136
187k
    __m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b;
137
187k
    __m128i band_table0_8x16b, band_table1_8x16b, band_table2_8x16b, band_table3_8x16b;
138
187k
    __m128i tmp_set_128i_1, tmp_set_128i_2, tmp_set_128i_3, tmp_set_128i_4;
139
187k
    __m128i band_pos_16x8b;
140
187k
    __m128i sao_offset;
141
187k
    __m128i cmp_mask, cmp_store;
142
143
    /* Updating left and top-left and top */
144
5.89M
    for(row = 0; row < ht; row++)
145
5.70M
    {
146
5.70M
        pu1_src_left[row] = pu1_src[row * src_strd + (wd - 1)];
147
5.70M
    }
148
187k
    pu1_src_top_left[0] = pu1_src_top[wd - 1];
149
909k
    for(col = 0; col < wd; col += 8)
150
722k
    {
151
722k
        tmp_set_128i_1 = _mm_loadl_epi64((__m128i *)(pu1_src + (ht - 1) * src_strd + offset));
152
722k
        _mm_storel_epi64((__m128i *)(pu1_src_top + offset), tmp_set_128i_1);
153
722k
        offset += 8;
154
722k
    }
155
156
    //replicating sao_band_pos as 8 bit value 16 times
157
158
159
187k
    band_pos_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos << 3));
160
    //value set for sao_offset extraction
161
187k
    tmp_set_128i_1  = _mm_set_epi8(-128, 1, -128, 1, -128, 1, -128, 1, -128, 1, -128, 1, -128, 1, -128, 1);
162
187k
    tmp_set_128i_2  = _mm_set_epi8(-128, 2, -128, 2, -128, 2, -128, 2, -128, 2, -128, 2, -128, 2, -128, 2);
163
187k
    tmp_set_128i_3  = _mm_set_epi8(-128, 3, -128, 3, -128, 3, -128, 3, -128, 3, -128, 3, -128, 3, -128, 3);
164
187k
    tmp_set_128i_4  = _mm_set_epi8(-128, 4, -128, 4, -128, 4, -128, 4, -128, 4, -128, 4, -128, 4, -128, 4);
165
166
    //loaded sao offset values
167
187k
    sao_offset      = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
168
169
    //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers
170
187k
    band_table0_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx));
171
187k
    band_table1_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8));
172
187k
    band_table2_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16));
173
187k
    band_table3_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24));
174
175
    //band_position addition
176
187k
    band_table0_8x16b = _mm_add_epi16(band_table0_8x16b, band_pos_16x8b);
177
187k
    band_table1_8x16b = _mm_add_epi16(band_table1_8x16b, band_pos_16x8b);
178
187k
    band_table2_8x16b = _mm_add_epi16(band_table2_8x16b, band_pos_16x8b);
179
187k
    band_table3_8x16b = _mm_add_epi16(band_table3_8x16b, band_pos_16x8b);
180
    //sao_offset duplication
181
187k
    tmp_set_128i_1  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1);
182
187k
    tmp_set_128i_2  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2);
183
187k
    tmp_set_128i_3  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3);
184
187k
    tmp_set_128i_4  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4);
185
    //settng for comparision
186
187k
    cmp_mask = _mm_set1_epi16(16);
187
187k
    cmp_store = _mm_set1_epi16(0x00ff);
188
189
    //sao_offset addition
190
187k
    band_table0_8x16b = _mm_add_epi16(band_table0_8x16b, tmp_set_128i_1);
191
187k
    band_table1_8x16b = _mm_add_epi16(band_table1_8x16b, tmp_set_128i_2);
192
187k
    band_table2_8x16b = _mm_add_epi16(band_table2_8x16b, tmp_set_128i_3);
193
187k
    band_table3_8x16b = _mm_add_epi16(band_table3_8x16b, tmp_set_128i_4);
194
    //masking upper 8bit values of each  16 bit band table value
195
187k
    band_table0_8x16b = _mm_and_si128(band_table0_8x16b, cmp_store);
196
187k
    band_table1_8x16b = _mm_and_si128(band_table1_8x16b, cmp_store);
197
187k
    band_table2_8x16b = _mm_and_si128(band_table2_8x16b, cmp_store);
198
187k
    band_table3_8x16b = _mm_and_si128(band_table3_8x16b, cmp_store);
199
200
187k
    switch(sao_band_pos)
201
187k
    {
202
13.3k
        case 0:
203
13.3k
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table0_8x16b);
204
13.3k
            band_table0_8x16b = _mm_and_si128(band_table0_8x16b, tmp_set_128i_2);
205
13.3k
            break;
206
2.93k
        case 28:
207
2.93k
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table3_8x16b);
208
2.93k
            band_table3_8x16b = _mm_or_si128(band_table3_8x16b, tmp_set_128i_2);
209
2.93k
            break;
210
4.11k
        case 29:
211
4.11k
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table2_8x16b);
212
4.11k
            band_table2_8x16b = _mm_or_si128(band_table2_8x16b, tmp_set_128i_2);
213
4.11k
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table3_8x16b);
214
4.11k
            band_table3_8x16b = _mm_and_si128(band_table3_8x16b, tmp_set_128i_2);
215
4.11k
            break;
216
6.68k
        case 30:
217
6.68k
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table1_8x16b);
218
6.68k
            band_table1_8x16b = _mm_or_si128(band_table1_8x16b, tmp_set_128i_2);
219
6.68k
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table2_8x16b);
220
6.68k
            band_table2_8x16b = _mm_and_si128(band_table2_8x16b, tmp_set_128i_2);
221
6.68k
            break;
222
3.92k
        case 31:
223
3.92k
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table0_8x16b);
224
3.92k
            band_table0_8x16b = _mm_or_si128(band_table0_8x16b, tmp_set_128i_2);
225
3.92k
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table1_8x16b);
226
3.92k
            band_table1_8x16b = _mm_and_si128(band_table1_8x16b, tmp_set_128i_2);
227
3.92k
            break;
228
156k
        default:
229
156k
            break;
230
187k
    }
231
    //sao_offset is reused for zero cmp mask.
232
187k
    sao_offset = _mm_setzero_si128();
233
187k
    tmp_set_128i_1 = _mm_set1_epi8(1);
234
    //tmp_set_128i_2 = _mm_set_epi8 (128,7,128,6,128,5,128,4,128,3,128,2,128,1,128,0);
235
187k
    cmp_mask = _mm_packus_epi16(cmp_mask, cmp_mask); //cmp_msk=dup16(16);
236
237
    //masking upper 8bit values of each  16 bit band table value
238
187k
    band_table0_8x16b = _mm_and_si128(band_table0_8x16b, cmp_store);
239
187k
    band_table1_8x16b = _mm_and_si128(band_table1_8x16b, cmp_store);
240
187k
    band_table2_8x16b = _mm_and_si128(band_table2_8x16b, cmp_store);
241
187k
    band_table3_8x16b = _mm_and_si128(band_table3_8x16b, cmp_store);
242
243
    //band table 8x16 four registers are packed into 16x8 two registers:  band_table0_8x16b and band_table2_8x16b
244
187k
    band_table0_8x16b = _mm_packus_epi16(band_table0_8x16b, band_table1_8x16b);
245
187k
    band_table2_8x16b = _mm_packus_epi16(band_table2_8x16b, band_table3_8x16b);
246
247
187k
    band_table3_8x16b = _mm_slli_epi16(cmp_mask, 1); // to compare if value is greater than 31
248
187k
    band_pos_16x8b = _mm_packus_epi16(band_pos_16x8b, band_pos_16x8b); //band_pos is now 8 bit aligned
249
187k
    band_table3_8x16b = _mm_sub_epi8(band_table3_8x16b, tmp_set_128i_1); // to compare if value is greater than 31
250
251
187k
    cmp_mask = _mm_sub_epi8(cmp_mask, tmp_set_128i_1);
252
    //  band_pos_16x8b = _mm_or_si128(band_pos_16x8b,cmp_store);
253
254
457k
    for(col = wd; col >= 16; col -= 16)
255
270k
    {
256
270k
        pu1_src_cpy = pu1_src;
257
4.58M
        for(row = ht; row > 0; row -= 2)
258
4.31M
        {
259
260
261
            //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
262
4.31M
            src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
263
            // row = 1
264
4.31M
            src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
265
266
267
268
            //saturated substract 8 bit
269
4.31M
            tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_16x8b);
270
4.31M
            tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_16x8b);
271
            //if the values less than 0 put ff
272
4.31M
            tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
273
4.31M
            tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
274
4.31M
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
275
4.31M
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
276
            //if the values gret=ater than 31 put ff
277
4.31M
            tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, band_table3_8x16b);
278
4.31M
            tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, band_table3_8x16b);
279
4.31M
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
280
4.31M
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
281
282
283
            //row 0 and row1
284
            //if the values >16 then put ff ,cmp_mask = dup16(15)
285
4.31M
            cmp_store = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
286
            //values 16 to 31 for row 0 & 1 but values <16 ==0
287
4.31M
            tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, cmp_store);
288
            // values 0 to 15 for row 0 & 1
289
4.31M
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, cmp_store);
290
            //values 16 to 31 for row 0 & 1 but values <16 masked to ff
291
4.31M
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
292
4.31M
            tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, cmp_store);
293
            //row 2 and  row 3
294
            //if the values >16 then put ff ,cmp_mask = dup16(15)
295
4.31M
            cmp_store = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
296
            //values 16 to 31 for row 2 & 3 but values <16 ==0
297
4.31M
            tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, cmp_store);
298
            // values 0 to 15 for row 2 & 3
299
4.31M
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, cmp_store);
300
            //values 16 to 31 for row 2 & 3 but values <16 masked to ff
301
4.31M
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
302
4.31M
            tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, cmp_store);
303
304
            //row 0 and row 1
305
            //to preserve pixel values in which no offset needs to be added.
306
4.31M
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
307
4.31M
            src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, cmp_store);
308
309
            //row 2 and row 3
310
            //to preserve pixel values in which no offset needs to be added.
311
4.31M
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
312
4.31M
            src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, cmp_store);
313
314
            //indexing 0 - 15 bandtable indexes
315
4.31M
            tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_1);
316
4.31M
            tmp_set_128i_3 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_3);
317
4.31M
            tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_2);
318
4.31M
            tmp_set_128i_4 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_4);
319
            // combining all offsets results
320
4.31M
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
321
4.31M
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
322
            // combing results woth the pixel values
323
4.31M
            src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
324
4.31M
            src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
325
326
327
            //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
328
4.31M
            _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_8x16b);
329
            // row = 1
330
4.31M
            _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp2_8x16b);
331
332
4.31M
            pu1_src_cpy += (src_strd << 1);
333
4.31M
        }
334
270k
        pu1_src += 16;
335
270k
    }
336
187k
    wd_rem = wd & 0xF;
337
187k
    if(wd_rem)
338
181k
    {pu1_src_cpy = pu1_src;
339
1.56M
        for(row = ht; row > 0; row -= 4)
340
1.38M
        {
341
342
343
            //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
344
1.38M
            src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
345
            // row = 1
346
1.38M
            src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
347
            // row = 2
348
1.38M
            src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
349
            // row = 3
350
1.38M
            src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
351
            //row0 and row1 packed and row2 and row3 packed
352
353
1.38M
            src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b);
354
1.38M
            src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp3_8x16b);
355
356
            //saturated substract 8 bit
357
1.38M
            tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_16x8b);
358
1.38M
            tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_16x8b);
359
            //if the values less than 0 put ff
360
1.38M
            tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
361
1.38M
            tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
362
1.38M
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
363
1.38M
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
364
            //if the values gret=ater than 31 put ff
365
1.38M
            tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, band_table3_8x16b);
366
1.38M
            tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, band_table3_8x16b);
367
1.38M
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
368
1.38M
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
369
370
371
372
            //row 0 and row1
373
            //if the values >16 then put ff ,cmp_mask = dup16(15)
374
1.38M
            cmp_store = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
375
            //values 16 to 31 for row 0 & 1 but values <16 ==0
376
1.38M
            tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, cmp_store);
377
            // values 0 to 15 for row 0 & 1
378
1.38M
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, cmp_store);
379
            //values 16 to 31 for row 0 & 1 but values <16 masked to ff
380
1.38M
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
381
1.38M
            tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, cmp_store);
382
            //row 2 and  row 3
383
            //if the values >16 then put ff ,cmp_mask = dup16(15)
384
1.38M
            cmp_store = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
385
            //values 16 to 31 for row 2 & 3 but values <16 ==0
386
1.38M
            tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, cmp_store);
387
            // values 0 to 15 for row 2 & 3
388
1.38M
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, cmp_store);
389
            //values 16 to 31 for row 2 & 3 but values <16 masked to ff
390
1.38M
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
391
1.38M
            tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, cmp_store);
392
393
            //row 0 and row 1
394
            //to preserve pixel values in which no offset needs to be added.
395
1.38M
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
396
1.38M
            src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, cmp_store);
397
398
            //row 2 and row 3
399
            //to preserve pixel values in which no offset needs to be added.
400
1.38M
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
401
1.38M
            src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, cmp_store);
402
403
            //indexing 0 - 15 bandtable indexes
404
1.38M
            tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_1);
405
1.38M
            tmp_set_128i_3 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_3);
406
1.38M
            tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_2);
407
1.38M
            tmp_set_128i_4 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_4);
408
            // combining all offsets results
409
1.38M
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
410
1.38M
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
411
            // combing results woth the pixel values
412
1.38M
            src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
413
1.38M
            src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
414
415
            //Getting row1 separately
416
1.38M
            src_temp1_8x16b = _mm_srli_si128(src_temp0_8x16b, 8);
417
            //Getting row3 separately
418
1.38M
            src_temp3_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
419
420
            //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
421
1.38M
            _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_8x16b);
422
            // row = 1
423
1.38M
            _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), src_temp1_8x16b);
424
            // row = 2
425
1.38M
            _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp2_8x16b);
426
            // row = 3
427
1.38M
            _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), src_temp3_8x16b);
428
429
1.38M
            pu1_src_cpy += (src_strd << 2);
430
431
1.38M
        }
432
181k
        pu1_src += 8;
433
181k
    }
434
435
436
187k
}
437
438
void ihevc_sao_band_offset_chroma_ssse3(UWORD8 *pu1_src,
439
                                        WORD32 src_strd,
440
                                        UWORD8 *pu1_src_left,
441
                                        UWORD8 *pu1_src_top,
442
                                        UWORD8 *pu1_src_top_left,
443
                                        WORD32 sao_band_pos_u,
444
                                        WORD32 sao_band_pos_v,
445
                                        WORD8 *pi1_sao_offset_u,
446
                                        WORD8 *pi1_sao_offset_v,
447
                                        WORD32 wd,
448
                                        WORD32 ht)
449
119k
{
450
119k
    WORD32 row, col;
451
119k
    WORD8 offset = 0;
452
453
454
119k
    __m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b;
455
119k
    __m128i cmp_msk2;
456
119k
    __m128i band_table0_16x8b, band_table1_16x8b, band_table2_16x8b, band_table3_16x8b;
457
119k
    __m128i tmp_set_128i_1, tmp_set_128i_2, tmp_set_128i_3, tmp_set_128i_4;
458
119k
    __m128i band_pos_u_16x8b, band_pos_v_16x8b;
459
119k
    __m128i sao_offset;
460
119k
    __m128i cmp_mask;
461
462
463
    /* Updating left and top and top-left */
464
1.94M
    for(row = 0; row < ht; row++)
465
1.82M
    {
466
1.82M
        pu1_src_left[2 * row] = pu1_src[row * src_strd + (wd - 2)];
467
1.82M
        pu1_src_left[2 * row + 1] = pu1_src[row * src_strd + (wd - 1)];
468
1.82M
    }
469
119k
    pu1_src_top_left[0] = pu1_src_top[wd - 2];
470
119k
    pu1_src_top_left[1] = pu1_src_top[wd - 1];
471
576k
    for(col = 0; col < wd; col += 8)
472
456k
    {
473
456k
        tmp_set_128i_1 = _mm_loadl_epi64((__m128i *)(pu1_src + (ht - 1) * src_strd + offset));
474
456k
        _mm_storel_epi64((__m128i *)(pu1_src_top + offset), tmp_set_128i_1);
475
456k
        offset += 8;
476
456k
    }
477
478
119k
    { // band _table creation
479
119k
        __m128i temp0_8x16b, temp1_8x16b, temp2_8x16b, temp3_8x16b;
480
        // Band table for U component : band_table0_16x8b and band_table2_16x8b
481
        //replicating sao_band_pos as 8 bit value 16 times
482
119k
        band_pos_u_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos_u << 3));
483
        //value set for sao_offset extraction
484
119k
        tmp_set_128i_1  = _mm_set_epi8(-128, 1, -128, 1, -128, 1, -128, 1, -128, 1, -128, 1, -128, 1, -128, 1);
485
119k
        tmp_set_128i_2  = _mm_set_epi8(-128, 2, -128, 2, -128, 2, -128, 2, -128, 2, -128, 2, -128, 2, -128, 2);
486
119k
        tmp_set_128i_3  = _mm_set_epi8(-128, 3, -128, 3, -128, 3, -128, 3, -128, 3, -128, 3, -128, 3, -128, 3);
487
119k
        tmp_set_128i_4  = _mm_set_epi8(-128, 4, -128, 4, -128, 4, -128, 4, -128, 4, -128, 4, -128, 4, -128, 4);
488
489
        //loaded sao offset values
490
119k
        sao_offset      = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
491
492
        //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers
493
119k
        band_table0_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx));
494
119k
        band_table1_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8));
495
119k
        band_table2_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16));
496
119k
        band_table3_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24));
497
498
        //band_position addition
499
119k
        band_table0_16x8b = _mm_add_epi16(band_table0_16x8b, band_pos_u_16x8b);
500
119k
        band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, band_pos_u_16x8b);
501
119k
        band_table2_16x8b = _mm_add_epi16(band_table2_16x8b, band_pos_u_16x8b);
502
119k
        band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, band_pos_u_16x8b);
503
        //sao_offset duplication
504
119k
        temp0_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1);
505
119k
        temp1_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2);
506
119k
        temp2_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3);
507
119k
        temp3_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4);
508
509
        //sao_offset addition
510
119k
        band_table0_16x8b = _mm_add_epi16(band_table0_16x8b, temp0_8x16b);
511
119k
        band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, temp1_8x16b);
512
119k
        band_table2_16x8b = _mm_add_epi16(band_table2_16x8b, temp2_8x16b);
513
119k
        band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, temp3_8x16b);
514
        //reuse for clipping
515
119k
        temp1_8x16b = _mm_set1_epi16(0x00ff);
516
        //settng for comparision
517
119k
        cmp_mask = _mm_set1_epi16(16);
518
519
        //masking upper 8bit values of each  16 bit band table value
520
119k
        band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp1_8x16b);
521
119k
        band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
522
119k
        band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp1_8x16b);
523
119k
        band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
524
525
        //temp1_8x16b reuse for compare storage
526
119k
        switch(sao_band_pos_u)
527
119k
        {
528
11.3k
            case 0:
529
11.3k
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table0_16x8b);
530
11.3k
                band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp3_8x16b);
531
11.3k
                break;
532
3.55k
            case 28:
533
3.55k
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
534
3.55k
                band_table3_16x8b = _mm_or_si128(band_table3_16x8b, temp3_8x16b);
535
3.55k
                break;
536
3.18k
            case 29:
537
3.18k
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table2_16x8b);
538
3.18k
                band_table2_16x8b = _mm_or_si128(band_table2_16x8b, temp3_8x16b);
539
3.18k
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
540
3.18k
                band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp3_8x16b);
541
3.18k
                break;
542
2.80k
            case 30:
543
2.80k
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
544
2.80k
                band_table1_16x8b = _mm_or_si128(band_table1_16x8b, temp3_8x16b);
545
2.80k
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table2_16x8b);
546
2.80k
                band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp3_8x16b);
547
2.80k
                break;
548
2.23k
            case 31:
549
2.23k
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table0_16x8b);
550
2.23k
                band_table0_16x8b = _mm_or_si128(band_table0_16x8b, temp3_8x16b);
551
2.23k
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
552
2.23k
                band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp3_8x16b);
553
2.23k
                break;
554
96.5k
            default:
555
96.5k
                break;
556
119k
        }
557
        //masking upper 8bit values of each  16 bit band table value
558
119k
        band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp1_8x16b);
559
119k
        band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
560
119k
        band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp1_8x16b);
561
119k
        band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
562
        //band table 8x16 four registers are packed into 16x8 two registers:  band_table0_8x16b and band_table2_8x16b
563
119k
        band_table0_16x8b = _mm_packus_epi16(band_table0_16x8b, band_table1_16x8b);
564
119k
        band_table2_16x8b = _mm_packus_epi16(band_table2_16x8b, band_table3_16x8b);
565
        // Band table for U component over
566
567
        // Band table for V component : band_table1_16x8b and band_table3_16x8b
568
        // replicating sao_band_pos as 8 bit value 16 times
569
119k
        band_pos_v_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos_v << 3));
570
571
        //loaded sao offset values
572
119k
        sao_offset      = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
573
574
        //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers
575
119k
        temp0_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx));
576
119k
        band_table1_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8));
577
119k
        temp2_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16));
578
119k
        band_table3_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24));
579
580
        //band_position addition
581
119k
        temp0_8x16b = _mm_add_epi16(temp0_8x16b, band_pos_v_16x8b);
582
119k
        band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, band_pos_v_16x8b);
583
119k
        temp2_8x16b = _mm_add_epi16(temp2_8x16b, band_pos_v_16x8b);
584
119k
        band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, band_pos_v_16x8b);
585
        //sao_offset duplication
586
119k
        tmp_set_128i_1  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1);
587
119k
        tmp_set_128i_2  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2);
588
119k
        tmp_set_128i_3  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3);
589
119k
        tmp_set_128i_4  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4);
590
591
        //sao_offset addition
592
119k
        temp0_8x16b = _mm_add_epi16(temp0_8x16b, tmp_set_128i_1);
593
119k
        band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, tmp_set_128i_2);
594
119k
        temp2_8x16b = _mm_add_epi16(temp2_8x16b, tmp_set_128i_3);
595
119k
        band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, tmp_set_128i_4);
596
597
        //masking upper 8bit values of 16 bit band table value
598
119k
        temp0_8x16b = _mm_and_si128(temp0_8x16b, temp1_8x16b);
599
119k
        band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
600
119k
        temp2_8x16b = _mm_and_si128(temp2_8x16b, temp1_8x16b);
601
119k
        band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
602
        //temp1_8x16b reuse for compare storage
603
604
119k
        switch(sao_band_pos_v)
605
119k
        {
606
11.3k
            case 0:
607
11.3k
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp0_8x16b);
608
11.3k
                temp0_8x16b = _mm_and_si128(temp0_8x16b, temp3_8x16b);
609
11.3k
                break;
610
2.79k
            case 28:
611
2.79k
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
612
2.79k
                band_table3_16x8b = _mm_or_si128(band_table3_16x8b, temp3_8x16b);
613
2.79k
                break;
614
3.42k
            case 29:
615
3.42k
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp2_8x16b);
616
3.42k
                temp2_8x16b = _mm_or_si128(temp2_8x16b, temp3_8x16b);
617
3.42k
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
618
3.42k
                band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp3_8x16b);
619
3.42k
                break;
620
2.85k
            case 30:
621
2.85k
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
622
2.85k
                band_table1_16x8b = _mm_or_si128(band_table1_16x8b, temp3_8x16b);
623
2.85k
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp2_8x16b);
624
2.85k
                temp2_8x16b = _mm_and_si128(temp2_8x16b, temp3_8x16b);
625
2.85k
                break;
626
2.59k
            case 31:
627
2.59k
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp0_8x16b);
628
2.59k
                temp0_8x16b = _mm_or_si128(temp0_8x16b, temp3_8x16b);
629
2.59k
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
630
2.59k
                band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp3_8x16b);
631
2.59k
                break;
632
96.6k
            default:
633
96.6k
                break;
634
119k
        }
635
        //masking upper 8bit values of each  16 bit band table value
636
119k
        temp0_8x16b = _mm_and_si128(temp0_8x16b, temp1_8x16b);
637
119k
        band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
638
119k
        temp2_8x16b = _mm_and_si128(temp2_8x16b, temp1_8x16b);
639
119k
        band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
640
        //band table 8x16 four registers are packed into 16x8 two registers:  band_table0_8x16b and band_table2_8x16b
641
119k
        band_table1_16x8b = _mm_packus_epi16(temp0_8x16b, band_table1_16x8b);
642
119k
        band_table3_16x8b = _mm_packus_epi16(temp2_8x16b, band_table3_16x8b);
643
        //band table for u and v created
644
119k
    }
645
0
    {
646
119k
        UWORD8 *pu1_src_cpy;
647
119k
        WORD32 wd_rem;
648
649
650
        //sao_offset is reused for zero cmp mask.
651
119k
        sao_offset = _mm_setzero_si128();
652
119k
        tmp_set_128i_1 = _mm_set1_epi8(1);
653
        //tmp_set_128i_2 = _mm_set_epi8 (128,7,128,6,128,5,128,4,128,3,128,2,128,1,128,0);
654
119k
        cmp_mask = _mm_packus_epi16(cmp_mask, cmp_mask); //cmp_msk=dup16(16);
655
        //to avoid ffff to be saturated to 0 instead it should be to ff
656
657
119k
        cmp_msk2 = _mm_slli_epi16(cmp_mask, 1); // to compare if value is greater than 31
658
119k
        band_pos_u_16x8b = _mm_packus_epi16(band_pos_u_16x8b, band_pos_u_16x8b); //band_pos_u is now 8 bit aligned
659
119k
        band_pos_v_16x8b = _mm_packus_epi16(band_pos_v_16x8b, band_pos_v_16x8b); //band_pos_v is now 8 bit aligned
660
119k
        cmp_msk2 = _mm_sub_epi8(cmp_msk2, tmp_set_128i_1); // to compare if value is greater than 31
661
662
119k
        cmp_mask = _mm_sub_epi8(cmp_mask, tmp_set_128i_1);
663
664
346k
        for(col = wd; col >= 16; col -= 16)
665
226k
        {
666
226k
            pu1_src_cpy = pu1_src;
667
2.01M
            for(row = ht; row > 0; row -= 2)
668
1.78M
            {
669
                //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
670
1.78M
                src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
671
                // row = 1
672
1.78M
                src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
673
674
675
                //odd values
676
1.78M
                src_temp1_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
677
1.78M
                src_temp2_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
678
                //even values
679
1.78M
                src_temp0_8x16b = _mm_slli_epi16(src_temp0_8x16b, 8);
680
1.78M
                src_temp3_8x16b = _mm_slli_epi16(src_temp3_8x16b, 8);
681
1.78M
                src_temp0_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
682
1.78M
                src_temp3_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
683
                //combining odd values
684
1.78M
                src_temp2_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp2_8x16b);
685
                //combining even values
686
1.78M
                src_temp0_8x16b = _mm_packus_epi16(src_temp0_8x16b, src_temp3_8x16b);
687
688
                //saturated substract 8 bit
689
1.78M
                tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_u_16x8b);
690
1.78M
                tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_v_16x8b);
691
                //if the values less than 0 put ff
692
1.78M
                tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
693
1.78M
                tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
694
1.78M
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
695
1.78M
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
696
                //if the values greater than 31 put ff
697
1.78M
                tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_msk2);
698
1.78M
                tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_msk2);
699
1.78M
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
700
1.78M
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
701
                // registers reused to increase performance
702
                //if the values >16 then put ff ,cmp_mask = dup16(15) row 0 and row1
703
1.78M
                src_temp1_8x16b = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
704
                //if the values >16 then put ff ,cmp_mask = dup16(15) row 2 and  row 3
705
1.78M
                src_temp3_8x16b = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
706
707
                //values 16 to 31 for row 0 & 1 but values <16 ==0
708
1.78M
                tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, src_temp1_8x16b);
709
                // values 0 to 15 for row 0 & 1
710
1.78M
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, src_temp1_8x16b);
711
                //values 16 to 31 for row 2 & 3 but values <16 ==0
712
1.78M
                tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, src_temp3_8x16b);
713
                // values 0 to 15 for row 2 & 3
714
1.78M
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, src_temp3_8x16b);
715
716
                //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 0 and row1
717
1.78M
                src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
718
                //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 2 and  row 3
719
1.78M
                src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
720
1.78M
                tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, src_temp1_8x16b);
721
1.78M
                tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, src_temp3_8x16b);
722
723
724
                //to choose which pixel values to preserve in row 0 and row 1
725
1.78M
                src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
726
                //to choose which pixel values to preserve in row 2 and row 3
727
1.78M
                src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
728
                //values of all rows to which no offset needs to be added preserved.
729
1.78M
                src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, src_temp1_8x16b);
730
1.78M
                src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, src_temp3_8x16b);
731
732
                //indexing 0 - 15 bandtable indexes
733
1.78M
                tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_16x8b, tmp_set_128i_1); //U low
734
1.78M
                tmp_set_128i_3 = _mm_shuffle_epi8(band_table1_16x8b, tmp_set_128i_3); //V low
735
                //indexing 16 -31 bandtable indexes
736
1.78M
                tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_16x8b, tmp_set_128i_2); //U high
737
1.78M
                tmp_set_128i_4 = _mm_shuffle_epi8(band_table3_16x8b, tmp_set_128i_4); //V high
738
                // combining all offsets results
739
1.78M
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); //U
740
1.78M
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); //V
741
                // combing results with the pixel values
742
1.78M
                src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
743
1.78M
                src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
744
                //reorganising even and odd values
745
1.78M
                src_temp1_8x16b = _mm_unpacklo_epi8(src_temp0_8x16b, src_temp2_8x16b);
746
1.78M
                src_temp3_8x16b = _mm_unpackhi_epi8(src_temp0_8x16b, src_temp2_8x16b);
747
748
749
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
750
1.78M
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp1_8x16b);
751
                // row = 1
752
1.78M
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp3_8x16b);
753
754
755
1.78M
                pu1_src_cpy += (src_strd << 1);
756
757
1.78M
            }
758
226k
            pu1_src += 16;
759
226k
        }
760
761
119k
        wd_rem = wd & 0xF;
762
119k
        if(wd_rem)
763
3.39k
        {
764
3.39k
            pu1_src_cpy = pu1_src;
765
13.0k
            for(row = ht; row > 0; row -= 4)
766
9.65k
            {
767
                //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
768
9.65k
                src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
769
                // row = 1
770
9.65k
                src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
771
                // row = 2
772
9.65k
                src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
773
                // row = 3
774
9.65k
                src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
775
                //row0 and row1 packed and row2 and row3 packed
776
777
9.65k
                src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b);
778
9.65k
                src_temp3_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp3_8x16b);
779
                //odd values
780
9.65k
                src_temp1_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
781
9.65k
                src_temp2_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
782
                //even values
783
9.65k
                src_temp0_8x16b = _mm_slli_epi16(src_temp0_8x16b, 8);
784
9.65k
                src_temp3_8x16b = _mm_slli_epi16(src_temp3_8x16b, 8);
785
9.65k
                src_temp0_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
786
9.65k
                src_temp3_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
787
                //combining odd values
788
9.65k
                src_temp2_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp2_8x16b);
789
                //combining even values
790
9.65k
                src_temp0_8x16b = _mm_packus_epi16(src_temp0_8x16b, src_temp3_8x16b);
791
792
                //saturated substract 8 bit
793
9.65k
                tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_u_16x8b);
794
9.65k
                tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_v_16x8b);
795
                //if the values less than 0 put ff
796
9.65k
                tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
797
9.65k
                tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
798
9.65k
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
799
9.65k
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
800
                //if the values greater than 31 put ff
801
9.65k
                tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_msk2);
802
9.65k
                tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_msk2);
803
9.65k
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
804
9.65k
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
805
                // registers reused to increase performance
806
                //if the values >16 then put ff ,cmp_mask = dup16(15) row 0 and row1
807
9.65k
                src_temp1_8x16b = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
808
                //if the values >16 then put ff ,cmp_mask = dup16(15) row 2 and  row 3
809
9.65k
                src_temp3_8x16b = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
810
811
                //values 16 to 31 for row 0 & 1 but values <16 ==0
812
9.65k
                tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, src_temp1_8x16b);
813
                // values 0 to 15 for row 0 & 1
814
9.65k
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, src_temp1_8x16b);
815
                //values 16 to 31 for row 2 & 3 but values <16 ==0
816
9.65k
                tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, src_temp3_8x16b);
817
                // values 0 to 15 for row 2 & 3
818
9.65k
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, src_temp3_8x16b);
819
820
                //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 0 and row1
821
9.65k
                src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
822
                //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 2 and  row 3
823
9.65k
                src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
824
9.65k
                tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, src_temp1_8x16b);
825
9.65k
                tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, src_temp3_8x16b);
826
827
828
                //to choose which pixel values to preserve in row 0 and row 1
829
9.65k
                src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
830
                //to choose which pixel values to preserve in row 2 and row 3
831
9.65k
                src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
832
                //values of all rows to which no offset needs to be added preserved.
833
9.65k
                src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, src_temp1_8x16b);
834
9.65k
                src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, src_temp3_8x16b);
835
836
                //indexing 0 - 15 bandtable indexes
837
9.65k
                tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_16x8b, tmp_set_128i_1); //U low
838
9.65k
                tmp_set_128i_3 = _mm_shuffle_epi8(band_table1_16x8b, tmp_set_128i_3); //V low
839
                //indexing 16 -31 bandtable indexes
840
9.65k
                tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_16x8b, tmp_set_128i_2); //U high
841
9.65k
                tmp_set_128i_4 = _mm_shuffle_epi8(band_table3_16x8b, tmp_set_128i_4); //V high
842
                // combining all offsets results
843
9.65k
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); //U
844
9.65k
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); //V
845
                // combing results with the pixel values
846
9.65k
                src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
847
9.65k
                src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
848
                //reorganising even and odd values
849
9.65k
                src_temp1_8x16b = _mm_unpacklo_epi8(src_temp0_8x16b, src_temp2_8x16b);
850
9.65k
                src_temp3_8x16b = _mm_unpackhi_epi8(src_temp0_8x16b, src_temp2_8x16b);
851
                //Getting row1 separately
852
9.65k
                src_temp0_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
853
                //Getting row3 separately
854
9.65k
                src_temp2_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
855
856
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
857
9.65k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp1_8x16b);
858
                // row = 1
859
9.65k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), src_temp0_8x16b);
860
                // row = 2
861
9.65k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp3_8x16b);
862
                // row = 3
863
9.65k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), src_temp2_8x16b);
864
865
9.65k
                pu1_src_cpy += (src_strd << 2);
866
867
9.65k
            }
868
3.39k
            pu1_src += 16;
869
3.39k
        }
870
871
872
119k
    }
873
119k
}
874
875
876
877
void ihevc_sao_edge_offset_class0_ssse3(UWORD8 *pu1_src,
878
                                        WORD32 src_strd,
879
                                        UWORD8 *pu1_src_left,
880
                                        UWORD8 *pu1_src_top,
881
                                        UWORD8 *pu1_src_top_left,
882
                                        UWORD8 *pu1_src_top_right,
883
                                        UWORD8 *pu1_src_bot_left,
884
                                        UWORD8 *pu1_avail,
885
                                        WORD8 *pi1_sao_offset,
886
                                        WORD32 wd,
887
                                        WORD32 ht)
888
47.2k
{
889
47.2k
    WORD32 row, col;
890
47.2k
    UWORD8 *pu1_src_cpy, *pu1_src_left_cpy, *pu1_src_left_str, *pu1_left_tmp;
891
47.2k
    UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
892
47.2k
    UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8];
893
47.2k
    UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8];
894
47.2k
    UWORD8 u1_avail0, u1_avail1;
895
47.2k
    WORD32 wd_rem;
896
47.2k
    WORD32 offset = 0;
897
47.2k
    __m128i src_temp0_16x8b, src_temp1_16x8b;
898
47.2k
    __m128i left0_16x8b, left1_16x8b;
899
47.2k
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b, cmp_gt1_16x8b, cmp_lt1_16x8b;
900
47.2k
    __m128i edge0_16x8b, edge1_16x8b;
901
47.2k
    __m128i au1_mask8x16b;
902
47.2k
    __m128i edge_idx_8x16b, sao_offset_8x16b;
903
47.2k
    __m128i const2_16x8b, const0_16x8b;
904
47.2k
    __m128i left_store_16x8b;
905
47.2k
    UNUSED(pu1_src_top_right);
906
47.2k
    UNUSED(pu1_src_bot_left);
907
908
47.2k
    au1_mask8x16b = _mm_set1_epi8(0xff);
909
910
    /* Update  top and top-left arrays */
911
912
47.2k
    *pu1_src_top_left = pu1_src_top[wd - 1];
913
914
116k
    for(col = wd; col >= 16; col -= 16)
915
69.4k
    {
916
69.4k
        const0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + offset + (ht - 1) * src_strd));
917
69.4k
        _mm_storeu_si128((__m128i *)(pu1_src_top + offset), const0_16x8b);
918
69.4k
        offset += 16;
919
69.4k
    }
920
921
    //setting availability mask to ff size MAX_CTB_SIZE
922
236k
    for(col = 0; col < MAX_CTB_SIZE; col += 16)
923
188k
        _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
924
1.50M
    for(row = 0; row < ht; row++)
925
1.46M
    {
926
1.46M
        au1_src_left_tmp[row] = pu1_src_left[row];
927
1.46M
    }
928
47.2k
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
929
47.2k
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
930
931
    //availability mask creation
932
47.2k
    u1_avail0 = pu1_avail[0];
933
47.2k
    u1_avail1 = pu1_avail[1];
934
47.2k
    au1_mask[0] = u1_avail0;
935
47.2k
    au1_mask[wd - 1] = u1_avail1;
936
937
47.2k
    const2_16x8b = _mm_set1_epi8(2);
938
47.2k
    const0_16x8b = _mm_setzero_si128();
939
47.2k
    pu1_src_left_cpy = au1_src_left_tmp;
940
47.2k
    pu1_src_left_str = au1_src_left_tmp1;
941
47.2k
    {
942
47.2k
        au1_mask_cpy = au1_mask;
943
116k
        for(col = wd; col >= 16; col -= 16)
944
69.4k
        {
945
69.4k
            pu1_src_cpy = pu1_src;
946
69.4k
            au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
947
            //pu1_src_left_cpy =au1_src_left_tmp;
948
1.18M
            for(row = ht; row > 0; row -= 2)
949
1.11M
            {
950
951
1.11M
                left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
952
                //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
953
1.11M
                src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
954
                // row = 1
955
1.11M
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
956
957
1.11M
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 2);
958
                //row 1 left
959
1.11M
                left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 15);
960
1.11M
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15);
961
                //row 0 left
962
1.11M
                left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 15);
963
1.11M
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
964
965
966
                //separating +ve and and -ve values.
967
1.11M
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
968
1.11M
                cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
969
1.11M
                cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
970
1.11M
                cmp_lt1_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
971
                //creating mask 00 for +ve and -ve values and FF for zero.
972
1.11M
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
973
1.11M
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
974
1.11M
                cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
975
1.11M
                cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
976
                //combining the appropriate sign change
977
1.11M
                left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
978
1.11M
                left1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
979
980
                //row = 0 right
981
1.11M
                edge0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 1));
982
                // row = 1 right
983
1.11M
                edge1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
984
                //separating +ve and and -ve values.
985
1.11M
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
986
1.11M
                cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
987
1.11M
                cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
988
1.11M
                cmp_lt1_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
989
                //creating mask 00 for +ve and -ve values and FF for zero.
990
1.11M
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
991
1.11M
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
992
1.11M
                cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
993
1.11M
                cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
994
                //combining the appropriate sign change
995
1.11M
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
996
1.11M
                edge1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
997
998
                //combining sign-left and sign_right
999
1.11M
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
1000
1.11M
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
1001
                //adding constant 2
1002
1.11M
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1003
1.11M
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
1004
                //shuffle to get sao index
1005
1.11M
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1006
1.11M
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
1007
                //using availability mask
1008
1.11M
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
1009
1.11M
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
1010
1011
                //shuffle to get sao offset
1012
1.11M
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1013
1.11M
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
1014
                //cnvert to 16 bit then add and then saturated pack
1015
1.11M
                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1016
1.11M
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1017
1.11M
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
1018
1.11M
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1019
1.11M
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1020
1.11M
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
1021
1.11M
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
1022
1.11M
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1023
1024
1.11M
                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
1025
1.11M
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
1026
1.11M
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
1027
1.11M
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
1028
1.11M
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1029
1.11M
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
1030
1.11M
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
1031
1.11M
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
1032
1033
1034
1.11M
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
1035
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1036
1.11M
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1037
                // row = 1
1038
1.11M
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
1039
1040
1.11M
                pu1_src_cpy += (src_strd << 1);
1041
1.11M
                pu1_src_left_cpy += 2;
1042
1.11M
                pu1_src_left_str += 2;
1043
1.11M
            }
1044
69.4k
            au1_mask_cpy += 16;
1045
69.4k
            pu1_src += 16;
1046
69.4k
            pu1_src_left_cpy -= ht;
1047
69.4k
            pu1_src_left_str -= ht;
1048
1049
69.4k
            pu1_left_tmp = pu1_src_left_cpy;
1050
69.4k
            pu1_src_left_cpy = pu1_src_left_str;
1051
69.4k
            pu1_src_left_str = pu1_left_tmp;
1052
69.4k
        }
1053
1054
47.2k
        wd_rem = wd & 0xF;
1055
47.2k
        if(wd_rem)
1056
45.6k
        {
1057
1058
45.6k
            cmp_gt1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (ht - 1) * src_strd));
1059
45.6k
            _mm_storel_epi64((__m128i *)(pu1_src_top + offset), cmp_gt1_16x8b);
1060
1061
45.6k
            au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy);
1062
45.6k
            pu1_src_cpy = pu1_src;
1063
45.6k
            au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
1064
            //pu1_src_left_cpy =au1_src_left_tmp;
1065
398k
            for(row = ht; row > 0; row -= 4)
1066
353k
            {
1067
353k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
1068
                //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
1069
353k
                src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
1070
                // row = 1
1071
353k
                cmp_gt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
1072
                // row  = 2
1073
353k
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
1074
                // row = 3
1075
353k
                cmp_gt1_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
1076
1077
1078
353k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 4);
1079
                //row 3 left
1080
353k
                edge0_16x8b = _mm_slli_si128(cmp_gt1_16x8b, 8);
1081
353k
                cmp_lt1_16x8b = _mm_alignr_epi8(cmp_gt1_16x8b, left_store_16x8b, 15);
1082
353k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
1083
                //row 2 left
1084
353k
                edge0_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
1085
353k
                left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 15);
1086
353k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
1087
                //row 1 left
1088
353k
                edge0_16x8b = _mm_slli_si128(cmp_gt0_16x8b, 8);
1089
353k
                cmp_lt0_16x8b = _mm_alignr_epi8(cmp_gt0_16x8b, left_store_16x8b, 15);
1090
353k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
1091
                //row 0 left
1092
353k
                edge0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
1093
353k
                left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 15);
1094
353k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
1095
1096
                // packing rows together for 16 SIMD operations
1097
353k
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, cmp_gt0_16x8b);
1098
353k
                src_temp1_16x8b = _mm_unpacklo_epi64(src_temp1_16x8b, cmp_gt1_16x8b);
1099
                // packing rows together for 16 SIMD operations
1100
353k
                left0_16x8b = _mm_unpacklo_epi64(left0_16x8b, cmp_lt0_16x8b);
1101
353k
                left1_16x8b = _mm_unpacklo_epi64(left1_16x8b, cmp_lt1_16x8b);
1102
1103
                //separating +ve and and -ve values.
1104
353k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
1105
353k
                cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
1106
353k
                cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
1107
353k
                cmp_lt1_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
1108
                //creating mask 00 for +ve and -ve values and FF for zero.
1109
353k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1110
353k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1111
353k
                cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
1112
353k
                cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
1113
                //combining the appropriate sign change
1114
353k
                left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1115
353k
                left1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
1116
1117
                //row = 0 right
1118
353k
                edge0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 1));
1119
                // row = 1 right
1120
353k
                cmp_gt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd + 1));
1121
                // row = 2 right
1122
353k
                edge1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd + 1));
1123
                // row = 3 right
1124
353k
                cmp_gt1_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd + 1));
1125
                // packing rows together for 16 SIMD operations
1126
353k
                edge0_16x8b = _mm_unpacklo_epi64(edge0_16x8b, cmp_gt0_16x8b);
1127
353k
                edge1_16x8b = _mm_unpacklo_epi64(edge1_16x8b, cmp_gt1_16x8b);
1128
1129
                //separating +ve and and -ve values.
1130
353k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
1131
353k
                cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
1132
353k
                cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
1133
353k
                cmp_lt1_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
1134
                //creating mask 00 for +ve and -ve values and FF for zero.
1135
353k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1136
353k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1137
353k
                cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
1138
353k
                cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
1139
                //combining the appropriate sign change
1140
353k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1141
353k
                edge1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
1142
1143
                //combining sign-left and sign_right
1144
353k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
1145
353k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
1146
                //adding constant 2
1147
353k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1148
353k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
1149
                //shuffle to get sao index
1150
353k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1151
353k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
1152
                //shuffle to get sao offset
1153
                //using availability mask
1154
353k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
1155
353k
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
1156
1157
353k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1158
353k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
1159
                //cnvert to 16 bit then add and then saturated pack
1160
353k
                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1161
353k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1162
353k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
1163
353k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1164
353k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1165
353k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
1166
353k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
1167
353k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1168
1169
353k
                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
1170
353k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
1171
353k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
1172
353k
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
1173
353k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1174
353k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
1175
353k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
1176
353k
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
1177
                //separting row 1 and row 3
1178
353k
                cmp_lt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
1179
353k
                cmp_lt1_16x8b = _mm_srli_si128(src_temp1_16x8b, 8);
1180
1181
353k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
1182
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1183
353k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1184
                // row = 1
1185
353k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_lt0_16x8b);
1186
                // row = 2
1187
353k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp1_16x8b);
1188
                // row = 3
1189
353k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt1_16x8b);
1190
1191
353k
                pu1_src_cpy += (src_strd << 2);
1192
353k
                pu1_src_left_cpy += 4;
1193
353k
                pu1_src_left_str += 4;
1194
353k
            }
1195
45.6k
            pu1_src += wd;
1196
45.6k
            pu1_src_left_cpy -= ht;
1197
45.6k
            pu1_src_left_str -= ht;
1198
1199
45.6k
            pu1_left_tmp = pu1_src_left_cpy;
1200
45.6k
            pu1_src_left_cpy = pu1_src_left_str;
1201
45.6k
            pu1_src_left_str = pu1_left_tmp;
1202
45.6k
        }
1203
1.50M
        for(row = 0; row < ht; row++)
1204
1.46M
        {
1205
1.46M
            pu1_src_left[row] = pu1_src_left_cpy[row];
1206
1.46M
        }
1207
47.2k
    }
1208
47.2k
}
1209
1210
1211
void ihevc_sao_edge_offset_class0_chroma_ssse3(UWORD8 *pu1_src,
1212
                                               WORD32 src_strd,
1213
                                               UWORD8 *pu1_src_left,
1214
                                               UWORD8 *pu1_src_top,
1215
                                               UWORD8 *pu1_src_top_left,
1216
                                               UWORD8 *pu1_src_top_right,
1217
                                               UWORD8 *pu1_src_bot_left,
1218
                                               UWORD8 *pu1_avail,
1219
                                               WORD8 *pi1_sao_offset_u,
1220
                                               WORD8 *pi1_sao_offset_v,
1221
                                               WORD32 wd,
1222
                                               WORD32 ht)
1223
27.5k
{
1224
27.5k
    WORD32 row, col;
1225
27.5k
    UWORD8 *pu1_src_cpy, *pu1_src_left_cpy, *pu1_src_left_str, *pu1_left_tmp;
1226
27.5k
    UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
1227
27.5k
    UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)];
1228
27.5k
    UWORD8 au1_src_left_tmp1[2 * (MAX_CTB_SIZE + 8)];
1229
27.5k
    UWORD8 u1_avail0, u1_avail1;
1230
27.5k
    WORD32 wd_rem;
1231
27.5k
    WORD32 offset = 0;
1232
1233
27.5k
    __m128i src_temp0_16x8b, src_temp1_16x8b;
1234
27.5k
    __m128i left0_16x8b, left1_16x8b;
1235
27.5k
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
1236
27.5k
    __m128i edge0_16x8b, edge1_16x8b;
1237
27.5k
    __m128i au1_mask8x16b;
1238
27.5k
    __m128i edge_idx_8x16b, sao_offset_8x16b;
1239
27.5k
    __m128i const2_16x8b, const0_16x8b;
1240
27.5k
    __m128i left_store_16x8b;
1241
27.5k
    __m128i chroma_offset_8x16b;
1242
27.5k
    UNUSED(pu1_src_top_right);
1243
27.5k
    UNUSED(pu1_src_bot_left);
1244
1245
27.5k
    au1_mask8x16b = _mm_set1_epi8(0xff);
1246
1247
    /* Update  top and top-left arrays */
1248
27.5k
    pu1_src_top_left[0] = pu1_src_top[wd - 2];
1249
27.5k
    pu1_src_top_left[1] = pu1_src_top[wd - 1];;
1250
1251
77.1k
    for(col = wd; col >= 16; col -= 16)
1252
49.5k
    {
1253
49.5k
        const0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + offset + (ht - 1) * src_strd));
1254
49.5k
        _mm_storeu_si128((__m128i *)(pu1_src_top + offset), const0_16x8b);
1255
49.5k
        offset += 16;
1256
49.5k
    }
1257
822k
    for(row = 0; row < 2 * ht; row++)
1258
794k
    {
1259
794k
        au1_src_left_tmp[row] = pu1_src_left[row];
1260
794k
    }
1261
    //setting availability mask to ff size MAX_CTB_SIZE
1262
137k
    for(col = 0; col < MAX_CTB_SIZE; col += 16)
1263
110k
        _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
1264
1265
27.5k
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
1266
27.5k
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
1267
27.5k
    const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
1268
27.5k
    chroma_offset_8x16b = _mm_set1_epi16(0x0800);
1269
    //availability mask creation
1270
27.5k
    u1_avail0 = pu1_avail[0];
1271
27.5k
    u1_avail1 = pu1_avail[1];
1272
27.5k
    au1_mask[0] = u1_avail0;
1273
27.5k
    au1_mask[1] = u1_avail0;
1274
27.5k
    au1_mask[wd - 1] = u1_avail1;
1275
27.5k
    au1_mask[wd - 2] = u1_avail1;
1276
27.5k
    sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
1277
27.5k
    const2_16x8b = _mm_set1_epi8(2);
1278
27.5k
    const0_16x8b = _mm_setzero_si128();
1279
1280
27.5k
    {
1281
27.5k
        pu1_src_left_cpy = au1_src_left_tmp;
1282
27.5k
        pu1_src_left_str = au1_src_left_tmp1;
1283
27.5k
        au1_mask_cpy = au1_mask;
1284
77.1k
        for(col = wd; col >= 16; col -= 16)
1285
49.5k
        {
1286
49.5k
            pu1_src_cpy = pu1_src;
1287
49.5k
            au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
1288
1289
429k
            for(row = ht; row > 0; row -= 2)
1290
379k
            {
1291
1292
379k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
1293
                //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
1294
379k
                src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
1295
                // row = 1
1296
379k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
1297
1298
379k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 4);
1299
                //row 1 left
1300
379k
                left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 14);
1301
379k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14);
1302
                //row 0 left
1303
379k
                left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 14);
1304
379k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
1305
1306
1307
                //separating +ve and and -ve values.row 0 left
1308
379k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
1309
379k
                cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
1310
                //creating mask 00 for +ve and -ve values and FF for zero.
1311
379k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1312
379k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1313
                //combining the appropriate sign change
1314
379k
                left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1315
1316
                //separating +ve and and -ve values.row 1 left
1317
379k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
1318
379k
                cmp_lt0_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
1319
                //creating mask 00 for +ve and -ve values and FF for zero.
1320
379k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1321
379k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1322
                //combining the appropriate sign change
1323
379k
                left1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1324
1325
1326
                //row = 0 right
1327
379k
                edge0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2));
1328
                // row = 1 right
1329
379k
                edge1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
1330
                //separating +ve and and -ve values.row 0 right
1331
379k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
1332
379k
                cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
1333
                //creating mask 00 for +ve and -ve values and FF for zero.
1334
379k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1335
379k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1336
                //combining the appropriate sign change
1337
379k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1338
1339
                //separating +ve and and -ve values.row 1 right
1340
379k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
1341
379k
                cmp_lt0_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
1342
                //creating mask 00 for +ve and -ve values and FF for zero.
1343
379k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1344
379k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1345
                //combining the appropriate sign change
1346
379k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1347
1348
                //combining sign-left and sign_right
1349
379k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
1350
379k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
1351
                //adding constant 2
1352
379k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1353
379k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
1354
                //shuffle to get sao index
1355
379k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1356
379k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
1357
                //using availability mask
1358
379k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
1359
379k
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
1360
                //adding chroma offset to access U and V
1361
379k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
1362
379k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
1363
1364
                //shuffle to get sao offset
1365
379k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1366
379k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
1367
                //cnvert to 16 bit then add and then saturated pack
1368
379k
                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1369
379k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1370
379k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
1371
379k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1372
379k
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
1373
379k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1374
379k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
1375
379k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1376
1377
379k
                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
1378
379k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
1379
379k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
1380
379k
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
1381
379k
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
1382
379k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1383
379k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
1384
379k
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
1385
1386
379k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
1387
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1388
379k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1389
                // row = 1
1390
379k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
1391
1392
379k
                pu1_src_cpy += (src_strd << 1);
1393
379k
                pu1_src_left_cpy += 4;
1394
379k
                pu1_src_left_str += 4;
1395
379k
            }
1396
49.5k
            au1_mask_cpy += 16;
1397
49.5k
            pu1_src += 16;
1398
49.5k
            pu1_src_left_cpy -= 2 * ht;
1399
49.5k
            pu1_src_left_str -= 2 * ht;
1400
1401
49.5k
            pu1_left_tmp = pu1_src_left_cpy;
1402
49.5k
            pu1_src_left_cpy = pu1_src_left_str;
1403
49.5k
            pu1_src_left_str = pu1_left_tmp;
1404
49.5k
        }
1405
1406
27.5k
        wd_rem = wd & 0xF;
1407
27.5k
        if(wd_rem)
1408
1.18k
        {
1409
1410
1.18k
            cmp_gt0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (ht - 1) * src_strd));
1411
1.18k
            _mm_storel_epi64((__m128i *)(pu1_src_top + offset), cmp_gt0_16x8b);
1412
1413
1.18k
            au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy);
1414
1.18k
            pu1_src_cpy = pu1_src;
1415
1.18k
            au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
1416
1417
3.78k
            for(row = ht; row > 0; row -= 4)
1418
2.60k
            {
1419
2.60k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
1420
                //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
1421
2.60k
                src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
1422
                // row = 1
1423
2.60k
                cmp_gt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
1424
                // row  = 2
1425
2.60k
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
1426
                // row = 3
1427
2.60k
                cmp_lt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
1428
1429
1430
2.60k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 8);
1431
                //row 3 left
1432
2.60k
                edge0_16x8b = _mm_slli_si128(cmp_lt0_16x8b, 8);
1433
2.60k
                left0_16x8b = _mm_alignr_epi8(cmp_lt0_16x8b, left_store_16x8b, 14);
1434
2.60k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
1435
                //row 2 left
1436
2.60k
                edge0_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
1437
2.60k
                left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 14);
1438
2.60k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
1439
1440
1441
                // packing rows together for 16 SIMD operations
1442
2.60k
                src_temp1_16x8b = _mm_unpacklo_epi64(src_temp1_16x8b, cmp_lt0_16x8b);
1443
2.60k
                left1_16x8b = _mm_unpacklo_epi64(left1_16x8b, left0_16x8b);
1444
1445
                //row 1 left
1446
2.60k
                edge0_16x8b = _mm_slli_si128(cmp_gt0_16x8b, 8);
1447
2.60k
                edge1_16x8b = _mm_alignr_epi8(cmp_gt0_16x8b, left_store_16x8b, 14);
1448
2.60k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
1449
                //row 0 left
1450
2.60k
                edge0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
1451
2.60k
                left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 14);
1452
2.60k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
1453
                // packing rows together for 16 SIMD operations
1454
2.60k
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, cmp_gt0_16x8b);
1455
2.60k
                left0_16x8b = _mm_unpacklo_epi64(left0_16x8b, edge1_16x8b);
1456
1457
                //separating +ve and and -ve values.for row 2 and row 3
1458
2.60k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
1459
2.60k
                cmp_lt0_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
1460
                //creating mask 00 for +ve and -ve values and FF for zero.
1461
2.60k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1462
2.60k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1463
                //combining the appropriate sign change
1464
2.60k
                left1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1465
1466
1467
1468
1469
1470
                //separating +ve and and -ve values.
1471
2.60k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
1472
2.60k
                cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
1473
                //creating mask 00 for +ve and -ve values and FF for zero.
1474
2.60k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1475
2.60k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1476
                //combining the appropriate sign change
1477
2.60k
                left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1478
1479
1480
                //row = 0 right
1481
2.60k
                edge0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2));
1482
                // row = 1 right
1483
2.60k
                cmp_gt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd + 2));
1484
                // row = 2 right
1485
2.60k
                edge1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd + 2));
1486
                // row = 3 right
1487
2.60k
                cmp_lt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd + 2));
1488
                // packing rows together for 16 SIMD operations
1489
2.60k
                edge0_16x8b = _mm_unpacklo_epi64(edge0_16x8b, cmp_gt0_16x8b);
1490
2.60k
                edge1_16x8b = _mm_unpacklo_epi64(edge1_16x8b, cmp_lt0_16x8b);
1491
1492
                //separating +ve and and -ve values.
1493
2.60k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
1494
2.60k
                cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
1495
                //creating mask 00 for +ve and -ve values and FF for zero.
1496
2.60k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1497
2.60k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1498
                //combining the appropriate sign change
1499
2.60k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1500
1501
2.60k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
1502
2.60k
                cmp_lt0_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
1503
                //creating mask 00 for +ve and -ve values and FF for zero.
1504
2.60k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1505
2.60k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1506
                //combining the appropriate sign change
1507
2.60k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1508
1509
                //combining sign-left and sign_right
1510
2.60k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
1511
2.60k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
1512
                //adding constant 2
1513
2.60k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1514
2.60k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
1515
                //shuffle to get sao index
1516
2.60k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1517
2.60k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
1518
                //shuffle to get sao offset
1519
                //using availability mask
1520
2.60k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
1521
2.60k
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
1522
                //adding chroma offset to access U and V
1523
2.60k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
1524
2.60k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
1525
1526
2.60k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1527
2.60k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
1528
                //cnvert to 16 bit then add and then saturated pack
1529
2.60k
                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1530
2.60k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1531
2.60k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
1532
2.60k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1533
2.60k
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
1534
2.60k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1535
2.60k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
1536
2.60k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1537
1538
2.60k
                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
1539
2.60k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
1540
2.60k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
1541
2.60k
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
1542
2.60k
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
1543
2.60k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1544
2.60k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
1545
2.60k
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
1546
1547
                //seaprting row 1 and row 3
1548
2.60k
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
1549
2.60k
                cmp_lt0_16x8b = _mm_srli_si128(src_temp1_16x8b, 8);
1550
1551
2.60k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
1552
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1553
2.60k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1554
                // row = 1
1555
2.60k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
1556
                // row = 2
1557
2.60k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp1_16x8b);
1558
                // row = 3
1559
2.60k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
1560
1561
2.60k
                pu1_src_cpy += (src_strd << 2);
1562
2.60k
                pu1_src_left_cpy += 8;
1563
2.60k
                pu1_src_left_str += 8;
1564
2.60k
            }
1565
1.18k
            pu1_src += wd;
1566
1.18k
            pu1_src_left_cpy -= 2 * ht;
1567
1.18k
            pu1_src_left_str -= 2 * ht;
1568
1569
1.18k
            pu1_left_tmp = pu1_src_left_cpy;
1570
1.18k
            pu1_src_left_cpy = pu1_src_left_str;
1571
1.18k
            pu1_src_left_str = pu1_left_tmp;
1572
1.18k
        }
1573
822k
        for(row = 0; row < 2 * ht; row++)
1574
795k
        {
1575
795k
            pu1_src_left[row] = pu1_src_left_cpy[row];
1576
795k
        }
1577
27.5k
    }
1578
1579
27.5k
}
1580
1581
1582
void ihevc_sao_edge_offset_class1_ssse3(UWORD8 *pu1_src,
1583
                                        WORD32 src_strd,
1584
                                        UWORD8 *pu1_src_left,
1585
                                        UWORD8 *pu1_src_top,
1586
                                        UWORD8 *pu1_src_top_left,
1587
                                        UWORD8 *pu1_src_top_right,
1588
                                        UWORD8 *pu1_src_bot_left,
1589
                                        UWORD8 *pu1_avail,
1590
                                        WORD8 *pi1_sao_offset,
1591
                                        WORD32 wd,
1592
                                        WORD32 ht)
1593
42.1k
{
1594
42.1k
    WORD32 row, col;
1595
42.1k
    UWORD8 *pu1_src_top_cpy;
1596
42.1k
    UWORD8 *pu1_src_cpy;
1597
42.1k
    WORD32 wd_rem;
1598
1599
1600
42.1k
    __m128i src_top_16x8b, src_bottom_16x8b;
1601
42.1k
    __m128i src_temp0_16x8b, src_temp1_16x8b;
1602
42.1k
    __m128i signup0_16x8b, signdwn1_16x8b;
1603
42.1k
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
1604
42.1k
    __m128i edge0_16x8b, edge1_16x8b;
1605
42.1k
    __m128i edge_idx_8x16b, sao_offset_8x16b;
1606
42.1k
    __m128i const2_16x8b, const0_16x8b;
1607
1608
42.1k
    UNUSED(pu1_src_top_right);
1609
42.1k
    UNUSED(pu1_src_bot_left);
1610
1611
1612
    /* Updating left and top-left  */
1613
1.25M
    for(row = 0; row < ht; row++)
1614
1.21M
    {
1615
1.21M
        pu1_src_left[row] = pu1_src[row * src_strd + (wd - 1)];
1616
1.21M
    }
1617
42.1k
    *pu1_src_top_left = pu1_src_top[wd - 1];
1618
1619
1620
1621
42.1k
    pu1_src_top_cpy = pu1_src_top;
1622
42.1k
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
1623
42.1k
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
1624
1625
    /* Update height and source pointers based on the availability flags */
1626
42.1k
    if(0 == pu1_avail[2])
1627
6.57k
    {
1628
6.57k
        pu1_src_top_cpy = pu1_src;
1629
6.57k
        pu1_src += src_strd;
1630
6.57k
        ht--;
1631
6.57k
    }
1632
42.1k
    if(0 == pu1_avail[3])
1633
5.67k
    {
1634
5.67k
        ht--;
1635
5.67k
    }
1636
1637
42.1k
    const2_16x8b = _mm_set1_epi8(2);
1638
42.1k
    const0_16x8b = _mm_setzero_si128();
1639
1640
42.1k
    {
1641
42.1k
        WORD32 ht_rem;
1642
98.7k
        for(col = wd; col >= 16; col -= 16)
1643
56.5k
        {
1644
56.5k
            pu1_src_cpy = pu1_src;
1645
56.5k
            src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
1646
            //row = 0
1647
56.5k
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
1648
            //separating +ve and and -ve values.
1649
56.5k
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
1650
56.5k
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
1651
            //creating mask 00 for +ve and -ve values and FF for zero.
1652
56.5k
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1653
56.5k
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1654
            //combining the appropriate sign change
1655
56.5k
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1656
1657
948k
            for(row = ht; row >= 2; row -= 2)
1658
891k
            {
1659
1660
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
1661
891k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
1662
                // row = 2
1663
891k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
1664
1665
1666
                //row 0 -row1
1667
                //separating +ve and and -ve values.
1668
891k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
1669
891k
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
1670
                //creating mask 00 for +ve and -ve values and FF for zero.
1671
891k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1672
891k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1673
                //combining the appropriate sign change
1674
891k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1675
                //row1-row0
1676
891k
                edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
1677
1678
                //row1 -bottom
1679
891k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
1680
891k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
1681
                //creating mask 00 for +ve and -ve values and FF for zero.
1682
891k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1683
891k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1684
                //combining the appropriate sign change
1685
891k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1686
1687
                //combining sign-left and sign_right
1688
891k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
1689
891k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
1690
1691
                //for the next iteration signup0_16x8b = -signdwn1_16x8b
1692
891k
                signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
1693
                //adding constant 2
1694
891k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1695
891k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
1696
                //shuffle to get sao index
1697
891k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1698
891k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
1699
                //shuffle to get sao offset
1700
891k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1701
891k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
1702
                //copying the next top
1703
891k
                src_top_16x8b = src_temp1_16x8b;
1704
                //cnvert to 16 bit then add and then saturated pack
1705
891k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1706
891k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1707
891k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
1708
891k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1709
891k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1710
891k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
1711
891k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
1712
891k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1713
1714
891k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
1715
891k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
1716
891k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
1717
891k
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
1718
891k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1719
891k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
1720
891k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
1721
891k
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
1722
1723
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1724
891k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1725
                // row = 1
1726
891k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
1727
1728
891k
                src_temp0_16x8b = src_bottom_16x8b;
1729
891k
                pu1_src_cpy += (src_strd << 1);
1730
891k
            }
1731
56.5k
            ht_rem = ht & 0x1;
1732
1733
56.5k
            if(ht_rem)
1734
12.6k
            {
1735
12.6k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
1736
                //current row -next row
1737
                //separating +ve and and -ve values.
1738
12.6k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
1739
12.6k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
1740
                //creating mask 00 for +ve and -ve values and FF for zero.
1741
12.6k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1742
12.6k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1743
                //combining the appropriate sign change
1744
12.6k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1745
                //adding top and botton and constant 2
1746
12.6k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
1747
12.6k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1748
1749
12.6k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1750
12.6k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1751
                //copying the next top
1752
12.6k
                src_top_16x8b = src_temp0_16x8b;
1753
                //cnvert to 16 bit then add and then saturated pack
1754
12.6k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1755
12.6k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1756
12.6k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
1757
12.6k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1758
12.6k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1759
12.6k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
1760
12.6k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
1761
12.6k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1762
1763
12.6k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1764
12.6k
            }
1765
56.5k
            if(0 == pu1_avail[3])
1766
6.28k
            {
1767
6.28k
                src_top_16x8b = src_bottom_16x8b;
1768
6.28k
            }
1769
            //updating top flag
1770
56.5k
            _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
1771
56.5k
            pu1_src += 16;
1772
56.5k
        }
1773
1774
42.1k
        wd_rem = wd & 0xF;
1775
42.1k
        if(wd_rem)
1776
39.8k
        {
1777
39.8k
            pu1_src_cpy = pu1_src;
1778
39.8k
            src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
1779
            //row = 0
1780
39.8k
            src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
1781
            //separating +ve and and -ve values.
1782
39.8k
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
1783
39.8k
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
1784
            //creating mask 00 for +ve and -ve values and FF for zero.
1785
39.8k
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1786
39.8k
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1787
            //combining the appropriate sign change
1788
39.8k
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1789
39.8k
            signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
1790
315k
            for(row = ht; row >= 4; row -= 4)
1791
275k
            {
1792
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
1793
275k
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
1794
                // row = 2
1795
275k
                src_bottom_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
1796
1797
                //row 0 -row1
1798
                //separating +ve and and -ve values.
1799
275k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
1800
275k
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
1801
                //creating mask 00 for +ve and -ve values and FF for zero.
1802
275k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1803
275k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1804
                //combining the appropriate sign change
1805
275k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1806
1807
                //row1-row0
1808
275k
                edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
1809
275k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
1810
275k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
1811
                //row1 -row2
1812
275k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
1813
275k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
1814
                //creating mask 00 for +ve and -ve values and FF for zero.
1815
275k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1816
275k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1817
                //combining the appropriate sign change
1818
275k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
1819
275k
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
1820
                //packing row 0 n row 1
1821
275k
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
1822
                //row = 3
1823
275k
                src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
1824
                // row = 4
1825
275k
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 4 * src_strd));
1826
1827
275k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
1828
275k
                signdwn1_16x8b = _mm_slli_si128(signdwn1_16x8b, 8); //allign left (1-2)
1829
                //separating +ve and and -ve values.(2,3)
1830
275k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_top_16x8b);
1831
275k
                cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_bottom_16x8b);
1832
                //creating mask 00 for +ve and -ve values and FF for zero.
1833
275k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1834
275k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1835
                //combining the appropriate sign change
1836
275k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
1837
1838
275k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signdwn1_16x8b, 8); //(2-3) ,(1-2) (substract with down)
1839
275k
                edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8);
1840
                //separating +ve and and -ve values.(3,4)
1841
275k
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp1_16x8b);
1842
275k
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_top_16x8b);
1843
                //creating mask 00 for +ve and -ve values and FF for zero.
1844
275k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1845
275k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1846
275k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-4)
1847
                //combining sign-left and sign_right
1848
275k
                edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-4),(2-3)
1849
1850
275k
                edge1_16x8b = _mm_sub_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
1851
1852
                //packing row 2 n row 3
1853
275k
                src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
1854
                //for the next iteration signup0_16x8b = -signdwn1_16x8b
1855
275k
                signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(4-3)
1856
1857
                //adding constant 2
1858
275k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1859
275k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
1860
                //shuffle to get sao index
1861
275k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1862
275k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
1863
                //shuffle to get sao offset
1864
275k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1865
275k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
1866
                //the next top already in  src_top_16x8b
1867
                //src_top_16x8b = src_temp1_16x8b;
1868
                //cnvert to 16 bit then add and then saturated pack
1869
275k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1870
275k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1871
275k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
1872
275k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1873
275k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1874
275k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
1875
275k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
1876
275k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1877
1878
275k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
1879
275k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
1880
275k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
1881
275k
                src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
1882
275k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1883
275k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
1884
275k
                src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b);
1885
275k
                src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
1886
1887
275k
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
1888
275k
                cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
1889
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1890
275k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1891
                // row = 1
1892
275k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
1893
                //row = 2
1894
275k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
1895
                // row = 3
1896
275k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
1897
1898
275k
                src_temp0_16x8b = src_temp1_16x8b;
1899
275k
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
1900
275k
                pu1_src_cpy += (src_strd << 2);
1901
1902
275k
            }
1903
39.8k
            ht_rem = ht & 0x2;
1904
39.8k
            if(ht_rem)
1905
11.2k
            {
1906
1907
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
1908
11.2k
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
1909
                // row = 2
1910
11.2k
                src_bottom_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
1911
1912
                //row 0 -row1
1913
                //separating +ve and and -ve values.
1914
11.2k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
1915
11.2k
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
1916
                //creating mask 00 for +ve and -ve values and FF for zero.
1917
11.2k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1918
11.2k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1919
                //combining the appropriate sign change
1920
11.2k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1921
                //row1-row0
1922
11.2k
                edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
1923
11.2k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
1924
11.2k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
1925
                //row1 -row2
1926
11.2k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
1927
11.2k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
1928
                //creating mask 00 for +ve and -ve values and FF for zero.
1929
11.2k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1930
11.2k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1931
                //combining the appropriate sign change
1932
11.2k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
1933
11.2k
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
1934
                //adding top and down substraction
1935
11.2k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
1936
                //for the next iteration signup0_16x8b = -signdwn1_16x8b
1937
11.2k
                signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(2-1) for next
1938
11.2k
                src_top_16x8b = src_temp1_16x8b;
1939
                //adding constant 2
1940
11.2k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1941
1942
                //shuffle to get sao index
1943
11.2k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1944
1945
                //shuffle to get sao offset
1946
11.2k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1947
1948
                //the next top already in  src_top_16x8b
1949
                //cnvert to 16 bit then add and then saturated pack
1950
11.2k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1951
11.2k
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1952
11.2k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
1953
11.2k
                src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
1954
11.2k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
1955
11.2k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
1956
11.2k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
1957
11.2k
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
1958
1959
11.2k
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
1960
1961
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1962
11.2k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1963
                // row = 1
1964
11.2k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
1965
11.2k
                src_temp0_16x8b = src_bottom_16x8b;
1966
11.2k
                pu1_src_cpy += (src_strd << 1);
1967
1968
11.2k
            }
1969
39.8k
            ht_rem = ht & 0x1;
1970
39.8k
            if(ht_rem)
1971
10.7k
            {
1972
1973
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
1974
10.7k
                src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
1975
1976
                //row 0 -row1
1977
                //separating +ve and and -ve values.
1978
10.7k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
1979
10.7k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
1980
                //creating mask 00 for +ve and -ve values and FF for zero.
1981
10.7k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1982
10.7k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1983
                //combining the appropriate sign change
1984
10.7k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1985
                //adding top and down substraction
1986
10.7k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
1987
                //adding constant 2
1988
10.7k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1989
10.7k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
1990
10.7k
                edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
1991
                //shuffle to get sao index
1992
10.7k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1993
                //shuffle to get sao offset
1994
10.7k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1995
10.7k
                src_top_16x8b = src_temp0_16x8b;
1996
                //cnvert to 16 bit then add and then saturated pack
1997
10.7k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1998
10.7k
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1999
10.7k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2000
10.7k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
2001
10.7k
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
2002
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
2003
10.7k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2004
10.7k
                pu1_src_cpy += (src_strd);
2005
2006
10.7k
            }
2007
39.8k
            if(0 == pu1_avail[3])
2008
5.56k
            {
2009
5.56k
                src_top_16x8b = src_bottom_16x8b;
2010
5.56k
            }
2011
39.8k
            _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
2012
39.8k
            pu1_src += 8;
2013
39.8k
        }
2014
42.1k
    }
2015
42.1k
}
2016
2017
void ihevc_sao_edge_offset_class1_chroma_ssse3(UWORD8 *pu1_src,
2018
                                               WORD32 src_strd,
2019
                                               UWORD8 *pu1_src_left,
2020
                                               UWORD8 *pu1_src_top,
2021
                                               UWORD8 *pu1_src_top_left,
2022
                                               UWORD8 *pu1_src_top_right,
2023
                                               UWORD8 *pu1_src_bot_left,
2024
                                               UWORD8 *pu1_avail,
2025
                                               WORD8 *pi1_sao_offset_u,
2026
                                               WORD8 *pi1_sao_offset_v,
2027
                                               WORD32 wd,
2028
                                               WORD32 ht)
2029
39.3k
{
2030
39.3k
    WORD32 row, col;
2031
39.3k
    UWORD8 *pu1_src_top_cpy;
2032
39.3k
    UWORD8 *pu1_src_cpy;
2033
39.3k
    WORD32 wd_rem;
2034
2035
2036
39.3k
    __m128i src_top_16x8b, src_bottom_16x8b;
2037
39.3k
    __m128i src_temp0_16x8b, src_temp1_16x8b;
2038
39.3k
    __m128i signup0_16x8b, signdwn1_16x8b;
2039
39.3k
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
2040
39.3k
    __m128i edge0_16x8b, edge1_16x8b;
2041
39.3k
    __m128i edge_idx_8x16b, sao_offset_8x16b;
2042
39.3k
    __m128i const2_16x8b, const0_16x8b;
2043
39.3k
    __m128i chroma_offset_8x16b;
2044
2045
39.3k
    UNUSED(pu1_src_top_right);
2046
39.3k
    UNUSED(pu1_src_bot_left);
2047
2048
    /* Updating left and top and top-left */
2049
591k
    for(row = 0; row < ht; row++)
2050
551k
    {
2051
551k
        pu1_src_left[2 * row] = pu1_src[row * src_strd + (wd - 2)];
2052
551k
        pu1_src_left[2 * row + 1] = pu1_src[row * src_strd + (wd - 1)];
2053
551k
    }
2054
39.3k
    pu1_src_top_left[0] = pu1_src_top[wd - 2];
2055
39.3k
    pu1_src_top_left[1] = pu1_src_top[wd - 1];
2056
2057
2058
2059
39.3k
    pu1_src_top_cpy = pu1_src_top;
2060
39.3k
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
2061
39.3k
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
2062
39.3k
    const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
2063
39.3k
    chroma_offset_8x16b = _mm_set1_epi16(0x0800);
2064
    /* Update height and source pointers based on the availability flags */
2065
39.3k
    if(0 == pu1_avail[2])
2066
7.49k
    {
2067
7.49k
        pu1_src_top_cpy = pu1_src;
2068
7.49k
        pu1_src += src_strd;
2069
7.49k
        ht--;
2070
7.49k
    }
2071
39.3k
    if(0 == pu1_avail[3])
2072
5.74k
    {
2073
5.74k
        ht--;
2074
5.74k
    }
2075
39.3k
    sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
2076
39.3k
    const2_16x8b = _mm_set1_epi8(2);
2077
39.3k
    const0_16x8b = _mm_setzero_si128();
2078
2079
2080
39.3k
    {
2081
39.3k
        WORD32 ht_rem;
2082
2083
2084
2085
106k
        for(col = wd; col >= 16; col -= 16)
2086
67.4k
        {
2087
67.4k
            pu1_src_cpy = pu1_src;
2088
67.4k
            src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
2089
            //row = 0
2090
67.4k
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
2091
            //separating +ve and and -ve values.
2092
67.4k
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
2093
67.4k
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
2094
            //creating mask 00 for +ve and -ve values and FF for zero.
2095
67.4k
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2096
67.4k
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2097
            //combining the appropriate sign change
2098
67.4k
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2099
2100
558k
            for(row = ht; row >= 2; row -= 2)
2101
490k
            {
2102
2103
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
2104
490k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
2105
                // row = 2
2106
490k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
2107
2108
2109
                //row 0 -row1
2110
                //separating +ve and and -ve values.
2111
490k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
2112
490k
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
2113
                //creating mask 00 for +ve and -ve values and FF for zero.
2114
490k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2115
490k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2116
                //combining the appropriate sign change
2117
490k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2118
                //row1-row0
2119
490k
                edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
2120
2121
                //row1 -bottom
2122
490k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
2123
490k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
2124
                //creating mask 00 for +ve and -ve values and FF for zero.
2125
490k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2126
490k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2127
                //combining the appropriate sign change
2128
490k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2129
2130
                //combining sign-left and sign_right
2131
490k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
2132
490k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
2133
2134
                //for the next iteration signup0_16x8b = -signdwn1_16x8b
2135
490k
                signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
2136
                //adding constant 2
2137
490k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2138
490k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
2139
                //copying the next top
2140
490k
                src_top_16x8b = src_temp1_16x8b;
2141
2142
2143
                //shuffle to get sao index
2144
490k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2145
490k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
2146
                //adding chroma offset to access U and V
2147
490k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
2148
490k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
2149
2150
                //shuffle to get sao offset
2151
490k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2152
490k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
2153
                //cnvert to 16 bit then add and then saturated pack
2154
490k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2155
490k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2156
490k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2157
490k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
2158
490k
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
2159
490k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2160
490k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
2161
490k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
2162
2163
490k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
2164
490k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
2165
490k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
2166
490k
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
2167
490k
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
2168
490k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2169
490k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
2170
490k
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
2171
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
2172
490k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2173
                // row = 1
2174
490k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
2175
2176
490k
                src_temp0_16x8b = src_bottom_16x8b;
2177
490k
                pu1_src_cpy += (src_strd << 1);
2178
490k
            }
2179
67.4k
            ht_rem = ht & 0x1;
2180
2181
67.4k
            if(ht_rem)
2182
17.9k
            {
2183
17.9k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
2184
                //current row -next row
2185
                //separating +ve and and -ve values.
2186
17.9k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
2187
17.9k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
2188
                //creating mask 00 for +ve and -ve values and FF for zero.
2189
17.9k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2190
17.9k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2191
                //combining the appropriate sign change
2192
17.9k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2193
                //adding top and botton and constant 2
2194
17.9k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
2195
17.9k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2196
                //copying the next top
2197
17.9k
                src_top_16x8b = src_temp0_16x8b;
2198
2199
17.9k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2200
                //adding chroma offset to access U and V
2201
17.9k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
2202
17.9k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2203
2204
                //cnvert to 16 bit then add and then saturated pack
2205
17.9k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2206
17.9k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2207
17.9k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2208
17.9k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
2209
17.9k
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
2210
17.9k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2211
17.9k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
2212
17.9k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
2213
2214
17.9k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2215
17.9k
            }
2216
67.4k
            if(0 == pu1_avail[3])
2217
8.99k
            {
2218
8.99k
                src_top_16x8b = src_bottom_16x8b;
2219
8.99k
            }
2220
            //updating top flag
2221
67.4k
            _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
2222
67.4k
            pu1_src += 16;
2223
67.4k
        }
2224
2225
39.3k
        wd_rem = wd & 0xF;
2226
39.3k
        if(wd_rem)
2227
1.95k
        {
2228
1.95k
            pu1_src_cpy = pu1_src;
2229
1.95k
            src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
2230
            //row = 0
2231
1.95k
            src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
2232
            //separating +ve and and -ve values.
2233
1.95k
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
2234
1.95k
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
2235
            //creating mask 00 for +ve and -ve values and FF for zero.
2236
1.95k
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2237
1.95k
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2238
            //combining the appropriate sign change
2239
1.95k
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2240
1.95k
            signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
2241
5.18k
            for(row = ht; row >= 4; row -= 4)
2242
3.23k
            {
2243
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
2244
3.23k
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
2245
                // row = 2
2246
3.23k
                src_bottom_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
2247
2248
                //row 0 -row1
2249
                //separating +ve and and -ve values.
2250
3.23k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
2251
3.23k
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
2252
                //creating mask 00 for +ve and -ve values and FF for zero.
2253
3.23k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2254
3.23k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2255
                //combining the appropriate sign change
2256
3.23k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2257
2258
                //row1-row0
2259
3.23k
                edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
2260
3.23k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
2261
3.23k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
2262
                //row1 -row2
2263
3.23k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
2264
3.23k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
2265
                //creating mask 00 for +ve and -ve values and FF for zero.
2266
3.23k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2267
3.23k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2268
                //combining the appropriate sign change
2269
3.23k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
2270
3.23k
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
2271
                //packing row 0 n row 1
2272
3.23k
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
2273
                //row = 3
2274
3.23k
                src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
2275
                // row = 4
2276
3.23k
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 4 * src_strd));
2277
2278
3.23k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
2279
3.23k
                signdwn1_16x8b = _mm_slli_si128(signdwn1_16x8b, 8); //allign left (1-2)
2280
                //separating +ve and and -ve values.(2,3)
2281
3.23k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_top_16x8b);
2282
3.23k
                cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_bottom_16x8b);
2283
                //creating mask 00 for +ve and -ve values and FF for zero.
2284
3.23k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2285
3.23k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2286
                //combining the appropriate sign change
2287
3.23k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
2288
2289
3.23k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signdwn1_16x8b, 8); //(2-3) ,(1-2) (substract with down)
2290
3.23k
                edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8);
2291
                //separating +ve and and -ve values.(3,4)
2292
3.23k
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp1_16x8b);
2293
3.23k
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_top_16x8b);
2294
                //creating mask 00 for +ve and -ve values and FF for zero.
2295
3.23k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2296
3.23k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2297
3.23k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-4)
2298
                //combining sign-left and sign_right
2299
3.23k
                edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-4),(2-3)
2300
2301
3.23k
                edge1_16x8b = _mm_sub_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
2302
2303
                //packing row 2 n row 3
2304
3.23k
                src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
2305
                //for the next iteration signup0_16x8b = -signdwn1_16x8b
2306
3.23k
                signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(4-3)
2307
                //adding constant 2
2308
3.23k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2309
3.23k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
2310
                //shuffle to get sao index
2311
3.23k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2312
3.23k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
2313
                //adding chroma offset to access U and V
2314
3.23k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
2315
3.23k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
2316
2317
                //shuffle to get sao offset
2318
3.23k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2319
3.23k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
2320
                //the next top already in  src_top_16x8b
2321
                //cnvert to 16 bit then add and then saturated pack
2322
3.23k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2323
3.23k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2324
3.23k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2325
3.23k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
2326
3.23k
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
2327
3.23k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2328
3.23k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
2329
3.23k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
2330
2331
3.23k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
2332
3.23k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
2333
3.23k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
2334
3.23k
                src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
2335
3.23k
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
2336
3.23k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2337
3.23k
                src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b);
2338
3.23k
                src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
2339
2340
3.23k
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
2341
3.23k
                cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
2342
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
2343
3.23k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2344
                // row = 1
2345
3.23k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
2346
                //row = 2
2347
3.23k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
2348
                // row = 3
2349
3.23k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
2350
2351
3.23k
                src_temp0_16x8b = src_temp1_16x8b;
2352
3.23k
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
2353
3.23k
                pu1_src_cpy += (src_strd << 2);
2354
2355
3.23k
            }
2356
1.95k
            ht_rem = ht & 0x2;
2357
1.95k
            if(ht_rem)
2358
994
            {
2359
2360
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
2361
994
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
2362
                // row = 2
2363
994
                src_bottom_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
2364
2365
                //row 0 -row1
2366
                //separating +ve and and -ve values.
2367
994
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
2368
994
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
2369
                //creating mask 00 for +ve and -ve values and FF for zero.
2370
994
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2371
994
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2372
                //combining the appropriate sign change
2373
994
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2374
                //row1-row0
2375
994
                edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
2376
994
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
2377
994
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
2378
                //row1 -row2
2379
994
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
2380
994
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
2381
                //creating mask 00 for +ve and -ve values and FF for zero.
2382
994
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2383
994
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2384
                //combining the appropriate sign change
2385
994
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
2386
994
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
2387
                //adding top and down substraction
2388
994
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
2389
                //for the next iteration signup0_16x8b = -signdwn1_16x8b
2390
994
                signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(2-1) for next
2391
994
                src_top_16x8b = src_temp1_16x8b;
2392
2393
                //adding constant 2
2394
994
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2395
2396
                //shuffle to get sao index
2397
994
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2398
2399
                //adding chroma offset to access U and V
2400
994
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
2401
                //shuffle to get sao offset
2402
994
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2403
                //the next top already in  src_top_16x8b
2404
                //cnvert to 16 bit then add and then saturated pack
2405
994
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2406
994
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2407
994
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2408
994
                src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
2409
994
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
2410
994
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
2411
994
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b);
2412
994
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
2413
2414
994
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
2415
2416
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
2417
994
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2418
                // row = 1
2419
994
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
2420
994
                src_temp0_16x8b = src_bottom_16x8b;
2421
994
                pu1_src_cpy += (src_strd << 1);
2422
2423
994
            }
2424
1.95k
            ht_rem = ht & 0x1;
2425
1.95k
            if(ht_rem)
2426
973
            {
2427
2428
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
2429
973
                src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
2430
2431
                //row 0 -row1
2432
                //separating +ve and and -ve values.
2433
973
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
2434
973
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
2435
                //creating mask 00 for +ve and -ve values and FF for zero.
2436
973
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2437
973
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2438
                //combining the appropriate sign change
2439
973
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2440
                //adding top and down substraction
2441
973
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
2442
                //adding constant 2
2443
973
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2444
973
                src_top_16x8b = src_temp0_16x8b;
2445
2446
973
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
2447
973
                edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
2448
                //shuffle to get sao index
2449
973
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2450
                //adding chroma offset to access U and V
2451
973
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
2452
                //shuffle to get sao offset
2453
973
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2454
2455
                //cnvert to 16 bit then add and then saturated pack
2456
973
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2457
973
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2458
973
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2459
973
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
2460
973
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
2461
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
2462
973
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2463
973
                pu1_src_cpy += (src_strd);
2464
2465
973
            }
2466
1.95k
            if(0 == pu1_avail[3])
2467
407
            {
2468
407
                src_top_16x8b = src_bottom_16x8b;
2469
407
            }
2470
1.95k
            _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
2471
1.95k
            pu1_src += 8;
2472
1.95k
        }
2473
39.3k
    }
2474
39.3k
}
2475
2476
/* 135 degree filtering */
2477
void ihevc_sao_edge_offset_class2_ssse3(UWORD8 *pu1_src,
2478
                                        WORD32 src_strd,
2479
                                        UWORD8 *pu1_src_left,
2480
                                        UWORD8 *pu1_src_top,
2481
                                        UWORD8 *pu1_src_top_left,
2482
                                        UWORD8 *pu1_src_top_right,
2483
                                        UWORD8 *pu1_src_bot_left,
2484
                                        UWORD8 *pu1_avail,
2485
                                        WORD8 *pi1_sao_offset,
2486
                                        WORD32 wd,
2487
                                        WORD32 ht)
2488
45.6k
{
2489
45.6k
    WORD32 row, col;
2490
45.6k
    UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
2491
45.6k
    UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2;
2492
45.6k
    UWORD8 *pu1_firstleft;
2493
45.6k
    UWORD8 *pu1_src_cpy, *pu1_src_org;
2494
45.6k
    UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
2495
45.6k
    UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8];
2496
45.6k
    UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8];
2497
45.6k
    WORD32 wd_rem;
2498
45.6k
    UWORD8 u1_pos_0_0_tmp, u1_pos_wd_ht_tmp;
2499
45.6k
    WORD32 ht_tmp, ht_0;
2500
2501
45.6k
    WORD32 bit_depth;
2502
45.6k
    UWORD8 u1_avail0, u1_avail1;
2503
2504
45.6k
    __m128i src_top_16x8b, src_bottom_16x8b;
2505
45.6k
    __m128i src_temp0_16x8b, src_temp1_16x8b;
2506
45.6k
    __m128i signup0_16x8b, signdwn1_16x8b;
2507
45.6k
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
2508
45.6k
    __m128i edge0_16x8b, edge1_16x8b;
2509
45.6k
    __m128i au1_mask8x16b;
2510
45.6k
    __m128i edge_idx_8x16b, sao_offset_8x16b;
2511
45.6k
    __m128i const2_16x8b, const0_16x8b;
2512
45.6k
    __m128i left_store_16x8b;
2513
45.6k
    UNUSED(pu1_src_top_right);
2514
45.6k
    UNUSED(pu1_src_bot_left);
2515
2516
45.6k
    ht_0 = ht; ht_tmp = ht;
2517
45.6k
    au1_mask8x16b = _mm_set1_epi8(0xff);
2518
2519
    //setting availability mask to ff size MAX_CTB_SIZE
2520
228k
    for(col = 0; col < MAX_CTB_SIZE; col += 16)
2521
182k
        _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
2522
1.35M
    for(row = 0; row < ht; row++)
2523
1.30M
    {
2524
1.30M
        au1_src_left_tmp[row] = pu1_src_left[row];
2525
1.30M
    }
2526
45.6k
    bit_depth = BIT_DEPTH_LUMA;
2527
45.6k
    pu1_src_org = pu1_src;
2528
45.6k
    pu1_src_top_cpy = pu1_src_top;
2529
45.6k
    pu1_src_left_cpy2 = au1_src_left_tmp;
2530
45.6k
    pu1_src_left_cpy = au1_src_left_tmp;
2531
45.6k
    pu1_src_left_str2 = au1_src_left_tmp1;
2532
45.6k
    pu1_src_left_str = au1_src_left_tmp1;
2533
45.6k
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
2534
45.6k
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
2535
2536
2537
    /* If top-left is available, process separately */
2538
45.6k
    if(0 != pu1_avail[4])
2539
35.1k
    {
2540
35.1k
        WORD8 edge_idx;
2541
2542
35.1k
        edge_idx = 2 + SIGN(pu1_src[0] - pu1_src_top_left[0]) +
2543
35.1k
                        SIGN(pu1_src[0] - pu1_src[1 + src_strd]);
2544
2545
35.1k
        edge_idx = gi1_table_edge_idx[edge_idx];
2546
2547
35.1k
        if(0 != edge_idx)
2548
8.82k
        {
2549
8.82k
            u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
2550
8.82k
        }
2551
26.3k
        else
2552
26.3k
        {
2553
26.3k
            u1_pos_0_0_tmp = pu1_src[0];
2554
26.3k
        }
2555
35.1k
    }
2556
10.5k
    else
2557
10.5k
    {
2558
10.5k
        u1_pos_0_0_tmp = pu1_src[0];
2559
10.5k
    }
2560
2561
    /* If bottom-right is available, process separately */
2562
45.6k
    if(0 != pu1_avail[7])
2563
37.5k
    {
2564
37.5k
        WORD8 edge_idx;
2565
2566
37.5k
        edge_idx = 2 + SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 1 - src_strd]) +
2567
37.5k
                        SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]);
2568
2569
37.5k
        edge_idx = gi1_table_edge_idx[edge_idx];
2570
2571
37.5k
        if(0 != edge_idx)
2572
10.2k
        {
2573
10.2k
            u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
2574
10.2k
        }
2575
27.3k
        else
2576
27.3k
        {
2577
27.3k
            u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd];
2578
27.3k
        }
2579
37.5k
    }
2580
8.09k
    else
2581
8.09k
    {
2582
8.09k
        u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd];
2583
8.09k
    }
2584
45.6k
    pu1_firstleft = pu1_src_top_left;
2585
2586
    /* Update height and source pointers based on the availability flags */
2587
45.6k
    if(0 == pu1_avail[2])
2588
7.83k
    {
2589
7.83k
        pu1_firstleft = pu1_src_left_cpy2;
2590
7.83k
        pu1_src_left_cpy2++;
2591
7.83k
        pu1_src_left_str2++;
2592
7.83k
        pu1_src_top_cpy = pu1_src;
2593
7.83k
        pu1_src += src_strd;
2594
7.83k
        ht--;
2595
7.83k
    }
2596
45.6k
    if(0 == pu1_avail[3])
2597
5.82k
    {
2598
5.82k
        ht--;
2599
5.82k
        ht_0--;
2600
5.82k
    }
2601
    //storing top left in a mmx register
2602
45.6k
    left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_firstleft);
2603
45.6k
    const2_16x8b = _mm_set1_epi8(2);
2604
45.6k
    const0_16x8b = _mm_setzero_si128();
2605
45.6k
    left_store_16x8b = _mm_slli_si128(left_store_16x8b, 15);
2606
    //update top -left
2607
45.6k
    *pu1_src_top_left = pu1_src_top[wd - 1];
2608
    //availability mask creation
2609
45.6k
    u1_avail0 = pu1_avail[0];
2610
45.6k
    u1_avail1 = pu1_avail[1];
2611
45.6k
    au1_mask[0] = u1_avail0;
2612
45.6k
    au1_mask[wd - 1] = u1_avail1;
2613
45.6k
    {
2614
45.6k
        WORD32 ht_rem;
2615
2616
2617
45.6k
        pu1_src_left_cpy = pu1_src_left_cpy2;
2618
45.6k
        pu1_src_left_str = pu1_src_left_str2;
2619
45.6k
        au1_mask_cpy = au1_mask;
2620
105k
        for(col = wd; col >= 16; col -= 16)
2621
59.8k
        {
2622
59.8k
            pu1_src_cpy = pu1_src;
2623
59.8k
            src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
2624
            //row = 0
2625
59.8k
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
2626
59.8k
            src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 15);
2627
            //loading the mask
2628
59.8k
            au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
2629
            //separating +ve and and -ve values.
2630
59.8k
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
2631
59.8k
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
2632
            //creating mask 00 for +ve and -ve values and FF for zero.
2633
59.8k
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2634
59.8k
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2635
            //combining the appropriate sign change
2636
59.8k
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2637
2638
2639
979k
            for(row = ht; row >= 2; row -= 2)
2640
919k
            {
2641
919k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
2642
                //row = 1
2643
919k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
2644
                // row = 1 right
2645
919k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
2646
                //to insert left in row 0
2647
919k
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 15);
2648
                //row 0 -row1
2649
                //separating +ve and and -ve values.
2650
919k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
2651
919k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
2652
2653
                //creating mask 00 for +ve and -ve values and FF for zero.
2654
919k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2655
919k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2656
                //manipulation for row 1 - row 0
2657
919k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15);
2658
                //combining the appropriate sign change
2659
919k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
2660
                //row1-row0
2661
                //separating +ve and and -ve values.
2662
919k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
2663
919k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
2664
                //creating mask 00 for +ve and -ve values and FF for zero.
2665
919k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2666
919k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2667
                // row = 2 right
2668
919k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd + 1));
2669
919k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
2670
2671
2672
                //row1 -bottom
2673
919k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
2674
919k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
2675
                //creating mask 00 for +ve and -ve values and FF for zero.
2676
919k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2677
919k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2678
                //combining the appropriate sign change
2679
919k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2680
                // row = 2
2681
919k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
2682
2683
                //combining sign-left and sign_right
2684
919k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
2685
2686
                //storing the row 1 left for next row.
2687
919k
                signup0_16x8b = _mm_slli_si128(left_store_16x8b, 14);
2688
2689
                //combining sign-left and sign_right
2690
919k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
2691
                //manipulation for bottom - row 1
2692
919k
                signup0_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signup0_16x8b, 15);
2693
                //eliminating old left for row 0 and row 1
2694
919k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
2695
                //bottom - row1
2696
919k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signup0_16x8b);
2697
919k
                cmp_lt0_16x8b = _mm_subs_epu8(signup0_16x8b, src_bottom_16x8b);
2698
                //creating mask 00 for +ve and -ve values and FF for zero.
2699
919k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2700
919k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2701
                //for the next iteration bottom -row1
2702
919k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2703
                //row1  getting it right for left of next block
2704
919k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15);
2705
                //adding constant 2
2706
919k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2707
919k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
2708
                //shuffle to get sao index
2709
919k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2710
919k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
2711
                //using availability mask
2712
919k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
2713
919k
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
2714
                //shuffle to get sao offset
2715
919k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2716
919k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
2717
                //row0  getting it right for left of next block
2718
919k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
2719
                //copying the next top
2720
919k
                src_top_16x8b = src_temp1_16x8b;
2721
                //cnvert to 16 bit then add and then saturated pack
2722
919k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2723
919k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2724
919k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2725
919k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
2726
919k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2727
919k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
2728
919k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
2729
919k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
2730
2731
919k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
2732
919k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
2733
919k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
2734
919k
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
2735
919k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2736
919k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
2737
919k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
2738
919k
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
2739
2740
                //store left boundary
2741
919k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
2742
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
2743
919k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2744
                // row = 1
2745
919k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
2746
2747
919k
                src_temp0_16x8b = src_bottom_16x8b;
2748
919k
                pu1_src_cpy += (src_strd << 1);
2749
919k
                pu1_src_left_cpy += 2;
2750
919k
                pu1_src_left_str += 2;
2751
919k
            }
2752
59.8k
            ht_rem = ht & 0x1;
2753
2754
59.8k
            if(ht_rem)
2755
14.4k
            {
2756
14.4k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
2757
14.4k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
2758
                //current row -next row
2759
                //separating +ve and and -ve values.
2760
14.4k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
2761
14.4k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
2762
                //creating mask 00 for +ve and -ve values and FF for zero.
2763
14.4k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2764
14.4k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2765
                //combining the appropriate sign change
2766
14.4k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2767
                //adding top and botton and constant 2
2768
14.4k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
2769
14.4k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2770
                //eliminating old left for row 0 and row 1
2771
14.4k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
2772
2773
14.4k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2774
                //using availability mask
2775
14.4k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
2776
2777
14.4k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2778
2779
                //row0  getting it right for left of next block
2780
14.4k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
2781
                //copying the next top
2782
14.4k
                src_top_16x8b = src_temp0_16x8b;
2783
                //cnvert to 16 bit then add and then saturated pack
2784
14.4k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2785
14.4k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2786
14.4k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2787
14.4k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
2788
14.4k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2789
14.4k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
2790
14.4k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
2791
14.4k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
2792
                //store left boundary
2793
14.4k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
2794
2795
14.4k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2796
14.4k
                pu1_src_cpy += (src_strd);
2797
14.4k
                pu1_src_left_cpy += 1;
2798
14.4k
                pu1_src_left_str += 1;
2799
14.4k
            }
2800
59.8k
            if(0 == pu1_avail[3])
2801
6.56k
            {
2802
6.56k
                src_top_16x8b = src_bottom_16x8b;
2803
6.56k
                pu1_src_left_str[0] = pu1_src_cpy[15];
2804
6.56k
            }
2805
59.8k
            if(0 == pu1_avail[2])
2806
9.78k
            {
2807
9.78k
                pu1_src_left_str[-ht_0] = pu1_src[15 - src_strd];
2808
9.78k
            }
2809
2810
            //for the top left of next part of the block
2811
59.8k
            left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
2812
            //updating top flag
2813
59.8k
            _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
2814
59.8k
            pu1_src += 16;
2815
59.8k
            au1_mask_cpy += 16;
2816
2817
2818
59.8k
            pu1_left_tmp = pu1_src_left_cpy2;
2819
59.8k
            pu1_src_left_cpy2 = pu1_src_left_str2;
2820
59.8k
            pu1_src_left_str2 = pu1_left_tmp;
2821
2822
59.8k
            pu1_src_left_cpy = pu1_src_left_cpy2;
2823
59.8k
            pu1_src_left_str = pu1_src_left_str2;
2824
59.8k
        }
2825
2826
45.6k
        wd_rem = wd & 0xF;
2827
45.6k
        if(wd_rem)
2828
44.6k
        {
2829
44.6k
            pu1_src_left_cpy = pu1_src_left_cpy2;
2830
44.6k
            pu1_src_left_str = pu1_src_left_str2;
2831
44.6k
            pu1_src_cpy = pu1_src;
2832
44.6k
            src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
2833
            //row = 0
2834
44.6k
            src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
2835
44.6k
            src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 15);
2836
44.6k
            au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
2837
            //separating +ve and and -ve values.
2838
44.6k
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
2839
44.6k
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
2840
            //creating mask 00 for +ve and -ve values and FF for zero.
2841
44.6k
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2842
44.6k
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2843
            //preparing au1_mask
2844
44.6k
            au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
2845
            //combining the appropriate sign change
2846
44.6k
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2847
44.6k
            signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
2848
2849
350k
            for(row = ht; row >= 4; row -= 4)
2850
306k
            {
2851
306k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
2852
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
2853
306k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
2854
                // row = 2
2855
306k
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
2856
                //right row1
2857
306k
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
2858
                //row 0 -row1
2859
                //separating +ve and and -ve values.
2860
306k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
2861
306k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
2862
                //manipulation for row 1 -row 0
2863
306k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 15);
2864
                //creating mask 00 for +ve and -ve values and FF for zero.
2865
306k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2866
306k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2867
                //row 0 left
2868
306k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15);
2869
                //combining the appropriate sign change
2870
306k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2871
                //row 1 -row0
2872
                //separating +ve and and -ve values.
2873
306k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
2874
306k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
2875
2876
                //creating mask 00 for +ve and -ve values and FF for zero.
2877
306k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2878
306k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2879
                //row1-row0
2880
306k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2881
2882
306k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
2883
2884
306k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
2885
                //right row2
2886
306k
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
2887
                //packing row 0 n row 1
2888
306k
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
2889
                //row1 -row2
2890
306k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
2891
306k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
2892
                //creating mask 00 for +ve and -ve values and FF for zero.
2893
306k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2894
306k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2895
                //combining the appropriate sign change
2896
306k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
2897
306k
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
2898
                //manipulation for row 2 -row 1
2899
306k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
2900
                //row 1 left
2901
306k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
2902
                //row = 3
2903
306k
                src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
2904
2905
                // row = 4
2906
306k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
2907
2908
306k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
2909
2910
                //separating +ve and and -ve values.(2,1)
2911
306k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
2912
306k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
2913
                //manipulation for row 3 -row 2
2914
306k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 13);
2915
                //creating mask 00 for +ve and -ve values and FF for zero.
2916
306k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2917
306k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2918
                //row 2 left
2919
306k
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
2920
                //combining the appropriate sign change
2921
306k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
2922
2923
                //separating +ve and and -ve values.(3,2)
2924
306k
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
2925
306k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
2926
306k
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
2927
                //creating mask 00 for +ve and -ve values and FF for zero.
2928
306k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2929
306k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2930
                //right row3
2931
306k
                signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 1);
2932
                //combining the appropriate sign change
2933
306k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
2934
2935
306k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
2936
2937
                //separating +ve and and -ve values.(2,3)
2938
306k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
2939
306k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
2940
                //right row 4
2941
306k
                signdwn1_16x8b =  _mm_srli_si128(src_temp1_16x8b, 1);
2942
                //creating mask 00 for +ve and -ve values and FF for zero.
2943
306k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2944
306k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2945
                //combining the appropriate sign change
2946
306k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
2947
2948
                //separating +ve and and -ve values.(3,bottom)
2949
306k
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
2950
306k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
2951
2952
                //creating mask 00 for +ve and -ve values and FF for zero.
2953
306k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2954
306k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2955
306k
                edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
2956
                //combining the appropriate sign change
2957
306k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
2958
306k
                edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
2959
2960
                //manipulation for bottom -row 3
2961
306k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
2962
                //eliminating old left for row 0,1,2,3
2963
306k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
2964
                //packing row 2 n row 3
2965
306k
                src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
2966
                //row 3 left
2967
306k
                signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 15);
2968
                //loading row 3 right into left
2969
306k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 15);
2970
                //adding bottom and top values of row 2 and row 3
2971
306k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
2972
                //separating +ve and and -ve values.(botttom,3)
2973
306k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
2974
306k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
2975
                //to store right of row 2
2976
306k
                signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
2977
                //creating mask 00 for +ve and -ve values and FF for zero.
2978
306k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2979
306k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2980
306k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
2981
2982
                //storing right of row 2into left
2983
306k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
2984
                //to store right of row 0
2985
306k
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
2986
                //storing right of row 1 into left
2987
306k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
2988
2989
                //adding constant 2
2990
306k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2991
306k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
2992
                //shuffle to get sao index
2993
306k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2994
306k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
2995
                //using availability mask
2996
306k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
2997
306k
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
2998
                //shuffle to get sao offset
2999
306k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3000
306k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
3001
3002
                //storing right of row 0 into left
3003
306k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
3004
                //cnvert to 16 bit then add and then saturated pack
3005
306k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3006
306k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3007
306k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3008
306k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
3009
306k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3010
306k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
3011
306k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
3012
306k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
3013
3014
306k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
3015
306k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
3016
306k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
3017
306k
                src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
3018
306k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3019
306k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
3020
306k
                src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b);
3021
306k
                src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
3022
3023
306k
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
3024
306k
                cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
3025
3026
306k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3027
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
3028
306k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3029
                // row = 1
3030
306k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
3031
                //row = 2
3032
306k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
3033
                // row = 3
3034
306k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
3035
3036
306k
                src_temp0_16x8b = src_temp1_16x8b;
3037
306k
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
3038
306k
                pu1_src_cpy += (src_strd << 2);
3039
306k
                pu1_src_left_cpy += 4;
3040
306k
                pu1_src_left_str += 4;
3041
306k
            }
3042
44.6k
            ht_rem = ht & 0x2;
3043
44.6k
            if(ht_rem)
3044
12.7k
            {
3045
12.7k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3046
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
3047
12.7k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
3048
                // row = 2
3049
12.7k
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
3050
3051
                //row 0 -row 1
3052
12.7k
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
3053
                //separating +ve and and -ve values.
3054
12.7k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
3055
12.7k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
3056
                //manipulation for row 1 -row 0
3057
12.7k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 15);
3058
                //creating mask 00 for +ve and -ve values and FF for zero.
3059
12.7k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3060
12.7k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3061
                //manipulation for row 1 - row 0
3062
12.7k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15);
3063
                //combining the appropriate sign change
3064
12.7k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3065
3066
                //row1-row0
3067
                //separating +ve and and -ve values.
3068
12.7k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3069
12.7k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3070
3071
                //creating mask 00 for +ve and -ve values and FF for zero.
3072
12.7k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3073
12.7k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3074
                //combining the appropriate sign chang
3075
12.7k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3076
                //row 1 -bottom
3077
12.7k
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
3078
3079
12.7k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
3080
12.7k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
3081
                //row1 -bottom
3082
12.7k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3083
12.7k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3084
3085
                //creating mask 00 for +ve and -ve values and FF for zero.
3086
12.7k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3087
12.7k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3088
                //combining the appropriate sign change
3089
12.7k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
3090
12.7k
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
3091
                //manipulation for bottom -row1
3092
12.7k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
3093
                //manipulation for bottom- row 1
3094
12.7k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
3095
                //adding top and down substraction
3096
12.7k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
3097
                //bottom - row 1
3098
12.7k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
3099
12.7k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
3100
3101
                //eliminating old left for row 0,1
3102
12.7k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
3103
12.7k
                signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
3104
                //creating mask 00 for +ve and -ve values and FF for zero.
3105
12.7k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3106
12.7k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3107
                //for the next iteration signup0_16x8b
3108
12.7k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
3109
3110
                //storing right of row 1 into left
3111
12.7k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
3112
                //for storing right of row 1
3113
12.7k
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
3114
3115
12.7k
                src_top_16x8b = src_temp1_16x8b;
3116
                //storing right of row 0 into left
3117
12.7k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
3118
3119
                //adding constant 2
3120
12.7k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
3121
3122
                //shuffle to get sao index
3123
12.7k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
3124
                //using availability mask
3125
12.7k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
3126
                //shuffle to get sao offset
3127
12.7k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3128
3129
                //the next top already in  src_top_16x8b
3130
                //cnvert to 16 bit then add and then saturated pack
3131
12.7k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3132
12.7k
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3133
12.7k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3134
12.7k
                src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
3135
12.7k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
3136
12.7k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
3137
12.7k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
3138
12.7k
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
3139
3140
12.7k
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
3141
3142
12.7k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3143
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
3144
12.7k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3145
                // row = 1
3146
12.7k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
3147
12.7k
                src_temp0_16x8b = src_bottom_16x8b;
3148
12.7k
                pu1_src_cpy += (src_strd << 1);
3149
12.7k
                pu1_src_left_cpy += 2;
3150
12.7k
                pu1_src_left_str += 2;
3151
12.7k
            }
3152
44.6k
            ht_rem = ht & 0x1;
3153
44.6k
            if(ht_rem)
3154
12.0k
            {
3155
12.0k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3156
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
3157
12.0k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
3158
                //left store manipulation 1
3159
12.0k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
3160
                //row 0 -row1
3161
12.0k
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
3162
                //separating +ve and and -ve values.
3163
12.0k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
3164
12.0k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
3165
                //creating mask 00 for +ve and -ve values and FF for zero.
3166
12.0k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3167
12.0k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3168
                //combining the appropriate sign change
3169
12.0k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3170
                //adding top and down substraction
3171
12.0k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
3172
                //for row 0 right to put into left store
3173
12.0k
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
3174
                //adding constant 2
3175
12.0k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
3176
12.0k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
3177
12.0k
                edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
3178
                //filling the left boundary value
3179
12.0k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
3180
3181
                //shuffle to get sao index
3182
12.0k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
3183
                //using availability mask
3184
12.0k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
3185
                //shuffle to get sao offset
3186
12.0k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3187
12.0k
                src_top_16x8b = src_temp0_16x8b;
3188
                //cnvert to 16 bit then add and then saturated pack
3189
12.0k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3190
12.0k
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3191
12.0k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3192
12.0k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
3193
12.0k
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
3194
3195
12.0k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3196
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
3197
12.0k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3198
12.0k
                pu1_src_cpy += (src_strd);
3199
12.0k
                pu1_src_left_cpy += 1;
3200
12.0k
                pu1_src_left_str += 1;
3201
12.0k
            }
3202
44.6k
            if(0 == pu1_avail[3])
3203
5.75k
            {
3204
5.75k
                src_top_16x8b = src_bottom_16x8b;
3205
5.75k
                pu1_src_left_str[0] = pu1_src_cpy[7];
3206
5.75k
            }
3207
3208
44.6k
            if(0 == pu1_avail[2])
3209
7.62k
            {
3210
7.62k
                pu1_src_left_str[-ht_0] = pu1_src[7 - src_strd];
3211
7.62k
            }
3212
3213
44.6k
            _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
3214
44.6k
            pu1_src += 8;
3215
44.6k
            au1_mask_cpy += 16;
3216
3217
44.6k
            pu1_left_tmp = pu1_src_left_cpy2;
3218
44.6k
            pu1_src_left_cpy2 = pu1_src_left_str2;
3219
44.6k
            pu1_src_left_str2 = pu1_left_tmp;
3220
3221
44.6k
            pu1_src_left_cpy = pu1_src_left_cpy2;
3222
44.6k
            pu1_src_left_str = pu1_src_left_str2;
3223
44.6k
        }
3224
45.6k
        pu1_src_org[0] = u1_pos_0_0_tmp;
3225
45.6k
        pu1_src_org[wd - 1 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp;
3226
45.6k
        pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 1) : pu1_src_left_cpy;
3227
1.35M
        for(row = 0; row < ht_tmp; row++)
3228
1.30M
        {
3229
1.30M
            pu1_src_left[row] = pu1_src_left_cpy[row];
3230
1.30M
        }
3231
45.6k
    }
3232
3233
45.6k
}
3234
3235
/* 135 degree filtering */
3236
void ihevc_sao_edge_offset_class2_chroma_ssse3(UWORD8 *pu1_src,
3237
                                               WORD32 src_strd,
3238
                                               UWORD8 *pu1_src_left,
3239
                                               UWORD8 *pu1_src_top,
3240
                                               UWORD8 *pu1_src_top_left,
3241
                                               UWORD8 *pu1_src_top_right,
3242
                                               UWORD8 *pu1_src_bot_left,
3243
                                               UWORD8 *pu1_avail,
3244
                                               WORD8 *pi1_sao_offset_u,
3245
                                               WORD8 *pi1_sao_offset_v,
3246
                                               WORD32 wd,
3247
                                               WORD32 ht)
3248
38.6k
{
3249
38.6k
    WORD32 row, col;
3250
38.6k
    UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
3251
38.6k
    UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2;
3252
38.6k
    UWORD8 *pu1_firstleft;
3253
38.6k
    UWORD8 *pu1_src_cpy, *pu1_src_org;
3254
38.6k
    UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
3255
38.6k
    UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)];
3256
38.6k
    UWORD8 au1_src_left_tmp1[2 * (MAX_CTB_SIZE + 8)];
3257
38.6k
    WORD32 wd_rem;
3258
38.6k
    UWORD8 u1_pos_0_0_tmp_u, u1_pos_0_0_tmp_v, u1_pos_wd_ht_tmp_u, u1_pos_wd_ht_tmp_v;
3259
38.6k
    WORD32 ht_tmp;
3260
38.6k
    WORD32 ht_0;
3261
3262
38.6k
    WORD32 bit_depth;
3263
38.6k
    UWORD8 u1_avail0, u1_avail1;
3264
3265
38.6k
    __m128i src_temp0_16x8b, src_temp1_16x8b;
3266
38.6k
    __m128i signup0_16x8b, signdwn1_16x8b;
3267
38.6k
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
3268
38.6k
    __m128i edge0_16x8b, edge1_16x8b;
3269
38.6k
    __m128i src_top_16x8b, src_bottom_16x8b;
3270
38.6k
    __m128i au1_mask8x16b;
3271
38.6k
    __m128i edge_idx_8x16b, sao_offset_8x16b;
3272
38.6k
    __m128i const2_16x8b, const0_16x8b;
3273
38.6k
    __m128i left_store_16x8b;
3274
38.6k
    __m128i chroma_offset_8x16b;
3275
3276
38.6k
    UNUSED(pu1_src_top_right);
3277
38.6k
    UNUSED(pu1_src_bot_left);
3278
3279
38.6k
    ht_0 = ht; ht_tmp = ht;
3280
38.6k
    au1_mask8x16b = _mm_set1_epi8(0xff);
3281
    /* Updating left and top-left  */
3282
1.18M
    for(row = 0; row < 2 * ht; row++)
3283
1.14M
    {
3284
1.14M
        au1_src_left_tmp[row] = pu1_src_left[row];
3285
1.14M
    }
3286
    //setting availability mask to ff size MAX_CTB_SIZE
3287
193k
    for(col = 0; col < MAX_CTB_SIZE; col += 16)
3288
154k
        _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
3289
38.6k
    bit_depth = BIT_DEPTH_LUMA;
3290
38.6k
    pu1_src_org = pu1_src;
3291
38.6k
    pu1_src_top_cpy = pu1_src_top;
3292
38.6k
    pu1_src_left_cpy2 = au1_src_left_tmp;
3293
38.6k
    pu1_src_left_cpy = au1_src_left_tmp;
3294
38.6k
    pu1_src_left_str2 = au1_src_left_tmp1;
3295
38.6k
    pu1_src_left_str = au1_src_left_tmp1;
3296
38.6k
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
3297
38.6k
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
3298
38.6k
    const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
3299
38.6k
    chroma_offset_8x16b = _mm_set1_epi16(0x0800);
3300
3301
    /* If top-left is available, process separately */
3302
38.6k
    if(0 != pu1_avail[4])
3303
29.0k
    {
3304
29.0k
        WORD32 edge_idx;
3305
3306
        /* U */
3307
29.0k
        edge_idx = 2 + SIGN(pu1_src[0] - pu1_src_top_left[0]) +
3308
29.0k
                        SIGN(pu1_src[0] - pu1_src[2 + src_strd]);
3309
3310
29.0k
        edge_idx = gi1_table_edge_idx[edge_idx];
3311
3312
29.0k
        if(0 != edge_idx)
3313
6.69k
        {
3314
6.69k
            u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
3315
6.69k
        }
3316
22.3k
        else
3317
22.3k
        {
3318
22.3k
            u1_pos_0_0_tmp_u = pu1_src[0];
3319
22.3k
        }
3320
3321
        /* V */
3322
29.0k
        edge_idx = 2 + SIGN(pu1_src[1] - pu1_src_top_left[1]) +
3323
29.0k
                        SIGN(pu1_src[1] - pu1_src[1 + 2 + src_strd]);
3324
3325
29.0k
        edge_idx = gi1_table_edge_idx[edge_idx];
3326
3327
29.0k
        if(0 != edge_idx)
3328
6.66k
        {
3329
6.66k
            u1_pos_0_0_tmp_v = CLIP3(pu1_src[1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
3330
6.66k
        }
3331
22.3k
        else
3332
22.3k
        {
3333
22.3k
            u1_pos_0_0_tmp_v = pu1_src[1];
3334
22.3k
        }
3335
29.0k
    }
3336
9.58k
    else
3337
9.58k
    {
3338
9.58k
        u1_pos_0_0_tmp_u = pu1_src[0];
3339
9.58k
        u1_pos_0_0_tmp_v = pu1_src[1];
3340
9.58k
    }
3341
3342
    /* If bottom-right is available, process separately */
3343
38.6k
    if(0 != pu1_avail[7])
3344
30.0k
    {
3345
30.0k
        WORD32 edge_idx;
3346
3347
        /* U */
3348
30.0k
        edge_idx = 2 + SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]) +
3349
30.0k
                        SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]);
3350
3351
30.0k
        edge_idx = gi1_table_edge_idx[edge_idx];
3352
3353
30.0k
        if(0 != edge_idx)
3354
6.94k
        {
3355
6.94k
            u1_pos_wd_ht_tmp_u = CLIP3(pu1_src[wd - 2 + (ht - 1) * src_strd] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
3356
6.94k
        }
3357
23.1k
        else
3358
23.1k
        {
3359
23.1k
            u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd];
3360
23.1k
        }
3361
3362
        /* V */
3363
30.0k
        edge_idx = 2 + SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]) +
3364
30.0k
                        SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]);
3365
3366
30.0k
        edge_idx = gi1_table_edge_idx[edge_idx];
3367
3368
30.0k
        if(0 != edge_idx)
3369
7.27k
        {
3370
7.27k
            u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
3371
7.27k
        }
3372
22.7k
        else
3373
22.7k
        {
3374
22.7k
            u1_pos_wd_ht_tmp_v = pu1_src[wd - 1 + (ht - 1) * src_strd];
3375
22.7k
        }
3376
30.0k
    }
3377
8.56k
    else
3378
8.56k
    {
3379
8.56k
        u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd];
3380
8.56k
        u1_pos_wd_ht_tmp_v = pu1_src[wd - 1 + (ht - 1) * src_strd];
3381
8.56k
    }
3382
38.6k
    pu1_firstleft = pu1_src_top_left;
3383
3384
    /* Update height and source pointers based on the availability flags */
3385
38.6k
    if(0 == pu1_avail[2])
3386
6.72k
    {
3387
6.72k
        pu1_firstleft = pu1_src_left_cpy2;
3388
6.72k
        pu1_src_left_cpy2 += 2;
3389
6.72k
        pu1_src_left_str2 += 2;
3390
6.72k
        pu1_src_top_cpy = pu1_src;
3391
6.72k
        pu1_src += src_strd;
3392
6.72k
        ht--;
3393
6.72k
    }
3394
38.6k
    if(0 == pu1_avail[3])
3395
5.39k
    {
3396
5.39k
        ht--;
3397
5.39k
        ht_0--;
3398
5.39k
    }
3399
    //storing top left in a mmx register
3400
38.6k
    left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_firstleft);
3401
38.6k
    sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
3402
38.6k
    const2_16x8b = _mm_set1_epi8(2);
3403
38.6k
    const0_16x8b = _mm_setzero_si128();
3404
38.6k
    left_store_16x8b = _mm_slli_si128(left_store_16x8b, 14);
3405
3406
    //availability mask creation
3407
38.6k
    u1_avail0 = pu1_avail[0];
3408
38.6k
    u1_avail1 = pu1_avail[1];
3409
38.6k
    au1_mask[0] = u1_avail0;
3410
38.6k
    au1_mask[1] = u1_avail0;
3411
38.6k
    au1_mask[wd - 1] = u1_avail1;
3412
38.6k
    au1_mask[wd - 2] = u1_avail1;
3413
3414
    /* top-left arrays */
3415
38.6k
    pu1_src_top_left[0] = pu1_src_top[wd - 2];
3416
38.6k
    pu1_src_top_left[1] = pu1_src_top[wd - 1];
3417
38.6k
    {
3418
38.6k
        WORD32 ht_rem;
3419
38.6k
        au1_mask_cpy = au1_mask;
3420
3421
38.6k
        pu1_src_left_cpy = pu1_src_left_cpy2;
3422
38.6k
        pu1_src_left_str = pu1_src_left_str2;
3423
112k
        for(col = wd; col >= 16; col -= 16)
3424
73.9k
        {
3425
73.9k
            pu1_src_cpy = pu1_src;
3426
73.9k
            src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
3427
            //row = 0
3428
73.9k
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
3429
73.9k
            src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 14);
3430
            //loading the mask
3431
73.9k
            au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
3432
            //separating +ve and and -ve values.
3433
73.9k
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
3434
73.9k
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
3435
            //creating mask 00 for +ve and -ve values and FF for zero.
3436
73.9k
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3437
73.9k
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3438
            //combining the appropriate sign change
3439
73.9k
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3440
3441
3442
627k
            for(row = ht; row >= 2; row -= 2)
3443
553k
            {
3444
553k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3445
                //row = 1
3446
553k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
3447
                // row = 1 right
3448
553k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
3449
                //to insert left in row 0
3450
553k
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
3451
                //row 0 -row1
3452
                //separating +ve and and -ve values.
3453
553k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
3454
553k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
3455
3456
                //creating mask 00 for +ve and -ve values and FF for zero.
3457
553k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3458
553k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3459
                //manipulation for row 1 - row 0
3460
553k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14);
3461
                //combining the appropriate sign change
3462
553k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
3463
                //row1-row0
3464
                //separating +ve and and -ve values.
3465
553k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3466
553k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3467
                //creating mask 00 for +ve and -ve values and FF for zero.
3468
553k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3469
553k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3470
                 // row = 2 right
3471
553k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd + 2));
3472
553k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
3473
3474
3475
                //row1 -bottom
3476
553k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
3477
553k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
3478
                //creating mask 00 for +ve and -ve values and FF for zero.
3479
553k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3480
553k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3481
                //combining the appropriate sign change
3482
553k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3483
                // row = 2
3484
553k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
3485
3486
                //combining sign-left and sign_right
3487
553k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
3488
3489
                //storing the row 1 left for next row.
3490
553k
                signup0_16x8b = _mm_slli_si128(left_store_16x8b, 12);
3491
3492
                //combining sign-left and sign_right
3493
553k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
3494
                //manipulation for bottom - row 1
3495
553k
                signup0_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signup0_16x8b, 14);
3496
                //eliminating old left for row 0 and row 1
3497
553k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
3498
                //bottom - row1
3499
553k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signup0_16x8b);
3500
553k
                cmp_lt0_16x8b = _mm_subs_epu8(signup0_16x8b, src_bottom_16x8b);
3501
                //creating mask 00 for +ve and -ve values and FF for zero.
3502
553k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3503
553k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3504
                //for the next iteration bottom -row1
3505
553k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3506
                //row1  getting it right for left of next iteration
3507
553k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14);
3508
                //copying the next top
3509
553k
                src_top_16x8b = src_temp1_16x8b;
3510
                //row0  getting its right for left of next iteration.
3511
553k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
3512
3513
3514
                //adding constant 2
3515
553k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
3516
553k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
3517
                //shuffle to get sao index
3518
553k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
3519
553k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
3520
                //using availability mask
3521
553k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
3522
553k
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
3523
                //adding chroma offset to access U and V
3524
553k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
3525
553k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
3526
3527
3528
                //shuffle to get sao offset
3529
553k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3530
553k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
3531
                //cnvert to 16 bit then add and then saturated pack
3532
553k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3533
553k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3534
553k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3535
553k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
3536
553k
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
3537
553k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3538
553k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
3539
553k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
3540
3541
553k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
3542
553k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
3543
553k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
3544
553k
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
3545
553k
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
3546
553k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3547
553k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
3548
553k
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
3549
3550
                //store left boundary
3551
553k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3552
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
3553
553k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3554
                // row = 1
3555
553k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
3556
3557
553k
                src_temp0_16x8b = src_bottom_16x8b;
3558
553k
                pu1_src_cpy += (src_strd << 1);
3559
553k
                pu1_src_left_cpy += 4;
3560
553k
                pu1_src_left_str += 4;
3561
553k
            }
3562
73.9k
            ht_rem = ht & 0x1;
3563
3564
73.9k
            if(ht_rem)
3565
18.7k
            {
3566
18.7k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3567
18.7k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
3568
                //current row -next row
3569
                //separating +ve and and -ve values.
3570
18.7k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
3571
18.7k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
3572
                //creating mask 00 for +ve and -ve values and FF for zero.
3573
18.7k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3574
18.7k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3575
                //combining the appropriate sign change
3576
18.7k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3577
                //adding top and botton and constant 2
3578
18.7k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
3579
18.7k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
3580
3581
                //eliminating old left for row 0 and row 1
3582
18.7k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
3583
                //copying the next top
3584
18.7k
                src_top_16x8b = src_temp0_16x8b;
3585
                //row0  getting it right for left of next block
3586
18.7k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
3587
3588
18.7k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
3589
                //using availability mask
3590
18.7k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
3591
                //adding chroma offset to access U and V
3592
18.7k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
3593
3594
18.7k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3595
3596
                //cnvert to 16 bit then add and then saturated pack
3597
18.7k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3598
18.7k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3599
18.7k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3600
18.7k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
3601
18.7k
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
3602
18.7k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3603
18.7k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
3604
18.7k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
3605
3606
18.7k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3607
3608
18.7k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3609
18.7k
                pu1_src_cpy += (src_strd);
3610
18.7k
                pu1_src_left_cpy += 2;
3611
18.7k
                pu1_src_left_str += 2;
3612
18.7k
            }
3613
73.9k
            if(0 == pu1_avail[3])
3614
9.63k
            {
3615
9.63k
                src_top_16x8b = src_bottom_16x8b;
3616
9.63k
                pu1_src_left_str[1] = pu1_src_cpy[15];
3617
9.63k
                pu1_src_left_str[0] = pu1_src_cpy[14];
3618
9.63k
            }
3619
73.9k
            if(0 == pu1_avail[2])
3620
11.9k
            {
3621
11.9k
                pu1_src_left_str[-2 * ht_0] = pu1_src[14 - src_strd];
3622
11.9k
                pu1_src_left_str[-2 * ht_0 + 1] = pu1_src[15 - src_strd];
3623
11.9k
            }
3624
3625
            //for the top left of next part of the block
3626
73.9k
            left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
3627
            //updating top flag
3628
73.9k
            _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
3629
73.9k
            pu1_src += 16;
3630
73.9k
            au1_mask_cpy += 16;
3631
3632
73.9k
            pu1_left_tmp = pu1_src_left_cpy2;
3633
73.9k
            pu1_src_left_cpy2 = pu1_src_left_str2;
3634
73.9k
            pu1_src_left_str2 = pu1_left_tmp;
3635
3636
73.9k
            pu1_src_left_cpy = pu1_src_left_cpy2;
3637
73.9k
            pu1_src_left_str = pu1_src_left_str2;
3638
73.9k
        }
3639
38.6k
        wd_rem = wd & 0xF;
3640
38.6k
        if(wd_rem)
3641
842
        {
3642
842
            pu1_src_left_cpy = pu1_src_left_cpy2;
3643
842
            pu1_src_left_str = pu1_src_left_str2;
3644
842
            pu1_src_cpy = pu1_src;
3645
842
            src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
3646
            //row = 0
3647
842
            src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
3648
842
            src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 14);
3649
842
            au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
3650
            //separating +ve and and -ve values.
3651
842
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
3652
842
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
3653
            //creating mask 00 for +ve and -ve values and FF for zero.
3654
842
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3655
842
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3656
            //preparing au1_mask
3657
842
            au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
3658
            //combining the appropriate sign change
3659
842
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3660
842
            signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
3661
3662
2.59k
            for(row = ht; row >= 4; row -= 4)
3663
1.75k
            {
3664
1.75k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3665
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
3666
1.75k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
3667
                // row = 2
3668
1.75k
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
3669
                //right row1
3670
1.75k
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
3671
                //row 0 -row1
3672
                //separating +ve and and -ve values.
3673
1.75k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
3674
1.75k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
3675
                //manipulation for row 1 -row 0
3676
1.75k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
3677
                //creating mask 00 for +ve and -ve values and FF for zero.
3678
1.75k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3679
1.75k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3680
                //row 0 left
3681
1.75k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14);
3682
                //combining the appropriate sign change
3683
1.75k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3684
                //row 1 -row0
3685
                //separating +ve and and -ve values.
3686
1.75k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3687
1.75k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3688
3689
                //creating mask 00 for +ve and -ve values and FF for zero.
3690
1.75k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3691
1.75k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3692
                //row1-row0
3693
1.75k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3694
3695
1.75k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
3696
3697
1.75k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
3698
                //right row2
3699
1.75k
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
3700
                //packing row 0 n row 1
3701
1.75k
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
3702
                //row1 -row2
3703
1.75k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3704
1.75k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3705
                //creating mask 00 for +ve and -ve values and FF for zero.
3706
1.75k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3707
1.75k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3708
                //combining the appropriate sign change
3709
1.75k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
3710
1.75k
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
3711
                //manipulation for row 2 -row 1
3712
1.75k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
3713
                //row 1 left
3714
1.75k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
3715
                //row = 3
3716
1.75k
                src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
3717
3718
                // row = 4
3719
1.75k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
3720
3721
1.75k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
3722
3723
                //separating +ve and and -ve values.(2,1)
3724
1.75k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
3725
1.75k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
3726
                //manipulation for row 3 -row 2
3727
1.75k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 10);
3728
                //creating mask 00 for +ve and -ve values and FF for zero.
3729
1.75k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3730
1.75k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3731
                //row 2 left
3732
1.75k
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
3733
                //combining the appropriate sign change
3734
1.75k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
3735
3736
                //separating +ve and and -ve values.(3,2)
3737
1.75k
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
3738
1.75k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
3739
1.75k
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
3740
                //creating mask 00 for +ve and -ve values and FF for zero.
3741
1.75k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3742
1.75k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3743
                //right row3
3744
1.75k
                signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 2);
3745
                //combining the appropriate sign change
3746
1.75k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
3747
3748
1.75k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
3749
3750
                //separating +ve and and -ve values.(2,3)
3751
1.75k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
3752
1.75k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
3753
                //right row 4
3754
1.75k
                signdwn1_16x8b =  _mm_srli_si128(src_temp1_16x8b, 2);
3755
                //creating mask 00 for +ve and -ve values and FF for zero.
3756
1.75k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3757
1.75k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3758
                //combining the appropriate sign change
3759
1.75k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
3760
3761
                //separating +ve and and -ve values.(3,bottom)
3762
1.75k
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
3763
1.75k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
3764
3765
                //creating mask 00 for +ve and -ve values and FF for zero.
3766
1.75k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3767
1.75k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3768
1.75k
                edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
3769
                //combining the appropriate sign change
3770
1.75k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
3771
1.75k
                edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
3772
3773
                //manipulation for bottom -row 3
3774
1.75k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 8);
3775
                //eliminating old left for row 0,1,2,3
3776
1.75k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 8);
3777
                //packing row 2 n row 3
3778
1.75k
                src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
3779
                //row 3 left
3780
1.75k
                signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 14);
3781
3782
                //adding bottom and top values of row 2 and row 3
3783
1.75k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
3784
                //separating +ve and and -ve values.(botttom,3)
3785
1.75k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3786
1.75k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3787
3788
                //creating mask 00 for +ve and -ve values and FF for zero.
3789
1.75k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3790
1.75k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3791
1.75k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
3792
3793
                //to store right of row 2
3794
1.75k
                signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
3795
                //loading row 3 right into left
3796
1.75k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 14);
3797
                //storing right of row 2into left
3798
1.75k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
3799
                //to store right of row 0
3800
1.75k
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
3801
                //storing right of row 1 into left
3802
1.75k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
3803
                //storing right of row 0 into left
3804
1.75k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
3805
3806
                //adding constant 2
3807
1.75k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
3808
1.75k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
3809
                //shuffle to get sao index
3810
1.75k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
3811
1.75k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
3812
                //using availability mask
3813
1.75k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
3814
1.75k
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
3815
3816
                //adding chroma offset to access U and V
3817
1.75k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
3818
1.75k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
3819
3820
                //shuffle to get sao offset
3821
1.75k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3822
1.75k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
3823
                //cnvert to 16 bit then add and then saturated pack
3824
1.75k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3825
1.75k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3826
1.75k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3827
1.75k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
3828
1.75k
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
3829
1.75k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3830
1.75k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
3831
1.75k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
3832
3833
1.75k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
3834
1.75k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
3835
1.75k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
3836
1.75k
                src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
3837
1.75k
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
3838
1.75k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3839
1.75k
                src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b);
3840
1.75k
                src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
3841
3842
1.75k
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
3843
1.75k
                cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
3844
3845
3846
1.75k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3847
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
3848
1.75k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3849
                // row = 1
3850
1.75k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
3851
                //row = 2
3852
1.75k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
3853
                // row = 3
3854
1.75k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
3855
3856
1.75k
                src_temp0_16x8b = src_temp1_16x8b;
3857
1.75k
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
3858
1.75k
                pu1_src_cpy += (src_strd << 2);
3859
1.75k
                pu1_src_left_cpy += 8;
3860
1.75k
                pu1_src_left_str += 8;
3861
1.75k
            }
3862
842
            ht_rem = ht & 0x2;
3863
842
            if(ht_rem)
3864
346
            {
3865
346
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3866
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
3867
346
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
3868
                // row = 2
3869
346
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
3870
3871
                //row 0 -row 1
3872
346
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
3873
                //separating +ve and and -ve values.
3874
346
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
3875
346
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
3876
                //manipulation for row 1 -row 0
3877
346
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
3878
                //creating mask 00 for +ve and -ve values and FF for zero.
3879
346
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3880
346
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3881
                //manipulation for row 1 - row 0
3882
346
                signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14);
3883
                //combining the appropriate sign change
3884
346
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3885
3886
                //row1-row0
3887
                //separating +ve and and -ve values.
3888
346
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3889
346
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3890
3891
                //creating mask 00 for +ve and -ve values and FF for zero.
3892
346
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3893
346
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3894
                //combining the appropriate sign chang
3895
346
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3896
                //row 1 -bottom
3897
346
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
3898
3899
346
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
3900
346
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
3901
                //row1 -bottom
3902
346
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3903
346
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3904
3905
                //creating mask 00 for +ve and -ve values and FF for zero.
3906
346
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3907
346
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3908
                //combining the appropriate sign change
3909
346
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
3910
346
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
3911
                //manipulation for bottom -row1
3912
346
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
3913
                //eliminating old left for row 0,1
3914
346
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
3915
                //manipulation for bottom- row 1
3916
346
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
3917
                //adding top and down substraction
3918
346
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
3919
                //bottom - row 1
3920
346
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
3921
346
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
3922
3923
                //shifting row 1
3924
346
                signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
3925
                //creating mask 00 for +ve and -ve values and FF for zero.
3926
346
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3927
346
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3928
                //for the next iteration signup0_16x8b
3929
346
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
3930
                //storing right of row 1 into left
3931
346
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); //for storing right of row 0
3932
346
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
3933
                //the next top  in  src_top_16x8b
3934
346
                src_top_16x8b = src_temp1_16x8b;
3935
                //storing right of row 0 into left
3936
346
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
3937
3938
3939
                //adding constant 2
3940
346
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
3941
3942
                //shuffle to get sao index
3943
346
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
3944
                //using availability mask
3945
346
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
3946
3947
                //adding chroma offset to access U and V
3948
346
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
3949
3950
                //shuffle to get sao offset
3951
346
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3952
                //the next top already in  src_top_16x8b
3953
                //cnvert to 16 bit then add and then saturated pack
3954
346
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3955
346
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3956
346
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3957
346
                src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
3958
346
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
3959
346
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
3960
346
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b);
3961
346
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
3962
3963
346
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
3964
3965
346
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3966
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
3967
346
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3968
                // row = 1
3969
346
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
3970
346
                src_temp0_16x8b = src_bottom_16x8b;
3971
346
                pu1_src_cpy += (src_strd << 1);
3972
346
                pu1_src_left_cpy += 4;
3973
346
                pu1_src_left_str += 4;
3974
346
            }
3975
842
            ht_rem = ht & 0x1;
3976
842
            if(ht_rem)
3977
190
            {
3978
190
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3979
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
3980
190
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
3981
3982
                //row 0 -row1
3983
190
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
3984
                //separating +ve and and -ve values.
3985
190
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
3986
190
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
3987
                //creating mask 00 for +ve and -ve values and FF for zero.
3988
190
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3989
190
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3990
                //combining the appropriate sign change
3991
190
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3992
                //adding top and down substraction
3993
190
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
3994
3995
                //for row 0 right to put into left store
3996
190
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
3997
                //left store manipulation 1
3998
190
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
3999
190
                src_top_16x8b = src_temp0_16x8b;
4000
                //filling the left boundary value
4001
190
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
4002
4003
                //adding constant 2
4004
190
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
4005
190
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
4006
190
                edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
4007
4008
4009
                //shuffle to get sao index
4010
190
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
4011
                //using availability mask
4012
190
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
4013
                //adding chroma offset to access U and V
4014
190
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
4015
4016
                //shuffle to get sao offset
4017
190
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
4018
4019
                //cnvert to 16 bit then add and then saturated pack
4020
190
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
4021
190
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
4022
190
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
4023
190
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
4024
190
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
4025
4026
190
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4027
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
4028
190
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
4029
190
                pu1_src_cpy += (src_strd);
4030
190
                pu1_src_left_cpy += 2;
4031
190
                pu1_src_left_str += 2;
4032
190
            }
4033
842
            if(0 == pu1_avail[3])
4034
229
            {
4035
229
                src_top_16x8b = src_bottom_16x8b;
4036
229
                pu1_src_left_str[1] = pu1_src_cpy[7];
4037
229
                pu1_src_left_str[0] = pu1_src_cpy[6];
4038
229
            }
4039
4040
842
            if(0 == pu1_avail[2])
4041
273
            {
4042
273
                pu1_src_left_str[-2 * ht_0] = pu1_src[6 - src_strd];
4043
273
                pu1_src_left_str[-2 * ht_0 + 1] = pu1_src[7 - src_strd];
4044
273
            }
4045
4046
842
            _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
4047
842
            pu1_src += 8;
4048
4049
842
            pu1_left_tmp = pu1_src_left_cpy2;
4050
842
            pu1_src_left_cpy2 = pu1_src_left_str2;
4051
842
            pu1_src_left_str2 = pu1_left_tmp;
4052
4053
842
            pu1_src_left_cpy = pu1_src_left_cpy2;
4054
842
            pu1_src_left_str = pu1_src_left_str2;
4055
842
        }
4056
38.6k
        pu1_src_org[0] = u1_pos_0_0_tmp_u;
4057
38.6k
        pu1_src_org[1] = u1_pos_0_0_tmp_v;
4058
38.6k
        pu1_src_org[wd - 2 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp_u;
4059
38.6k
        pu1_src_org[wd - 1 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp_v;
4060
38.6k
        pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 2) : pu1_src_left_cpy;
4061
1.18M
        for(row = 0; row < 2 * ht_tmp; row++)
4062
1.14M
        {
4063
1.14M
            pu1_src_left[row] = pu1_src_left_cpy[row];
4064
1.14M
        }
4065
38.6k
    }
4066
4067
38.6k
}
4068
4069
void ihevc_sao_edge_offset_class3_ssse3(UWORD8 *pu1_src,
4070
                                        WORD32 src_strd,
4071
                                        UWORD8 *pu1_src_left,
4072
                                        UWORD8 *pu1_src_top,
4073
                                        UWORD8 *pu1_src_top_left,
4074
                                        UWORD8 *pu1_src_top_right,
4075
                                        UWORD8 *pu1_src_bot_left,
4076
                                        UWORD8 *pu1_avail,
4077
                                        WORD8 *pi1_sao_offset,
4078
                                        WORD32 wd,
4079
                                        WORD32 ht)
4080
48.4k
{
4081
48.4k
    WORD32 row, col;
4082
48.4k
    UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
4083
48.4k
    UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2;
4084
48.4k
    UWORD8 *pu1_src_cpy, *pu1_src_org;
4085
48.4k
    UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8];
4086
48.4k
    UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8];
4087
48.4k
    UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
4088
48.4k
    WORD32 wd_rem;
4089
48.4k
    UWORD8 u1_pos_wd_0_tmp, u1_pos_0_ht_tmp;
4090
48.4k
    WORD32 ht_tmp;
4091
48.4k
    WORD32 bit_depth;
4092
48.4k
    UWORD8 u1_avail0, u1_avail1;
4093
4094
48.4k
    __m128i src_top_16x8b, src_bottom_16x8b;
4095
48.4k
    __m128i src_temp0_16x8b, src_temp1_16x8b;
4096
48.4k
    __m128i signup0_16x8b, signdwn1_16x8b;
4097
48.4k
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
4098
48.4k
    __m128i edge0_16x8b, edge1_16x8b;
4099
48.4k
    __m128i au1_mask8x16b;
4100
48.4k
    __m128i edge_idx_8x16b, sao_offset_8x16b;
4101
48.4k
    __m128i const2_16x8b, const0_16x8b;
4102
48.4k
    __m128i left_store_16x8b;
4103
4104
48.4k
    ht_tmp = ht;
4105
48.4k
    au1_mask8x16b = _mm_set1_epi8(0xff);
4106
4107
48.4k
    au1_src_left_tmp[0] = pu1_src[(wd - 1)];
4108
    //manipulation for bottom left
4109
1.29M
    for(row = 1; row < ht; row++)
4110
1.24M
    {
4111
1.24M
        au1_src_left_tmp[row] = pu1_src_left[row];
4112
1.24M
    }
4113
48.4k
    au1_src_left_tmp[ht] = pu1_src_bot_left[0];
4114
4115
48.4k
    *pu1_src_top_left = pu1_src_top[wd - 1];
4116
    //setting availability mask to ff size MAX_CTB_SIZE
4117
241k
    for(col = 0; col < MAX_CTB_SIZE; col += 16)
4118
193k
        _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
4119
48.4k
    bit_depth = BIT_DEPTH_LUMA;
4120
48.4k
    pu1_src_org = pu1_src;
4121
48.4k
    pu1_src_top_cpy = pu1_src_top;
4122
48.4k
    pu1_src_left_cpy2 = au1_src_left_tmp;
4123
48.4k
    pu1_src_left_cpy = au1_src_left_tmp;
4124
48.4k
    pu1_src_left_str2 = au1_src_left_tmp1;
4125
48.4k
    pu1_src_left_str = au1_src_left_tmp1;
4126
48.4k
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
4127
48.4k
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
4128
4129
    /* If top-right is available, process separately */
4130
48.4k
    if(0 != pu1_avail[5])
4131
36.7k
    {
4132
36.7k
        WORD32 edge_idx;
4133
4134
36.7k
        edge_idx = 2 + SIGN(pu1_src[wd - 1] - pu1_src_top_right[0]) +
4135
36.7k
                        SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd]);
4136
4137
36.7k
        edge_idx = gi1_table_edge_idx[edge_idx];
4138
4139
36.7k
        if(0 != edge_idx)
4140
9.12k
        {
4141
9.12k
            u1_pos_wd_0_tmp = CLIP3(pu1_src[wd - 1] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
4142
9.12k
        }
4143
27.5k
        else
4144
27.5k
        {
4145
27.5k
            u1_pos_wd_0_tmp = pu1_src[wd - 1];
4146
27.5k
        }
4147
36.7k
    }
4148
11.6k
    else
4149
11.6k
    {
4150
11.6k
        u1_pos_wd_0_tmp = pu1_src[wd - 1];
4151
11.6k
    }
4152
4153
    /* If bottom-left is available, process separately */
4154
48.4k
    if(0 != pu1_avail[6])
4155
37.6k
    {
4156
37.6k
        WORD32 edge_idx;
4157
4158
37.6k
        edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd]) +
4159
37.6k
                        SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]);
4160
4161
37.6k
        edge_idx = gi1_table_edge_idx[edge_idx];
4162
4163
37.6k
        if(0 != edge_idx)
4164
9.91k
        {
4165
9.91k
            u1_pos_0_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
4166
9.91k
        }
4167
27.7k
        else
4168
27.7k
        {
4169
27.7k
            u1_pos_0_ht_tmp = pu1_src[(ht - 1) * src_strd];
4170
27.7k
        }
4171
37.6k
    }
4172
10.7k
    else
4173
10.7k
    {
4174
10.7k
        u1_pos_0_ht_tmp = pu1_src[(ht - 1) * src_strd];
4175
10.7k
    }
4176
4177
4178
4179
    /* Update height and source pointers based on the availability flags */
4180
48.4k
    if(0 == pu1_avail[2])
4181
8.51k
    {
4182
8.51k
        pu1_src_left_cpy2++;
4183
8.51k
        pu1_src_left_str2++;
4184
8.51k
        pu1_src_top_cpy = pu1_src;
4185
8.51k
        pu1_src += src_strd;
4186
8.51k
        ht--;
4187
8.51k
    }
4188
48.4k
    if(0 == pu1_avail[3])
4189
6.20k
    {
4190
6.20k
        ht--;
4191
6.20k
    }
4192
4193
4194
48.4k
    const2_16x8b = _mm_set1_epi8(2);
4195
48.4k
    const0_16x8b = _mm_setzero_si128();
4196
4197
4198
    //availability mask creation
4199
48.4k
    u1_avail0 = pu1_avail[0];
4200
48.4k
    u1_avail1 = pu1_avail[1];
4201
48.4k
    au1_mask[0] = u1_avail0;
4202
48.4k
    au1_mask[wd - 1] = u1_avail1;
4203
48.4k
    {
4204
48.4k
        WORD32 ht_rem;
4205
4206
48.4k
        pu1_src_left_cpy = pu1_src_left_cpy2;
4207
48.4k
        pu1_src_left_str = pu1_src_left_str2;
4208
48.4k
        au1_mask_cpy = au1_mask;
4209
107k
        for(col = wd; col >= 16; col -= 16)
4210
58.6k
        {
4211
58.6k
            pu1_src_cpy = pu1_src;
4212
58.6k
            src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col + 1));
4213
            //row = 0
4214
58.6k
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
4215
4216
            //loading the mask
4217
58.6k
            au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
4218
            //separating +ve and and -ve values.
4219
58.6k
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
4220
58.6k
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
4221
            //creating mask 00 for +ve and -ve values and FF for zero.
4222
58.6k
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4223
58.6k
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4224
            //combining the appropriate sign change
4225
58.6k
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4226
4227
942k
            for(row = ht; row >= 2; row -= 2)
4228
883k
            {
4229
883k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
4230
                //row = 1
4231
883k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
4232
                //to insert left in row 1
4233
883k
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
4234
                // row = 0 right
4235
883k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 1));
4236
4237
                //manipulation for row 1 - row 0
4238
883k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
4239
                //row 0 -row1
4240
                //separating +ve and and -ve values.
4241
883k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
4242
883k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
4243
4244
                //creating mask 00 for +ve and -ve values and FF for zero.
4245
883k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4246
883k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4247
4248
                //combining the appropriate sign change
4249
883k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
4250
                //combining sign-left and sign_right
4251
883k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
4252
4253
                //row1-row0
4254
                //separating +ve and and -ve values.
4255
883k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
4256
883k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
4257
                //creating mask 00 for +ve and -ve values and FF for zero.
4258
883k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4259
883k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4260
4261
                // row = 2
4262
883k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
4263
                // row = 1 right
4264
883k
                signdwn1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
4265
883k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
4266
4267
                //bottom - row1
4268
883k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
4269
883k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
4270
                //creating mask 00 for +ve and -ve values and FF for zero.
4271
883k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4272
883k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4273
                //for the next iteration bottom -row1
4274
883k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4275
4276
                //to insert left in row 1
4277
883k
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 13);
4278
                //manipulation for row 1 - bottom
4279
883k
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
4280
4281
                //row1 -bottom
4282
883k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
4283
883k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
4284
                //creating mask 00 for +ve and -ve values and FF for zero.
4285
883k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4286
883k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4287
                //combining the appropriate sign change
4288
883k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4289
4290
                //combining sign-left and sign_right
4291
883k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
4292
4293
                //eliminating old left for row 0 and row 1
4294
883k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
4295
4296
                //row1  getting it right for left of next block
4297
883k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15);
4298
                //adding constant 2
4299
883k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
4300
883k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
4301
                //shuffle to get sao index
4302
883k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
4303
883k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
4304
                //using availability mask
4305
883k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
4306
883k
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
4307
                //shuffle to get sao offset
4308
883k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
4309
883k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
4310
                //row0  getting it right for left of next block
4311
883k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
4312
                //copying the next top
4313
883k
                src_top_16x8b = src_temp1_16x8b;
4314
                //cnvert to 16 bit then add and then saturated pack
4315
883k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
4316
883k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
4317
883k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
4318
883k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
4319
883k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
4320
883k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
4321
883k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
4322
883k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
4323
4324
883k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
4325
883k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
4326
883k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
4327
883k
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
4328
883k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
4329
883k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
4330
883k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
4331
883k
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
4332
                //store left boundary
4333
883k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4334
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
4335
883k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
4336
                // row = 1
4337
883k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
4338
4339
883k
                src_temp0_16x8b = src_bottom_16x8b;
4340
883k
                pu1_src_cpy += (src_strd << 1);
4341
883k
                pu1_src_left_cpy += 2;
4342
883k
                pu1_src_left_str += 2;
4343
883k
            }
4344
58.6k
            ht_rem = ht & 0x1;
4345
4346
58.6k
            if(ht_rem)
4347
15.9k
            {
4348
15.9k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
4349
15.9k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
4350
                //to insert left in row 1
4351
15.9k
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
4352
                //manipulation for row 1 - row 0
4353
15.9k
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
4354
4355
                //current row -next row
4356
                //separating +ve and and -ve values.
4357
15.9k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
4358
15.9k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
4359
                //creating mask 00 for +ve and -ve values and FF for zero.
4360
15.9k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4361
15.9k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4362
                //combining the appropriate sign change
4363
15.9k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4364
                //adding top and bottom and constant 2
4365
15.9k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
4366
15.9k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
4367
                //eliminating old left for row 0 and row 1
4368
15.9k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
4369
4370
15.9k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
4371
                //using availability mask
4372
15.9k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
4373
4374
15.9k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
4375
4376
                //row0  getting it right for left of next block
4377
15.9k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
4378
                //copying the next top
4379
15.9k
                src_top_16x8b = src_temp0_16x8b;
4380
                //cnvert to 16 bit then add and then saturated pack
4381
15.9k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
4382
15.9k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
4383
15.9k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
4384
15.9k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
4385
15.9k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
4386
15.9k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
4387
15.9k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
4388
15.9k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
4389
                //store left boundary
4390
15.9k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4391
4392
15.9k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
4393
15.9k
                pu1_src_cpy += (src_strd);
4394
15.9k
                src_temp0_16x8b = src_bottom_16x8b;
4395
15.9k
                pu1_src_left_cpy++;
4396
15.9k
                pu1_src_left_str++;
4397
15.9k
            }
4398
58.6k
            {   //for bottom right
4399
58.6k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
4400
58.6k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
4401
58.6k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
4402
58.6k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4403
58.6k
            }
4404
58.6k
            if(0 == pu1_avail[3])
4405
7.30k
            {
4406
7.30k
                src_top_16x8b = src_bottom_16x8b;
4407
7.30k
            }
4408
            //for the top left of next part of the block
4409
58.6k
            left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
4410
            //updating top flag
4411
58.6k
            _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
4412
58.6k
            pu1_src += 16;
4413
58.6k
            au1_mask_cpy += 16;
4414
4415
58.6k
            pu1_left_tmp = pu1_src_left_cpy2;
4416
58.6k
            pu1_src_left_cpy2 = pu1_src_left_str2;
4417
58.6k
            pu1_src_left_str2 = pu1_left_tmp;
4418
4419
58.6k
            pu1_src_left_cpy = pu1_src_left_cpy2;
4420
58.6k
            pu1_src_left_str = pu1_src_left_str2;
4421
58.6k
        }
4422
4423
48.4k
        wd_rem = wd & 0xF;
4424
48.4k
        if(wd_rem)
4425
46.4k
        {
4426
46.4k
            pu1_src_cpy = pu1_src;
4427
46.4k
            pu1_src_left_cpy = pu1_src_left_cpy2;
4428
46.4k
            pu1_src_left_str = pu1_src_left_str2;
4429
46.4k
            src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col + 1));
4430
            //row = 0
4431
46.4k
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
4432
46.4k
            au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
4433
            //separating +ve and and -ve values.
4434
46.4k
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
4435
46.4k
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
4436
            //creating mask 00 for +ve and -ve values and FF for zero.
4437
46.4k
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4438
46.4k
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4439
            //preparing au1_mask
4440
46.4k
            au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
4441
            //combining the appropriate sign change
4442
46.4k
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4443
46.4k
            signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
4444
4445
341k
            for(row = ht; row >= 4; row -= 4)
4446
294k
            {
4447
294k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
4448
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
4449
294k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
4450
                // row = 2
4451
294k
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
4452
                //manipulation for row 0 -row 1
4453
294k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
4454
                //row 1 left
4455
294k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
4456
                //row 0 -row1
4457
                //separating +ve and and -ve values.
4458
294k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
4459
294k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
4460
4461
                //creating mask 00 for +ve and -ve values and FF for zero.
4462
294k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4463
294k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4464
                //manipulatiing for row 1 -row 0
4465
294k
                signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 1);
4466
                //combining the appropriate sign change
4467
294k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4468
                //row 1 -row0
4469
                //separating +ve and and -ve values.
4470
294k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
4471
294k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
4472
4473
                //creating mask 00 for +ve and -ve values and FF for zero.
4474
294k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4475
294k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4476
                //row1-row0
4477
294k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4478
4479
294k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
4480
4481
294k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
4482
                //manipulation for row 1 -row 2
4483
294k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 13);
4484
                //row 2 left
4485
294k
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
4486
                //packing row 0 n row 1
4487
294k
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
4488
                //row1 -row2
4489
294k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
4490
294k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
4491
                //creating mask 00 for +ve and -ve values and FF for zero.
4492
294k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4493
294k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4494
                //combining the appropriate sign change
4495
294k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
4496
294k
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
4497
4498
                //row 1 right
4499
294k
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
4500
                //row = 3
4501
294k
                src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
4502
4503
                // row = 4
4504
294k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
4505
4506
294k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
4507
4508
                //separating +ve and and -ve values.(2,1)
4509
294k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
4510
294k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
4511
4512
                //creating mask 00 for +ve and -ve values and FF for zero.
4513
294k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4514
294k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4515
                //row 2 right
4516
294k
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
4517
                //combining the appropriate sign change
4518
294k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
4519
4520
                //separating +ve and and -ve values.(3,2)
4521
294k
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
4522
294k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
4523
294k
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
4524
                //creating mask 00 for +ve and -ve values and FF for zero.
4525
294k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4526
294k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4527
                //manipulation for row 2 -row 3
4528
294k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
4529
                //row 3 left
4530
294k
                signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 15);
4531
                //combining the appropriate sign change
4532
294k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
4533
4534
294k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
4535
4536
                //separating +ve and and -ve values.(2,3)
4537
294k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
4538
294k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
4539
4540
                //manipulation for row 3 -bottom
4541
294k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 11);
4542
                //bottom left
4543
294k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
4544
4545
                //creating mask 00 for +ve and -ve values and FF for zero.
4546
294k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4547
294k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4548
                //combining the appropriate sign change
4549
294k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
4550
4551
                //separating +ve and and -ve values.(3,bottom)
4552
294k
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
4553
294k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
4554
4555
                //creating mask 00 for +ve and -ve values and FF for zero.
4556
294k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4557
294k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4558
294k
                edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
4559
                //combining the appropriate sign change
4560
294k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
4561
294k
                edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
4562
4563
4564
                //eliminating old left for row 0,1,2,3
4565
294k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
4566
                //packing row 2 n row 3
4567
294k
                src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
4568
                //row 3 right
4569
294k
                signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 1);
4570
                //loading row 3 right into left
4571
294k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 15);
4572
                //adding bottom and top values of row 2 and row 3
4573
294k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
4574
                //separating +ve and and -ve values.(botttom,3)
4575
294k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
4576
294k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
4577
                //to store right of row 2
4578
294k
                signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
4579
                //creating mask 00 for +ve and -ve values and FF for zero.
4580
294k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4581
294k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4582
294k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
4583
4584
                //storing right of row 2into left
4585
294k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
4586
                //to store right of row 0
4587
294k
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
4588
                //storing right of row 1 into left
4589
294k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
4590
4591
                //adding constant 2
4592
294k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
4593
294k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
4594
                //shuffle to get sao index
4595
294k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
4596
294k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
4597
                //using availability mask
4598
294k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
4599
294k
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
4600
                //shuffle to get sao offset
4601
294k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
4602
294k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
4603
4604
                //storing right of row 0 into left
4605
294k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
4606
                //cnvert to 16 bit then add and then saturated pack
4607
294k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
4608
294k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
4609
294k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
4610
294k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
4611
294k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
4612
294k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
4613
294k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
4614
294k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
4615
4616
294k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
4617
294k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
4618
294k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
4619
294k
                src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
4620
294k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
4621
294k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
4622
294k
                src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b);
4623
294k
                src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
4624
4625
294k
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
4626
294k
                cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
4627
4628
294k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4629
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
4630
294k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
4631
                // row = 1
4632
294k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
4633
                //row = 2
4634
294k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
4635
                // row = 3
4636
294k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
4637
4638
294k
                src_temp0_16x8b = src_temp1_16x8b;
4639
294k
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
4640
294k
                pu1_src_cpy += (src_strd << 2);
4641
294k
                pu1_src_left_cpy += 4;
4642
294k
                pu1_src_left_str += 4;
4643
294k
            }
4644
46.4k
            ht_rem = ht & 0x2;
4645
46.4k
            if(ht_rem)
4646
13.8k
            {
4647
13.8k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
4648
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
4649
13.8k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
4650
                // row = 2
4651
13.8k
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
4652
4653
                //manipulation for row 0 -row 1
4654
13.8k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
4655
                //bottom left
4656
13.8k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
4657
                //separating +ve and and -ve values.
4658
13.8k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
4659
13.8k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
4660
4661
                //creating mask 00 for +ve and -ve values and FF for zero.
4662
13.8k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4663
13.8k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4664
                //manipulation for row 1 - row 0
4665
13.8k
                signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 1);
4666
                //combining the appropriate sign change
4667
13.8k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4668
4669
                //row1-row0
4670
                //separating +ve and and -ve values.
4671
13.8k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
4672
13.8k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
4673
4674
                //creating mask 00 for +ve and -ve values and FF for zero.
4675
13.8k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4676
13.8k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4677
                //combining the appropriate sign chang
4678
13.8k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4679
4680
                //manipulation for row 1 -bottom
4681
13.8k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 13);
4682
                //bottom left
4683
13.8k
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
4684
4685
13.8k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
4686
13.8k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
4687
                //row1 -bottom
4688
13.8k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
4689
13.8k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
4690
4691
                //creating mask 00 for +ve and -ve values and FF for zero.
4692
13.8k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4693
13.8k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4694
                //combining the appropriate sign change
4695
13.8k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
4696
13.8k
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
4697
                //manipulation for bottom- row 1 (row 1 right)
4698
13.8k
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
4699
                //adding top and down substraction
4700
13.8k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
4701
                //bottom - row 1
4702
13.8k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
4703
13.8k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
4704
4705
                //eliminating old left for row 0,1
4706
13.8k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
4707
13.8k
                signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
4708
                //creating mask 00 for +ve and -ve values and FF for zero.
4709
13.8k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4710
13.8k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4711
                //for the next iteration signup0_16x8b
4712
13.8k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
4713
4714
                //storing right of row 1 into left
4715
13.8k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
4716
                //for storing right of row 1
4717
13.8k
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
4718
4719
13.8k
                src_top_16x8b = src_temp1_16x8b;
4720
                //storing right of row 0 into left
4721
13.8k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
4722
4723
                //adding constant 2
4724
13.8k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
4725
4726
                //shuffle to get sao index
4727
13.8k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
4728
                //using availability mask
4729
13.8k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
4730
                //shuffle to get sao offset
4731
13.8k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
4732
4733
                //the next top already in  src_top_16x8b
4734
                //cnvert to 16 bit then add and then saturated pack
4735
13.8k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
4736
13.8k
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
4737
13.8k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
4738
13.8k
                src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
4739
13.8k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
4740
13.8k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
4741
13.8k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
4742
13.8k
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
4743
4744
13.8k
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
4745
4746
13.8k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4747
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
4748
13.8k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
4749
                // row = 1
4750
13.8k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
4751
13.8k
                src_temp0_16x8b = src_bottom_16x8b;
4752
13.8k
                pu1_src_cpy += (src_strd << 1);
4753
13.8k
                pu1_src_left_cpy += 2;
4754
13.8k
                pu1_src_left_str += 2;
4755
13.8k
            }
4756
46.4k
            ht_rem = ht & 0x1;
4757
46.4k
            if(ht_rem)
4758
13.2k
            {
4759
13.2k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
4760
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
4761
13.2k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
4762
4763
4764
                //manipulation for row 0 -bottom
4765
13.2k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
4766
                //bottom left
4767
13.2k
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
4768
                //separating +ve and and -ve values.
4769
13.2k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
4770
13.2k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
4771
                //creating mask 00 for +ve and -ve values and FF for zero.
4772
13.2k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4773
13.2k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4774
                //combining the appropriate sign change
4775
13.2k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4776
                //adding top and down substraction
4777
13.2k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
4778
                //for row 0 right to put into left store
4779
13.2k
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
4780
                //adding constant 2
4781
13.2k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
4782
13.2k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
4783
13.2k
                edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
4784
                //left store manipulation 1
4785
13.2k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
4786
                //filling the left boundary value
4787
13.2k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
4788
4789
                //shuffle to get sao index
4790
13.2k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
4791
                //using availability mask
4792
13.2k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
4793
                //shuffle to get sao offset
4794
13.2k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
4795
13.2k
                src_top_16x8b = src_temp0_16x8b;
4796
                //cnvert to 16 bit then add and then saturated pack
4797
13.2k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
4798
13.2k
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
4799
13.2k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
4800
13.2k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
4801
13.2k
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
4802
4803
13.2k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4804
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
4805
13.2k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
4806
13.2k
                pu1_src_cpy += (src_strd);
4807
13.2k
                src_temp0_16x8b = src_bottom_16x8b;
4808
13.2k
                pu1_src_left_cpy++;
4809
13.2k
                pu1_src_left_str++;
4810
13.2k
            }
4811
46.4k
            {   //for bottom right
4812
46.4k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
4813
46.4k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
4814
46.4k
                src_temp0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
4815
46.4k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
4816
46.4k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4817
46.4k
            }
4818
46.4k
            if(0 == pu1_avail[3])
4819
6.07k
            {
4820
6.07k
                src_top_16x8b = src_bottom_16x8b;
4821
6.07k
            }
4822
46.4k
            _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
4823
46.4k
            pu1_src += 8;
4824
4825
46.4k
            pu1_left_tmp = pu1_src_left_cpy2;
4826
46.4k
            pu1_src_left_cpy2 = pu1_src_left_str2;
4827
46.4k
            pu1_src_left_str2 = pu1_left_tmp;
4828
4829
46.4k
            pu1_src_left_cpy = pu1_src_left_cpy2;
4830
46.4k
            pu1_src_left_str = pu1_src_left_str2;
4831
4832
46.4k
        }
4833
48.4k
        pu1_src_org[wd - 1] = u1_pos_wd_0_tmp;
4834
48.4k
        pu1_src_org[(ht_tmp - 1) * src_strd] = u1_pos_0_ht_tmp;
4835
48.4k
        pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 1) : pu1_src_left_cpy;
4836
48.4k
        pu1_src_left[0] = au1_src_left_tmp[0];
4837
1.29M
        for(row = 1; row < ht_tmp; row++)
4838
1.24M
        {
4839
1.24M
            pu1_src_left[row] = pu1_src_left_cpy[row];
4840
1.24M
        }
4841
48.4k
    }
4842
4843
48.4k
}
4844
4845
void ihevc_sao_edge_offset_class3_chroma_ssse3(UWORD8 *pu1_src,
4846
                                               WORD32 src_strd,
4847
                                               UWORD8 *pu1_src_left,
4848
                                               UWORD8 *pu1_src_top,
4849
                                               UWORD8 *pu1_src_top_left,
4850
                                               UWORD8 *pu1_src_top_right,
4851
                                               UWORD8 *pu1_src_bot_left,
4852
                                               UWORD8 *pu1_avail,
4853
                                               WORD8 *pi1_sao_offset_u,
4854
                                               WORD8 *pi1_sao_offset_v,
4855
                                               WORD32 wd,
4856
                                               WORD32 ht)
4857
50.3k
{
4858
50.3k
    WORD32 row, col;
4859
50.3k
    UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
4860
50.3k
    UWORD8 *pu1_src_cpy, *pu1_src_org;
4861
50.3k
    UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)];
4862
50.3k
    UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
4863
50.3k
    WORD32 wd_rem;
4864
50.3k
    UWORD8 u1_pos_wd_0_tmp_u, u1_pos_wd_0_tmp_v, u1_pos_0_ht_tmp_u, u1_pos_0_ht_tmp_v;
4865
50.3k
    WORD32 ht_tmp;
4866
50.3k
    WORD32 bit_depth;
4867
50.3k
    UWORD8 u1_avail0, u1_avail1;
4868
4869
50.3k
    __m128i src_top_16x8b, src_bottom_16x8b;
4870
50.3k
    __m128i src_temp0_16x8b, src_temp1_16x8b;
4871
50.3k
    __m128i signup0_16x8b, signdwn1_16x8b;
4872
50.3k
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
4873
50.3k
    __m128i edge0_16x8b, edge1_16x8b;
4874
50.3k
    __m128i au1_mask8x16b;
4875
50.3k
    __m128i edge_idx_8x16b, sao_offset_8x16b;
4876
50.3k
    __m128i left_store_16x8b;
4877
50.3k
    __m128i const0_16x8b, const2_16x8b;
4878
50.3k
    __m128i chroma_offset_8x16b;
4879
4880
50.3k
    ht_tmp = ht;
4881
50.3k
    au1_mask8x16b = _mm_set1_epi8(0xff);
4882
4883
4884
50.3k
    au1_src_left_tmp[0] = pu1_src[(wd - 2)];
4885
50.3k
    au1_src_left_tmp[1] = pu1_src[(wd - 1)];
4886
    //manipulation for bottom left
4887
1.23M
    for(row = 2; row < 2 * ht; row++)
4888
1.18M
    {
4889
1.18M
        au1_src_left_tmp[row] = pu1_src_left[row];
4890
1.18M
    }
4891
50.3k
    au1_src_left_tmp[2 * ht] = pu1_src_bot_left[0];
4892
50.3k
    au1_src_left_tmp[2 * ht + 1] = pu1_src_bot_left[1];
4893
4894
50.3k
    pu1_src_top_left[0] = pu1_src_top[wd - 2];
4895
50.3k
    pu1_src_top_left[1] = pu1_src_top[wd - 1];
4896
    //setting availability mask to ff size MAX_CTB_SIZE
4897
251k
    for(col = 0; col < MAX_CTB_SIZE; col += 16)
4898
201k
        _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
4899
50.3k
    bit_depth = BIT_DEPTH_LUMA;
4900
50.3k
    pu1_src_org = pu1_src;
4901
50.3k
    pu1_src_top_cpy = pu1_src_top;
4902
50.3k
    pu1_src_left_cpy2 = au1_src_left_tmp;
4903
50.3k
    pu1_src_left_cpy = au1_src_left_tmp;
4904
50.3k
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
4905
50.3k
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
4906
50.3k
    const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
4907
50.3k
    chroma_offset_8x16b = _mm_set1_epi16(0x0800);
4908
    /* If top-right is available, process separately */
4909
50.3k
    if(0 != pu1_avail[5])
4910
35.3k
    {
4911
35.3k
        WORD32 edge_idx;
4912
4913
        /* U */
4914
35.3k
        edge_idx = 2 + SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) +
4915
35.3k
                        SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]);
4916
4917
35.3k
        edge_idx = gi1_table_edge_idx[edge_idx];
4918
4919
35.3k
        if(0 != edge_idx)
4920
8.14k
        {
4921
8.14k
            u1_pos_wd_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
4922
8.14k
        }
4923
27.2k
        else
4924
27.2k
        {
4925
27.2k
            u1_pos_wd_0_tmp_u = pu1_src[wd - 2];
4926
27.2k
        }
4927
4928
        /* V */
4929
35.3k
        edge_idx = 2 + SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) +
4930
35.3k
                        SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]);
4931
4932
35.3k
        edge_idx = gi1_table_edge_idx[edge_idx];
4933
4934
35.3k
        if(0 != edge_idx)
4935
7.92k
        {
4936
7.92k
            u1_pos_wd_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
4937
7.92k
        }
4938
27.4k
        else
4939
27.4k
        {
4940
27.4k
            u1_pos_wd_0_tmp_v = pu1_src[wd - 1];
4941
27.4k
        }
4942
35.3k
    }
4943
14.9k
    else
4944
14.9k
    {
4945
14.9k
        u1_pos_wd_0_tmp_u = pu1_src[wd - 2];
4946
14.9k
        u1_pos_wd_0_tmp_v = pu1_src[wd - 1];
4947
14.9k
    }
4948
4949
    /* If bottom-left is available, process separately */
4950
50.3k
    if(0 != pu1_avail[6])
4951
36.4k
    {
4952
36.4k
        WORD32 edge_idx;
4953
4954
        /* U */
4955
36.4k
        edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 2 - src_strd]) +
4956
36.4k
                        SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]);
4957
4958
36.4k
        edge_idx = gi1_table_edge_idx[edge_idx];
4959
4960
36.4k
        if(0 != edge_idx)
4961
8.50k
        {
4962
8.50k
            u1_pos_0_ht_tmp_u = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
4963
8.50k
        }
4964
27.9k
        else
4965
27.9k
        {
4966
27.9k
            u1_pos_0_ht_tmp_u = pu1_src[(ht - 1) * src_strd];
4967
27.9k
        }
4968
4969
        /* V */
4970
36.4k
        edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd]) +
4971
36.4k
                        SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1]);
4972
4973
36.4k
        edge_idx = gi1_table_edge_idx[edge_idx];
4974
4975
36.4k
        if(0 != edge_idx)
4976
8.31k
        {
4977
8.31k
            u1_pos_0_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd + 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
4978
8.31k
        }
4979
28.1k
        else
4980
28.1k
        {
4981
28.1k
            u1_pos_0_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1];
4982
28.1k
        }
4983
36.4k
    }
4984
13.8k
    else
4985
13.8k
    {
4986
13.8k
        u1_pos_0_ht_tmp_u = pu1_src[(ht - 1) * src_strd];
4987
13.8k
        u1_pos_0_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1];
4988
13.8k
    }
4989
4990
4991
4992
    /* Update height and source pointers based on the availability flags */
4993
50.3k
    if(0 == pu1_avail[2])
4994
11.7k
    {
4995
11.7k
        pu1_src_left_cpy2 += 2;
4996
11.7k
        pu1_src_top_cpy = pu1_src;
4997
11.7k
        pu1_src += src_strd;
4998
11.7k
        ht--;
4999
11.7k
    }
5000
50.3k
    if(0 == pu1_avail[3])
5001
8.60k
    {
5002
8.60k
        ht--;
5003
8.60k
    }
5004
5005
50.3k
    sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
5006
50.3k
    const2_16x8b = _mm_set1_epi8(2);
5007
50.3k
    const0_16x8b = _mm_setzero_si128();
5008
5009
5010
    //availability mask creation
5011
50.3k
    u1_avail0 = pu1_avail[0];
5012
50.3k
    u1_avail1 = pu1_avail[1];
5013
50.3k
    au1_mask[0] = u1_avail0;
5014
50.3k
    au1_mask[1] = u1_avail0;
5015
50.3k
    au1_mask[wd - 1] = u1_avail1;
5016
50.3k
    au1_mask[wd - 2] = u1_avail1;
5017
50.3k
    {
5018
50.3k
        WORD32 ht_rem;
5019
50.3k
        au1_mask_cpy = au1_mask;
5020
129k
        for(col = wd; col >= 16; col -= 16)
5021
78.7k
        {
5022
78.7k
            pu1_src_cpy = pu1_src;
5023
78.7k
            src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col + 2));
5024
            //row = 0
5025
78.7k
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
5026
5027
            //loading the mask
5028
78.7k
            au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
5029
            //separating +ve and and -ve values.
5030
78.7k
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
5031
78.7k
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
5032
            //creating mask 00 for +ve and -ve values and FF for zero.
5033
78.7k
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5034
78.7k
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5035
            //combining the appropriate sign change
5036
78.7k
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5037
78.7k
            pu1_src_left_cpy = pu1_src_left_cpy2;
5038
5039
610k
            for(row = ht; row >= 2; row -= 2)
5040
531k
            {
5041
531k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
5042
                //row = 1
5043
531k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
5044
                //to insert left in row 1
5045
531k
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
5046
                // row = 0 right
5047
531k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2));
5048
5049
                //manipulation for row 1 - row 0
5050
531k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
5051
                //row 0 -row1
5052
                //separating +ve and and -ve values.
5053
531k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
5054
531k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
5055
5056
                //creating mask 00 for +ve and -ve values and FF for zero.
5057
531k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5058
531k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5059
5060
                //combining the appropriate sign change
5061
531k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
5062
                //combining sign-left and sign_right
5063
531k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
5064
5065
                //row1-row0
5066
                //separating +ve and and -ve values.
5067
531k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
5068
531k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
5069
                //creating mask 00 for +ve and -ve values and FF for zero.
5070
531k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5071
531k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5072
5073
                // row = 2
5074
531k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
5075
                // row = 1 right
5076
531k
                signdwn1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
5077
531k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
5078
5079
                //bottom - row1
5080
531k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
5081
531k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
5082
                //creating mask 00 for +ve and -ve values and FF for zero.
5083
531k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5084
531k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5085
                //for the next iteration bottom -row1
5086
531k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5087
5088
                //to insert left in row 1
5089
531k
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 10);
5090
                //manipulation for row 1 - bottom
5091
531k
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
5092
5093
                //row1 -bottom
5094
531k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
5095
531k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
5096
                //creating mask 00 for +ve and -ve values and FF for zero.
5097
531k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5098
531k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5099
                //combining the appropriate sign change
5100
531k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5101
5102
                //combining sign-left and sign_right
5103
531k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
5104
5105
                //eliminating old left for row 0 and row 1
5106
531k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
5107
                //row1  getting it right for left of next block
5108
531k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14);
5109
                //row0  getting it right for left of next block
5110
531k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
5111
                //copying the next top
5112
531k
                src_top_16x8b = src_temp1_16x8b;
5113
5114
5115
                //adding constant 2
5116
531k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
5117
531k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
5118
                //shuffle to get sao index
5119
531k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
5120
531k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
5121
                //using availability mask
5122
531k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
5123
531k
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
5124
5125
                //adding chroma offset to access U and V
5126
531k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
5127
531k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
5128
5129
                //shuffle to get sao offset
5130
531k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
5131
531k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
5132
                //cnvert to 16 bit then add and then saturated pack
5133
531k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
5134
531k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
5135
531k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
5136
531k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
5137
531k
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
5138
531k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
5139
531k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
5140
531k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
5141
5142
531k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
5143
531k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
5144
531k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
5145
531k
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
5146
531k
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
5147
531k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
5148
531k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
5149
531k
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
5150
                //store left boundary
5151
531k
                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5152
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
5153
531k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
5154
                // row = 1
5155
531k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
5156
5157
531k
                src_temp0_16x8b = src_bottom_16x8b;
5158
531k
                pu1_src_cpy += (src_strd << 1);
5159
531k
                pu1_src_left_cpy += 4;
5160
531k
            }
5161
78.7k
            ht_rem = ht & 0x1;
5162
5163
78.7k
            if(ht_rem)
5164
27.0k
            {
5165
27.0k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
5166
27.0k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
5167
                //to insert left in row 1
5168
27.0k
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
5169
                //manipulation for row 1 - row 0
5170
27.0k
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
5171
5172
                //current row -next row
5173
                //separating +ve and and -ve values.
5174
27.0k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
5175
27.0k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
5176
                //creating mask 00 for +ve and -ve values and FF for zero.
5177
27.0k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5178
27.0k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5179
                //combining the appropriate sign change
5180
27.0k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5181
                //adding top and bottom and constant 2
5182
27.0k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
5183
27.0k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
5184
                //eliminating old left for row 0 and row 1
5185
27.0k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
5186
                //row0  getting it right for left of next block
5187
27.0k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
5188
                //copying the next top
5189
27.0k
                src_top_16x8b = src_temp0_16x8b;
5190
5191
27.0k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
5192
                //using availability mask
5193
27.0k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
5194
5195
                //adding chroma offset to access U and V
5196
27.0k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
5197
5198
5199
27.0k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
5200
5201
                //cnvert to 16 bit then add and then saturated pack
5202
27.0k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
5203
27.0k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
5204
27.0k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
5205
27.0k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
5206
27.0k
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
5207
27.0k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
5208
27.0k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
5209
27.0k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
5210
5211
                //store left boundary
5212
27.0k
                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5213
5214
27.0k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
5215
27.0k
                pu1_src_cpy += (src_strd);
5216
27.0k
                src_temp0_16x8b = src_bottom_16x8b;
5217
27.0k
                pu1_src_left_cpy += 2;
5218
27.0k
            }
5219
78.7k
            {   //for bottom right
5220
78.7k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
5221
78.7k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
5222
78.7k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
5223
78.7k
                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5224
78.7k
            }
5225
78.7k
            if(0 == pu1_avail[3])
5226
12.8k
            {
5227
12.8k
                src_top_16x8b = src_bottom_16x8b;
5228
12.8k
            }
5229
            //for the top left of next part of the block
5230
78.7k
            left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
5231
            //updating top flag
5232
78.7k
            _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
5233
78.7k
            pu1_src += 16;
5234
78.7k
            au1_mask_cpy += 16;
5235
78.7k
        }
5236
50.3k
        pu1_src_left_cpy = pu1_src_left_cpy2;
5237
50.3k
        wd_rem = wd & 0xF;
5238
50.3k
        if(wd_rem)
5239
2.59k
        {
5240
2.59k
            pu1_src_cpy = pu1_src;
5241
2.59k
            src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col + 2));
5242
            //row = 0
5243
2.59k
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
5244
2.59k
            au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
5245
            //separating +ve and and -ve values.
5246
2.59k
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
5247
2.59k
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
5248
            //creating mask 00 for +ve and -ve values and FF for zero.
5249
2.59k
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5250
2.59k
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5251
            //preparing au1_mask
5252
2.59k
            au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
5253
            //combining the appropriate sign change
5254
2.59k
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5255
2.59k
            signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
5256
2.59k
            pu1_src_left_cpy = pu1_src_left_cpy2;
5257
8.04k
            for(row = ht; row >= 4; row -= 4)
5258
5.44k
            {
5259
5.44k
                left_store_16x8b = _mm_loadu_si128((__m128i *)pu1_src_left_cpy);
5260
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
5261
5.44k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
5262
                // row = 2
5263
5.44k
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
5264
                //manipulation for row 0 -row 1
5265
5.44k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
5266
                //row 1 left
5267
5.44k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
5268
                //row 0 -row1
5269
                //separating +ve and and -ve values.
5270
5.44k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
5271
5.44k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
5272
5273
                //creating mask 00 for +ve and -ve values and FF for zero.
5274
5.44k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5275
5.44k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5276
                //manipulatiing for row 1 -row 0
5277
5.44k
                signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 2);
5278
                //combining the appropriate sign change
5279
5.44k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5280
                //row 1 -row0
5281
                //separating +ve and and -ve values.
5282
5.44k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
5283
5.44k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
5284
5285
                //creating mask 00 for +ve and -ve values and FF for zero.
5286
5.44k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5287
5.44k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5288
                //row1-row0
5289
5.44k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5290
5291
5.44k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
5292
5293
5.44k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
5294
                //manipulation for row 1 -row 2
5295
5.44k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 10);
5296
                //row 2 left
5297
5.44k
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
5298
                //packing row 0 n row 1
5299
5.44k
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
5300
                //row1 -row2
5301
5.44k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
5302
5.44k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
5303
                //creating mask 00 for +ve and -ve values and FF for zero.
5304
5.44k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5305
5.44k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5306
                //combining the appropriate sign change
5307
5.44k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
5308
5.44k
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
5309
5310
                //row 1 right
5311
5.44k
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
5312
                //row = 3
5313
5.44k
                src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
5314
5315
                // row = 4
5316
5.44k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
5317
5318
5.44k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
5319
5320
                //separating +ve and and -ve values.(2,1)
5321
5.44k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
5322
5.44k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
5323
5324
                //creating mask 00 for +ve and -ve values and FF for zero.
5325
5.44k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5326
5.44k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5327
                //row 2 right
5328
5.44k
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
5329
                //combining the appropriate sign change
5330
5.44k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
5331
5332
                //separating +ve and and -ve values.(3,2)
5333
5.44k
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
5334
5.44k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
5335
5.44k
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
5336
                //creating mask 00 for +ve and -ve values and FF for zero.
5337
5.44k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5338
5.44k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5339
                //manipulation for row 2 -row 3
5340
5.44k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 8);
5341
                //row 3 left
5342
5.44k
                signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 14);
5343
                //combining the appropriate sign change
5344
5.44k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
5345
5346
5.44k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
5347
5348
                //separating +ve and and -ve values.(2,3)
5349
5.44k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
5350
5.44k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
5351
5352
                //manipulation for row 3 -bottom
5353
5.44k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 6);
5354
                //bottom left
5355
5.44k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
5356
5357
                //creating mask 00 for +ve and -ve values and FF for zero.
5358
5.44k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5359
5.44k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5360
                //combining the appropriate sign change
5361
5.44k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
5362
5363
                //separating +ve and and -ve values.(3,bottom)
5364
5.44k
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
5365
5.44k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
5366
5367
                //creating mask 00 for +ve and -ve values and FF for zero.
5368
5.44k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5369
5.44k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5370
5.44k
                edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
5371
                //combining the appropriate sign change
5372
5.44k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
5373
5.44k
                edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
5374
5375
5376
                //eliminating old left for row 0,1,2,3
5377
5.44k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 8);
5378
                //packing row 2 n row 3
5379
5.44k
                src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
5380
                //row 3 right
5381
5.44k
                signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 2);
5382
                //loading row 3 right into left
5383
5.44k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 14);
5384
                //adding bottom and top values of row 2 and row 3
5385
5.44k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
5386
                //separating +ve and and -ve values.(botttom,3)
5387
5.44k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
5388
5.44k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
5389
                //to store right of row 2
5390
5.44k
                signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
5391
                //creating mask 00 for +ve and -ve values and FF for zero.
5392
5.44k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5393
5.44k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5394
5.44k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
5395
5396
                //storing right of row 2into left
5397
5.44k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
5398
                //to store right of row 0
5399
5.44k
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
5400
                //storing right of row 1 into left
5401
5.44k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
5402
                //storing right of row 0 into left
5403
5.44k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
5404
5405
5406
                //adding constant 2
5407
5.44k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
5408
5.44k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
5409
                //shuffle to get sao index
5410
5.44k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
5411
5.44k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
5412
                //using availability mask
5413
5.44k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
5414
5.44k
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
5415
                //adding chroma offset to access U and V
5416
5.44k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
5417
5.44k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
5418
                //shuffle to get sao offset
5419
5.44k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
5420
5.44k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
5421
5422
                //cnvert to 16 bit then add and then saturated pack
5423
5.44k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
5424
5.44k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
5425
5.44k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
5426
5.44k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
5427
5.44k
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
5428
5.44k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
5429
5.44k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
5430
5.44k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
5431
5432
5.44k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
5433
5.44k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
5434
5.44k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
5435
5.44k
                src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
5436
5.44k
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
5437
5.44k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
5438
5.44k
                src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b);
5439
5.44k
                src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
5440
5441
5.44k
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
5442
5.44k
                cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
5443
5.44k
                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5444
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
5445
5.44k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
5446
                // row = 1
5447
5.44k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
5448
                //row = 2
5449
5.44k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
5450
                // row = 3
5451
5.44k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
5452
5453
5.44k
                src_temp0_16x8b = src_temp1_16x8b;
5454
5.44k
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
5455
5.44k
                pu1_src_cpy += (src_strd << 2);
5456
5.44k
                pu1_src_left_cpy += 8;
5457
5.44k
            }
5458
2.59k
            ht_rem = ht & 0x2;
5459
2.59k
            if(ht_rem)
5460
1.07k
            {
5461
1.07k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
5462
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
5463
1.07k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
5464
                // row = 2
5465
1.07k
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
5466
5467
                //manipulation for row 0 -row 1
5468
1.07k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
5469
                //bottom left
5470
1.07k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
5471
                //separating +ve and and -ve values.
5472
1.07k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
5473
1.07k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
5474
5475
                //creating mask 00 for +ve and -ve values and FF for zero.
5476
1.07k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5477
1.07k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5478
                //manipulation for row 1 - row 0
5479
1.07k
                signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 2);
5480
                //combining the appropriate sign change
5481
1.07k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5482
5483
                //row1-row0
5484
                //separating +ve and and -ve values.
5485
1.07k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
5486
1.07k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
5487
5488
                //creating mask 00 for +ve and -ve values and FF for zero.
5489
1.07k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5490
1.07k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5491
                //combining the appropriate sign chang
5492
1.07k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5493
5494
                //manipulation for row 1 -bottom
5495
1.07k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 10);
5496
                //bottom left
5497
1.07k
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
5498
5499
1.07k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
5500
1.07k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
5501
                //row1 -bottom
5502
1.07k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
5503
1.07k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
5504
5505
                //creating mask 00 for +ve and -ve values and FF for zero.
5506
1.07k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5507
1.07k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5508
                //combining the appropriate sign change
5509
1.07k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
5510
1.07k
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
5511
5512
                //manipulation for bottom- row 1 (row 1 right)
5513
1.07k
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
5514
                //adding top and down substraction
5515
1.07k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
5516
                //bottom - row 1
5517
1.07k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
5518
1.07k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
5519
5520
                //eliminating old left for row 0,1
5521
1.07k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
5522
1.07k
                signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
5523
                //creating mask 00 for +ve and -ve values and FF for zero.
5524
1.07k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5525
1.07k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5526
                //for the next iteration signup0_16x8b
5527
1.07k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
5528
5529
                //storing right of row 1 into left
5530
1.07k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
5531
                //for storing right of row 1
5532
1.07k
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
5533
5534
1.07k
                src_top_16x8b = src_temp1_16x8b;
5535
                //storing right of row 0 into left
5536
1.07k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
5537
5538
                //adding constant 2
5539
1.07k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
5540
5541
                //shuffle to get sao index
5542
1.07k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
5543
                //using availability mask
5544
1.07k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
5545
                //adding chroma offset to access U and V
5546
1.07k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
5547
                //shuffle to get sao offset
5548
1.07k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
5549
                //the next top already in  src_top_16x8b
5550
                //cnvert to 16 bit then add and then saturated pack
5551
1.07k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
5552
1.07k
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
5553
1.07k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
5554
1.07k
                src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
5555
1.07k
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
5556
1.07k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
5557
1.07k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b);
5558
1.07k
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
5559
5560
1.07k
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
5561
5562
1.07k
                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5563
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
5564
1.07k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
5565
                // row = 1
5566
1.07k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
5567
1.07k
                src_temp0_16x8b = src_bottom_16x8b;
5568
1.07k
                pu1_src_cpy += (src_strd << 1);
5569
1.07k
                pu1_src_left_cpy += 4;
5570
1.07k
            }
5571
2.59k
            ht_rem = ht & 0x1;
5572
2.59k
            if(ht_rem)
5573
956
            {
5574
956
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
5575
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
5576
956
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
5577
5578
5579
                //manipulation for row 0 -bottom
5580
956
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
5581
                //bottom left
5582
956
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
5583
                //separating +ve and and -ve values.
5584
956
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
5585
956
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
5586
                //creating mask 00 for +ve and -ve values and FF for zero.
5587
956
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5588
956
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5589
                //combining the appropriate sign change
5590
956
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5591
                //adding top and down substraction
5592
956
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
5593
                //for row 0 right to put into left store
5594
956
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
5595
                //adding constant 2
5596
956
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
5597
956
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
5598
956
                edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
5599
                //left store manipulation 1
5600
956
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
5601
                //filling the left boundary value
5602
956
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
5603
956
                src_top_16x8b = src_temp0_16x8b;
5604
5605
                //shuffle to get sao index
5606
956
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
5607
                //using availability mask
5608
956
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
5609
                //adding chroma offset to access U and V
5610
956
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
5611
                //shuffle to get sao offset
5612
956
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
5613
5614
                //cnvert to 16 bit then add and then saturated pack
5615
956
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
5616
956
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
5617
956
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
5618
956
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
5619
956
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
5620
5621
956
                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5622
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
5623
956
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
5624
956
                pu1_src_cpy += (src_strd);
5625
956
                src_temp0_16x8b = src_bottom_16x8b;
5626
956
                pu1_src_left_cpy += 2;
5627
956
            }
5628
2.59k
            {   //for bottom right
5629
2.59k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
5630
2.59k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
5631
2.59k
                src_temp0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
5632
2.59k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
5633
2.59k
                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5634
2.59k
            }
5635
2.59k
            if(0 == pu1_avail[3])
5636
619
            {
5637
619
                src_top_16x8b = src_bottom_16x8b;
5638
619
            }
5639
5640
2.59k
            _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
5641
2.59k
            pu1_src += 8;
5642
2.59k
        }
5643
50.3k
        pu1_src_org[wd - 2] = u1_pos_wd_0_tmp_u;
5644
50.3k
        pu1_src_org[wd - 1] = u1_pos_wd_0_tmp_v;
5645
50.3k
        pu1_src_org[(ht_tmp - 1) * src_strd] = u1_pos_0_ht_tmp_u;
5646
50.3k
        pu1_src_org[(ht_tmp - 1) * src_strd + 1] = u1_pos_0_ht_tmp_v;
5647
1.33M
        for(row = 0; row < 2 * ht_tmp; row++)
5648
1.28M
        {
5649
1.28M
            pu1_src_left[row] = au1_src_left_tmp[row];
5650
1.28M
        }
5651
50.3k
    }
5652
5653
50.3k
}