Coverage Report

Created: 2025-08-03 06:13

/src/libhevc/common/x86/ihevc_sao_ssse3_intr.c
Line
Count
Source (jump to first uncovered line)
1
/******************************************************************************
2
*
3
* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4
*
5
* Licensed under the Apache License, Version 2.0 (the "License");
6
* you may not use this file except in compliance with the License.
7
* You may obtain a copy of the License at:
8
*
9
* http://www.apache.org/licenses/LICENSE-2.0
10
*
11
* Unless required by applicable law or agreed to in writing, software
12
* distributed under the License is distributed on an "AS IS" BASIS,
13
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
* See the License for the specific language governing permissions and
15
* limitations under the License.
16
*
17
******************************************************************************/
18
/**
19
*******************************************************************************
20
* @file
21
*  ihevc_sao_atom_intr.c
22
*
23
* @brief
24
*  Contains function definitions for Sample adaptive offset(SAO) used in-loop
25
* filtering
26
*
27
* @author
28
* 100592
29
*
30
* @par List of Functions:
31
*   - ihevc_sao_band_offset_luma_ssse3()
32
*   - ihevc_sao_band_offset_chroma_ssse3()
33
*   - ihevc_sao_edge_offset_class0_ssse3()
34
*   - ihevc_sao_edge_offset_class0_chroma_ssse3()
35
*   - ihevc_sao_edge_offset_class1_ssse3()
36
*   - ihevc_sao_edge_offset_class1_chroma_ssse3()
37
*   - ihevc_sao_edge_offset_class2_ssse3()
38
*   - ihevc_sao_edge_offset_class2_chroma_ssse3()
39
*   - ihevc_sao_edge_offset_class3_ssse3()
40
*   - ihevc_sao_edge_offset_class3_chroma_ssse3()
41
*
42
* @remarks
43
*  None
44
*
45
*******************************************************************************
46
*/
47
/*****************************************************************************/
48
/* File Includes                                                             */
49
/*****************************************************************************/
50
#include <stdio.h>
51
52
#include "ihevc_typedefs.h"
53
#include "ihevc_platform_macros.h"
54
#include "ihevc_macros.h"
55
#include "ihevc_func_selector.h"
56
#include "ihevc_defs.h"
57
#include "ihevc_tables_x86_intr.h"
58
#include "ihevc_common_tables.h"
59
#include "ihevc_sao.h"
60
61
#include <immintrin.h>
62
63
#define NUM_BAND_TABLE  32
64
/**
65
*******************************************************************************
66
*
67
* @brief
68
* Has two sets of functions : band offset and edge offset both for luma and chroma
69
* edge offset has horizontal ,vertical, 135 degree and 45 degree
70
*
71
* @par Description:
72
*
73
*
74
* @param[in-out] pu1_src
75
*  Pointer to the source
76
*
77
* @param[in] src_strd
78
*  Source stride
79
*
80
* @param[in-out] pu1_src_left
81
*  source left boundary
82
*
83
* @param[in-out] pu1_src_top
84
* Source top boundary
85
*
86
* @param[in-out] pu1_src_top_left
87
*  Source top left boundary
88
*
89
* @param[in] pu1_src_top_right
90
*  Source top right boundary
91
*
92
* @param[in] pu1_src_bot_left
93
*  Source bottom left boundary
94
*
95
* @param[in] pu1_avail
96
*  boundary availability flags
97
*
98
* @param[in] pi1_sao_offset_u
99
*  Chroma U sao offset values
100
*
101
* @param[in] pi1_sao_offset_v
102
*  Chroma V sao offset values
103
*
104
* @param[in] pi1_sao_offset
105
*  Luma sao offset values
106
*
107
* @param[in] wd
108
*  width of the source
109
110
* @param[in] ht
111
*  height of the source
112
* @returns
113
*
114
* @remarks
115
*  None
116
*
117
*******************************************************************************
118
*/
119
120
121
void ihevc_sao_band_offset_luma_ssse3(UWORD8 *pu1_src,
122
                                      WORD32 src_strd,
123
                                      UWORD8 *pu1_src_left,
124
                                      UWORD8 *pu1_src_top,
125
                                      UWORD8 *pu1_src_top_left,
126
                                      WORD32 sao_band_pos,
127
                                      WORD8 *pi1_sao_offset,
128
                                      WORD32 wd,
129
                                      WORD32 ht)
130
50.0k
{
131
50.0k
    WORD32 row, col;
132
50.0k
    UWORD8 *pu1_src_cpy;
133
50.0k
    WORD32 wd_rem;
134
50.0k
    WORD8 offset = 0;
135
136
50.0k
    __m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b;
137
50.0k
    __m128i band_table0_8x16b, band_table1_8x16b, band_table2_8x16b, band_table3_8x16b;
138
50.0k
    __m128i tmp_set_128i_1, tmp_set_128i_2, tmp_set_128i_3, tmp_set_128i_4;
139
50.0k
    __m128i band_pos_16x8b;
140
50.0k
    __m128i sao_offset;
141
50.0k
    __m128i cmp_mask, cmp_store;
142
143
    /* Updating left and top-left and top */
144
1.67M
    for(row = 0; row < ht; row++)
145
1.62M
    {
146
1.62M
        pu1_src_left[row] = pu1_src[row * src_strd + (wd - 1)];
147
1.62M
    }
148
50.0k
    pu1_src_top_left[0] = pu1_src_top[wd - 1];
149
258k
    for(col = 0; col < wd; col += 8)
150
208k
    {
151
208k
        tmp_set_128i_1 = _mm_loadl_epi64((__m128i *)(pu1_src + (ht - 1) * src_strd + offset));
152
208k
        _mm_storel_epi64((__m128i *)(pu1_src_top + offset), tmp_set_128i_1);
153
208k
        offset += 8;
154
208k
    }
155
156
    //replicating sao_band_pos as 8 bit value 16 times
157
158
159
50.0k
    band_pos_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos << 3));
160
    //value set for sao_offset extraction
161
50.0k
    tmp_set_128i_1  = _mm_set_epi8(-128, 1, -128, 1, -128, 1, -128, 1, -128, 1, -128, 1, -128, 1, -128, 1);
162
50.0k
    tmp_set_128i_2  = _mm_set_epi8(-128, 2, -128, 2, -128, 2, -128, 2, -128, 2, -128, 2, -128, 2, -128, 2);
163
50.0k
    tmp_set_128i_3  = _mm_set_epi8(-128, 3, -128, 3, -128, 3, -128, 3, -128, 3, -128, 3, -128, 3, -128, 3);
164
50.0k
    tmp_set_128i_4  = _mm_set_epi8(-128, 4, -128, 4, -128, 4, -128, 4, -128, 4, -128, 4, -128, 4, -128, 4);
165
166
    //loaded sao offset values
167
50.0k
    sao_offset      = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
168
169
    //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers
170
50.0k
    band_table0_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx));
171
50.0k
    band_table1_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8));
172
50.0k
    band_table2_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16));
173
50.0k
    band_table3_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24));
174
175
    //band_position addition
176
50.0k
    band_table0_8x16b = _mm_add_epi16(band_table0_8x16b, band_pos_16x8b);
177
50.0k
    band_table1_8x16b = _mm_add_epi16(band_table1_8x16b, band_pos_16x8b);
178
50.0k
    band_table2_8x16b = _mm_add_epi16(band_table2_8x16b, band_pos_16x8b);
179
50.0k
    band_table3_8x16b = _mm_add_epi16(band_table3_8x16b, band_pos_16x8b);
180
    //sao_offset duplication
181
50.0k
    tmp_set_128i_1  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1);
182
50.0k
    tmp_set_128i_2  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2);
183
50.0k
    tmp_set_128i_3  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3);
184
50.0k
    tmp_set_128i_4  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4);
185
    //settng for comparision
186
50.0k
    cmp_mask = _mm_set1_epi16(16);
187
50.0k
    cmp_store = _mm_set1_epi16(0x00ff);
188
189
    //sao_offset addition
190
50.0k
    band_table0_8x16b = _mm_add_epi16(band_table0_8x16b, tmp_set_128i_1);
191
50.0k
    band_table1_8x16b = _mm_add_epi16(band_table1_8x16b, tmp_set_128i_2);
192
50.0k
    band_table2_8x16b = _mm_add_epi16(band_table2_8x16b, tmp_set_128i_3);
193
50.0k
    band_table3_8x16b = _mm_add_epi16(band_table3_8x16b, tmp_set_128i_4);
194
    //masking upper 8bit values of each  16 bit band table value
195
50.0k
    band_table0_8x16b = _mm_and_si128(band_table0_8x16b, cmp_store);
196
50.0k
    band_table1_8x16b = _mm_and_si128(band_table1_8x16b, cmp_store);
197
50.0k
    band_table2_8x16b = _mm_and_si128(band_table2_8x16b, cmp_store);
198
50.0k
    band_table3_8x16b = _mm_and_si128(band_table3_8x16b, cmp_store);
199
200
50.0k
    switch(sao_band_pos)
201
50.0k
    {
202
1.22k
        case 0:
203
1.22k
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table0_8x16b);
204
1.22k
            band_table0_8x16b = _mm_and_si128(band_table0_8x16b, tmp_set_128i_2);
205
1.22k
            break;
206
548
        case 28:
207
548
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table3_8x16b);
208
548
            band_table3_8x16b = _mm_or_si128(band_table3_8x16b, tmp_set_128i_2);
209
548
            break;
210
1.12k
        case 29:
211
1.12k
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table2_8x16b);
212
1.12k
            band_table2_8x16b = _mm_or_si128(band_table2_8x16b, tmp_set_128i_2);
213
1.12k
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table3_8x16b);
214
1.12k
            band_table3_8x16b = _mm_and_si128(band_table3_8x16b, tmp_set_128i_2);
215
1.12k
            break;
216
496
        case 30:
217
496
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table1_8x16b);
218
496
            band_table1_8x16b = _mm_or_si128(band_table1_8x16b, tmp_set_128i_2);
219
496
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table2_8x16b);
220
496
            band_table2_8x16b = _mm_and_si128(band_table2_8x16b, tmp_set_128i_2);
221
496
            break;
222
798
        case 31:
223
798
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table0_8x16b);
224
798
            band_table0_8x16b = _mm_or_si128(band_table0_8x16b, tmp_set_128i_2);
225
798
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table1_8x16b);
226
798
            band_table1_8x16b = _mm_and_si128(band_table1_8x16b, tmp_set_128i_2);
227
798
            break;
228
45.9k
        default:
229
45.9k
            break;
230
50.0k
    }
231
    //sao_offset is reused for zero cmp mask.
232
50.0k
    sao_offset = _mm_setzero_si128();
233
50.0k
    tmp_set_128i_1 = _mm_set1_epi8(1);
234
    //tmp_set_128i_2 = _mm_set_epi8 (128,7,128,6,128,5,128,4,128,3,128,2,128,1,128,0);
235
50.0k
    cmp_mask = _mm_packus_epi16(cmp_mask, cmp_mask); //cmp_msk=dup16(16);
236
237
    //masking upper 8bit values of each  16 bit band table value
238
50.0k
    band_table0_8x16b = _mm_and_si128(band_table0_8x16b, cmp_store);
239
50.0k
    band_table1_8x16b = _mm_and_si128(band_table1_8x16b, cmp_store);
240
50.0k
    band_table2_8x16b = _mm_and_si128(band_table2_8x16b, cmp_store);
241
50.0k
    band_table3_8x16b = _mm_and_si128(band_table3_8x16b, cmp_store);
242
243
    //band table 8x16 four registers are packed into 16x8 two registers:  band_table0_8x16b and band_table2_8x16b
244
50.0k
    band_table0_8x16b = _mm_packus_epi16(band_table0_8x16b, band_table1_8x16b);
245
50.0k
    band_table2_8x16b = _mm_packus_epi16(band_table2_8x16b, band_table3_8x16b);
246
247
50.0k
    band_table3_8x16b = _mm_slli_epi16(cmp_mask, 1); // to compare if value is greater than 31
248
50.0k
    band_pos_16x8b = _mm_packus_epi16(band_pos_16x8b, band_pos_16x8b); //band_pos is now 8 bit aligned
249
50.0k
    band_table3_8x16b = _mm_sub_epi8(band_table3_8x16b, tmp_set_128i_1); // to compare if value is greater than 31
250
251
50.0k
    cmp_mask = _mm_sub_epi8(cmp_mask, tmp_set_128i_1);
252
    //  band_pos_16x8b = _mm_or_si128(band_pos_16x8b,cmp_store);
253
254
130k
    for(col = wd; col >= 16; col -= 16)
255
80.3k
    {
256
80.3k
        pu1_src_cpy = pu1_src;
257
1.38M
        for(row = ht; row > 0; row -= 2)
258
1.30M
        {
259
260
261
            //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
262
1.30M
            src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
263
            // row = 1
264
1.30M
            src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
265
266
267
268
            //saturated substract 8 bit
269
1.30M
            tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_16x8b);
270
1.30M
            tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_16x8b);
271
            //if the values less than 0 put ff
272
1.30M
            tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
273
1.30M
            tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
274
1.30M
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
275
1.30M
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
276
            //if the values gret=ater than 31 put ff
277
1.30M
            tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, band_table3_8x16b);
278
1.30M
            tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, band_table3_8x16b);
279
1.30M
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
280
1.30M
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
281
282
283
            //row 0 and row1
284
            //if the values >16 then put ff ,cmp_mask = dup16(15)
285
1.30M
            cmp_store = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
286
            //values 16 to 31 for row 0 & 1 but values <16 ==0
287
1.30M
            tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, cmp_store);
288
            // values 0 to 15 for row 0 & 1
289
1.30M
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, cmp_store);
290
            //values 16 to 31 for row 0 & 1 but values <16 masked to ff
291
1.30M
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
292
1.30M
            tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, cmp_store);
293
            //row 2 and  row 3
294
            //if the values >16 then put ff ,cmp_mask = dup16(15)
295
1.30M
            cmp_store = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
296
            //values 16 to 31 for row 2 & 3 but values <16 ==0
297
1.30M
            tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, cmp_store);
298
            // values 0 to 15 for row 2 & 3
299
1.30M
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, cmp_store);
300
            //values 16 to 31 for row 2 & 3 but values <16 masked to ff
301
1.30M
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
302
1.30M
            tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, cmp_store);
303
304
            //row 0 and row 1
305
            //to preserve pixel values in which no offset needs to be added.
306
1.30M
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
307
1.30M
            src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, cmp_store);
308
309
            //row 2 and row 3
310
            //to preserve pixel values in which no offset needs to be added.
311
1.30M
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
312
1.30M
            src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, cmp_store);
313
314
            //indexing 0 - 15 bandtable indexes
315
1.30M
            tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_1);
316
1.30M
            tmp_set_128i_3 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_3);
317
1.30M
            tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_2);
318
1.30M
            tmp_set_128i_4 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_4);
319
            // combining all offsets results
320
1.30M
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
321
1.30M
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
322
            // combing results woth the pixel values
323
1.30M
            src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
324
1.30M
            src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
325
326
327
            //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
328
1.30M
            _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_8x16b);
329
            // row = 1
330
1.30M
            _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp2_8x16b);
331
332
1.30M
            pu1_src_cpy += (src_strd << 1);
333
1.30M
        }
334
80.3k
        pu1_src += 16;
335
80.3k
    }
336
50.0k
    wd_rem = wd & 0xF;
337
50.0k
    if(wd_rem)
338
47.9k
    {pu1_src_cpy = pu1_src;
339
435k
        for(row = ht; row > 0; row -= 4)
340
387k
        {
341
342
343
            //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
344
387k
            src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
345
            // row = 1
346
387k
            src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
347
            // row = 2
348
387k
            src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
349
            // row = 3
350
387k
            src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
351
            //row0 and row1 packed and row2 and row3 packed
352
353
387k
            src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b);
354
387k
            src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp3_8x16b);
355
356
            //saturated substract 8 bit
357
387k
            tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_16x8b);
358
387k
            tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_16x8b);
359
            //if the values less than 0 put ff
360
387k
            tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
361
387k
            tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
362
387k
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
363
387k
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
364
            //if the values gret=ater than 31 put ff
365
387k
            tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, band_table3_8x16b);
366
387k
            tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, band_table3_8x16b);
367
387k
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
368
387k
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
369
370
371
372
            //row 0 and row1
373
            //if the values >16 then put ff ,cmp_mask = dup16(15)
374
387k
            cmp_store = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
375
            //values 16 to 31 for row 0 & 1 but values <16 ==0
376
387k
            tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, cmp_store);
377
            // values 0 to 15 for row 0 & 1
378
387k
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, cmp_store);
379
            //values 16 to 31 for row 0 & 1 but values <16 masked to ff
380
387k
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
381
387k
            tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, cmp_store);
382
            //row 2 and  row 3
383
            //if the values >16 then put ff ,cmp_mask = dup16(15)
384
387k
            cmp_store = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
385
            //values 16 to 31 for row 2 & 3 but values <16 ==0
386
387k
            tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, cmp_store);
387
            // values 0 to 15 for row 2 & 3
388
387k
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, cmp_store);
389
            //values 16 to 31 for row 2 & 3 but values <16 masked to ff
390
387k
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
391
387k
            tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, cmp_store);
392
393
            //row 0 and row 1
394
            //to preserve pixel values in which no offset needs to be added.
395
387k
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
396
387k
            src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, cmp_store);
397
398
            //row 2 and row 3
399
            //to preserve pixel values in which no offset needs to be added.
400
387k
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
401
387k
            src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, cmp_store);
402
403
            //indexing 0 - 15 bandtable indexes
404
387k
            tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_1);
405
387k
            tmp_set_128i_3 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_3);
406
387k
            tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_2);
407
387k
            tmp_set_128i_4 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_4);
408
            // combining all offsets results
409
387k
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
410
387k
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
411
            // combing results woth the pixel values
412
387k
            src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
413
387k
            src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
414
415
            //Getting row1 separately
416
387k
            src_temp1_8x16b = _mm_srli_si128(src_temp0_8x16b, 8);
417
            //Getting row3 separately
418
387k
            src_temp3_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
419
420
            //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
421
387k
            _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_8x16b);
422
            // row = 1
423
387k
            _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), src_temp1_8x16b);
424
            // row = 2
425
387k
            _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp2_8x16b);
426
            // row = 3
427
387k
            _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), src_temp3_8x16b);
428
429
387k
            pu1_src_cpy += (src_strd << 2);
430
431
387k
        }
432
47.9k
        pu1_src += 8;
433
47.9k
    }
434
435
436
50.0k
}
437
438
void ihevc_sao_band_offset_chroma_ssse3(UWORD8 *pu1_src,
439
                                        WORD32 src_strd,
440
                                        UWORD8 *pu1_src_left,
441
                                        UWORD8 *pu1_src_top,
442
                                        UWORD8 *pu1_src_top_left,
443
                                        WORD32 sao_band_pos_u,
444
                                        WORD32 sao_band_pos_v,
445
                                        WORD8 *pi1_sao_offset_u,
446
                                        WORD8 *pi1_sao_offset_v,
447
                                        WORD32 wd,
448
                                        WORD32 ht)
449
24.5k
{
450
24.5k
    WORD32 row, col;
451
24.5k
    WORD8 offset = 0;
452
453
454
24.5k
    __m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b;
455
24.5k
    __m128i cmp_msk2;
456
24.5k
    __m128i band_table0_16x8b, band_table1_16x8b, band_table2_16x8b, band_table3_16x8b;
457
24.5k
    __m128i tmp_set_128i_1, tmp_set_128i_2, tmp_set_128i_3, tmp_set_128i_4;
458
24.5k
    __m128i band_pos_u_16x8b, band_pos_v_16x8b;
459
24.5k
    __m128i sao_offset;
460
24.5k
    __m128i cmp_mask;
461
462
463
    /* Updating left and top and top-left */
464
422k
    for(row = 0; row < ht; row++)
465
397k
    {
466
397k
        pu1_src_left[2 * row] = pu1_src[row * src_strd + (wd - 2)];
467
397k
        pu1_src_left[2 * row + 1] = pu1_src[row * src_strd + (wd - 1)];
468
397k
    }
469
24.5k
    pu1_src_top_left[0] = pu1_src_top[wd - 2];
470
24.5k
    pu1_src_top_left[1] = pu1_src_top[wd - 1];
471
126k
    for(col = 0; col < wd; col += 8)
472
101k
    {
473
101k
        tmp_set_128i_1 = _mm_loadl_epi64((__m128i *)(pu1_src + (ht - 1) * src_strd + offset));
474
101k
        _mm_storel_epi64((__m128i *)(pu1_src_top + offset), tmp_set_128i_1);
475
101k
        offset += 8;
476
101k
    }
477
478
24.5k
    { // band _table creation
479
24.5k
        __m128i temp0_8x16b, temp1_8x16b, temp2_8x16b, temp3_8x16b;
480
        // Band table for U component : band_table0_16x8b and band_table2_16x8b
481
        //replicating sao_band_pos as 8 bit value 16 times
482
24.5k
        band_pos_u_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos_u << 3));
483
        //value set for sao_offset extraction
484
24.5k
        tmp_set_128i_1  = _mm_set_epi8(-128, 1, -128, 1, -128, 1, -128, 1, -128, 1, -128, 1, -128, 1, -128, 1);
485
24.5k
        tmp_set_128i_2  = _mm_set_epi8(-128, 2, -128, 2, -128, 2, -128, 2, -128, 2, -128, 2, -128, 2, -128, 2);
486
24.5k
        tmp_set_128i_3  = _mm_set_epi8(-128, 3, -128, 3, -128, 3, -128, 3, -128, 3, -128, 3, -128, 3, -128, 3);
487
24.5k
        tmp_set_128i_4  = _mm_set_epi8(-128, 4, -128, 4, -128, 4, -128, 4, -128, 4, -128, 4, -128, 4, -128, 4);
488
489
        //loaded sao offset values
490
24.5k
        sao_offset      = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
491
492
        //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers
493
24.5k
        band_table0_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx));
494
24.5k
        band_table1_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8));
495
24.5k
        band_table2_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16));
496
24.5k
        band_table3_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24));
497
498
        //band_position addition
499
24.5k
        band_table0_16x8b = _mm_add_epi16(band_table0_16x8b, band_pos_u_16x8b);
500
24.5k
        band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, band_pos_u_16x8b);
501
24.5k
        band_table2_16x8b = _mm_add_epi16(band_table2_16x8b, band_pos_u_16x8b);
502
24.5k
        band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, band_pos_u_16x8b);
503
        //sao_offset duplication
504
24.5k
        temp0_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1);
505
24.5k
        temp1_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2);
506
24.5k
        temp2_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3);
507
24.5k
        temp3_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4);
508
509
        //sao_offset addition
510
24.5k
        band_table0_16x8b = _mm_add_epi16(band_table0_16x8b, temp0_8x16b);
511
24.5k
        band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, temp1_8x16b);
512
24.5k
        band_table2_16x8b = _mm_add_epi16(band_table2_16x8b, temp2_8x16b);
513
24.5k
        band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, temp3_8x16b);
514
        //reuse for clipping
515
24.5k
        temp1_8x16b = _mm_set1_epi16(0x00ff);
516
        //settng for comparision
517
24.5k
        cmp_mask = _mm_set1_epi16(16);
518
519
        //masking upper 8bit values of each  16 bit band table value
520
24.5k
        band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp1_8x16b);
521
24.5k
        band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
522
24.5k
        band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp1_8x16b);
523
24.5k
        band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
524
525
        //temp1_8x16b reuse for compare storage
526
24.5k
        switch(sao_band_pos_u)
527
24.5k
        {
528
1.16k
            case 0:
529
1.16k
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table0_16x8b);
530
1.16k
                band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp3_8x16b);
531
1.16k
                break;
532
284
            case 28:
533
284
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
534
284
                band_table3_16x8b = _mm_or_si128(band_table3_16x8b, temp3_8x16b);
535
284
                break;
536
981
            case 29:
537
981
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table2_16x8b);
538
981
                band_table2_16x8b = _mm_or_si128(band_table2_16x8b, temp3_8x16b);
539
981
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
540
981
                band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp3_8x16b);
541
981
                break;
542
616
            case 30:
543
616
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
544
616
                band_table1_16x8b = _mm_or_si128(band_table1_16x8b, temp3_8x16b);
545
616
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table2_16x8b);
546
616
                band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp3_8x16b);
547
616
                break;
548
544
            case 31:
549
544
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table0_16x8b);
550
544
                band_table0_16x8b = _mm_or_si128(band_table0_16x8b, temp3_8x16b);
551
544
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
552
544
                band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp3_8x16b);
553
544
                break;
554
20.9k
            default:
555
20.9k
                break;
556
24.5k
        }
557
        //masking upper 8bit values of each  16 bit band table value
558
24.5k
        band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp1_8x16b);
559
24.5k
        band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
560
24.5k
        band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp1_8x16b);
561
24.5k
        band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
562
        //band table 8x16 four registers are packed into 16x8 two registers:  band_table0_8x16b and band_table2_8x16b
563
24.5k
        band_table0_16x8b = _mm_packus_epi16(band_table0_16x8b, band_table1_16x8b);
564
24.5k
        band_table2_16x8b = _mm_packus_epi16(band_table2_16x8b, band_table3_16x8b);
565
        // Band table for U component over
566
567
        // Band table for V component : band_table1_16x8b and band_table3_16x8b
568
        // replicating sao_band_pos as 8 bit value 16 times
569
24.5k
        band_pos_v_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos_v << 3));
570
571
        //loaded sao offset values
572
24.5k
        sao_offset      = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
573
574
        //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers
575
24.5k
        temp0_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx));
576
24.5k
        band_table1_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8));
577
24.5k
        temp2_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16));
578
24.5k
        band_table3_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24));
579
580
        //band_position addition
581
24.5k
        temp0_8x16b = _mm_add_epi16(temp0_8x16b, band_pos_v_16x8b);
582
24.5k
        band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, band_pos_v_16x8b);
583
24.5k
        temp2_8x16b = _mm_add_epi16(temp2_8x16b, band_pos_v_16x8b);
584
24.5k
        band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, band_pos_v_16x8b);
585
        //sao_offset duplication
586
24.5k
        tmp_set_128i_1  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1);
587
24.5k
        tmp_set_128i_2  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2);
588
24.5k
        tmp_set_128i_3  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3);
589
24.5k
        tmp_set_128i_4  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4);
590
591
        //sao_offset addition
592
24.5k
        temp0_8x16b = _mm_add_epi16(temp0_8x16b, tmp_set_128i_1);
593
24.5k
        band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, tmp_set_128i_2);
594
24.5k
        temp2_8x16b = _mm_add_epi16(temp2_8x16b, tmp_set_128i_3);
595
24.5k
        band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, tmp_set_128i_4);
596
597
        //masking upper 8bit values of 16 bit band table value
598
24.5k
        temp0_8x16b = _mm_and_si128(temp0_8x16b, temp1_8x16b);
599
24.5k
        band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
600
24.5k
        temp2_8x16b = _mm_and_si128(temp2_8x16b, temp1_8x16b);
601
24.5k
        band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
602
        //temp1_8x16b reuse for compare storage
603
604
24.5k
        switch(sao_band_pos_v)
605
24.5k
        {
606
1.12k
            case 0:
607
1.12k
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp0_8x16b);
608
1.12k
                temp0_8x16b = _mm_and_si128(temp0_8x16b, temp3_8x16b);
609
1.12k
                break;
610
732
            case 28:
611
732
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
612
732
                band_table3_16x8b = _mm_or_si128(band_table3_16x8b, temp3_8x16b);
613
732
                break;
614
347
            case 29:
615
347
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp2_8x16b);
616
347
                temp2_8x16b = _mm_or_si128(temp2_8x16b, temp3_8x16b);
617
347
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
618
347
                band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp3_8x16b);
619
347
                break;
620
710
            case 30:
621
710
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
622
710
                band_table1_16x8b = _mm_or_si128(band_table1_16x8b, temp3_8x16b);
623
710
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp2_8x16b);
624
710
                temp2_8x16b = _mm_and_si128(temp2_8x16b, temp3_8x16b);
625
710
                break;
626
546
            case 31:
627
546
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp0_8x16b);
628
546
                temp0_8x16b = _mm_or_si128(temp0_8x16b, temp3_8x16b);
629
546
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
630
546
                band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp3_8x16b);
631
546
                break;
632
21.0k
            default:
633
21.0k
                break;
634
24.5k
        }
635
        //masking upper 8bit values of each  16 bit band table value
636
24.5k
        temp0_8x16b = _mm_and_si128(temp0_8x16b, temp1_8x16b);
637
24.5k
        band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
638
24.5k
        temp2_8x16b = _mm_and_si128(temp2_8x16b, temp1_8x16b);
639
24.5k
        band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
640
        //band table 8x16 four registers are packed into 16x8 two registers:  band_table0_8x16b and band_table2_8x16b
641
24.5k
        band_table1_16x8b = _mm_packus_epi16(temp0_8x16b, band_table1_16x8b);
642
24.5k
        band_table3_16x8b = _mm_packus_epi16(temp2_8x16b, band_table3_16x8b);
643
        //band table for u and v created
644
24.5k
    }
645
0
    {
646
24.5k
        UWORD8 *pu1_src_cpy;
647
24.5k
        WORD32 wd_rem;
648
649
650
        //sao_offset is reused for zero cmp mask.
651
24.5k
        sao_offset = _mm_setzero_si128();
652
24.5k
        tmp_set_128i_1 = _mm_set1_epi8(1);
653
        //tmp_set_128i_2 = _mm_set_epi8 (128,7,128,6,128,5,128,4,128,3,128,2,128,1,128,0);
654
24.5k
        cmp_mask = _mm_packus_epi16(cmp_mask, cmp_mask); //cmp_msk=dup16(16);
655
        //to avoid ffff to be saturated to 0 instead it should be to ff
656
657
24.5k
        cmp_msk2 = _mm_slli_epi16(cmp_mask, 1); // to compare if value is greater than 31
658
24.5k
        band_pos_u_16x8b = _mm_packus_epi16(band_pos_u_16x8b, band_pos_u_16x8b); //band_pos_u is now 8 bit aligned
659
24.5k
        band_pos_v_16x8b = _mm_packus_epi16(band_pos_v_16x8b, band_pos_v_16x8b); //band_pos_v is now 8 bit aligned
660
24.5k
        cmp_msk2 = _mm_sub_epi8(cmp_msk2, tmp_set_128i_1); // to compare if value is greater than 31
661
662
24.5k
        cmp_mask = _mm_sub_epi8(cmp_mask, tmp_set_128i_1);
663
664
75.2k
        for(col = wd; col >= 16; col -= 16)
665
50.7k
        {
666
50.7k
            pu1_src_cpy = pu1_src;
667
461k
            for(row = ht; row > 0; row -= 2)
668
411k
            {
669
                //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
670
411k
                src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
671
                // row = 1
672
411k
                src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
673
674
675
                //odd values
676
411k
                src_temp1_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
677
411k
                src_temp2_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
678
                //even values
679
411k
                src_temp0_8x16b = _mm_slli_epi16(src_temp0_8x16b, 8);
680
411k
                src_temp3_8x16b = _mm_slli_epi16(src_temp3_8x16b, 8);
681
411k
                src_temp0_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
682
411k
                src_temp3_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
683
                //combining odd values
684
411k
                src_temp2_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp2_8x16b);
685
                //combining even values
686
411k
                src_temp0_8x16b = _mm_packus_epi16(src_temp0_8x16b, src_temp3_8x16b);
687
688
                //saturated substract 8 bit
689
411k
                tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_u_16x8b);
690
411k
                tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_v_16x8b);
691
                //if the values less than 0 put ff
692
411k
                tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
693
411k
                tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
694
411k
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
695
411k
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
696
                //if the values greater than 31 put ff
697
411k
                tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_msk2);
698
411k
                tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_msk2);
699
411k
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
700
411k
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
701
                // registers reused to increase performance
702
                //if the values >16 then put ff ,cmp_mask = dup16(15) row 0 and row1
703
411k
                src_temp1_8x16b = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
704
                //if the values >16 then put ff ,cmp_mask = dup16(15) row 2 and  row 3
705
411k
                src_temp3_8x16b = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
706
707
                //values 16 to 31 for row 0 & 1 but values <16 ==0
708
411k
                tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, src_temp1_8x16b);
709
                // values 0 to 15 for row 0 & 1
710
411k
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, src_temp1_8x16b);
711
                //values 16 to 31 for row 2 & 3 but values <16 ==0
712
411k
                tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, src_temp3_8x16b);
713
                // values 0 to 15 for row 2 & 3
714
411k
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, src_temp3_8x16b);
715
716
                //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 0 and row1
717
411k
                src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
718
                //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 2 and  row 3
719
411k
                src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
720
411k
                tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, src_temp1_8x16b);
721
411k
                tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, src_temp3_8x16b);
722
723
724
                //to choose which pixel values to preserve in row 0 and row 1
725
411k
                src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
726
                //to choose which pixel values to preserve in row 2 and row 3
727
411k
                src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
728
                //values of all rows to which no offset needs to be added preserved.
729
411k
                src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, src_temp1_8x16b);
730
411k
                src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, src_temp3_8x16b);
731
732
                //indexing 0 - 15 bandtable indexes
733
411k
                tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_16x8b, tmp_set_128i_1); //U low
734
411k
                tmp_set_128i_3 = _mm_shuffle_epi8(band_table1_16x8b, tmp_set_128i_3); //V low
735
                //indexing 16 -31 bandtable indexes
736
411k
                tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_16x8b, tmp_set_128i_2); //U high
737
411k
                tmp_set_128i_4 = _mm_shuffle_epi8(band_table3_16x8b, tmp_set_128i_4); //V high
738
                // combining all offsets results
739
411k
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); //U
740
411k
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); //V
741
                // combing results with the pixel values
742
411k
                src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
743
411k
                src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
744
                //reorganising even and odd values
745
411k
                src_temp1_8x16b = _mm_unpacklo_epi8(src_temp0_8x16b, src_temp2_8x16b);
746
411k
                src_temp3_8x16b = _mm_unpackhi_epi8(src_temp0_8x16b, src_temp2_8x16b);
747
748
749
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
750
411k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp1_8x16b);
751
                // row = 1
752
411k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp3_8x16b);
753
754
755
411k
                pu1_src_cpy += (src_strd << 1);
756
757
411k
            }
758
50.7k
            pu1_src += 16;
759
50.7k
        }
760
761
24.5k
        wd_rem = wd & 0xF;
762
24.5k
        if(wd_rem)
763
154
        {
764
154
            pu1_src_cpy = pu1_src;
765
770
            for(row = ht; row > 0; row -= 4)
766
616
            {
767
                //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
768
616
                src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
769
                // row = 1
770
616
                src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
771
                // row = 2
772
616
                src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
773
                // row = 3
774
616
                src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
775
                //row0 and row1 packed and row2 and row3 packed
776
777
616
                src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b);
778
616
                src_temp3_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp3_8x16b);
779
                //odd values
780
616
                src_temp1_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
781
616
                src_temp2_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
782
                //even values
783
616
                src_temp0_8x16b = _mm_slli_epi16(src_temp0_8x16b, 8);
784
616
                src_temp3_8x16b = _mm_slli_epi16(src_temp3_8x16b, 8);
785
616
                src_temp0_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
786
616
                src_temp3_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
787
                //combining odd values
788
616
                src_temp2_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp2_8x16b);
789
                //combining even values
790
616
                src_temp0_8x16b = _mm_packus_epi16(src_temp0_8x16b, src_temp3_8x16b);
791
792
                //saturated substract 8 bit
793
616
                tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_u_16x8b);
794
616
                tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_v_16x8b);
795
                //if the values less than 0 put ff
796
616
                tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
797
616
                tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
798
616
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
799
616
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
800
                //if the values greater than 31 put ff
801
616
                tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_msk2);
802
616
                tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_msk2);
803
616
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
804
616
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
805
                // registers reused to increase performance
806
                //if the values >16 then put ff ,cmp_mask = dup16(15) row 0 and row1
807
616
                src_temp1_8x16b = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
808
                //if the values >16 then put ff ,cmp_mask = dup16(15) row 2 and  row 3
809
616
                src_temp3_8x16b = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
810
811
                //values 16 to 31 for row 0 & 1 but values <16 ==0
812
616
                tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, src_temp1_8x16b);
813
                // values 0 to 15 for row 0 & 1
814
616
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, src_temp1_8x16b);
815
                //values 16 to 31 for row 2 & 3 but values <16 ==0
816
616
                tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, src_temp3_8x16b);
817
                // values 0 to 15 for row 2 & 3
818
616
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, src_temp3_8x16b);
819
820
                //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 0 and row1
821
616
                src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
822
                //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 2 and  row 3
823
616
                src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
824
616
                tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, src_temp1_8x16b);
825
616
                tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, src_temp3_8x16b);
826
827
828
                //to choose which pixel values to preserve in row 0 and row 1
829
616
                src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
830
                //to choose which pixel values to preserve in row 2 and row 3
831
616
                src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
832
                //values of all rows to which no offset needs to be added preserved.
833
616
                src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, src_temp1_8x16b);
834
616
                src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, src_temp3_8x16b);
835
836
                //indexing 0 - 15 bandtable indexes
837
616
                tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_16x8b, tmp_set_128i_1); //U low
838
616
                tmp_set_128i_3 = _mm_shuffle_epi8(band_table1_16x8b, tmp_set_128i_3); //V low
839
                //indexing 16 -31 bandtable indexes
840
616
                tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_16x8b, tmp_set_128i_2); //U high
841
616
                tmp_set_128i_4 = _mm_shuffle_epi8(band_table3_16x8b, tmp_set_128i_4); //V high
842
                // combining all offsets results
843
616
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); //U
844
616
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); //V
845
                // combing results with the pixel values
846
616
                src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
847
616
                src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
848
                //reorganising even and odd values
849
616
                src_temp1_8x16b = _mm_unpacklo_epi8(src_temp0_8x16b, src_temp2_8x16b);
850
616
                src_temp3_8x16b = _mm_unpackhi_epi8(src_temp0_8x16b, src_temp2_8x16b);
851
                //Getting row1 separately
852
616
                src_temp0_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
853
                //Getting row3 separately
854
616
                src_temp2_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
855
856
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
857
616
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp1_8x16b);
858
                // row = 1
859
616
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), src_temp0_8x16b);
860
                // row = 2
861
616
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp3_8x16b);
862
                // row = 3
863
616
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), src_temp2_8x16b);
864
865
616
                pu1_src_cpy += (src_strd << 2);
866
867
616
            }
868
154
            pu1_src += 16;
869
154
        }
870
871
872
24.5k
    }
873
24.5k
}
874
875
876
877
void ihevc_sao_edge_offset_class0_ssse3(UWORD8 *pu1_src,
878
                                        WORD32 src_strd,
879
                                        UWORD8 *pu1_src_left,
880
                                        UWORD8 *pu1_src_top,
881
                                        UWORD8 *pu1_src_top_left,
882
                                        UWORD8 *pu1_src_top_right,
883
                                        UWORD8 *pu1_src_bot_left,
884
                                        UWORD8 *pu1_avail,
885
                                        WORD8 *pi1_sao_offset,
886
                                        WORD32 wd,
887
                                        WORD32 ht)
888
15.3k
{
889
15.3k
    WORD32 row, col;
890
15.3k
    UWORD8 *pu1_src_cpy, *pu1_src_left_cpy, *pu1_src_left_str, *pu1_left_tmp;
891
15.3k
    UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
892
15.3k
    UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8];
893
15.3k
    UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8];
894
15.3k
    UWORD8 u1_avail0, u1_avail1;
895
15.3k
    WORD32 wd_rem;
896
15.3k
    WORD32 offset = 0;
897
15.3k
    __m128i src_temp0_16x8b, src_temp1_16x8b;
898
15.3k
    __m128i left0_16x8b, left1_16x8b;
899
15.3k
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b, cmp_gt1_16x8b, cmp_lt1_16x8b;
900
15.3k
    __m128i edge0_16x8b, edge1_16x8b;
901
15.3k
    __m128i au1_mask8x16b;
902
15.3k
    __m128i edge_idx_8x16b, sao_offset_8x16b;
903
15.3k
    __m128i const2_16x8b, const0_16x8b;
904
15.3k
    __m128i left_store_16x8b;
905
15.3k
    UNUSED(pu1_src_top_right);
906
15.3k
    UNUSED(pu1_src_bot_left);
907
908
15.3k
    au1_mask8x16b = _mm_set1_epi8(0xff);
909
910
    /* Update  top and top-left arrays */
911
912
15.3k
    *pu1_src_top_left = pu1_src_top[wd - 1];
913
914
39.5k
    for(col = wd; col >= 16; col -= 16)
915
24.1k
    {
916
24.1k
        const0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + offset + (ht - 1) * src_strd));
917
24.1k
        _mm_storeu_si128((__m128i *)(pu1_src_top + offset), const0_16x8b);
918
24.1k
        offset += 16;
919
24.1k
    }
920
921
    //setting availability mask to ff size MAX_CTB_SIZE
922
76.8k
    for(col = 0; col < MAX_CTB_SIZE; col += 16)
923
61.5k
        _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
924
512k
    for(row = 0; row < ht; row++)
925
497k
    {
926
497k
        au1_src_left_tmp[row] = pu1_src_left[row];
927
497k
    }
928
15.3k
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
929
15.3k
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
930
931
    //availability mask creation
932
15.3k
    u1_avail0 = pu1_avail[0];
933
15.3k
    u1_avail1 = pu1_avail[1];
934
15.3k
    au1_mask[0] = u1_avail0;
935
15.3k
    au1_mask[wd - 1] = u1_avail1;
936
937
15.3k
    const2_16x8b = _mm_set1_epi8(2);
938
15.3k
    const0_16x8b = _mm_setzero_si128();
939
15.3k
    pu1_src_left_cpy = au1_src_left_tmp;
940
15.3k
    pu1_src_left_str = au1_src_left_tmp1;
941
15.3k
    {
942
15.3k
        au1_mask_cpy = au1_mask;
943
39.5k
        for(col = wd; col >= 16; col -= 16)
944
24.1k
        {
945
24.1k
            pu1_src_cpy = pu1_src;
946
24.1k
            au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
947
            //pu1_src_left_cpy =au1_src_left_tmp;
948
414k
            for(row = ht; row > 0; row -= 2)
949
390k
            {
950
951
390k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
952
                //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
953
390k
                src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
954
                // row = 1
955
390k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
956
957
390k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 2);
958
                //row 1 left
959
390k
                left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 15);
960
390k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15);
961
                //row 0 left
962
390k
                left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 15);
963
390k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
964
965
966
                //separating +ve and and -ve values.
967
390k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
968
390k
                cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
969
390k
                cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
970
390k
                cmp_lt1_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
971
                //creating mask 00 for +ve and -ve values and FF for zero.
972
390k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
973
390k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
974
390k
                cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
975
390k
                cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
976
                //combining the appropriate sign change
977
390k
                left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
978
390k
                left1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
979
980
                //row = 0 right
981
390k
                edge0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 1));
982
                // row = 1 right
983
390k
                edge1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
984
                //separating +ve and and -ve values.
985
390k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
986
390k
                cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
987
390k
                cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
988
390k
                cmp_lt1_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
989
                //creating mask 00 for +ve and -ve values and FF for zero.
990
390k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
991
390k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
992
390k
                cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
993
390k
                cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
994
                //combining the appropriate sign change
995
390k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
996
390k
                edge1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
997
998
                //combining sign-left and sign_right
999
390k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
1000
390k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
1001
                //adding constant 2
1002
390k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1003
390k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
1004
                //shuffle to get sao index
1005
390k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1006
390k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
1007
                //using availability mask
1008
390k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
1009
390k
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
1010
1011
                //shuffle to get sao offset
1012
390k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1013
390k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
1014
                //cnvert to 16 bit then add and then saturated pack
1015
390k
                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1016
390k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1017
390k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
1018
390k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1019
390k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1020
390k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
1021
390k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
1022
390k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1023
1024
390k
                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
1025
390k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
1026
390k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
1027
390k
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
1028
390k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1029
390k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
1030
390k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
1031
390k
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
1032
1033
1034
390k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
1035
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1036
390k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1037
                // row = 1
1038
390k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
1039
1040
390k
                pu1_src_cpy += (src_strd << 1);
1041
390k
                pu1_src_left_cpy += 2;
1042
390k
                pu1_src_left_str += 2;
1043
390k
            }
1044
24.1k
            au1_mask_cpy += 16;
1045
24.1k
            pu1_src += 16;
1046
24.1k
            pu1_src_left_cpy -= ht;
1047
24.1k
            pu1_src_left_str -= ht;
1048
1049
24.1k
            pu1_left_tmp = pu1_src_left_cpy;
1050
24.1k
            pu1_src_left_cpy = pu1_src_left_str;
1051
24.1k
            pu1_src_left_str = pu1_left_tmp;
1052
24.1k
        }
1053
1054
15.3k
        wd_rem = wd & 0xF;
1055
15.3k
        if(wd_rem)
1056
14.9k
        {
1057
1058
14.9k
            cmp_gt1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (ht - 1) * src_strd));
1059
14.9k
            _mm_storel_epi64((__m128i *)(pu1_src_top + offset), cmp_gt1_16x8b);
1060
1061
14.9k
            au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy);
1062
14.9k
            pu1_src_cpy = pu1_src;
1063
14.9k
            au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
1064
            //pu1_src_left_cpy =au1_src_left_tmp;
1065
135k
            for(row = ht; row > 0; row -= 4)
1066
120k
            {
1067
120k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
1068
                //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
1069
120k
                src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
1070
                // row = 1
1071
120k
                cmp_gt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
1072
                // row  = 2
1073
120k
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
1074
                // row = 3
1075
120k
                cmp_gt1_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
1076
1077
1078
120k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 4);
1079
                //row 3 left
1080
120k
                edge0_16x8b = _mm_slli_si128(cmp_gt1_16x8b, 8);
1081
120k
                cmp_lt1_16x8b = _mm_alignr_epi8(cmp_gt1_16x8b, left_store_16x8b, 15);
1082
120k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
1083
                //row 2 left
1084
120k
                edge0_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
1085
120k
                left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 15);
1086
120k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
1087
                //row 1 left
1088
120k
                edge0_16x8b = _mm_slli_si128(cmp_gt0_16x8b, 8);
1089
120k
                cmp_lt0_16x8b = _mm_alignr_epi8(cmp_gt0_16x8b, left_store_16x8b, 15);
1090
120k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
1091
                //row 0 left
1092
120k
                edge0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
1093
120k
                left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 15);
1094
120k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
1095
1096
                // packing rows together for 16 SIMD operations
1097
120k
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, cmp_gt0_16x8b);
1098
120k
                src_temp1_16x8b = _mm_unpacklo_epi64(src_temp1_16x8b, cmp_gt1_16x8b);
1099
                // packing rows together for 16 SIMD operations
1100
120k
                left0_16x8b = _mm_unpacklo_epi64(left0_16x8b, cmp_lt0_16x8b);
1101
120k
                left1_16x8b = _mm_unpacklo_epi64(left1_16x8b, cmp_lt1_16x8b);
1102
1103
                //separating +ve and and -ve values.
1104
120k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
1105
120k
                cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
1106
120k
                cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
1107
120k
                cmp_lt1_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
1108
                //creating mask 00 for +ve and -ve values and FF for zero.
1109
120k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1110
120k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1111
120k
                cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
1112
120k
                cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
1113
                //combining the appropriate sign change
1114
120k
                left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1115
120k
                left1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
1116
1117
                //row = 0 right
1118
120k
                edge0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 1));
1119
                // row = 1 right
1120
120k
                cmp_gt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd + 1));
1121
                // row = 2 right
1122
120k
                edge1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd + 1));
1123
                // row = 3 right
1124
120k
                cmp_gt1_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd + 1));
1125
                // packing rows together for 16 SIMD operations
1126
120k
                edge0_16x8b = _mm_unpacklo_epi64(edge0_16x8b, cmp_gt0_16x8b);
1127
120k
                edge1_16x8b = _mm_unpacklo_epi64(edge1_16x8b, cmp_gt1_16x8b);
1128
1129
                //separating +ve and and -ve values.
1130
120k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
1131
120k
                cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
1132
120k
                cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
1133
120k
                cmp_lt1_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
1134
                //creating mask 00 for +ve and -ve values and FF for zero.
1135
120k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1136
120k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1137
120k
                cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
1138
120k
                cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
1139
                //combining the appropriate sign change
1140
120k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1141
120k
                edge1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
1142
1143
                //combining sign-left and sign_right
1144
120k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
1145
120k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
1146
                //adding constant 2
1147
120k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1148
120k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
1149
                //shuffle to get sao index
1150
120k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1151
120k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
1152
                //shuffle to get sao offset
1153
                //using availability mask
1154
120k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
1155
120k
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
1156
1157
120k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1158
120k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
1159
                //cnvert to 16 bit then add and then saturated pack
1160
120k
                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1161
120k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1162
120k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
1163
120k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1164
120k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1165
120k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
1166
120k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
1167
120k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1168
1169
120k
                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
1170
120k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
1171
120k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
1172
120k
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
1173
120k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1174
120k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
1175
120k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
1176
120k
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
1177
                //separting row 1 and row 3
1178
120k
                cmp_lt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
1179
120k
                cmp_lt1_16x8b = _mm_srli_si128(src_temp1_16x8b, 8);
1180
1181
120k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
1182
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1183
120k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1184
                // row = 1
1185
120k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_lt0_16x8b);
1186
                // row = 2
1187
120k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp1_16x8b);
1188
                // row = 3
1189
120k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt1_16x8b);
1190
1191
120k
                pu1_src_cpy += (src_strd << 2);
1192
120k
                pu1_src_left_cpy += 4;
1193
120k
                pu1_src_left_str += 4;
1194
120k
            }
1195
14.9k
            pu1_src += wd;
1196
14.9k
            pu1_src_left_cpy -= ht;
1197
14.9k
            pu1_src_left_str -= ht;
1198
1199
14.9k
            pu1_left_tmp = pu1_src_left_cpy;
1200
14.9k
            pu1_src_left_cpy = pu1_src_left_str;
1201
14.9k
            pu1_src_left_str = pu1_left_tmp;
1202
14.9k
        }
1203
512k
        for(row = 0; row < ht; row++)
1204
497k
        {
1205
497k
            pu1_src_left[row] = pu1_src_left_cpy[row];
1206
497k
        }
1207
15.3k
    }
1208
15.3k
}
1209
1210
1211
void ihevc_sao_edge_offset_class0_chroma_ssse3(UWORD8 *pu1_src,
1212
                                               WORD32 src_strd,
1213
                                               UWORD8 *pu1_src_left,
1214
                                               UWORD8 *pu1_src_top,
1215
                                               UWORD8 *pu1_src_top_left,
1216
                                               UWORD8 *pu1_src_top_right,
1217
                                               UWORD8 *pu1_src_bot_left,
1218
                                               UWORD8 *pu1_avail,
1219
                                               WORD8 *pi1_sao_offset_u,
1220
                                               WORD8 *pi1_sao_offset_v,
1221
                                               WORD32 wd,
1222
                                               WORD32 ht)
1223
6.58k
{
1224
6.58k
    WORD32 row, col;
1225
6.58k
    UWORD8 *pu1_src_cpy, *pu1_src_left_cpy, *pu1_src_left_str, *pu1_left_tmp;
1226
6.58k
    UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
1227
6.58k
    UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)];
1228
6.58k
    UWORD8 au1_src_left_tmp1[2 * (MAX_CTB_SIZE + 8)];
1229
6.58k
    UWORD8 u1_avail0, u1_avail1;
1230
6.58k
    WORD32 wd_rem;
1231
6.58k
    WORD32 offset = 0;
1232
1233
6.58k
    __m128i src_temp0_16x8b, src_temp1_16x8b;
1234
6.58k
    __m128i left0_16x8b, left1_16x8b;
1235
6.58k
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
1236
6.58k
    __m128i edge0_16x8b, edge1_16x8b;
1237
6.58k
    __m128i au1_mask8x16b;
1238
6.58k
    __m128i edge_idx_8x16b, sao_offset_8x16b;
1239
6.58k
    __m128i const2_16x8b, const0_16x8b;
1240
6.58k
    __m128i left_store_16x8b;
1241
6.58k
    __m128i chroma_offset_8x16b;
1242
6.58k
    UNUSED(pu1_src_top_right);
1243
6.58k
    UNUSED(pu1_src_bot_left);
1244
1245
6.58k
    au1_mask8x16b = _mm_set1_epi8(0xff);
1246
1247
    /* Update  top and top-left arrays */
1248
6.58k
    pu1_src_top_left[0] = pu1_src_top[wd - 2];
1249
6.58k
    pu1_src_top_left[1] = pu1_src_top[wd - 1];;
1250
1251
20.3k
    for(col = wd; col >= 16; col -= 16)
1252
13.7k
    {
1253
13.7k
        const0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + offset + (ht - 1) * src_strd));
1254
13.7k
        _mm_storeu_si128((__m128i *)(pu1_src_top + offset), const0_16x8b);
1255
13.7k
        offset += 16;
1256
13.7k
    }
1257
219k
    for(row = 0; row < 2 * ht; row++)
1258
213k
    {
1259
213k
        au1_src_left_tmp[row] = pu1_src_left[row];
1260
213k
    }
1261
    //setting availability mask to ff size MAX_CTB_SIZE
1262
32.9k
    for(col = 0; col < MAX_CTB_SIZE; col += 16)
1263
26.3k
        _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
1264
1265
6.58k
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
1266
6.58k
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
1267
6.58k
    const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
1268
6.58k
    chroma_offset_8x16b = _mm_set1_epi16(0x0800);
1269
    //availability mask creation
1270
6.58k
    u1_avail0 = pu1_avail[0];
1271
6.58k
    u1_avail1 = pu1_avail[1];
1272
6.58k
    au1_mask[0] = u1_avail0;
1273
6.58k
    au1_mask[1] = u1_avail0;
1274
6.58k
    au1_mask[wd - 1] = u1_avail1;
1275
6.58k
    au1_mask[wd - 2] = u1_avail1;
1276
6.58k
    sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
1277
6.58k
    const2_16x8b = _mm_set1_epi8(2);
1278
6.58k
    const0_16x8b = _mm_setzero_si128();
1279
1280
6.58k
    {
1281
6.58k
        pu1_src_left_cpy = au1_src_left_tmp;
1282
6.58k
        pu1_src_left_str = au1_src_left_tmp1;
1283
6.58k
        au1_mask_cpy = au1_mask;
1284
20.3k
        for(col = wd; col >= 16; col -= 16)
1285
13.7k
        {
1286
13.7k
            pu1_src_cpy = pu1_src;
1287
13.7k
            au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
1288
1289
125k
            for(row = ht; row > 0; row -= 2)
1290
111k
            {
1291
1292
111k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
1293
                //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
1294
111k
                src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
1295
                // row = 1
1296
111k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
1297
1298
111k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 4);
1299
                //row 1 left
1300
111k
                left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 14);
1301
111k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14);
1302
                //row 0 left
1303
111k
                left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 14);
1304
111k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
1305
1306
1307
                //separating +ve and and -ve values.row 0 left
1308
111k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
1309
111k
                cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
1310
                //creating mask 00 for +ve and -ve values and FF for zero.
1311
111k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1312
111k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1313
                //combining the appropriate sign change
1314
111k
                left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1315
1316
                //separating +ve and and -ve values.row 1 left
1317
111k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
1318
111k
                cmp_lt0_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
1319
                //creating mask 00 for +ve and -ve values and FF for zero.
1320
111k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1321
111k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1322
                //combining the appropriate sign change
1323
111k
                left1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1324
1325
1326
                //row = 0 right
1327
111k
                edge0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2));
1328
                // row = 1 right
1329
111k
                edge1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
1330
                //separating +ve and and -ve values.row 0 right
1331
111k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
1332
111k
                cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
1333
                //creating mask 00 for +ve and -ve values and FF for zero.
1334
111k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1335
111k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1336
                //combining the appropriate sign change
1337
111k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1338
1339
                //separating +ve and and -ve values.row 1 right
1340
111k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
1341
111k
                cmp_lt0_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
1342
                //creating mask 00 for +ve and -ve values and FF for zero.
1343
111k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1344
111k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1345
                //combining the appropriate sign change
1346
111k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1347
1348
                //combining sign-left and sign_right
1349
111k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
1350
111k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
1351
                //adding constant 2
1352
111k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1353
111k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
1354
                //shuffle to get sao index
1355
111k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1356
111k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
1357
                //using availability mask
1358
111k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
1359
111k
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
1360
                //adding chroma offset to access U and V
1361
111k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
1362
111k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
1363
1364
                //shuffle to get sao offset
1365
111k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1366
111k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
1367
                //cnvert to 16 bit then add and then saturated pack
1368
111k
                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1369
111k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1370
111k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
1371
111k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1372
111k
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
1373
111k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1374
111k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
1375
111k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1376
1377
111k
                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
1378
111k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
1379
111k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
1380
111k
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
1381
111k
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
1382
111k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1383
111k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
1384
111k
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
1385
1386
111k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
1387
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1388
111k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1389
                // row = 1
1390
111k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
1391
1392
111k
                pu1_src_cpy += (src_strd << 1);
1393
111k
                pu1_src_left_cpy += 4;
1394
111k
                pu1_src_left_str += 4;
1395
111k
            }
1396
13.7k
            au1_mask_cpy += 16;
1397
13.7k
            pu1_src += 16;
1398
13.7k
            pu1_src_left_cpy -= 2 * ht;
1399
13.7k
            pu1_src_left_str -= 2 * ht;
1400
1401
13.7k
            pu1_left_tmp = pu1_src_left_cpy;
1402
13.7k
            pu1_src_left_cpy = pu1_src_left_str;
1403
13.7k
            pu1_src_left_str = pu1_left_tmp;
1404
13.7k
        }
1405
1406
6.58k
        wd_rem = wd & 0xF;
1407
6.58k
        if(wd_rem)
1408
55
        {
1409
1410
55
            cmp_gt0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (ht - 1) * src_strd));
1411
55
            _mm_storel_epi64((__m128i *)(pu1_src_top + offset), cmp_gt0_16x8b);
1412
1413
55
            au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy);
1414
55
            pu1_src_cpy = pu1_src;
1415
55
            au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
1416
1417
272
            for(row = ht; row > 0; row -= 4)
1418
217
            {
1419
217
                left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
1420
                //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
1421
217
                src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
1422
                // row = 1
1423
217
                cmp_gt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
1424
                // row  = 2
1425
217
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
1426
                // row = 3
1427
217
                cmp_lt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
1428
1429
1430
217
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 8);
1431
                //row 3 left
1432
217
                edge0_16x8b = _mm_slli_si128(cmp_lt0_16x8b, 8);
1433
217
                left0_16x8b = _mm_alignr_epi8(cmp_lt0_16x8b, left_store_16x8b, 14);
1434
217
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
1435
                //row 2 left
1436
217
                edge0_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
1437
217
                left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 14);
1438
217
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
1439
1440
1441
                // packing rows together for 16 SIMD operations
1442
217
                src_temp1_16x8b = _mm_unpacklo_epi64(src_temp1_16x8b, cmp_lt0_16x8b);
1443
217
                left1_16x8b = _mm_unpacklo_epi64(left1_16x8b, left0_16x8b);
1444
1445
                //row 1 left
1446
217
                edge0_16x8b = _mm_slli_si128(cmp_gt0_16x8b, 8);
1447
217
                edge1_16x8b = _mm_alignr_epi8(cmp_gt0_16x8b, left_store_16x8b, 14);
1448
217
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
1449
                //row 0 left
1450
217
                edge0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
1451
217
                left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 14);
1452
217
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
1453
                // packing rows together for 16 SIMD operations
1454
217
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, cmp_gt0_16x8b);
1455
217
                left0_16x8b = _mm_unpacklo_epi64(left0_16x8b, edge1_16x8b);
1456
1457
                //separating +ve and and -ve values.for row 2 and row 3
1458
217
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
1459
217
                cmp_lt0_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
1460
                //creating mask 00 for +ve and -ve values and FF for zero.
1461
217
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1462
217
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1463
                //combining the appropriate sign change
1464
217
                left1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1465
1466
1467
1468
1469
1470
                //separating +ve and and -ve values.
1471
217
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
1472
217
                cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
1473
                //creating mask 00 for +ve and -ve values and FF for zero.
1474
217
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1475
217
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1476
                //combining the appropriate sign change
1477
217
                left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1478
1479
1480
                //row = 0 right
1481
217
                edge0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2));
1482
                // row = 1 right
1483
217
                cmp_gt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd + 2));
1484
                // row = 2 right
1485
217
                edge1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd + 2));
1486
                // row = 3 right
1487
217
                cmp_lt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd + 2));
1488
                // packing rows together for 16 SIMD operations
1489
217
                edge0_16x8b = _mm_unpacklo_epi64(edge0_16x8b, cmp_gt0_16x8b);
1490
217
                edge1_16x8b = _mm_unpacklo_epi64(edge1_16x8b, cmp_lt0_16x8b);
1491
1492
                //separating +ve and and -ve values.
1493
217
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
1494
217
                cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
1495
                //creating mask 00 for +ve and -ve values and FF for zero.
1496
217
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1497
217
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1498
                //combining the appropriate sign change
1499
217
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1500
1501
217
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
1502
217
                cmp_lt0_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
1503
                //creating mask 00 for +ve and -ve values and FF for zero.
1504
217
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1505
217
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1506
                //combining the appropriate sign change
1507
217
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1508
1509
                //combining sign-left and sign_right
1510
217
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
1511
217
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
1512
                //adding constant 2
1513
217
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1514
217
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
1515
                //shuffle to get sao index
1516
217
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1517
217
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
1518
                //shuffle to get sao offset
1519
                //using availability mask
1520
217
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
1521
217
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
1522
                //adding chroma offset to access U and V
1523
217
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
1524
217
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
1525
1526
217
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1527
217
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
1528
                //cnvert to 16 bit then add and then saturated pack
1529
217
                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1530
217
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1531
217
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
1532
217
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1533
217
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
1534
217
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1535
217
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
1536
217
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1537
1538
217
                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
1539
217
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
1540
217
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
1541
217
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
1542
217
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
1543
217
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1544
217
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
1545
217
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
1546
1547
                //seaprting row 1 and row 3
1548
217
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
1549
217
                cmp_lt0_16x8b = _mm_srli_si128(src_temp1_16x8b, 8);
1550
1551
217
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
1552
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1553
217
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1554
                // row = 1
1555
217
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
1556
                // row = 2
1557
217
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp1_16x8b);
1558
                // row = 3
1559
217
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
1560
1561
217
                pu1_src_cpy += (src_strd << 2);
1562
217
                pu1_src_left_cpy += 8;
1563
217
                pu1_src_left_str += 8;
1564
217
            }
1565
55
            pu1_src += wd;
1566
55
            pu1_src_left_cpy -= 2 * ht;
1567
55
            pu1_src_left_str -= 2 * ht;
1568
1569
55
            pu1_left_tmp = pu1_src_left_cpy;
1570
55
            pu1_src_left_cpy = pu1_src_left_str;
1571
55
            pu1_src_left_str = pu1_left_tmp;
1572
55
        }
1573
219k
        for(row = 0; row < 2 * ht; row++)
1574
213k
        {
1575
213k
            pu1_src_left[row] = pu1_src_left_cpy[row];
1576
213k
        }
1577
6.58k
    }
1578
1579
6.58k
}
1580
1581
1582
void ihevc_sao_edge_offset_class1_ssse3(UWORD8 *pu1_src,
1583
                                        WORD32 src_strd,
1584
                                        UWORD8 *pu1_src_left,
1585
                                        UWORD8 *pu1_src_top,
1586
                                        UWORD8 *pu1_src_top_left,
1587
                                        UWORD8 *pu1_src_top_right,
1588
                                        UWORD8 *pu1_src_bot_left,
1589
                                        UWORD8 *pu1_avail,
1590
                                        WORD8 *pi1_sao_offset,
1591
                                        WORD32 wd,
1592
                                        WORD32 ht)
1593
14.4k
{
1594
14.4k
    WORD32 row, col;
1595
14.4k
    UWORD8 *pu1_src_top_cpy;
1596
14.4k
    UWORD8 *pu1_src_cpy;
1597
14.4k
    WORD32 wd_rem;
1598
1599
1600
14.4k
    __m128i src_top_16x8b, src_bottom_16x8b;
1601
14.4k
    __m128i src_temp0_16x8b, src_temp1_16x8b;
1602
14.4k
    __m128i signup0_16x8b, signdwn1_16x8b;
1603
14.4k
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
1604
14.4k
    __m128i edge0_16x8b, edge1_16x8b;
1605
14.4k
    __m128i edge_idx_8x16b, sao_offset_8x16b;
1606
14.4k
    __m128i const2_16x8b, const0_16x8b;
1607
1608
14.4k
    UNUSED(pu1_src_top_right);
1609
14.4k
    UNUSED(pu1_src_bot_left);
1610
1611
1612
    /* Updating left and top-left  */
1613
481k
    for(row = 0; row < ht; row++)
1614
467k
    {
1615
467k
        pu1_src_left[row] = pu1_src[row * src_strd + (wd - 1)];
1616
467k
    }
1617
14.4k
    *pu1_src_top_left = pu1_src_top[wd - 1];
1618
1619
1620
1621
14.4k
    pu1_src_top_cpy = pu1_src_top;
1622
14.4k
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
1623
14.4k
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
1624
1625
    /* Update height and source pointers based on the availability flags */
1626
14.4k
    if(0 == pu1_avail[2])
1627
1.61k
    {
1628
1.61k
        pu1_src_top_cpy = pu1_src;
1629
1.61k
        pu1_src += src_strd;
1630
1.61k
        ht--;
1631
1.61k
    }
1632
14.4k
    if(0 == pu1_avail[3])
1633
1.28k
    {
1634
1.28k
        ht--;
1635
1.28k
    }
1636
1637
14.4k
    const2_16x8b = _mm_set1_epi8(2);
1638
14.4k
    const0_16x8b = _mm_setzero_si128();
1639
1640
14.4k
    {
1641
14.4k
        WORD32 ht_rem;
1642
37.7k
        for(col = wd; col >= 16; col -= 16)
1643
23.3k
        {
1644
23.3k
            pu1_src_cpy = pu1_src;
1645
23.3k
            src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
1646
            //row = 0
1647
23.3k
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
1648
            //separating +ve and and -ve values.
1649
23.3k
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
1650
23.3k
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
1651
            //creating mask 00 for +ve and -ve values and FF for zero.
1652
23.3k
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1653
23.3k
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1654
            //combining the appropriate sign change
1655
23.3k
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1656
1657
396k
            for(row = ht; row >= 2; row -= 2)
1658
372k
            {
1659
1660
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
1661
372k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
1662
                // row = 2
1663
372k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
1664
1665
1666
                //row 0 -row1
1667
                //separating +ve and and -ve values.
1668
372k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
1669
372k
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
1670
                //creating mask 00 for +ve and -ve values and FF for zero.
1671
372k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1672
372k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1673
                //combining the appropriate sign change
1674
372k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1675
                //row1-row0
1676
372k
                edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
1677
1678
                //row1 -bottom
1679
372k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
1680
372k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
1681
                //creating mask 00 for +ve and -ve values and FF for zero.
1682
372k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1683
372k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1684
                //combining the appropriate sign change
1685
372k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1686
1687
                //combining sign-left and sign_right
1688
372k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
1689
372k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
1690
1691
                //for the next iteration signup0_16x8b = -signdwn1_16x8b
1692
372k
                signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
1693
                //adding constant 2
1694
372k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1695
372k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
1696
                //shuffle to get sao index
1697
372k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1698
372k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
1699
                //shuffle to get sao offset
1700
372k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1701
372k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
1702
                //copying the next top
1703
372k
                src_top_16x8b = src_temp1_16x8b;
1704
                //cnvert to 16 bit then add and then saturated pack
1705
372k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1706
372k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1707
372k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
1708
372k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1709
372k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1710
372k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
1711
372k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
1712
372k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1713
1714
372k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
1715
372k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
1716
372k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
1717
372k
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
1718
372k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1719
372k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
1720
372k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
1721
372k
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
1722
1723
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1724
372k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1725
                // row = 1
1726
372k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
1727
1728
372k
                src_temp0_16x8b = src_bottom_16x8b;
1729
372k
                pu1_src_cpy += (src_strd << 1);
1730
372k
            }
1731
23.3k
            ht_rem = ht & 0x1;
1732
1733
23.3k
            if(ht_rem)
1734
4.68k
            {
1735
4.68k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
1736
                //current row -next row
1737
                //separating +ve and and -ve values.
1738
4.68k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
1739
4.68k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
1740
                //creating mask 00 for +ve and -ve values and FF for zero.
1741
4.68k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1742
4.68k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1743
                //combining the appropriate sign change
1744
4.68k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1745
                //adding top and botton and constant 2
1746
4.68k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
1747
4.68k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1748
1749
4.68k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1750
4.68k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1751
                //copying the next top
1752
4.68k
                src_top_16x8b = src_temp0_16x8b;
1753
                //cnvert to 16 bit then add and then saturated pack
1754
4.68k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1755
4.68k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1756
4.68k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
1757
4.68k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1758
4.68k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1759
4.68k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
1760
4.68k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
1761
4.68k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1762
1763
4.68k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1764
4.68k
            }
1765
23.3k
            if(0 == pu1_avail[3])
1766
2.07k
            {
1767
2.07k
                src_top_16x8b = src_bottom_16x8b;
1768
2.07k
            }
1769
            //updating top flag
1770
23.3k
            _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
1771
23.3k
            pu1_src += 16;
1772
23.3k
        }
1773
1774
14.4k
        wd_rem = wd & 0xF;
1775
14.4k
        if(wd_rem)
1776
13.7k
        {
1777
13.7k
            pu1_src_cpy = pu1_src;
1778
13.7k
            src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
1779
            //row = 0
1780
13.7k
            src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
1781
            //separating +ve and and -ve values.
1782
13.7k
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
1783
13.7k
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
1784
            //creating mask 00 for +ve and -ve values and FF for zero.
1785
13.7k
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1786
13.7k
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1787
            //combining the appropriate sign change
1788
13.7k
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1789
13.7k
            signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
1790
122k
            for(row = ht; row >= 4; row -= 4)
1791
108k
            {
1792
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
1793
108k
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
1794
                // row = 2
1795
108k
                src_bottom_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
1796
1797
                //row 0 -row1
1798
                //separating +ve and and -ve values.
1799
108k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
1800
108k
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
1801
                //creating mask 00 for +ve and -ve values and FF for zero.
1802
108k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1803
108k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1804
                //combining the appropriate sign change
1805
108k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1806
1807
                //row1-row0
1808
108k
                edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
1809
108k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
1810
108k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
1811
                //row1 -row2
1812
108k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
1813
108k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
1814
                //creating mask 00 for +ve and -ve values and FF for zero.
1815
108k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1816
108k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1817
                //combining the appropriate sign change
1818
108k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
1819
108k
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
1820
                //packing row 0 n row 1
1821
108k
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
1822
                //row = 3
1823
108k
                src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
1824
                // row = 4
1825
108k
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 4 * src_strd));
1826
1827
108k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
1828
108k
                signdwn1_16x8b = _mm_slli_si128(signdwn1_16x8b, 8); //allign left (1-2)
1829
                //separating +ve and and -ve values.(2,3)
1830
108k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_top_16x8b);
1831
108k
                cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_bottom_16x8b);
1832
                //creating mask 00 for +ve and -ve values and FF for zero.
1833
108k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1834
108k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1835
                //combining the appropriate sign change
1836
108k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
1837
1838
108k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signdwn1_16x8b, 8); //(2-3) ,(1-2) (substract with down)
1839
108k
                edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8);
1840
                //separating +ve and and -ve values.(3,4)
1841
108k
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp1_16x8b);
1842
108k
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_top_16x8b);
1843
                //creating mask 00 for +ve and -ve values and FF for zero.
1844
108k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1845
108k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1846
108k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-4)
1847
                //combining sign-left and sign_right
1848
108k
                edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-4),(2-3)
1849
1850
108k
                edge1_16x8b = _mm_sub_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
1851
1852
                //packing row 2 n row 3
1853
108k
                src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
1854
                //for the next iteration signup0_16x8b = -signdwn1_16x8b
1855
108k
                signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(4-3)
1856
1857
                //adding constant 2
1858
108k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1859
108k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
1860
                //shuffle to get sao index
1861
108k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1862
108k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
1863
                //shuffle to get sao offset
1864
108k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1865
108k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
1866
                //the next top already in  src_top_16x8b
1867
                //src_top_16x8b = src_temp1_16x8b;
1868
                //cnvert to 16 bit then add and then saturated pack
1869
108k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1870
108k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1871
108k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
1872
108k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1873
108k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1874
108k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
1875
108k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
1876
108k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1877
1878
108k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
1879
108k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
1880
108k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
1881
108k
                src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
1882
108k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1883
108k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
1884
108k
                src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b);
1885
108k
                src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
1886
1887
108k
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
1888
108k
                cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
1889
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1890
108k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1891
                // row = 1
1892
108k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
1893
                //row = 2
1894
108k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
1895
                // row = 3
1896
108k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
1897
1898
108k
                src_temp0_16x8b = src_temp1_16x8b;
1899
108k
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
1900
108k
                pu1_src_cpy += (src_strd << 2);
1901
1902
108k
            }
1903
13.7k
            ht_rem = ht & 0x2;
1904
13.7k
            if(ht_rem)
1905
2.66k
            {
1906
1907
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
1908
2.66k
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
1909
                // row = 2
1910
2.66k
                src_bottom_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
1911
1912
                //row 0 -row1
1913
                //separating +ve and and -ve values.
1914
2.66k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
1915
2.66k
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
1916
                //creating mask 00 for +ve and -ve values and FF for zero.
1917
2.66k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1918
2.66k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1919
                //combining the appropriate sign change
1920
2.66k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1921
                //row1-row0
1922
2.66k
                edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
1923
2.66k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
1924
2.66k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
1925
                //row1 -row2
1926
2.66k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
1927
2.66k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
1928
                //creating mask 00 for +ve and -ve values and FF for zero.
1929
2.66k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1930
2.66k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1931
                //combining the appropriate sign change
1932
2.66k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
1933
2.66k
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
1934
                //adding top and down substraction
1935
2.66k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
1936
                //for the next iteration signup0_16x8b = -signdwn1_16x8b
1937
2.66k
                signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(2-1) for next
1938
2.66k
                src_top_16x8b = src_temp1_16x8b;
1939
                //adding constant 2
1940
2.66k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1941
1942
                //shuffle to get sao index
1943
2.66k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1944
1945
                //shuffle to get sao offset
1946
2.66k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1947
1948
                //the next top already in  src_top_16x8b
1949
                //cnvert to 16 bit then add and then saturated pack
1950
2.66k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1951
2.66k
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1952
2.66k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
1953
2.66k
                src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
1954
2.66k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
1955
2.66k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
1956
2.66k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
1957
2.66k
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
1958
1959
2.66k
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
1960
1961
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1962
2.66k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1963
                // row = 1
1964
2.66k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
1965
2.66k
                src_temp0_16x8b = src_bottom_16x8b;
1966
2.66k
                pu1_src_cpy += (src_strd << 1);
1967
1968
2.66k
            }
1969
13.7k
            ht_rem = ht & 0x1;
1970
13.7k
            if(ht_rem)
1971
2.62k
            {
1972
1973
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
1974
2.62k
                src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
1975
1976
                //row 0 -row1
1977
                //separating +ve and and -ve values.
1978
2.62k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
1979
2.62k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
1980
                //creating mask 00 for +ve and -ve values and FF for zero.
1981
2.62k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1982
2.62k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1983
                //combining the appropriate sign change
1984
2.62k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1985
                //adding top and down substraction
1986
2.62k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
1987
                //adding constant 2
1988
2.62k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1989
2.62k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
1990
2.62k
                edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
1991
                //shuffle to get sao index
1992
2.62k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1993
                //shuffle to get sao offset
1994
2.62k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1995
2.62k
                src_top_16x8b = src_temp0_16x8b;
1996
                //cnvert to 16 bit then add and then saturated pack
1997
2.62k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1998
2.62k
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1999
2.62k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2000
2.62k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
2001
2.62k
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
2002
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
2003
2.62k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2004
2.62k
                pu1_src_cpy += (src_strd);
2005
2006
2.62k
            }
2007
13.7k
            if(0 == pu1_avail[3])
2008
1.22k
            {
2009
1.22k
                src_top_16x8b = src_bottom_16x8b;
2010
1.22k
            }
2011
13.7k
            _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
2012
13.7k
            pu1_src += 8;
2013
13.7k
        }
2014
14.4k
    }
2015
14.4k
}
2016
2017
void ihevc_sao_edge_offset_class1_chroma_ssse3(UWORD8 *pu1_src,
2018
                                               WORD32 src_strd,
2019
                                               UWORD8 *pu1_src_left,
2020
                                               UWORD8 *pu1_src_top,
2021
                                               UWORD8 *pu1_src_top_left,
2022
                                               UWORD8 *pu1_src_top_right,
2023
                                               UWORD8 *pu1_src_bot_left,
2024
                                               UWORD8 *pu1_avail,
2025
                                               WORD8 *pi1_sao_offset_u,
2026
                                               WORD8 *pi1_sao_offset_v,
2027
                                               WORD32 wd,
2028
                                               WORD32 ht)
2029
6.72k
{
2030
6.72k
    WORD32 row, col;
2031
6.72k
    UWORD8 *pu1_src_top_cpy;
2032
6.72k
    UWORD8 *pu1_src_cpy;
2033
6.72k
    WORD32 wd_rem;
2034
2035
2036
6.72k
    __m128i src_top_16x8b, src_bottom_16x8b;
2037
6.72k
    __m128i src_temp0_16x8b, src_temp1_16x8b;
2038
6.72k
    __m128i signup0_16x8b, signdwn1_16x8b;
2039
6.72k
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
2040
6.72k
    __m128i edge0_16x8b, edge1_16x8b;
2041
6.72k
    __m128i edge_idx_8x16b, sao_offset_8x16b;
2042
6.72k
    __m128i const2_16x8b, const0_16x8b;
2043
6.72k
    __m128i chroma_offset_8x16b;
2044
2045
6.72k
    UNUSED(pu1_src_top_right);
2046
6.72k
    UNUSED(pu1_src_bot_left);
2047
2048
    /* Updating left and top and top-left */
2049
115k
    for(row = 0; row < ht; row++)
2050
108k
    {
2051
108k
        pu1_src_left[2 * row] = pu1_src[row * src_strd + (wd - 2)];
2052
108k
        pu1_src_left[2 * row + 1] = pu1_src[row * src_strd + (wd - 1)];
2053
108k
    }
2054
6.72k
    pu1_src_top_left[0] = pu1_src_top[wd - 2];
2055
6.72k
    pu1_src_top_left[1] = pu1_src_top[wd - 1];
2056
2057
2058
2059
6.72k
    pu1_src_top_cpy = pu1_src_top;
2060
6.72k
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
2061
6.72k
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
2062
6.72k
    const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
2063
6.72k
    chroma_offset_8x16b = _mm_set1_epi16(0x0800);
2064
    /* Update height and source pointers based on the availability flags */
2065
6.72k
    if(0 == pu1_avail[2])
2066
1.05k
    {
2067
1.05k
        pu1_src_top_cpy = pu1_src;
2068
1.05k
        pu1_src += src_strd;
2069
1.05k
        ht--;
2070
1.05k
    }
2071
6.72k
    if(0 == pu1_avail[3])
2072
593
    {
2073
593
        ht--;
2074
593
    }
2075
6.72k
    sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
2076
6.72k
    const2_16x8b = _mm_set1_epi8(2);
2077
6.72k
    const0_16x8b = _mm_setzero_si128();
2078
2079
2080
6.72k
    {
2081
6.72k
        WORD32 ht_rem;
2082
2083
2084
2085
20.8k
        for(col = wd; col >= 16; col -= 16)
2086
14.1k
        {
2087
14.1k
            pu1_src_cpy = pu1_src;
2088
14.1k
            src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
2089
            //row = 0
2090
14.1k
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
2091
            //separating +ve and and -ve values.
2092
14.1k
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
2093
14.1k
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
2094
            //creating mask 00 for +ve and -ve values and FF for zero.
2095
14.1k
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2096
14.1k
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2097
            //combining the appropriate sign change
2098
14.1k
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2099
2100
124k
            for(row = ht; row >= 2; row -= 2)
2101
110k
            {
2102
2103
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
2104
110k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
2105
                // row = 2
2106
110k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
2107
2108
2109
                //row 0 -row1
2110
                //separating +ve and and -ve values.
2111
110k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
2112
110k
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
2113
                //creating mask 00 for +ve and -ve values and FF for zero.
2114
110k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2115
110k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2116
                //combining the appropriate sign change
2117
110k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2118
                //row1-row0
2119
110k
                edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
2120
2121
                //row1 -bottom
2122
110k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
2123
110k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
2124
                //creating mask 00 for +ve and -ve values and FF for zero.
2125
110k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2126
110k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2127
                //combining the appropriate sign change
2128
110k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2129
2130
                //combining sign-left and sign_right
2131
110k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
2132
110k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
2133
2134
                //for the next iteration signup0_16x8b = -signdwn1_16x8b
2135
110k
                signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
2136
                //adding constant 2
2137
110k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2138
110k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
2139
                //copying the next top
2140
110k
                src_top_16x8b = src_temp1_16x8b;
2141
2142
2143
                //shuffle to get sao index
2144
110k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2145
110k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
2146
                //adding chroma offset to access U and V
2147
110k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
2148
110k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
2149
2150
                //shuffle to get sao offset
2151
110k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2152
110k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
2153
                //cnvert to 16 bit then add and then saturated pack
2154
110k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2155
110k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2156
110k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2157
110k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
2158
110k
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
2159
110k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2160
110k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
2161
110k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
2162
2163
110k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
2164
110k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
2165
110k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
2166
110k
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
2167
110k
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
2168
110k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2169
110k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
2170
110k
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
2171
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
2172
110k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2173
                // row = 1
2174
110k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
2175
2176
110k
                src_temp0_16x8b = src_bottom_16x8b;
2177
110k
                pu1_src_cpy += (src_strd << 1);
2178
110k
            }
2179
14.1k
            ht_rem = ht & 0x1;
2180
2181
14.1k
            if(ht_rem)
2182
3.38k
            {
2183
3.38k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
2184
                //current row -next row
2185
                //separating +ve and and -ve values.
2186
3.38k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
2187
3.38k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
2188
                //creating mask 00 for +ve and -ve values and FF for zero.
2189
3.38k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2190
3.38k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2191
                //combining the appropriate sign change
2192
3.38k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2193
                //adding top and botton and constant 2
2194
3.38k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
2195
3.38k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2196
                //copying the next top
2197
3.38k
                src_top_16x8b = src_temp0_16x8b;
2198
2199
3.38k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2200
                //adding chroma offset to access U and V
2201
3.38k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
2202
3.38k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2203
2204
                //cnvert to 16 bit then add and then saturated pack
2205
3.38k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2206
3.38k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2207
3.38k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2208
3.38k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
2209
3.38k
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
2210
3.38k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2211
3.38k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
2212
3.38k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
2213
2214
3.38k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2215
3.38k
            }
2216
14.1k
            if(0 == pu1_avail[3])
2217
1.23k
            {
2218
1.23k
                src_top_16x8b = src_bottom_16x8b;
2219
1.23k
            }
2220
            //updating top flag
2221
14.1k
            _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
2222
14.1k
            pu1_src += 16;
2223
14.1k
        }
2224
2225
6.72k
        wd_rem = wd & 0xF;
2226
6.72k
        if(wd_rem)
2227
10
        {
2228
10
            pu1_src_cpy = pu1_src;
2229
10
            src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
2230
            //row = 0
2231
10
            src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
2232
            //separating +ve and and -ve values.
2233
10
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
2234
10
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
2235
            //creating mask 00 for +ve and -ve values and FF for zero.
2236
10
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2237
10
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2238
            //combining the appropriate sign change
2239
10
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2240
10
            signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
2241
44
            for(row = ht; row >= 4; row -= 4)
2242
34
            {
2243
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
2244
34
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
2245
                // row = 2
2246
34
                src_bottom_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
2247
2248
                //row 0 -row1
2249
                //separating +ve and and -ve values.
2250
34
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
2251
34
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
2252
                //creating mask 00 for +ve and -ve values and FF for zero.
2253
34
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2254
34
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2255
                //combining the appropriate sign change
2256
34
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2257
2258
                //row1-row0
2259
34
                edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
2260
34
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
2261
34
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
2262
                //row1 -row2
2263
34
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
2264
34
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
2265
                //creating mask 00 for +ve and -ve values and FF for zero.
2266
34
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2267
34
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2268
                //combining the appropriate sign change
2269
34
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
2270
34
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
2271
                //packing row 0 n row 1
2272
34
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
2273
                //row = 3
2274
34
                src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
2275
                // row = 4
2276
34
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 4 * src_strd));
2277
2278
34
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
2279
34
                signdwn1_16x8b = _mm_slli_si128(signdwn1_16x8b, 8); //allign left (1-2)
2280
                //separating +ve and and -ve values.(2,3)
2281
34
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_top_16x8b);
2282
34
                cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_bottom_16x8b);
2283
                //creating mask 00 for +ve and -ve values and FF for zero.
2284
34
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2285
34
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2286
                //combining the appropriate sign change
2287
34
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
2288
2289
34
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signdwn1_16x8b, 8); //(2-3) ,(1-2) (substract with down)
2290
34
                edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8);
2291
                //separating +ve and and -ve values.(3,4)
2292
34
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp1_16x8b);
2293
34
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_top_16x8b);
2294
                //creating mask 00 for +ve and -ve values and FF for zero.
2295
34
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2296
34
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2297
34
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-4)
2298
                //combining sign-left and sign_right
2299
34
                edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-4),(2-3)
2300
2301
34
                edge1_16x8b = _mm_sub_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
2302
2303
                //packing row 2 n row 3
2304
34
                src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
2305
                //for the next iteration signup0_16x8b = -signdwn1_16x8b
2306
34
                signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(4-3)
2307
                //adding constant 2
2308
34
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2309
34
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
2310
                //shuffle to get sao index
2311
34
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2312
34
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
2313
                //adding chroma offset to access U and V
2314
34
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
2315
34
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
2316
2317
                //shuffle to get sao offset
2318
34
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2319
34
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
2320
                //the next top already in  src_top_16x8b
2321
                //cnvert to 16 bit then add and then saturated pack
2322
34
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2323
34
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2324
34
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2325
34
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
2326
34
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
2327
34
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2328
34
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
2329
34
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
2330
2331
34
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
2332
34
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
2333
34
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
2334
34
                src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
2335
34
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
2336
34
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2337
34
                src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b);
2338
34
                src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
2339
2340
34
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
2341
34
                cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
2342
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
2343
34
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2344
                // row = 1
2345
34
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
2346
                //row = 2
2347
34
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
2348
                // row = 3
2349
34
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
2350
2351
34
                src_temp0_16x8b = src_temp1_16x8b;
2352
34
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
2353
34
                pu1_src_cpy += (src_strd << 2);
2354
2355
34
            }
2356
10
            ht_rem = ht & 0x2;
2357
10
            if(ht_rem)
2358
6
            {
2359
2360
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
2361
6
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
2362
                // row = 2
2363
6
                src_bottom_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
2364
2365
                //row 0 -row1
2366
                //separating +ve and and -ve values.
2367
6
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
2368
6
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
2369
                //creating mask 00 for +ve and -ve values and FF for zero.
2370
6
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2371
6
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2372
                //combining the appropriate sign change
2373
6
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2374
                //row1-row0
2375
6
                edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
2376
6
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
2377
6
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
2378
                //row1 -row2
2379
6
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
2380
6
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
2381
                //creating mask 00 for +ve and -ve values and FF for zero.
2382
6
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2383
6
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2384
                //combining the appropriate sign change
2385
6
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
2386
6
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
2387
                //adding top and down substraction
2388
6
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
2389
                //for the next iteration signup0_16x8b = -signdwn1_16x8b
2390
6
                signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(2-1) for next
2391
6
                src_top_16x8b = src_temp1_16x8b;
2392
2393
                //adding constant 2
2394
6
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2395
2396
                //shuffle to get sao index
2397
6
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2398
2399
                //adding chroma offset to access U and V
2400
6
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
2401
                //shuffle to get sao offset
2402
6
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2403
                //the next top already in  src_top_16x8b
2404
                //cnvert to 16 bit then add and then saturated pack
2405
6
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2406
6
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2407
6
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2408
6
                src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
2409
6
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
2410
6
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
2411
6
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b);
2412
6
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
2413
2414
6
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
2415
2416
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
2417
6
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2418
                // row = 1
2419
6
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
2420
6
                src_temp0_16x8b = src_bottom_16x8b;
2421
6
                pu1_src_cpy += (src_strd << 1);
2422
2423
6
            }
2424
10
            ht_rem = ht & 0x1;
2425
10
            if(ht_rem)
2426
4
            {
2427
2428
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
2429
4
                src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
2430
2431
                //row 0 -row1
2432
                //separating +ve and and -ve values.
2433
4
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
2434
4
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
2435
                //creating mask 00 for +ve and -ve values and FF for zero.
2436
4
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2437
4
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2438
                //combining the appropriate sign change
2439
4
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2440
                //adding top and down substraction
2441
4
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
2442
                //adding constant 2
2443
4
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2444
4
                src_top_16x8b = src_temp0_16x8b;
2445
2446
4
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
2447
4
                edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
2448
                //shuffle to get sao index
2449
4
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2450
                //adding chroma offset to access U and V
2451
4
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
2452
                //shuffle to get sao offset
2453
4
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2454
2455
                //cnvert to 16 bit then add and then saturated pack
2456
4
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2457
4
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2458
4
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2459
4
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
2460
4
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
2461
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
2462
4
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2463
4
                pu1_src_cpy += (src_strd);
2464
2465
4
            }
2466
10
            if(0 == pu1_avail[3])
2467
4
            {
2468
4
                src_top_16x8b = src_bottom_16x8b;
2469
4
            }
2470
10
            _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
2471
10
            pu1_src += 8;
2472
10
        }
2473
6.72k
    }
2474
6.72k
}
2475
2476
/* 135 degree filtering */
2477
void ihevc_sao_edge_offset_class2_ssse3(UWORD8 *pu1_src,
2478
                                        WORD32 src_strd,
2479
                                        UWORD8 *pu1_src_left,
2480
                                        UWORD8 *pu1_src_top,
2481
                                        UWORD8 *pu1_src_top_left,
2482
                                        UWORD8 *pu1_src_top_right,
2483
                                        UWORD8 *pu1_src_bot_left,
2484
                                        UWORD8 *pu1_avail,
2485
                                        WORD8 *pi1_sao_offset,
2486
                                        WORD32 wd,
2487
                                        WORD32 ht)
2488
11.8k
{
2489
11.8k
    WORD32 row, col;
2490
11.8k
    UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
2491
11.8k
    UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2;
2492
11.8k
    UWORD8 *pu1_firstleft;
2493
11.8k
    UWORD8 *pu1_src_cpy, *pu1_src_org;
2494
11.8k
    UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
2495
11.8k
    UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8];
2496
11.8k
    UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8];
2497
11.8k
    WORD32 wd_rem;
2498
11.8k
    UWORD8 u1_pos_0_0_tmp, u1_pos_wd_ht_tmp;
2499
11.8k
    WORD32 ht_tmp, ht_0;
2500
2501
11.8k
    WORD32 bit_depth;
2502
11.8k
    UWORD8 u1_avail0, u1_avail1;
2503
2504
11.8k
    __m128i src_top_16x8b, src_bottom_16x8b;
2505
11.8k
    __m128i src_temp0_16x8b, src_temp1_16x8b;
2506
11.8k
    __m128i signup0_16x8b, signdwn1_16x8b;
2507
11.8k
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
2508
11.8k
    __m128i edge0_16x8b, edge1_16x8b;
2509
11.8k
    __m128i au1_mask8x16b;
2510
11.8k
    __m128i edge_idx_8x16b, sao_offset_8x16b;
2511
11.8k
    __m128i const2_16x8b, const0_16x8b;
2512
11.8k
    __m128i left_store_16x8b;
2513
11.8k
    UNUSED(pu1_src_top_right);
2514
11.8k
    UNUSED(pu1_src_bot_left);
2515
2516
11.8k
    ht_0 = ht; ht_tmp = ht;
2517
11.8k
    au1_mask8x16b = _mm_set1_epi8(0xff);
2518
2519
    //setting availability mask to ff size MAX_CTB_SIZE
2520
59.3k
    for(col = 0; col < MAX_CTB_SIZE; col += 16)
2521
47.4k
        _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
2522
395k
    for(row = 0; row < ht; row++)
2523
383k
    {
2524
383k
        au1_src_left_tmp[row] = pu1_src_left[row];
2525
383k
    }
2526
11.8k
    bit_depth = BIT_DEPTH_LUMA;
2527
11.8k
    pu1_src_org = pu1_src;
2528
11.8k
    pu1_src_top_cpy = pu1_src_top;
2529
11.8k
    pu1_src_left_cpy2 = au1_src_left_tmp;
2530
11.8k
    pu1_src_left_cpy = au1_src_left_tmp;
2531
11.8k
    pu1_src_left_str2 = au1_src_left_tmp1;
2532
11.8k
    pu1_src_left_str = au1_src_left_tmp1;
2533
11.8k
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
2534
11.8k
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
2535
2536
2537
    /* If top-left is available, process separately */
2538
11.8k
    if(0 != pu1_avail[4])
2539
9.29k
    {
2540
9.29k
        WORD8 edge_idx;
2541
2542
9.29k
        edge_idx = 2 + SIGN(pu1_src[0] - pu1_src_top_left[0]) +
2543
9.29k
                        SIGN(pu1_src[0] - pu1_src[1 + src_strd]);
2544
2545
9.29k
        edge_idx = gi1_table_edge_idx[edge_idx];
2546
2547
9.29k
        if(0 != edge_idx)
2548
3.72k
        {
2549
3.72k
            u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
2550
3.72k
        }
2551
5.57k
        else
2552
5.57k
        {
2553
5.57k
            u1_pos_0_0_tmp = pu1_src[0];
2554
5.57k
        }
2555
9.29k
    }
2556
2.56k
    else
2557
2.56k
    {
2558
2.56k
        u1_pos_0_0_tmp = pu1_src[0];
2559
2.56k
    }
2560
2561
    /* If bottom-right is available, process separately */
2562
11.8k
    if(0 != pu1_avail[7])
2563
9.88k
    {
2564
9.88k
        WORD8 edge_idx;
2565
2566
9.88k
        edge_idx = 2 + SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 1 - src_strd]) +
2567
9.88k
                        SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]);
2568
2569
9.88k
        edge_idx = gi1_table_edge_idx[edge_idx];
2570
2571
9.88k
        if(0 != edge_idx)
2572
4.25k
        {
2573
4.25k
            u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
2574
4.25k
        }
2575
5.63k
        else
2576
5.63k
        {
2577
5.63k
            u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd];
2578
5.63k
        }
2579
9.88k
    }
2580
1.97k
    else
2581
1.97k
    {
2582
1.97k
        u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd];
2583
1.97k
    }
2584
11.8k
    pu1_firstleft = pu1_src_top_left;
2585
2586
    /* Update height and source pointers based on the availability flags */
2587
11.8k
    if(0 == pu1_avail[2])
2588
1.82k
    {
2589
1.82k
        pu1_firstleft = pu1_src_left_cpy2;
2590
1.82k
        pu1_src_left_cpy2++;
2591
1.82k
        pu1_src_left_str2++;
2592
1.82k
        pu1_src_top_cpy = pu1_src;
2593
1.82k
        pu1_src += src_strd;
2594
1.82k
        ht--;
2595
1.82k
    }
2596
11.8k
    if(0 == pu1_avail[3])
2597
1.38k
    {
2598
1.38k
        ht--;
2599
1.38k
        ht_0--;
2600
1.38k
    }
2601
    //storing top left in a mmx register
2602
11.8k
    left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_firstleft);
2603
11.8k
    const2_16x8b = _mm_set1_epi8(2);
2604
11.8k
    const0_16x8b = _mm_setzero_si128();
2605
11.8k
    left_store_16x8b = _mm_slli_si128(left_store_16x8b, 15);
2606
    //update top -left
2607
11.8k
    *pu1_src_top_left = pu1_src_top[wd - 1];
2608
    //availability mask creation
2609
11.8k
    u1_avail0 = pu1_avail[0];
2610
11.8k
    u1_avail1 = pu1_avail[1];
2611
11.8k
    au1_mask[0] = u1_avail0;
2612
11.8k
    au1_mask[wd - 1] = u1_avail1;
2613
11.8k
    {
2614
11.8k
        WORD32 ht_rem;
2615
2616
2617
11.8k
        pu1_src_left_cpy = pu1_src_left_cpy2;
2618
11.8k
        pu1_src_left_str = pu1_src_left_str2;
2619
11.8k
        au1_mask_cpy = au1_mask;
2620
31.1k
        for(col = wd; col >= 16; col -= 16)
2621
19.2k
        {
2622
19.2k
            pu1_src_cpy = pu1_src;
2623
19.2k
            src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
2624
            //row = 0
2625
19.2k
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
2626
19.2k
            src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 15);
2627
            //loading the mask
2628
19.2k
            au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
2629
            //separating +ve and and -ve values.
2630
19.2k
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
2631
19.2k
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
2632
            //creating mask 00 for +ve and -ve values and FF for zero.
2633
19.2k
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2634
19.2k
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2635
            //combining the appropriate sign change
2636
19.2k
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2637
2638
2639
325k
            for(row = ht; row >= 2; row -= 2)
2640
306k
            {
2641
306k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
2642
                //row = 1
2643
306k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
2644
                // row = 1 right
2645
306k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
2646
                //to insert left in row 0
2647
306k
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 15);
2648
                //row 0 -row1
2649
                //separating +ve and and -ve values.
2650
306k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
2651
306k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
2652
2653
                //creating mask 00 for +ve and -ve values and FF for zero.
2654
306k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2655
306k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2656
                //manipulation for row 1 - row 0
2657
306k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15);
2658
                //combining the appropriate sign change
2659
306k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
2660
                //row1-row0
2661
                //separating +ve and and -ve values.
2662
306k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
2663
306k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
2664
                //creating mask 00 for +ve and -ve values and FF for zero.
2665
306k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2666
306k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2667
                // row = 2 right
2668
306k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd + 1));
2669
306k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
2670
2671
2672
                //row1 -bottom
2673
306k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
2674
306k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
2675
                //creating mask 00 for +ve and -ve values and FF for zero.
2676
306k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2677
306k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2678
                //combining the appropriate sign change
2679
306k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2680
                // row = 2
2681
306k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
2682
2683
                //combining sign-left and sign_right
2684
306k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
2685
2686
                //storing the row 1 left for next row.
2687
306k
                signup0_16x8b = _mm_slli_si128(left_store_16x8b, 14);
2688
2689
                //combining sign-left and sign_right
2690
306k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
2691
                //manipulation for bottom - row 1
2692
306k
                signup0_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signup0_16x8b, 15);
2693
                //eliminating old left for row 0 and row 1
2694
306k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
2695
                //bottom - row1
2696
306k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signup0_16x8b);
2697
306k
                cmp_lt0_16x8b = _mm_subs_epu8(signup0_16x8b, src_bottom_16x8b);
2698
                //creating mask 00 for +ve and -ve values and FF for zero.
2699
306k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2700
306k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2701
                //for the next iteration bottom -row1
2702
306k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2703
                //row1  getting it right for left of next block
2704
306k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15);
2705
                //adding constant 2
2706
306k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2707
306k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
2708
                //shuffle to get sao index
2709
306k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2710
306k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
2711
                //using availability mask
2712
306k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
2713
306k
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
2714
                //shuffle to get sao offset
2715
306k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2716
306k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
2717
                //row0  getting it right for left of next block
2718
306k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
2719
                //copying the next top
2720
306k
                src_top_16x8b = src_temp1_16x8b;
2721
                //cnvert to 16 bit then add and then saturated pack
2722
306k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2723
306k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2724
306k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2725
306k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
2726
306k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2727
306k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
2728
306k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
2729
306k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
2730
2731
306k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
2732
306k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
2733
306k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
2734
306k
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
2735
306k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2736
306k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
2737
306k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
2738
306k
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
2739
2740
                //store left boundary
2741
306k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
2742
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
2743
306k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2744
                // row = 1
2745
306k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
2746
2747
306k
                src_temp0_16x8b = src_bottom_16x8b;
2748
306k
                pu1_src_cpy += (src_strd << 1);
2749
306k
                pu1_src_left_cpy += 2;
2750
306k
                pu1_src_left_str += 2;
2751
306k
            }
2752
19.2k
            ht_rem = ht & 0x1;
2753
2754
19.2k
            if(ht_rem)
2755
5.03k
            {
2756
5.03k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
2757
5.03k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
2758
                //current row -next row
2759
                //separating +ve and and -ve values.
2760
5.03k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
2761
5.03k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
2762
                //creating mask 00 for +ve and -ve values and FF for zero.
2763
5.03k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2764
5.03k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2765
                //combining the appropriate sign change
2766
5.03k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2767
                //adding top and botton and constant 2
2768
5.03k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
2769
5.03k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2770
                //eliminating old left for row 0 and row 1
2771
5.03k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
2772
2773
5.03k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2774
                //using availability mask
2775
5.03k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
2776
2777
5.03k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2778
2779
                //row0  getting it right for left of next block
2780
5.03k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
2781
                //copying the next top
2782
5.03k
                src_top_16x8b = src_temp0_16x8b;
2783
                //cnvert to 16 bit then add and then saturated pack
2784
5.03k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2785
5.03k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2786
5.03k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2787
5.03k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
2788
5.03k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2789
5.03k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
2790
5.03k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
2791
5.03k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
2792
                //store left boundary
2793
5.03k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
2794
2795
5.03k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2796
5.03k
                pu1_src_cpy += (src_strd);
2797
5.03k
                pu1_src_left_cpy += 1;
2798
5.03k
                pu1_src_left_str += 1;
2799
5.03k
            }
2800
19.2k
            if(0 == pu1_avail[3])
2801
2.22k
            {
2802
2.22k
                src_top_16x8b = src_bottom_16x8b;
2803
2.22k
                pu1_src_left_str[0] = pu1_src_cpy[15];
2804
2.22k
            }
2805
19.2k
            if(0 == pu1_avail[2])
2806
3.08k
            {
2807
3.08k
                pu1_src_left_str[-ht_0] = pu1_src[15 - src_strd];
2808
3.08k
            }
2809
2810
            //for the top left of next part of the block
2811
19.2k
            left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
2812
            //updating top flag
2813
19.2k
            _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
2814
19.2k
            pu1_src += 16;
2815
19.2k
            au1_mask_cpy += 16;
2816
2817
2818
19.2k
            pu1_left_tmp = pu1_src_left_cpy2;
2819
19.2k
            pu1_src_left_cpy2 = pu1_src_left_str2;
2820
19.2k
            pu1_src_left_str2 = pu1_left_tmp;
2821
2822
19.2k
            pu1_src_left_cpy = pu1_src_left_cpy2;
2823
19.2k
            pu1_src_left_str = pu1_src_left_str2;
2824
19.2k
        }
2825
2826
11.8k
        wd_rem = wd & 0xF;
2827
11.8k
        if(wd_rem)
2828
11.2k
        {
2829
11.2k
            pu1_src_left_cpy = pu1_src_left_cpy2;
2830
11.2k
            pu1_src_left_str = pu1_src_left_str2;
2831
11.2k
            pu1_src_cpy = pu1_src;
2832
11.2k
            src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
2833
            //row = 0
2834
11.2k
            src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
2835
11.2k
            src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 15);
2836
11.2k
            au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
2837
            //separating +ve and and -ve values.
2838
11.2k
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
2839
11.2k
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
2840
            //creating mask 00 for +ve and -ve values and FF for zero.
2841
11.2k
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2842
11.2k
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2843
            //preparing au1_mask
2844
11.2k
            au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
2845
            //combining the appropriate sign change
2846
11.2k
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2847
11.2k
            signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
2848
2849
99.3k
            for(row = ht; row >= 4; row -= 4)
2850
88.1k
            {
2851
88.1k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
2852
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
2853
88.1k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
2854
                // row = 2
2855
88.1k
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
2856
                //right row1
2857
88.1k
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
2858
                //row 0 -row1
2859
                //separating +ve and and -ve values.
2860
88.1k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
2861
88.1k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
2862
                //manipulation for row 1 -row 0
2863
88.1k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 15);
2864
                //creating mask 00 for +ve and -ve values and FF for zero.
2865
88.1k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2866
88.1k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2867
                //row 0 left
2868
88.1k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15);
2869
                //combining the appropriate sign change
2870
88.1k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2871
                //row 1 -row0
2872
                //separating +ve and and -ve values.
2873
88.1k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
2874
88.1k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
2875
2876
                //creating mask 00 for +ve and -ve values and FF for zero.
2877
88.1k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2878
88.1k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2879
                //row1-row0
2880
88.1k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2881
2882
88.1k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
2883
2884
88.1k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
2885
                //right row2
2886
88.1k
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
2887
                //packing row 0 n row 1
2888
88.1k
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
2889
                //row1 -row2
2890
88.1k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
2891
88.1k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
2892
                //creating mask 00 for +ve and -ve values and FF for zero.
2893
88.1k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2894
88.1k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2895
                //combining the appropriate sign change
2896
88.1k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
2897
88.1k
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
2898
                //manipulation for row 2 -row 1
2899
88.1k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
2900
                //row 1 left
2901
88.1k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
2902
                //row = 3
2903
88.1k
                src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
2904
2905
                // row = 4
2906
88.1k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
2907
2908
88.1k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
2909
2910
                //separating +ve and and -ve values.(2,1)
2911
88.1k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
2912
88.1k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
2913
                //manipulation for row 3 -row 2
2914
88.1k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 13);
2915
                //creating mask 00 for +ve and -ve values and FF for zero.
2916
88.1k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2917
88.1k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2918
                //row 2 left
2919
88.1k
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
2920
                //combining the appropriate sign change
2921
88.1k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
2922
2923
                //separating +ve and and -ve values.(3,2)
2924
88.1k
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
2925
88.1k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
2926
88.1k
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
2927
                //creating mask 00 for +ve and -ve values and FF for zero.
2928
88.1k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2929
88.1k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2930
                //right row3
2931
88.1k
                signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 1);
2932
                //combining the appropriate sign change
2933
88.1k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
2934
2935
88.1k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
2936
2937
                //separating +ve and and -ve values.(2,3)
2938
88.1k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
2939
88.1k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
2940
                //right row 4
2941
88.1k
                signdwn1_16x8b =  _mm_srli_si128(src_temp1_16x8b, 1);
2942
                //creating mask 00 for +ve and -ve values and FF for zero.
2943
88.1k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2944
88.1k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2945
                //combining the appropriate sign change
2946
88.1k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
2947
2948
                //separating +ve and and -ve values.(3,bottom)
2949
88.1k
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
2950
88.1k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
2951
2952
                //creating mask 00 for +ve and -ve values and FF for zero.
2953
88.1k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2954
88.1k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2955
88.1k
                edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
2956
                //combining the appropriate sign change
2957
88.1k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
2958
88.1k
                edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
2959
2960
                //manipulation for bottom -row 3
2961
88.1k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
2962
                //eliminating old left for row 0,1,2,3
2963
88.1k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
2964
                //packing row 2 n row 3
2965
88.1k
                src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
2966
                //row 3 left
2967
88.1k
                signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 15);
2968
                //loading row 3 right into left
2969
88.1k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 15);
2970
                //adding bottom and top values of row 2 and row 3
2971
88.1k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
2972
                //separating +ve and and -ve values.(botttom,3)
2973
88.1k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
2974
88.1k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
2975
                //to store right of row 2
2976
88.1k
                signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
2977
                //creating mask 00 for +ve and -ve values and FF for zero.
2978
88.1k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2979
88.1k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2980
88.1k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
2981
2982
                //storing right of row 2into left
2983
88.1k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
2984
                //to store right of row 0
2985
88.1k
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
2986
                //storing right of row 1 into left
2987
88.1k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
2988
2989
                //adding constant 2
2990
88.1k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2991
88.1k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
2992
                //shuffle to get sao index
2993
88.1k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2994
88.1k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
2995
                //using availability mask
2996
88.1k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
2997
88.1k
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
2998
                //shuffle to get sao offset
2999
88.1k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3000
88.1k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
3001
3002
                //storing right of row 0 into left
3003
88.1k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
3004
                //cnvert to 16 bit then add and then saturated pack
3005
88.1k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3006
88.1k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3007
88.1k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3008
88.1k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
3009
88.1k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3010
88.1k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
3011
88.1k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
3012
88.1k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
3013
3014
88.1k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
3015
88.1k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
3016
88.1k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
3017
88.1k
                src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
3018
88.1k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3019
88.1k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
3020
88.1k
                src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b);
3021
88.1k
                src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
3022
3023
88.1k
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
3024
88.1k
                cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
3025
3026
88.1k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3027
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
3028
88.1k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3029
                // row = 1
3030
88.1k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
3031
                //row = 2
3032
88.1k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
3033
                // row = 3
3034
88.1k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
3035
3036
88.1k
                src_temp0_16x8b = src_temp1_16x8b;
3037
88.1k
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
3038
88.1k
                pu1_src_cpy += (src_strd << 2);
3039
88.1k
                pu1_src_left_cpy += 4;
3040
88.1k
                pu1_src_left_str += 4;
3041
88.1k
            }
3042
11.2k
            ht_rem = ht & 0x2;
3043
11.2k
            if(ht_rem)
3044
2.91k
            {
3045
2.91k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3046
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
3047
2.91k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
3048
                // row = 2
3049
2.91k
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
3050
3051
                //row 0 -row 1
3052
2.91k
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
3053
                //separating +ve and and -ve values.
3054
2.91k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
3055
2.91k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
3056
                //manipulation for row 1 -row 0
3057
2.91k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 15);
3058
                //creating mask 00 for +ve and -ve values and FF for zero.
3059
2.91k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3060
2.91k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3061
                //manipulation for row 1 - row 0
3062
2.91k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15);
3063
                //combining the appropriate sign change
3064
2.91k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3065
3066
                //row1-row0
3067
                //separating +ve and and -ve values.
3068
2.91k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3069
2.91k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3070
3071
                //creating mask 00 for +ve and -ve values and FF for zero.
3072
2.91k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3073
2.91k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3074
                //combining the appropriate sign chang
3075
2.91k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3076
                //row 1 -bottom
3077
2.91k
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
3078
3079
2.91k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
3080
2.91k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
3081
                //row1 -bottom
3082
2.91k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3083
2.91k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3084
3085
                //creating mask 00 for +ve and -ve values and FF for zero.
3086
2.91k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3087
2.91k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3088
                //combining the appropriate sign change
3089
2.91k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
3090
2.91k
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
3091
                //manipulation for bottom -row1
3092
2.91k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
3093
                //manipulation for bottom- row 1
3094
2.91k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
3095
                //adding top and down substraction
3096
2.91k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
3097
                //bottom - row 1
3098
2.91k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
3099
2.91k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
3100
3101
                //eliminating old left for row 0,1
3102
2.91k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
3103
2.91k
                signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
3104
                //creating mask 00 for +ve and -ve values and FF for zero.
3105
2.91k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3106
2.91k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3107
                //for the next iteration signup0_16x8b
3108
2.91k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
3109
3110
                //storing right of row 1 into left
3111
2.91k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
3112
                //for storing right of row 1
3113
2.91k
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
3114
3115
2.91k
                src_top_16x8b = src_temp1_16x8b;
3116
                //storing right of row 0 into left
3117
2.91k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
3118
3119
                //adding constant 2
3120
2.91k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
3121
3122
                //shuffle to get sao index
3123
2.91k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
3124
                //using availability mask
3125
2.91k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
3126
                //shuffle to get sao offset
3127
2.91k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3128
3129
                //the next top already in  src_top_16x8b
3130
                //cnvert to 16 bit then add and then saturated pack
3131
2.91k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3132
2.91k
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3133
2.91k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3134
2.91k
                src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
3135
2.91k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
3136
2.91k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
3137
2.91k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
3138
2.91k
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
3139
3140
2.91k
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
3141
3142
2.91k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3143
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
3144
2.91k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3145
                // row = 1
3146
2.91k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
3147
2.91k
                src_temp0_16x8b = src_bottom_16x8b;
3148
2.91k
                pu1_src_cpy += (src_strd << 1);
3149
2.91k
                pu1_src_left_cpy += 2;
3150
2.91k
                pu1_src_left_str += 2;
3151
2.91k
            }
3152
11.2k
            ht_rem = ht & 0x1;
3153
11.2k
            if(ht_rem)
3154
2.81k
            {
3155
2.81k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3156
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
3157
2.81k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
3158
                //left store manipulation 1
3159
2.81k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
3160
                //row 0 -row1
3161
2.81k
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
3162
                //separating +ve and and -ve values.
3163
2.81k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
3164
2.81k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
3165
                //creating mask 00 for +ve and -ve values and FF for zero.
3166
2.81k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3167
2.81k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3168
                //combining the appropriate sign change
3169
2.81k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3170
                //adding top and down substraction
3171
2.81k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
3172
                //for row 0 right to put into left store
3173
2.81k
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
3174
                //adding constant 2
3175
2.81k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
3176
2.81k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
3177
2.81k
                edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
3178
                //filling the left boundary value
3179
2.81k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
3180
3181
                //shuffle to get sao index
3182
2.81k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
3183
                //using availability mask
3184
2.81k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
3185
                //shuffle to get sao offset
3186
2.81k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3187
2.81k
                src_top_16x8b = src_temp0_16x8b;
3188
                //cnvert to 16 bit then add and then saturated pack
3189
2.81k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3190
2.81k
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3191
2.81k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3192
2.81k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
3193
2.81k
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
3194
3195
2.81k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3196
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
3197
2.81k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3198
2.81k
                pu1_src_cpy += (src_strd);
3199
2.81k
                pu1_src_left_cpy += 1;
3200
2.81k
                pu1_src_left_str += 1;
3201
2.81k
            }
3202
11.2k
            if(0 == pu1_avail[3])
3203
1.32k
            {
3204
1.32k
                src_top_16x8b = src_bottom_16x8b;
3205
1.32k
                pu1_src_left_str[0] = pu1_src_cpy[7];
3206
1.32k
            }
3207
3208
11.2k
            if(0 == pu1_avail[2])
3209
1.68k
            {
3210
1.68k
                pu1_src_left_str[-ht_0] = pu1_src[7 - src_strd];
3211
1.68k
            }
3212
3213
11.2k
            _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
3214
11.2k
            pu1_src += 8;
3215
11.2k
            au1_mask_cpy += 16;
3216
3217
11.2k
            pu1_left_tmp = pu1_src_left_cpy2;
3218
11.2k
            pu1_src_left_cpy2 = pu1_src_left_str2;
3219
11.2k
            pu1_src_left_str2 = pu1_left_tmp;
3220
3221
11.2k
            pu1_src_left_cpy = pu1_src_left_cpy2;
3222
11.2k
            pu1_src_left_str = pu1_src_left_str2;
3223
11.2k
        }
3224
11.8k
        pu1_src_org[0] = u1_pos_0_0_tmp;
3225
11.8k
        pu1_src_org[wd - 1 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp;
3226
11.8k
        pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 1) : pu1_src_left_cpy;
3227
395k
        for(row = 0; row < ht_tmp; row++)
3228
383k
        {
3229
383k
            pu1_src_left[row] = pu1_src_left_cpy[row];
3230
383k
        }
3231
11.8k
    }
3232
3233
11.8k
}
3234
3235
/* 135 degree filtering */
3236
void ihevc_sao_edge_offset_class2_chroma_ssse3(UWORD8 *pu1_src,
3237
                                               WORD32 src_strd,
3238
                                               UWORD8 *pu1_src_left,
3239
                                               UWORD8 *pu1_src_top,
3240
                                               UWORD8 *pu1_src_top_left,
3241
                                               UWORD8 *pu1_src_top_right,
3242
                                               UWORD8 *pu1_src_bot_left,
3243
                                               UWORD8 *pu1_avail,
3244
                                               WORD8 *pi1_sao_offset_u,
3245
                                               WORD8 *pi1_sao_offset_v,
3246
                                               WORD32 wd,
3247
                                               WORD32 ht)
3248
8.57k
{
3249
8.57k
    WORD32 row, col;
3250
8.57k
    UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
3251
8.57k
    UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2;
3252
8.57k
    UWORD8 *pu1_firstleft;
3253
8.57k
    UWORD8 *pu1_src_cpy, *pu1_src_org;
3254
8.57k
    UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
3255
8.57k
    UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)];
3256
8.57k
    UWORD8 au1_src_left_tmp1[2 * (MAX_CTB_SIZE + 8)];
3257
8.57k
    WORD32 wd_rem;
3258
8.57k
    UWORD8 u1_pos_0_0_tmp_u, u1_pos_0_0_tmp_v, u1_pos_wd_ht_tmp_u, u1_pos_wd_ht_tmp_v;
3259
8.57k
    WORD32 ht_tmp;
3260
8.57k
    WORD32 ht_0;
3261
3262
8.57k
    WORD32 bit_depth;
3263
8.57k
    UWORD8 u1_avail0, u1_avail1;
3264
3265
8.57k
    __m128i src_temp0_16x8b, src_temp1_16x8b;
3266
8.57k
    __m128i signup0_16x8b, signdwn1_16x8b;
3267
8.57k
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
3268
8.57k
    __m128i edge0_16x8b, edge1_16x8b;
3269
8.57k
    __m128i src_top_16x8b, src_bottom_16x8b;
3270
8.57k
    __m128i au1_mask8x16b;
3271
8.57k
    __m128i edge_idx_8x16b, sao_offset_8x16b;
3272
8.57k
    __m128i const2_16x8b, const0_16x8b;
3273
8.57k
    __m128i left_store_16x8b;
3274
8.57k
    __m128i chroma_offset_8x16b;
3275
3276
8.57k
    UNUSED(pu1_src_top_right);
3277
8.57k
    UNUSED(pu1_src_bot_left);
3278
3279
8.57k
    ht_0 = ht; ht_tmp = ht;
3280
8.57k
    au1_mask8x16b = _mm_set1_epi8(0xff);
3281
    /* Updating left and top-left  */
3282
285k
    for(row = 0; row < 2 * ht; row++)
3283
276k
    {
3284
276k
        au1_src_left_tmp[row] = pu1_src_left[row];
3285
276k
    }
3286
    //setting availability mask to ff size MAX_CTB_SIZE
3287
42.8k
    for(col = 0; col < MAX_CTB_SIZE; col += 16)
3288
34.2k
        _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
3289
8.57k
    bit_depth = BIT_DEPTH_LUMA;
3290
8.57k
    pu1_src_org = pu1_src;
3291
8.57k
    pu1_src_top_cpy = pu1_src_top;
3292
8.57k
    pu1_src_left_cpy2 = au1_src_left_tmp;
3293
8.57k
    pu1_src_left_cpy = au1_src_left_tmp;
3294
8.57k
    pu1_src_left_str2 = au1_src_left_tmp1;
3295
8.57k
    pu1_src_left_str = au1_src_left_tmp1;
3296
8.57k
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
3297
8.57k
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
3298
8.57k
    const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
3299
8.57k
    chroma_offset_8x16b = _mm_set1_epi16(0x0800);
3300
3301
    /* If top-left is available, process separately */
3302
8.57k
    if(0 != pu1_avail[4])
3303
6.95k
    {
3304
6.95k
        WORD32 edge_idx;
3305
3306
        /* U */
3307
6.95k
        edge_idx = 2 + SIGN(pu1_src[0] - pu1_src_top_left[0]) +
3308
6.95k
                        SIGN(pu1_src[0] - pu1_src[2 + src_strd]);
3309
3310
6.95k
        edge_idx = gi1_table_edge_idx[edge_idx];
3311
3312
6.95k
        if(0 != edge_idx)
3313
2.12k
        {
3314
2.12k
            u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
3315
2.12k
        }
3316
4.83k
        else
3317
4.83k
        {
3318
4.83k
            u1_pos_0_0_tmp_u = pu1_src[0];
3319
4.83k
        }
3320
3321
        /* V */
3322
6.95k
        edge_idx = 2 + SIGN(pu1_src[1] - pu1_src_top_left[1]) +
3323
6.95k
                        SIGN(pu1_src[1] - pu1_src[1 + 2 + src_strd]);
3324
3325
6.95k
        edge_idx = gi1_table_edge_idx[edge_idx];
3326
3327
6.95k
        if(0 != edge_idx)
3328
2.22k
        {
3329
2.22k
            u1_pos_0_0_tmp_v = CLIP3(pu1_src[1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
3330
2.22k
        }
3331
4.72k
        else
3332
4.72k
        {
3333
4.72k
            u1_pos_0_0_tmp_v = pu1_src[1];
3334
4.72k
        }
3335
6.95k
    }
3336
1.62k
    else
3337
1.62k
    {
3338
1.62k
        u1_pos_0_0_tmp_u = pu1_src[0];
3339
1.62k
        u1_pos_0_0_tmp_v = pu1_src[1];
3340
1.62k
    }
3341
3342
    /* If bottom-right is available, process separately */
3343
8.57k
    if(0 != pu1_avail[7])
3344
7.07k
    {
3345
7.07k
        WORD32 edge_idx;
3346
3347
        /* U */
3348
7.07k
        edge_idx = 2 + SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]) +
3349
7.07k
                        SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]);
3350
3351
7.07k
        edge_idx = gi1_table_edge_idx[edge_idx];
3352
3353
7.07k
        if(0 != edge_idx)
3354
2.10k
        {
3355
2.10k
            u1_pos_wd_ht_tmp_u = CLIP3(pu1_src[wd - 2 + (ht - 1) * src_strd] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
3356
2.10k
        }
3357
4.96k
        else
3358
4.96k
        {
3359
4.96k
            u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd];
3360
4.96k
        }
3361
3362
        /* V */
3363
7.07k
        edge_idx = 2 + SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]) +
3364
7.07k
                        SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]);
3365
3366
7.07k
        edge_idx = gi1_table_edge_idx[edge_idx];
3367
3368
7.07k
        if(0 != edge_idx)
3369
1.97k
        {
3370
1.97k
            u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
3371
1.97k
        }
3372
5.09k
        else
3373
5.09k
        {
3374
5.09k
            u1_pos_wd_ht_tmp_v = pu1_src[wd - 1 + (ht - 1) * src_strd];
3375
5.09k
        }
3376
7.07k
    }
3377
1.50k
    else
3378
1.50k
    {
3379
1.50k
        u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd];
3380
1.50k
        u1_pos_wd_ht_tmp_v = pu1_src[wd - 1 + (ht - 1) * src_strd];
3381
1.50k
    }
3382
8.57k
    pu1_firstleft = pu1_src_top_left;
3383
3384
    /* Update height and source pointers based on the availability flags */
3385
8.57k
    if(0 == pu1_avail[2])
3386
1.02k
    {
3387
1.02k
        pu1_firstleft = pu1_src_left_cpy2;
3388
1.02k
        pu1_src_left_cpy2 += 2;
3389
1.02k
        pu1_src_left_str2 += 2;
3390
1.02k
        pu1_src_top_cpy = pu1_src;
3391
1.02k
        pu1_src += src_strd;
3392
1.02k
        ht--;
3393
1.02k
    }
3394
8.57k
    if(0 == pu1_avail[3])
3395
917
    {
3396
917
        ht--;
3397
917
        ht_0--;
3398
917
    }
3399
    //storing top left in a mmx register
3400
8.57k
    left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_firstleft);
3401
8.57k
    sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
3402
8.57k
    const2_16x8b = _mm_set1_epi8(2);
3403
8.57k
    const0_16x8b = _mm_setzero_si128();
3404
8.57k
    left_store_16x8b = _mm_slli_si128(left_store_16x8b, 14);
3405
3406
    //availability mask creation
3407
8.57k
    u1_avail0 = pu1_avail[0];
3408
8.57k
    u1_avail1 = pu1_avail[1];
3409
8.57k
    au1_mask[0] = u1_avail0;
3410
8.57k
    au1_mask[1] = u1_avail0;
3411
8.57k
    au1_mask[wd - 1] = u1_avail1;
3412
8.57k
    au1_mask[wd - 2] = u1_avail1;
3413
3414
    /* top-left arrays */
3415
8.57k
    pu1_src_top_left[0] = pu1_src_top[wd - 2];
3416
8.57k
    pu1_src_top_left[1] = pu1_src_top[wd - 1];
3417
8.57k
    {
3418
8.57k
        WORD32 ht_rem;
3419
8.57k
        au1_mask_cpy = au1_mask;
3420
3421
8.57k
        pu1_src_left_cpy = pu1_src_left_cpy2;
3422
8.57k
        pu1_src_left_str = pu1_src_left_str2;
3423
26.8k
        for(col = wd; col >= 16; col -= 16)
3424
18.2k
        {
3425
18.2k
            pu1_src_cpy = pu1_src;
3426
18.2k
            src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
3427
            //row = 0
3428
18.2k
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
3429
18.2k
            src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 14);
3430
            //loading the mask
3431
18.2k
            au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
3432
            //separating +ve and and -ve values.
3433
18.2k
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
3434
18.2k
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
3435
            //creating mask 00 for +ve and -ve values and FF for zero.
3436
18.2k
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3437
18.2k
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3438
            //combining the appropriate sign change
3439
18.2k
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3440
3441
3442
161k
            for(row = ht; row >= 2; row -= 2)
3443
143k
            {
3444
143k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3445
                //row = 1
3446
143k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
3447
                // row = 1 right
3448
143k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
3449
                //to insert left in row 0
3450
143k
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
3451
                //row 0 -row1
3452
                //separating +ve and and -ve values.
3453
143k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
3454
143k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
3455
3456
                //creating mask 00 for +ve and -ve values and FF for zero.
3457
143k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3458
143k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3459
                //manipulation for row 1 - row 0
3460
143k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14);
3461
                //combining the appropriate sign change
3462
143k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
3463
                //row1-row0
3464
                //separating +ve and and -ve values.
3465
143k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3466
143k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3467
                //creating mask 00 for +ve and -ve values and FF for zero.
3468
143k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3469
143k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3470
                 // row = 2 right
3471
143k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd + 2));
3472
143k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
3473
3474
3475
                //row1 -bottom
3476
143k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
3477
143k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
3478
                //creating mask 00 for +ve and -ve values and FF for zero.
3479
143k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3480
143k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3481
                //combining the appropriate sign change
3482
143k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3483
                // row = 2
3484
143k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
3485
3486
                //combining sign-left and sign_right
3487
143k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
3488
3489
                //storing the row 1 left for next row.
3490
143k
                signup0_16x8b = _mm_slli_si128(left_store_16x8b, 12);
3491
3492
                //combining sign-left and sign_right
3493
143k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
3494
                //manipulation for bottom - row 1
3495
143k
                signup0_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signup0_16x8b, 14);
3496
                //eliminating old left for row 0 and row 1
3497
143k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
3498
                //bottom - row1
3499
143k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signup0_16x8b);
3500
143k
                cmp_lt0_16x8b = _mm_subs_epu8(signup0_16x8b, src_bottom_16x8b);
3501
                //creating mask 00 for +ve and -ve values and FF for zero.
3502
143k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3503
143k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3504
                //for the next iteration bottom -row1
3505
143k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3506
                //row1  getting it right for left of next iteration
3507
143k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14);
3508
                //copying the next top
3509
143k
                src_top_16x8b = src_temp1_16x8b;
3510
                //row0  getting its right for left of next iteration.
3511
143k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
3512
3513
3514
                //adding constant 2
3515
143k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
3516
143k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
3517
                //shuffle to get sao index
3518
143k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
3519
143k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
3520
                //using availability mask
3521
143k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
3522
143k
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
3523
                //adding chroma offset to access U and V
3524
143k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
3525
143k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
3526
3527
3528
                //shuffle to get sao offset
3529
143k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3530
143k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
3531
                //cnvert to 16 bit then add and then saturated pack
3532
143k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3533
143k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3534
143k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3535
143k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
3536
143k
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
3537
143k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3538
143k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
3539
143k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
3540
3541
143k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
3542
143k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
3543
143k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
3544
143k
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
3545
143k
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
3546
143k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3547
143k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
3548
143k
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
3549
3550
                //store left boundary
3551
143k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3552
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
3553
143k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3554
                // row = 1
3555
143k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
3556
3557
143k
                src_temp0_16x8b = src_bottom_16x8b;
3558
143k
                pu1_src_cpy += (src_strd << 1);
3559
143k
                pu1_src_left_cpy += 4;
3560
143k
                pu1_src_left_str += 4;
3561
143k
            }
3562
18.2k
            ht_rem = ht & 0x1;
3563
3564
18.2k
            if(ht_rem)
3565
3.86k
            {
3566
3.86k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3567
3.86k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
3568
                //current row -next row
3569
                //separating +ve and and -ve values.
3570
3.86k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
3571
3.86k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
3572
                //creating mask 00 for +ve and -ve values and FF for zero.
3573
3.86k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3574
3.86k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3575
                //combining the appropriate sign change
3576
3.86k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3577
                //adding top and botton and constant 2
3578
3.86k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
3579
3.86k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
3580
3581
                //eliminating old left for row 0 and row 1
3582
3.86k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
3583
                //copying the next top
3584
3.86k
                src_top_16x8b = src_temp0_16x8b;
3585
                //row0  getting it right for left of next block
3586
3.86k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
3587
3588
3.86k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
3589
                //using availability mask
3590
3.86k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
3591
                //adding chroma offset to access U and V
3592
3.86k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
3593
3594
3.86k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3595
3596
                //cnvert to 16 bit then add and then saturated pack
3597
3.86k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3598
3.86k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3599
3.86k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3600
3.86k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
3601
3.86k
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
3602
3.86k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3603
3.86k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
3604
3.86k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
3605
3606
3.86k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3607
3608
3.86k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3609
3.86k
                pu1_src_cpy += (src_strd);
3610
3.86k
                pu1_src_left_cpy += 2;
3611
3.86k
                pu1_src_left_str += 2;
3612
3.86k
            }
3613
18.2k
            if(0 == pu1_avail[3])
3614
1.93k
            {
3615
1.93k
                src_top_16x8b = src_bottom_16x8b;
3616
1.93k
                pu1_src_left_str[1] = pu1_src_cpy[15];
3617
1.93k
                pu1_src_left_str[0] = pu1_src_cpy[14];
3618
1.93k
            }
3619
18.2k
            if(0 == pu1_avail[2])
3620
2.20k
            {
3621
2.20k
                pu1_src_left_str[-2 * ht_0] = pu1_src[14 - src_strd];
3622
2.20k
                pu1_src_left_str[-2 * ht_0 + 1] = pu1_src[15 - src_strd];
3623
2.20k
            }
3624
3625
            //for the top left of next part of the block
3626
18.2k
            left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
3627
            //updating top flag
3628
18.2k
            _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
3629
18.2k
            pu1_src += 16;
3630
18.2k
            au1_mask_cpy += 16;
3631
3632
18.2k
            pu1_left_tmp = pu1_src_left_cpy2;
3633
18.2k
            pu1_src_left_cpy2 = pu1_src_left_str2;
3634
18.2k
            pu1_src_left_str2 = pu1_left_tmp;
3635
3636
18.2k
            pu1_src_left_cpy = pu1_src_left_cpy2;
3637
18.2k
            pu1_src_left_str = pu1_src_left_str2;
3638
18.2k
        }
3639
8.57k
        wd_rem = wd & 0xF;
3640
8.57k
        if(wd_rem)
3641
2
        {
3642
2
            pu1_src_left_cpy = pu1_src_left_cpy2;
3643
2
            pu1_src_left_str = pu1_src_left_str2;
3644
2
            pu1_src_cpy = pu1_src;
3645
2
            src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
3646
            //row = 0
3647
2
            src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
3648
2
            src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 14);
3649
2
            au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
3650
            //separating +ve and and -ve values.
3651
2
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
3652
2
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
3653
            //creating mask 00 for +ve and -ve values and FF for zero.
3654
2
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3655
2
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3656
            //preparing au1_mask
3657
2
            au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
3658
            //combining the appropriate sign change
3659
2
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3660
2
            signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
3661
3662
9
            for(row = ht; row >= 4; row -= 4)
3663
7
            {
3664
7
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3665
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
3666
7
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
3667
                // row = 2
3668
7
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
3669
                //right row1
3670
7
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
3671
                //row 0 -row1
3672
                //separating +ve and and -ve values.
3673
7
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
3674
7
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
3675
                //manipulation for row 1 -row 0
3676
7
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
3677
                //creating mask 00 for +ve and -ve values and FF for zero.
3678
7
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3679
7
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3680
                //row 0 left
3681
7
                signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14);
3682
                //combining the appropriate sign change
3683
7
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3684
                //row 1 -row0
3685
                //separating +ve and and -ve values.
3686
7
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3687
7
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3688
3689
                //creating mask 00 for +ve and -ve values and FF for zero.
3690
7
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3691
7
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3692
                //row1-row0
3693
7
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3694
3695
7
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
3696
3697
7
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
3698
                //right row2
3699
7
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
3700
                //packing row 0 n row 1
3701
7
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
3702
                //row1 -row2
3703
7
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3704
7
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3705
                //creating mask 00 for +ve and -ve values and FF for zero.
3706
7
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3707
7
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3708
                //combining the appropriate sign change
3709
7
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
3710
7
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
3711
                //manipulation for row 2 -row 1
3712
7
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
3713
                //row 1 left
3714
7
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
3715
                //row = 3
3716
7
                src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
3717
3718
                // row = 4
3719
7
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
3720
3721
7
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
3722
3723
                //separating +ve and and -ve values.(2,1)
3724
7
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
3725
7
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
3726
                //manipulation for row 3 -row 2
3727
7
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 10);
3728
                //creating mask 00 for +ve and -ve values and FF for zero.
3729
7
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3730
7
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3731
                //row 2 left
3732
7
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
3733
                //combining the appropriate sign change
3734
7
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
3735
3736
                //separating +ve and and -ve values.(3,2)
3737
7
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
3738
7
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
3739
7
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
3740
                //creating mask 00 for +ve and -ve values and FF for zero.
3741
7
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3742
7
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3743
                //right row3
3744
7
                signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 2);
3745
                //combining the appropriate sign change
3746
7
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
3747
3748
7
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
3749
3750
                //separating +ve and and -ve values.(2,3)
3751
7
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
3752
7
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
3753
                //right row 4
3754
7
                signdwn1_16x8b =  _mm_srli_si128(src_temp1_16x8b, 2);
3755
                //creating mask 00 for +ve and -ve values and FF for zero.
3756
7
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3757
7
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3758
                //combining the appropriate sign change
3759
7
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
3760
3761
                //separating +ve and and -ve values.(3,bottom)
3762
7
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
3763
7
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
3764
3765
                //creating mask 00 for +ve and -ve values and FF for zero.
3766
7
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3767
7
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3768
7
                edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
3769
                //combining the appropriate sign change
3770
7
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
3771
7
                edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
3772
3773
                //manipulation for bottom -row 3
3774
7
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 8);
3775
                //eliminating old left for row 0,1,2,3
3776
7
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 8);
3777
                //packing row 2 n row 3
3778
7
                src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
3779
                //row 3 left
3780
7
                signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 14);
3781
3782
                //adding bottom and top values of row 2 and row 3
3783
7
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
3784
                //separating +ve and and -ve values.(botttom,3)
3785
7
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3786
7
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3787
3788
                //creating mask 00 for +ve and -ve values and FF for zero.
3789
7
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3790
7
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3791
7
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
3792
3793
                //to store right of row 2
3794
7
                signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
3795
                //loading row 3 right into left
3796
7
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 14);
3797
                //storing right of row 2into left
3798
7
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
3799
                //to store right of row 0
3800
7
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
3801
                //storing right of row 1 into left
3802
7
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
3803
                //storing right of row 0 into left
3804
7
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
3805
3806
                //adding constant 2
3807
7
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
3808
7
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
3809
                //shuffle to get sao index
3810
7
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
3811
7
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
3812
                //using availability mask
3813
7
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
3814
7
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
3815
3816
                //adding chroma offset to access U and V
3817
7
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
3818
7
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
3819
3820
                //shuffle to get sao offset
3821
7
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3822
7
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
3823
                //cnvert to 16 bit then add and then saturated pack
3824
7
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3825
7
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3826
7
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3827
7
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
3828
7
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
3829
7
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3830
7
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
3831
7
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
3832
3833
7
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
3834
7
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
3835
7
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
3836
7
                src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
3837
7
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
3838
7
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3839
7
                src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b);
3840
7
                src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
3841
3842
7
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
3843
7
                cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
3844
3845
3846
7
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3847
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
3848
7
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3849
                // row = 1
3850
7
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
3851
                //row = 2
3852
7
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
3853
                // row = 3
3854
7
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
3855
3856
7
                src_temp0_16x8b = src_temp1_16x8b;
3857
7
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
3858
7
                pu1_src_cpy += (src_strd << 2);
3859
7
                pu1_src_left_cpy += 8;
3860
7
                pu1_src_left_str += 8;
3861
7
            }
3862
2
            ht_rem = ht & 0x2;
3863
2
            if(ht_rem)
3864
1
            {
3865
1
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3866
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
3867
1
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
3868
                // row = 2
3869
1
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
3870
3871
                //row 0 -row 1
3872
1
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
3873
                //separating +ve and and -ve values.
3874
1
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
3875
1
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
3876
                //manipulation for row 1 -row 0
3877
1
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
3878
                //creating mask 00 for +ve and -ve values and FF for zero.
3879
1
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3880
1
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3881
                //manipulation for row 1 - row 0
3882
1
                signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14);
3883
                //combining the appropriate sign change
3884
1
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3885
3886
                //row1-row0
3887
                //separating +ve and and -ve values.
3888
1
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3889
1
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3890
3891
                //creating mask 00 for +ve and -ve values and FF for zero.
3892
1
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3893
1
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3894
                //combining the appropriate sign chang
3895
1
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3896
                //row 1 -bottom
3897
1
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
3898
3899
1
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
3900
1
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
3901
                //row1 -bottom
3902
1
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3903
1
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3904
3905
                //creating mask 00 for +ve and -ve values and FF for zero.
3906
1
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3907
1
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3908
                //combining the appropriate sign change
3909
1
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
3910
1
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
3911
                //manipulation for bottom -row1
3912
1
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
3913
                //eliminating old left for row 0,1
3914
1
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
3915
                //manipulation for bottom- row 1
3916
1
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
3917
                //adding top and down substraction
3918
1
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
3919
                //bottom - row 1
3920
1
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
3921
1
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
3922
3923
                //shifting row 1
3924
1
                signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
3925
                //creating mask 00 for +ve and -ve values and FF for zero.
3926
1
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3927
1
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3928
                //for the next iteration signup0_16x8b
3929
1
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
3930
                //storing right of row 1 into left
3931
1
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); //for storing right of row 0
3932
1
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
3933
                //the next top  in  src_top_16x8b
3934
1
                src_top_16x8b = src_temp1_16x8b;
3935
                //storing right of row 0 into left
3936
1
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
3937
3938
3939
                //adding constant 2
3940
1
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
3941
3942
                //shuffle to get sao index
3943
1
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
3944
                //using availability mask
3945
1
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
3946
3947
                //adding chroma offset to access U and V
3948
1
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
3949
3950
                //shuffle to get sao offset
3951
1
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3952
                //the next top already in  src_top_16x8b
3953
                //cnvert to 16 bit then add and then saturated pack
3954
1
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3955
1
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3956
1
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3957
1
                src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
3958
1
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
3959
1
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
3960
1
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b);
3961
1
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
3962
3963
1
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
3964
3965
1
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3966
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
3967
1
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3968
                // row = 1
3969
1
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
3970
1
                src_temp0_16x8b = src_bottom_16x8b;
3971
1
                pu1_src_cpy += (src_strd << 1);
3972
1
                pu1_src_left_cpy += 4;
3973
1
                pu1_src_left_str += 4;
3974
1
            }
3975
2
            ht_rem = ht & 0x1;
3976
2
            if(ht_rem)
3977
1
            {
3978
1
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3979
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
3980
1
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
3981
3982
                //row 0 -row1
3983
1
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
3984
                //separating +ve and and -ve values.
3985
1
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
3986
1
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
3987
                //creating mask 00 for +ve and -ve values and FF for zero.
3988
1
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3989
1
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3990
                //combining the appropriate sign change
3991
1
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3992
                //adding top and down substraction
3993
1
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
3994
3995
                //for row 0 right to put into left store
3996
1
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
3997
                //left store manipulation 1
3998
1
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
3999
1
                src_top_16x8b = src_temp0_16x8b;
4000
                //filling the left boundary value
4001
1
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
4002
4003
                //adding constant 2
4004
1
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
4005
1
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
4006
1
                edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
4007
4008
4009
                //shuffle to get sao index
4010
1
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
4011
                //using availability mask
4012
1
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
4013
                //adding chroma offset to access U and V
4014
1
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
4015
4016
                //shuffle to get sao offset
4017
1
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
4018
4019
                //cnvert to 16 bit then add and then saturated pack
4020
1
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
4021
1
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
4022
1
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
4023
1
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
4024
1
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
4025
4026
1
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4027
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
4028
1
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
4029
1
                pu1_src_cpy += (src_strd);
4030
1
                pu1_src_left_cpy += 2;
4031
1
                pu1_src_left_str += 2;
4032
1
            }
4033
2
            if(0 == pu1_avail[3])
4034
1
            {
4035
1
                src_top_16x8b = src_bottom_16x8b;
4036
1
                pu1_src_left_str[1] = pu1_src_cpy[7];
4037
1
                pu1_src_left_str[0] = pu1_src_cpy[6];
4038
1
            }
4039
4040
2
            if(0 == pu1_avail[2])
4041
0
            {
4042
0
                pu1_src_left_str[-2 * ht_0] = pu1_src[6 - src_strd];
4043
0
                pu1_src_left_str[-2 * ht_0 + 1] = pu1_src[7 - src_strd];
4044
0
            }
4045
4046
2
            _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
4047
2
            pu1_src += 8;
4048
4049
2
            pu1_left_tmp = pu1_src_left_cpy2;
4050
2
            pu1_src_left_cpy2 = pu1_src_left_str2;
4051
2
            pu1_src_left_str2 = pu1_left_tmp;
4052
4053
2
            pu1_src_left_cpy = pu1_src_left_cpy2;
4054
2
            pu1_src_left_str = pu1_src_left_str2;
4055
2
        }
4056
8.57k
        pu1_src_org[0] = u1_pos_0_0_tmp_u;
4057
8.57k
        pu1_src_org[1] = u1_pos_0_0_tmp_v;
4058
8.57k
        pu1_src_org[wd - 2 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp_u;
4059
8.57k
        pu1_src_org[wd - 1 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp_v;
4060
8.57k
        pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 2) : pu1_src_left_cpy;
4061
285k
        for(row = 0; row < 2 * ht_tmp; row++)
4062
276k
        {
4063
276k
            pu1_src_left[row] = pu1_src_left_cpy[row];
4064
276k
        }
4065
8.57k
    }
4066
4067
8.57k
}
4068
4069
void ihevc_sao_edge_offset_class3_ssse3(UWORD8 *pu1_src,
4070
                                        WORD32 src_strd,
4071
                                        UWORD8 *pu1_src_left,
4072
                                        UWORD8 *pu1_src_top,
4073
                                        UWORD8 *pu1_src_top_left,
4074
                                        UWORD8 *pu1_src_top_right,
4075
                                        UWORD8 *pu1_src_bot_left,
4076
                                        UWORD8 *pu1_avail,
4077
                                        WORD8 *pi1_sao_offset,
4078
                                        WORD32 wd,
4079
                                        WORD32 ht)
4080
9.68k
{
4081
9.68k
    WORD32 row, col;
4082
9.68k
    UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
4083
9.68k
    UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2;
4084
9.68k
    UWORD8 *pu1_src_cpy, *pu1_src_org;
4085
9.68k
    UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8];
4086
9.68k
    UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8];
4087
9.68k
    UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
4088
9.68k
    WORD32 wd_rem;
4089
9.68k
    UWORD8 u1_pos_wd_0_tmp, u1_pos_0_ht_tmp;
4090
9.68k
    WORD32 ht_tmp;
4091
9.68k
    WORD32 bit_depth;
4092
9.68k
    UWORD8 u1_avail0, u1_avail1;
4093
4094
9.68k
    __m128i src_top_16x8b, src_bottom_16x8b;
4095
9.68k
    __m128i src_temp0_16x8b, src_temp1_16x8b;
4096
9.68k
    __m128i signup0_16x8b, signdwn1_16x8b;
4097
9.68k
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
4098
9.68k
    __m128i edge0_16x8b, edge1_16x8b;
4099
9.68k
    __m128i au1_mask8x16b;
4100
9.68k
    __m128i edge_idx_8x16b, sao_offset_8x16b;
4101
9.68k
    __m128i const2_16x8b, const0_16x8b;
4102
9.68k
    __m128i left_store_16x8b;
4103
4104
9.68k
    ht_tmp = ht;
4105
9.68k
    au1_mask8x16b = _mm_set1_epi8(0xff);
4106
4107
9.68k
    au1_src_left_tmp[0] = pu1_src[(wd - 1)];
4108
    //manipulation for bottom left
4109
312k
    for(row = 1; row < ht; row++)
4110
302k
    {
4111
302k
        au1_src_left_tmp[row] = pu1_src_left[row];
4112
302k
    }
4113
9.68k
    au1_src_left_tmp[ht] = pu1_src_bot_left[0];
4114
4115
9.68k
    *pu1_src_top_left = pu1_src_top[wd - 1];
4116
    //setting availability mask to ff size MAX_CTB_SIZE
4117
48.3k
    for(col = 0; col < MAX_CTB_SIZE; col += 16)
4118
38.7k
        _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
4119
9.68k
    bit_depth = BIT_DEPTH_LUMA;
4120
9.68k
    pu1_src_org = pu1_src;
4121
9.68k
    pu1_src_top_cpy = pu1_src_top;
4122
9.68k
    pu1_src_left_cpy2 = au1_src_left_tmp;
4123
9.68k
    pu1_src_left_cpy = au1_src_left_tmp;
4124
9.68k
    pu1_src_left_str2 = au1_src_left_tmp1;
4125
9.68k
    pu1_src_left_str = au1_src_left_tmp1;
4126
9.68k
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
4127
9.68k
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
4128
4129
    /* If top-right is available, process separately */
4130
9.68k
    if(0 != pu1_avail[5])
4131
7.32k
    {
4132
7.32k
        WORD32 edge_idx;
4133
4134
7.32k
        edge_idx = 2 + SIGN(pu1_src[wd - 1] - pu1_src_top_right[0]) +
4135
7.32k
                        SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd]);
4136
4137
7.32k
        edge_idx = gi1_table_edge_idx[edge_idx];
4138
4139
7.32k
        if(0 != edge_idx)
4140
2.62k
        {
4141
2.62k
            u1_pos_wd_0_tmp = CLIP3(pu1_src[wd - 1] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
4142
2.62k
        }
4143
4.70k
        else
4144
4.70k
        {
4145
4.70k
            u1_pos_wd_0_tmp = pu1_src[wd - 1];
4146
4.70k
        }
4147
7.32k
    }
4148
2.35k
    else
4149
2.35k
    {
4150
2.35k
        u1_pos_wd_0_tmp = pu1_src[wd - 1];
4151
2.35k
    }
4152
4153
    /* If bottom-left is available, process separately */
4154
9.68k
    if(0 != pu1_avail[6])
4155
7.97k
    {
4156
7.97k
        WORD32 edge_idx;
4157
4158
7.97k
        edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd]) +
4159
7.97k
                        SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]);
4160
4161
7.97k
        edge_idx = gi1_table_edge_idx[edge_idx];
4162
4163
7.97k
        if(0 != edge_idx)
4164
3.08k
        {
4165
3.08k
            u1_pos_0_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
4166
3.08k
        }
4167
4.89k
        else
4168
4.89k
        {
4169
4.89k
            u1_pos_0_ht_tmp = pu1_src[(ht - 1) * src_strd];
4170
4.89k
        }
4171
7.97k
    }
4172
1.70k
    else
4173
1.70k
    {
4174
1.70k
        u1_pos_0_ht_tmp = pu1_src[(ht - 1) * src_strd];
4175
1.70k
    }
4176
4177
4178
4179
    /* Update height and source pointers based on the availability flags */
4180
9.68k
    if(0 == pu1_avail[2])
4181
1.96k
    {
4182
1.96k
        pu1_src_left_cpy2++;
4183
1.96k
        pu1_src_left_str2++;
4184
1.96k
        pu1_src_top_cpy = pu1_src;
4185
1.96k
        pu1_src += src_strd;
4186
1.96k
        ht--;
4187
1.96k
    }
4188
9.68k
    if(0 == pu1_avail[3])
4189
1.09k
    {
4190
1.09k
        ht--;
4191
1.09k
    }
4192
4193
4194
9.68k
    const2_16x8b = _mm_set1_epi8(2);
4195
9.68k
    const0_16x8b = _mm_setzero_si128();
4196
4197
4198
    //availability mask creation
4199
9.68k
    u1_avail0 = pu1_avail[0];
4200
9.68k
    u1_avail1 = pu1_avail[1];
4201
9.68k
    au1_mask[0] = u1_avail0;
4202
9.68k
    au1_mask[wd - 1] = u1_avail1;
4203
9.68k
    {
4204
9.68k
        WORD32 ht_rem;
4205
4206
9.68k
        pu1_src_left_cpy = pu1_src_left_cpy2;
4207
9.68k
        pu1_src_left_str = pu1_src_left_str2;
4208
9.68k
        au1_mask_cpy = au1_mask;
4209
25.2k
        for(col = wd; col >= 16; col -= 16)
4210
15.5k
        {
4211
15.5k
            pu1_src_cpy = pu1_src;
4212
15.5k
            src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col + 1));
4213
            //row = 0
4214
15.5k
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
4215
4216
            //loading the mask
4217
15.5k
            au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
4218
            //separating +ve and and -ve values.
4219
15.5k
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
4220
15.5k
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
4221
            //creating mask 00 for +ve and -ve values and FF for zero.
4222
15.5k
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4223
15.5k
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4224
            //combining the appropriate sign change
4225
15.5k
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4226
4227
261k
            for(row = ht; row >= 2; row -= 2)
4228
246k
            {
4229
246k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
4230
                //row = 1
4231
246k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
4232
                //to insert left in row 1
4233
246k
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
4234
                // row = 0 right
4235
246k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 1));
4236
4237
                //manipulation for row 1 - row 0
4238
246k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
4239
                //row 0 -row1
4240
                //separating +ve and and -ve values.
4241
246k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
4242
246k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
4243
4244
                //creating mask 00 for +ve and -ve values and FF for zero.
4245
246k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4246
246k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4247
4248
                //combining the appropriate sign change
4249
246k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
4250
                //combining sign-left and sign_right
4251
246k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
4252
4253
                //row1-row0
4254
                //separating +ve and and -ve values.
4255
246k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
4256
246k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
4257
                //creating mask 00 for +ve and -ve values and FF for zero.
4258
246k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4259
246k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4260
4261
                // row = 2
4262
246k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
4263
                // row = 1 right
4264
246k
                signdwn1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
4265
246k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
4266
4267
                //bottom - row1
4268
246k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
4269
246k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
4270
                //creating mask 00 for +ve and -ve values and FF for zero.
4271
246k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4272
246k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4273
                //for the next iteration bottom -row1
4274
246k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4275
4276
                //to insert left in row 1
4277
246k
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 13);
4278
                //manipulation for row 1 - bottom
4279
246k
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
4280
4281
                //row1 -bottom
4282
246k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
4283
246k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
4284
                //creating mask 00 for +ve and -ve values and FF for zero.
4285
246k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4286
246k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4287
                //combining the appropriate sign change
4288
246k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4289
4290
                //combining sign-left and sign_right
4291
246k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
4292
4293
                //eliminating old left for row 0 and row 1
4294
246k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
4295
4296
                //row1  getting it right for left of next block
4297
246k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15);
4298
                //adding constant 2
4299
246k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
4300
246k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
4301
                //shuffle to get sao index
4302
246k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
4303
246k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
4304
                //using availability mask
4305
246k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
4306
246k
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
4307
                //shuffle to get sao offset
4308
246k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
4309
246k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
4310
                //row0  getting it right for left of next block
4311
246k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
4312
                //copying the next top
4313
246k
                src_top_16x8b = src_temp1_16x8b;
4314
                //cnvert to 16 bit then add and then saturated pack
4315
246k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
4316
246k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
4317
246k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
4318
246k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
4319
246k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
4320
246k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
4321
246k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
4322
246k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
4323
4324
246k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
4325
246k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
4326
246k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
4327
246k
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
4328
246k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
4329
246k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
4330
246k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
4331
246k
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
4332
                //store left boundary
4333
246k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4334
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
4335
246k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
4336
                // row = 1
4337
246k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
4338
4339
246k
                src_temp0_16x8b = src_bottom_16x8b;
4340
246k
                pu1_src_cpy += (src_strd << 1);
4341
246k
                pu1_src_left_cpy += 2;
4342
246k
                pu1_src_left_str += 2;
4343
246k
            }
4344
15.5k
            ht_rem = ht & 0x1;
4345
4346
15.5k
            if(ht_rem)
4347
4.71k
            {
4348
4.71k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
4349
4.71k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
4350
                //to insert left in row 1
4351
4.71k
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
4352
                //manipulation for row 1 - row 0
4353
4.71k
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
4354
4355
                //current row -next row
4356
                //separating +ve and and -ve values.
4357
4.71k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
4358
4.71k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
4359
                //creating mask 00 for +ve and -ve values and FF for zero.
4360
4.71k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4361
4.71k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4362
                //combining the appropriate sign change
4363
4.71k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4364
                //adding top and bottom and constant 2
4365
4.71k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
4366
4.71k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
4367
                //eliminating old left for row 0 and row 1
4368
4.71k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
4369
4370
4.71k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
4371
                //using availability mask
4372
4.71k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
4373
4374
4.71k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
4375
4376
                //row0  getting it right for left of next block
4377
4.71k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
4378
                //copying the next top
4379
4.71k
                src_top_16x8b = src_temp0_16x8b;
4380
                //cnvert to 16 bit then add and then saturated pack
4381
4.71k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
4382
4.71k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
4383
4.71k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
4384
4.71k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
4385
4.71k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
4386
4.71k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
4387
4.71k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
4388
4.71k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
4389
                //store left boundary
4390
4.71k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4391
4392
4.71k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
4393
4.71k
                pu1_src_cpy += (src_strd);
4394
4.71k
                src_temp0_16x8b = src_bottom_16x8b;
4395
4.71k
                pu1_src_left_cpy++;
4396
4.71k
                pu1_src_left_str++;
4397
4.71k
            }
4398
15.5k
            {   //for bottom right
4399
15.5k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
4400
15.5k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
4401
15.5k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
4402
15.5k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4403
15.5k
            }
4404
15.5k
            if(0 == pu1_avail[3])
4405
1.70k
            {
4406
1.70k
                src_top_16x8b = src_bottom_16x8b;
4407
1.70k
            }
4408
            //for the top left of next part of the block
4409
15.5k
            left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
4410
            //updating top flag
4411
15.5k
            _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
4412
15.5k
            pu1_src += 16;
4413
15.5k
            au1_mask_cpy += 16;
4414
4415
15.5k
            pu1_left_tmp = pu1_src_left_cpy2;
4416
15.5k
            pu1_src_left_cpy2 = pu1_src_left_str2;
4417
15.5k
            pu1_src_left_str2 = pu1_left_tmp;
4418
4419
15.5k
            pu1_src_left_cpy = pu1_src_left_cpy2;
4420
15.5k
            pu1_src_left_str = pu1_src_left_str2;
4421
15.5k
        }
4422
4423
9.68k
        wd_rem = wd & 0xF;
4424
9.68k
        if(wd_rem)
4425
9.24k
        {
4426
9.24k
            pu1_src_cpy = pu1_src;
4427
9.24k
            pu1_src_left_cpy = pu1_src_left_cpy2;
4428
9.24k
            pu1_src_left_str = pu1_src_left_str2;
4429
9.24k
            src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col + 1));
4430
            //row = 0
4431
9.24k
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
4432
9.24k
            au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
4433
            //separating +ve and and -ve values.
4434
9.24k
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
4435
9.24k
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
4436
            //creating mask 00 for +ve and -ve values and FF for zero.
4437
9.24k
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4438
9.24k
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4439
            //preparing au1_mask
4440
9.24k
            au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
4441
            //combining the appropriate sign change
4442
9.24k
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4443
9.24k
            signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
4444
4445
80.8k
            for(row = ht; row >= 4; row -= 4)
4446
71.6k
            {
4447
71.6k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
4448
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
4449
71.6k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
4450
                // row = 2
4451
71.6k
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
4452
                //manipulation for row 0 -row 1
4453
71.6k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
4454
                //row 1 left
4455
71.6k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
4456
                //row 0 -row1
4457
                //separating +ve and and -ve values.
4458
71.6k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
4459
71.6k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
4460
4461
                //creating mask 00 for +ve and -ve values and FF for zero.
4462
71.6k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4463
71.6k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4464
                //manipulatiing for row 1 -row 0
4465
71.6k
                signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 1);
4466
                //combining the appropriate sign change
4467
71.6k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4468
                //row 1 -row0
4469
                //separating +ve and and -ve values.
4470
71.6k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
4471
71.6k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
4472
4473
                //creating mask 00 for +ve and -ve values and FF for zero.
4474
71.6k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4475
71.6k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4476
                //row1-row0
4477
71.6k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4478
4479
71.6k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
4480
4481
71.6k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
4482
                //manipulation for row 1 -row 2
4483
71.6k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 13);
4484
                //row 2 left
4485
71.6k
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
4486
                //packing row 0 n row 1
4487
71.6k
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
4488
                //row1 -row2
4489
71.6k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
4490
71.6k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
4491
                //creating mask 00 for +ve and -ve values and FF for zero.
4492
71.6k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4493
71.6k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4494
                //combining the appropriate sign change
4495
71.6k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
4496
71.6k
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
4497
4498
                //row 1 right
4499
71.6k
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
4500
                //row = 3
4501
71.6k
                src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
4502
4503
                // row = 4
4504
71.6k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
4505
4506
71.6k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
4507
4508
                //separating +ve and and -ve values.(2,1)
4509
71.6k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
4510
71.6k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
4511
4512
                //creating mask 00 for +ve and -ve values and FF for zero.
4513
71.6k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4514
71.6k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4515
                //row 2 right
4516
71.6k
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
4517
                //combining the appropriate sign change
4518
71.6k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
4519
4520
                //separating +ve and and -ve values.(3,2)
4521
71.6k
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
4522
71.6k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
4523
71.6k
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
4524
                //creating mask 00 for +ve and -ve values and FF for zero.
4525
71.6k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4526
71.6k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4527
                //manipulation for row 2 -row 3
4528
71.6k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
4529
                //row 3 left
4530
71.6k
                signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 15);
4531
                //combining the appropriate sign change
4532
71.6k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
4533
4534
71.6k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
4535
4536
                //separating +ve and and -ve values.(2,3)
4537
71.6k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
4538
71.6k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
4539
4540
                //manipulation for row 3 -bottom
4541
71.6k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 11);
4542
                //bottom left
4543
71.6k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
4544
4545
                //creating mask 00 for +ve and -ve values and FF for zero.
4546
71.6k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4547
71.6k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4548
                //combining the appropriate sign change
4549
71.6k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
4550
4551
                //separating +ve and and -ve values.(3,bottom)
4552
71.6k
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
4553
71.6k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
4554
4555
                //creating mask 00 for +ve and -ve values and FF for zero.
4556
71.6k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4557
71.6k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4558
71.6k
                edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
4559
                //combining the appropriate sign change
4560
71.6k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
4561
71.6k
                edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
4562
4563
4564
                //eliminating old left for row 0,1,2,3
4565
71.6k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
4566
                //packing row 2 n row 3
4567
71.6k
                src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
4568
                //row 3 right
4569
71.6k
                signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 1);
4570
                //loading row 3 right into left
4571
71.6k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 15);
4572
                //adding bottom and top values of row 2 and row 3
4573
71.6k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
4574
                //separating +ve and and -ve values.(botttom,3)
4575
71.6k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
4576
71.6k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
4577
                //to store right of row 2
4578
71.6k
                signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
4579
                //creating mask 00 for +ve and -ve values and FF for zero.
4580
71.6k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4581
71.6k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4582
71.6k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
4583
4584
                //storing right of row 2into left
4585
71.6k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
4586
                //to store right of row 0
4587
71.6k
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
4588
                //storing right of row 1 into left
4589
71.6k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
4590
4591
                //adding constant 2
4592
71.6k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
4593
71.6k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
4594
                //shuffle to get sao index
4595
71.6k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
4596
71.6k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
4597
                //using availability mask
4598
71.6k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
4599
71.6k
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
4600
                //shuffle to get sao offset
4601
71.6k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
4602
71.6k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
4603
4604
                //storing right of row 0 into left
4605
71.6k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
4606
                //cnvert to 16 bit then add and then saturated pack
4607
71.6k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
4608
71.6k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
4609
71.6k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
4610
71.6k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
4611
71.6k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
4612
71.6k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
4613
71.6k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
4614
71.6k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
4615
4616
71.6k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
4617
71.6k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
4618
71.6k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
4619
71.6k
                src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
4620
71.6k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
4621
71.6k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
4622
71.6k
                src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b);
4623
71.6k
                src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
4624
4625
71.6k
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
4626
71.6k
                cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
4627
4628
71.6k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4629
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
4630
71.6k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
4631
                // row = 1
4632
71.6k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
4633
                //row = 2
4634
71.6k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
4635
                // row = 3
4636
71.6k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
4637
4638
71.6k
                src_temp0_16x8b = src_temp1_16x8b;
4639
71.6k
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
4640
71.6k
                pu1_src_cpy += (src_strd << 2);
4641
71.6k
                pu1_src_left_cpy += 4;
4642
71.6k
                pu1_src_left_str += 4;
4643
71.6k
            }
4644
9.24k
            ht_rem = ht & 0x2;
4645
9.24k
            if(ht_rem)
4646
2.91k
            {
4647
2.91k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
4648
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
4649
2.91k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
4650
                // row = 2
4651
2.91k
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
4652
4653
                //manipulation for row 0 -row 1
4654
2.91k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
4655
                //bottom left
4656
2.91k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
4657
                //separating +ve and and -ve values.
4658
2.91k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
4659
2.91k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
4660
4661
                //creating mask 00 for +ve and -ve values and FF for zero.
4662
2.91k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4663
2.91k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4664
                //manipulation for row 1 - row 0
4665
2.91k
                signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 1);
4666
                //combining the appropriate sign change
4667
2.91k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4668
4669
                //row1-row0
4670
                //separating +ve and and -ve values.
4671
2.91k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
4672
2.91k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
4673
4674
                //creating mask 00 for +ve and -ve values and FF for zero.
4675
2.91k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4676
2.91k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4677
                //combining the appropriate sign chang
4678
2.91k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4679
4680
                //manipulation for row 1 -bottom
4681
2.91k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 13);
4682
                //bottom left
4683
2.91k
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
4684
4685
2.91k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
4686
2.91k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
4687
                //row1 -bottom
4688
2.91k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
4689
2.91k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
4690
4691
                //creating mask 00 for +ve and -ve values and FF for zero.
4692
2.91k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4693
2.91k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4694
                //combining the appropriate sign change
4695
2.91k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
4696
2.91k
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
4697
                //manipulation for bottom- row 1 (row 1 right)
4698
2.91k
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
4699
                //adding top and down substraction
4700
2.91k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
4701
                //bottom - row 1
4702
2.91k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
4703
2.91k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
4704
4705
                //eliminating old left for row 0,1
4706
2.91k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
4707
2.91k
                signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
4708
                //creating mask 00 for +ve and -ve values and FF for zero.
4709
2.91k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4710
2.91k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4711
                //for the next iteration signup0_16x8b
4712
2.91k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
4713
4714
                //storing right of row 1 into left
4715
2.91k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
4716
                //for storing right of row 1
4717
2.91k
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
4718
4719
2.91k
                src_top_16x8b = src_temp1_16x8b;
4720
                //storing right of row 0 into left
4721
2.91k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
4722
4723
                //adding constant 2
4724
2.91k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
4725
4726
                //shuffle to get sao index
4727
2.91k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
4728
                //using availability mask
4729
2.91k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
4730
                //shuffle to get sao offset
4731
2.91k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
4732
4733
                //the next top already in  src_top_16x8b
4734
                //cnvert to 16 bit then add and then saturated pack
4735
2.91k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
4736
2.91k
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
4737
2.91k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
4738
2.91k
                src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
4739
2.91k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
4740
2.91k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
4741
2.91k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
4742
2.91k
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
4743
4744
2.91k
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
4745
4746
2.91k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4747
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
4748
2.91k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
4749
                // row = 1
4750
2.91k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
4751
2.91k
                src_temp0_16x8b = src_bottom_16x8b;
4752
2.91k
                pu1_src_cpy += (src_strd << 1);
4753
2.91k
                pu1_src_left_cpy += 2;
4754
2.91k
                pu1_src_left_str += 2;
4755
2.91k
            }
4756
9.24k
            ht_rem = ht & 0x1;
4757
9.24k
            if(ht_rem)
4758
2.87k
            {
4759
2.87k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
4760
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
4761
2.87k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
4762
4763
4764
                //manipulation for row 0 -bottom
4765
2.87k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
4766
                //bottom left
4767
2.87k
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
4768
                //separating +ve and and -ve values.
4769
2.87k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
4770
2.87k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
4771
                //creating mask 00 for +ve and -ve values and FF for zero.
4772
2.87k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4773
2.87k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4774
                //combining the appropriate sign change
4775
2.87k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4776
                //adding top and down substraction
4777
2.87k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
4778
                //for row 0 right to put into left store
4779
2.87k
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
4780
                //adding constant 2
4781
2.87k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
4782
2.87k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
4783
2.87k
                edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
4784
                //left store manipulation 1
4785
2.87k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
4786
                //filling the left boundary value
4787
2.87k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
4788
4789
                //shuffle to get sao index
4790
2.87k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
4791
                //using availability mask
4792
2.87k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
4793
                //shuffle to get sao offset
4794
2.87k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
4795
2.87k
                src_top_16x8b = src_temp0_16x8b;
4796
                //cnvert to 16 bit then add and then saturated pack
4797
2.87k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
4798
2.87k
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
4799
2.87k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
4800
2.87k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
4801
2.87k
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
4802
4803
2.87k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4804
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
4805
2.87k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
4806
2.87k
                pu1_src_cpy += (src_strd);
4807
2.87k
                src_temp0_16x8b = src_bottom_16x8b;
4808
2.87k
                pu1_src_left_cpy++;
4809
2.87k
                pu1_src_left_str++;
4810
2.87k
            }
4811
9.24k
            {   //for bottom right
4812
9.24k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
4813
9.24k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
4814
9.24k
                src_temp0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
4815
9.24k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
4816
9.24k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4817
9.24k
            }
4818
9.24k
            if(0 == pu1_avail[3])
4819
1.06k
            {
4820
1.06k
                src_top_16x8b = src_bottom_16x8b;
4821
1.06k
            }
4822
9.24k
            _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
4823
9.24k
            pu1_src += 8;
4824
4825
9.24k
            pu1_left_tmp = pu1_src_left_cpy2;
4826
9.24k
            pu1_src_left_cpy2 = pu1_src_left_str2;
4827
9.24k
            pu1_src_left_str2 = pu1_left_tmp;
4828
4829
9.24k
            pu1_src_left_cpy = pu1_src_left_cpy2;
4830
9.24k
            pu1_src_left_str = pu1_src_left_str2;
4831
4832
9.24k
        }
4833
9.68k
        pu1_src_org[wd - 1] = u1_pos_wd_0_tmp;
4834
9.68k
        pu1_src_org[(ht_tmp - 1) * src_strd] = u1_pos_0_ht_tmp;
4835
9.68k
        pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 1) : pu1_src_left_cpy;
4836
9.68k
        pu1_src_left[0] = au1_src_left_tmp[0];
4837
312k
        for(row = 1; row < ht_tmp; row++)
4838
302k
        {
4839
302k
            pu1_src_left[row] = pu1_src_left_cpy[row];
4840
302k
        }
4841
9.68k
    }
4842
4843
9.68k
}
4844
4845
void ihevc_sao_edge_offset_class3_chroma_ssse3(UWORD8 *pu1_src,
4846
                                               WORD32 src_strd,
4847
                                               UWORD8 *pu1_src_left,
4848
                                               UWORD8 *pu1_src_top,
4849
                                               UWORD8 *pu1_src_top_left,
4850
                                               UWORD8 *pu1_src_top_right,
4851
                                               UWORD8 *pu1_src_bot_left,
4852
                                               UWORD8 *pu1_avail,
4853
                                               WORD8 *pi1_sao_offset_u,
4854
                                               WORD8 *pi1_sao_offset_v,
4855
                                               WORD32 wd,
4856
                                               WORD32 ht)
4857
7.64k
{
4858
7.64k
    WORD32 row, col;
4859
7.64k
    UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
4860
7.64k
    UWORD8 *pu1_src_cpy, *pu1_src_org;
4861
7.64k
    UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)];
4862
7.64k
    UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
4863
7.64k
    WORD32 wd_rem;
4864
7.64k
    UWORD8 u1_pos_wd_0_tmp_u, u1_pos_wd_0_tmp_v, u1_pos_0_ht_tmp_u, u1_pos_0_ht_tmp_v;
4865
7.64k
    WORD32 ht_tmp;
4866
7.64k
    WORD32 bit_depth;
4867
7.64k
    UWORD8 u1_avail0, u1_avail1;
4868
4869
7.64k
    __m128i src_top_16x8b, src_bottom_16x8b;
4870
7.64k
    __m128i src_temp0_16x8b, src_temp1_16x8b;
4871
7.64k
    __m128i signup0_16x8b, signdwn1_16x8b;
4872
7.64k
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
4873
7.64k
    __m128i edge0_16x8b, edge1_16x8b;
4874
7.64k
    __m128i au1_mask8x16b;
4875
7.64k
    __m128i edge_idx_8x16b, sao_offset_8x16b;
4876
7.64k
    __m128i left_store_16x8b;
4877
7.64k
    __m128i const0_16x8b, const2_16x8b;
4878
7.64k
    __m128i chroma_offset_8x16b;
4879
4880
7.64k
    ht_tmp = ht;
4881
7.64k
    au1_mask8x16b = _mm_set1_epi8(0xff);
4882
4883
4884
7.64k
    au1_src_left_tmp[0] = pu1_src[(wd - 2)];
4885
7.64k
    au1_src_left_tmp[1] = pu1_src[(wd - 1)];
4886
    //manipulation for bottom left
4887
239k
    for(row = 2; row < 2 * ht; row++)
4888
231k
    {
4889
231k
        au1_src_left_tmp[row] = pu1_src_left[row];
4890
231k
    }
4891
7.64k
    au1_src_left_tmp[2 * ht] = pu1_src_bot_left[0];
4892
7.64k
    au1_src_left_tmp[2 * ht + 1] = pu1_src_bot_left[1];
4893
4894
7.64k
    pu1_src_top_left[0] = pu1_src_top[wd - 2];
4895
7.64k
    pu1_src_top_left[1] = pu1_src_top[wd - 1];
4896
    //setting availability mask to ff size MAX_CTB_SIZE
4897
38.2k
    for(col = 0; col < MAX_CTB_SIZE; col += 16)
4898
30.5k
        _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
4899
7.64k
    bit_depth = BIT_DEPTH_LUMA;
4900
7.64k
    pu1_src_org = pu1_src;
4901
7.64k
    pu1_src_top_cpy = pu1_src_top;
4902
7.64k
    pu1_src_left_cpy2 = au1_src_left_tmp;
4903
7.64k
    pu1_src_left_cpy = au1_src_left_tmp;
4904
7.64k
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
4905
7.64k
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
4906
7.64k
    const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
4907
7.64k
    chroma_offset_8x16b = _mm_set1_epi16(0x0800);
4908
    /* If top-right is available, process separately */
4909
7.64k
    if(0 != pu1_avail[5])
4910
6.53k
    {
4911
6.53k
        WORD32 edge_idx;
4912
4913
        /* U */
4914
6.53k
        edge_idx = 2 + SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) +
4915
6.53k
                        SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]);
4916
4917
6.53k
        edge_idx = gi1_table_edge_idx[edge_idx];
4918
4919
6.53k
        if(0 != edge_idx)
4920
1.82k
        {
4921
1.82k
            u1_pos_wd_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
4922
1.82k
        }
4923
4.71k
        else
4924
4.71k
        {
4925
4.71k
            u1_pos_wd_0_tmp_u = pu1_src[wd - 2];
4926
4.71k
        }
4927
4928
        /* V */
4929
6.53k
        edge_idx = 2 + SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) +
4930
6.53k
                        SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]);
4931
4932
6.53k
        edge_idx = gi1_table_edge_idx[edge_idx];
4933
4934
6.53k
        if(0 != edge_idx)
4935
1.93k
        {
4936
1.93k
            u1_pos_wd_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
4937
1.93k
        }
4938
4.60k
        else
4939
4.60k
        {
4940
4.60k
            u1_pos_wd_0_tmp_v = pu1_src[wd - 1];
4941
4.60k
        }
4942
6.53k
    }
4943
1.11k
    else
4944
1.11k
    {
4945
1.11k
        u1_pos_wd_0_tmp_u = pu1_src[wd - 2];
4946
1.11k
        u1_pos_wd_0_tmp_v = pu1_src[wd - 1];
4947
1.11k
    }
4948
4949
    /* If bottom-left is available, process separately */
4950
7.64k
    if(0 != pu1_avail[6])
4951
6.42k
    {
4952
6.42k
        WORD32 edge_idx;
4953
4954
        /* U */
4955
6.42k
        edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 2 - src_strd]) +
4956
6.42k
                        SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]);
4957
4958
6.42k
        edge_idx = gi1_table_edge_idx[edge_idx];
4959
4960
6.42k
        if(0 != edge_idx)
4961
1.73k
        {
4962
1.73k
            u1_pos_0_ht_tmp_u = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
4963
1.73k
        }
4964
4.69k
        else
4965
4.69k
        {
4966
4.69k
            u1_pos_0_ht_tmp_u = pu1_src[(ht - 1) * src_strd];
4967
4.69k
        }
4968
4969
        /* V */
4970
6.42k
        edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd]) +
4971
6.42k
                        SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1]);
4972
4973
6.42k
        edge_idx = gi1_table_edge_idx[edge_idx];
4974
4975
6.42k
        if(0 != edge_idx)
4976
1.73k
        {
4977
1.73k
            u1_pos_0_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd + 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
4978
1.73k
        }
4979
4.68k
        else
4980
4.68k
        {
4981
4.68k
            u1_pos_0_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1];
4982
4.68k
        }
4983
6.42k
    }
4984
1.22k
    else
4985
1.22k
    {
4986
1.22k
        u1_pos_0_ht_tmp_u = pu1_src[(ht - 1) * src_strd];
4987
1.22k
        u1_pos_0_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1];
4988
1.22k
    }
4989
4990
4991
4992
    /* Update height and source pointers based on the availability flags */
4993
7.64k
    if(0 == pu1_avail[2])
4994
765
    {
4995
765
        pu1_src_left_cpy2 += 2;
4996
765
        pu1_src_top_cpy = pu1_src;
4997
765
        pu1_src += src_strd;
4998
765
        ht--;
4999
765
    }
5000
7.64k
    if(0 == pu1_avail[3])
5001
823
    {
5002
823
        ht--;
5003
823
    }
5004
5005
7.64k
    sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
5006
7.64k
    const2_16x8b = _mm_set1_epi8(2);
5007
7.64k
    const0_16x8b = _mm_setzero_si128();
5008
5009
5010
    //availability mask creation
5011
7.64k
    u1_avail0 = pu1_avail[0];
5012
7.64k
    u1_avail1 = pu1_avail[1];
5013
7.64k
    au1_mask[0] = u1_avail0;
5014
7.64k
    au1_mask[1] = u1_avail0;
5015
7.64k
    au1_mask[wd - 1] = u1_avail1;
5016
7.64k
    au1_mask[wd - 2] = u1_avail1;
5017
7.64k
    {
5018
7.64k
        WORD32 ht_rem;
5019
7.64k
        au1_mask_cpy = au1_mask;
5020
23.4k
        for(col = wd; col >= 16; col -= 16)
5021
15.8k
        {
5022
15.8k
            pu1_src_cpy = pu1_src;
5023
15.8k
            src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col + 2));
5024
            //row = 0
5025
15.8k
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
5026
5027
            //loading the mask
5028
15.8k
            au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
5029
            //separating +ve and and -ve values.
5030
15.8k
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
5031
15.8k
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
5032
            //creating mask 00 for +ve and -ve values and FF for zero.
5033
15.8k
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5034
15.8k
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5035
            //combining the appropriate sign change
5036
15.8k
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5037
15.8k
            pu1_src_left_cpy = pu1_src_left_cpy2;
5038
5039
140k
            for(row = ht; row >= 2; row -= 2)
5040
124k
            {
5041
124k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
5042
                //row = 1
5043
124k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
5044
                //to insert left in row 1
5045
124k
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
5046
                // row = 0 right
5047
124k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2));
5048
5049
                //manipulation for row 1 - row 0
5050
124k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
5051
                //row 0 -row1
5052
                //separating +ve and and -ve values.
5053
124k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
5054
124k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
5055
5056
                //creating mask 00 for +ve and -ve values and FF for zero.
5057
124k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5058
124k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5059
5060
                //combining the appropriate sign change
5061
124k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
5062
                //combining sign-left and sign_right
5063
124k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
5064
5065
                //row1-row0
5066
                //separating +ve and and -ve values.
5067
124k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
5068
124k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
5069
                //creating mask 00 for +ve and -ve values and FF for zero.
5070
124k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5071
124k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5072
5073
                // row = 2
5074
124k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
5075
                // row = 1 right
5076
124k
                signdwn1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
5077
124k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
5078
5079
                //bottom - row1
5080
124k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
5081
124k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
5082
                //creating mask 00 for +ve and -ve values and FF for zero.
5083
124k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5084
124k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5085
                //for the next iteration bottom -row1
5086
124k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5087
5088
                //to insert left in row 1
5089
124k
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 10);
5090
                //manipulation for row 1 - bottom
5091
124k
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
5092
5093
                //row1 -bottom
5094
124k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
5095
124k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
5096
                //creating mask 00 for +ve and -ve values and FF for zero.
5097
124k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5098
124k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5099
                //combining the appropriate sign change
5100
124k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5101
5102
                //combining sign-left and sign_right
5103
124k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
5104
5105
                //eliminating old left for row 0 and row 1
5106
124k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
5107
                //row1  getting it right for left of next block
5108
124k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14);
5109
                //row0  getting it right for left of next block
5110
124k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
5111
                //copying the next top
5112
124k
                src_top_16x8b = src_temp1_16x8b;
5113
5114
5115
                //adding constant 2
5116
124k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
5117
124k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
5118
                //shuffle to get sao index
5119
124k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
5120
124k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
5121
                //using availability mask
5122
124k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
5123
124k
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
5124
5125
                //adding chroma offset to access U and V
5126
124k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
5127
124k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
5128
5129
                //shuffle to get sao offset
5130
124k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
5131
124k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
5132
                //cnvert to 16 bit then add and then saturated pack
5133
124k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
5134
124k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
5135
124k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
5136
124k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
5137
124k
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
5138
124k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
5139
124k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
5140
124k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
5141
5142
124k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
5143
124k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
5144
124k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
5145
124k
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
5146
124k
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
5147
124k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
5148
124k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
5149
124k
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
5150
                //store left boundary
5151
124k
                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5152
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
5153
124k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
5154
                // row = 1
5155
124k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
5156
5157
124k
                src_temp0_16x8b = src_bottom_16x8b;
5158
124k
                pu1_src_cpy += (src_strd << 1);
5159
124k
                pu1_src_left_cpy += 4;
5160
124k
            }
5161
15.8k
            ht_rem = ht & 0x1;
5162
5163
15.8k
            if(ht_rem)
5164
3.15k
            {
5165
3.15k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
5166
3.15k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
5167
                //to insert left in row 1
5168
3.15k
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
5169
                //manipulation for row 1 - row 0
5170
3.15k
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
5171
5172
                //current row -next row
5173
                //separating +ve and and -ve values.
5174
3.15k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
5175
3.15k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
5176
                //creating mask 00 for +ve and -ve values and FF for zero.
5177
3.15k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5178
3.15k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5179
                //combining the appropriate sign change
5180
3.15k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5181
                //adding top and bottom and constant 2
5182
3.15k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
5183
3.15k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
5184
                //eliminating old left for row 0 and row 1
5185
3.15k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
5186
                //row0  getting it right for left of next block
5187
3.15k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
5188
                //copying the next top
5189
3.15k
                src_top_16x8b = src_temp0_16x8b;
5190
5191
3.15k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
5192
                //using availability mask
5193
3.15k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
5194
5195
                //adding chroma offset to access U and V
5196
3.15k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
5197
5198
5199
3.15k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
5200
5201
                //cnvert to 16 bit then add and then saturated pack
5202
3.15k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
5203
3.15k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
5204
3.15k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
5205
3.15k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
5206
3.15k
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
5207
3.15k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
5208
3.15k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
5209
3.15k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
5210
5211
                //store left boundary
5212
3.15k
                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5213
5214
3.15k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
5215
3.15k
                pu1_src_cpy += (src_strd);
5216
3.15k
                src_temp0_16x8b = src_bottom_16x8b;
5217
3.15k
                pu1_src_left_cpy += 2;
5218
3.15k
            }
5219
15.8k
            {   //for bottom right
5220
15.8k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
5221
15.8k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
5222
15.8k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
5223
15.8k
                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5224
15.8k
            }
5225
15.8k
            if(0 == pu1_avail[3])
5226
1.65k
            {
5227
1.65k
                src_top_16x8b = src_bottom_16x8b;
5228
1.65k
            }
5229
            //for the top left of next part of the block
5230
15.8k
            left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
5231
            //updating top flag
5232
15.8k
            _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
5233
15.8k
            pu1_src += 16;
5234
15.8k
            au1_mask_cpy += 16;
5235
15.8k
        }
5236
7.64k
        pu1_src_left_cpy = pu1_src_left_cpy2;
5237
7.64k
        wd_rem = wd & 0xF;
5238
7.64k
        if(wd_rem)
5239
66
        {
5240
66
            pu1_src_cpy = pu1_src;
5241
66
            src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col + 2));
5242
            //row = 0
5243
66
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
5244
66
            au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
5245
            //separating +ve and and -ve values.
5246
66
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
5247
66
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
5248
            //creating mask 00 for +ve and -ve values and FF for zero.
5249
66
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5250
66
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5251
            //preparing au1_mask
5252
66
            au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
5253
            //combining the appropriate sign change
5254
66
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5255
66
            signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
5256
66
            pu1_src_left_cpy = pu1_src_left_cpy2;
5257
296
            for(row = ht; row >= 4; row -= 4)
5258
230
            {
5259
230
                left_store_16x8b = _mm_loadu_si128((__m128i *)pu1_src_left_cpy);
5260
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
5261
230
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
5262
                // row = 2
5263
230
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
5264
                //manipulation for row 0 -row 1
5265
230
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
5266
                //row 1 left
5267
230
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
5268
                //row 0 -row1
5269
                //separating +ve and and -ve values.
5270
230
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
5271
230
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
5272
5273
                //creating mask 00 for +ve and -ve values and FF for zero.
5274
230
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5275
230
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5276
                //manipulatiing for row 1 -row 0
5277
230
                signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 2);
5278
                //combining the appropriate sign change
5279
230
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5280
                //row 1 -row0
5281
                //separating +ve and and -ve values.
5282
230
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
5283
230
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
5284
5285
                //creating mask 00 for +ve and -ve values and FF for zero.
5286
230
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5287
230
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5288
                //row1-row0
5289
230
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5290
5291
230
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
5292
5293
230
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
5294
                //manipulation for row 1 -row 2
5295
230
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 10);
5296
                //row 2 left
5297
230
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
5298
                //packing row 0 n row 1
5299
230
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
5300
                //row1 -row2
5301
230
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
5302
230
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
5303
                //creating mask 00 for +ve and -ve values and FF for zero.
5304
230
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5305
230
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5306
                //combining the appropriate sign change
5307
230
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
5308
230
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
5309
5310
                //row 1 right
5311
230
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
5312
                //row = 3
5313
230
                src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
5314
5315
                // row = 4
5316
230
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
5317
5318
230
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
5319
5320
                //separating +ve and and -ve values.(2,1)
5321
230
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
5322
230
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
5323
5324
                //creating mask 00 for +ve and -ve values and FF for zero.
5325
230
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5326
230
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5327
                //row 2 right
5328
230
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
5329
                //combining the appropriate sign change
5330
230
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
5331
5332
                //separating +ve and and -ve values.(3,2)
5333
230
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
5334
230
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
5335
230
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
5336
                //creating mask 00 for +ve and -ve values and FF for zero.
5337
230
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5338
230
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5339
                //manipulation for row 2 -row 3
5340
230
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 8);
5341
                //row 3 left
5342
230
                signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 14);
5343
                //combining the appropriate sign change
5344
230
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
5345
5346
230
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
5347
5348
                //separating +ve and and -ve values.(2,3)
5349
230
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
5350
230
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
5351
5352
                //manipulation for row 3 -bottom
5353
230
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 6);
5354
                //bottom left
5355
230
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
5356
5357
                //creating mask 00 for +ve and -ve values and FF for zero.
5358
230
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5359
230
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5360
                //combining the appropriate sign change
5361
230
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
5362
5363
                //separating +ve and and -ve values.(3,bottom)
5364
230
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
5365
230
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
5366
5367
                //creating mask 00 for +ve and -ve values and FF for zero.
5368
230
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5369
230
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5370
230
                edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
5371
                //combining the appropriate sign change
5372
230
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
5373
230
                edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
5374
5375
5376
                //eliminating old left for row 0,1,2,3
5377
230
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 8);
5378
                //packing row 2 n row 3
5379
230
                src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
5380
                //row 3 right
5381
230
                signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 2);
5382
                //loading row 3 right into left
5383
230
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 14);
5384
                //adding bottom and top values of row 2 and row 3
5385
230
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
5386
                //separating +ve and and -ve values.(botttom,3)
5387
230
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
5388
230
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
5389
                //to store right of row 2
5390
230
                signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
5391
                //creating mask 00 for +ve and -ve values and FF for zero.
5392
230
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5393
230
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5394
230
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
5395
5396
                //storing right of row 2into left
5397
230
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
5398
                //to store right of row 0
5399
230
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
5400
                //storing right of row 1 into left
5401
230
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
5402
                //storing right of row 0 into left
5403
230
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
5404
5405
5406
                //adding constant 2
5407
230
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
5408
230
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
5409
                //shuffle to get sao index
5410
230
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
5411
230
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
5412
                //using availability mask
5413
230
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
5414
230
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
5415
                //adding chroma offset to access U and V
5416
230
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
5417
230
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
5418
                //shuffle to get sao offset
5419
230
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
5420
230
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
5421
5422
                //cnvert to 16 bit then add and then saturated pack
5423
230
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
5424
230
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
5425
230
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
5426
230
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
5427
230
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
5428
230
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
5429
230
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
5430
230
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
5431
5432
230
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
5433
230
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
5434
230
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
5435
230
                src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
5436
230
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
5437
230
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
5438
230
                src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b);
5439
230
                src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
5440
5441
230
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
5442
230
                cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
5443
230
                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5444
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
5445
230
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
5446
                // row = 1
5447
230
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
5448
                //row = 2
5449
230
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
5450
                // row = 3
5451
230
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
5452
5453
230
                src_temp0_16x8b = src_temp1_16x8b;
5454
230
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
5455
230
                pu1_src_cpy += (src_strd << 2);
5456
230
                pu1_src_left_cpy += 8;
5457
230
            }
5458
66
            ht_rem = ht & 0x2;
5459
66
            if(ht_rem)
5460
34
            {
5461
34
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
5462
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
5463
34
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
5464
                // row = 2
5465
34
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
5466
5467
                //manipulation for row 0 -row 1
5468
34
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
5469
                //bottom left
5470
34
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
5471
                //separating +ve and and -ve values.
5472
34
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
5473
34
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
5474
5475
                //creating mask 00 for +ve and -ve values and FF for zero.
5476
34
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5477
34
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5478
                //manipulation for row 1 - row 0
5479
34
                signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 2);
5480
                //combining the appropriate sign change
5481
34
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5482
5483
                //row1-row0
5484
                //separating +ve and and -ve values.
5485
34
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
5486
34
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
5487
5488
                //creating mask 00 for +ve and -ve values and FF for zero.
5489
34
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5490
34
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5491
                //combining the appropriate sign chang
5492
34
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5493
5494
                //manipulation for row 1 -bottom
5495
34
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 10);
5496
                //bottom left
5497
34
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
5498
5499
34
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
5500
34
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
5501
                //row1 -bottom
5502
34
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
5503
34
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
5504
5505
                //creating mask 00 for +ve and -ve values and FF for zero.
5506
34
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5507
34
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5508
                //combining the appropriate sign change
5509
34
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
5510
34
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
5511
5512
                //manipulation for bottom- row 1 (row 1 right)
5513
34
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
5514
                //adding top and down substraction
5515
34
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
5516
                //bottom - row 1
5517
34
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
5518
34
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
5519
5520
                //eliminating old left for row 0,1
5521
34
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
5522
34
                signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
5523
                //creating mask 00 for +ve and -ve values and FF for zero.
5524
34
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5525
34
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5526
                //for the next iteration signup0_16x8b
5527
34
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
5528
5529
                //storing right of row 1 into left
5530
34
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
5531
                //for storing right of row 1
5532
34
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
5533
5534
34
                src_top_16x8b = src_temp1_16x8b;
5535
                //storing right of row 0 into left
5536
34
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
5537
5538
                //adding constant 2
5539
34
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
5540
5541
                //shuffle to get sao index
5542
34
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
5543
                //using availability mask
5544
34
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
5545
                //adding chroma offset to access U and V
5546
34
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
5547
                //shuffle to get sao offset
5548
34
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
5549
                //the next top already in  src_top_16x8b
5550
                //cnvert to 16 bit then add and then saturated pack
5551
34
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
5552
34
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
5553
34
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
5554
34
                src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
5555
34
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
5556
34
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
5557
34
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b);
5558
34
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
5559
5560
34
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
5561
5562
34
                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5563
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
5564
34
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
5565
                // row = 1
5566
34
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
5567
34
                src_temp0_16x8b = src_bottom_16x8b;
5568
34
                pu1_src_cpy += (src_strd << 1);
5569
34
                pu1_src_left_cpy += 4;
5570
34
            }
5571
66
            ht_rem = ht & 0x1;
5572
66
            if(ht_rem)
5573
32
            {
5574
32
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
5575
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
5576
32
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
5577
5578
5579
                //manipulation for row 0 -bottom
5580
32
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
5581
                //bottom left
5582
32
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
5583
                //separating +ve and and -ve values.
5584
32
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
5585
32
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
5586
                //creating mask 00 for +ve and -ve values and FF for zero.
5587
32
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5588
32
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5589
                //combining the appropriate sign change
5590
32
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5591
                //adding top and down substraction
5592
32
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
5593
                //for row 0 right to put into left store
5594
32
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
5595
                //adding constant 2
5596
32
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
5597
32
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
5598
32
                edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
5599
                //left store manipulation 1
5600
32
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
5601
                //filling the left boundary value
5602
32
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
5603
32
                src_top_16x8b = src_temp0_16x8b;
5604
5605
                //shuffle to get sao index
5606
32
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
5607
                //using availability mask
5608
32
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
5609
                //adding chroma offset to access U and V
5610
32
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
5611
                //shuffle to get sao offset
5612
32
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
5613
5614
                //cnvert to 16 bit then add and then saturated pack
5615
32
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
5616
32
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
5617
32
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
5618
32
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
5619
32
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
5620
5621
32
                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5622
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
5623
32
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
5624
32
                pu1_src_cpy += (src_strd);
5625
32
                src_temp0_16x8b = src_bottom_16x8b;
5626
32
                pu1_src_left_cpy += 2;
5627
32
            }
5628
66
            {   //for bottom right
5629
66
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
5630
66
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
5631
66
                src_temp0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
5632
66
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
5633
66
                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5634
66
            }
5635
66
            if(0 == pu1_avail[3])
5636
16
            {
5637
16
                src_top_16x8b = src_bottom_16x8b;
5638
16
            }
5639
5640
66
            _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
5641
66
            pu1_src += 8;
5642
66
        }
5643
7.64k
        pu1_src_org[wd - 2] = u1_pos_wd_0_tmp_u;
5644
7.64k
        pu1_src_org[wd - 1] = u1_pos_wd_0_tmp_v;
5645
7.64k
        pu1_src_org[(ht_tmp - 1) * src_strd] = u1_pos_0_ht_tmp_u;
5646
7.64k
        pu1_src_org[(ht_tmp - 1) * src_strd + 1] = u1_pos_0_ht_tmp_v;
5647
254k
        for(row = 0; row < 2 * ht_tmp; row++)
5648
247k
        {
5649
247k
            pu1_src_left[row] = au1_src_left_tmp[row];
5650
247k
        }
5651
7.64k
    }
5652
5653
7.64k
}