Coverage Report

Created: 2023-09-25 07:43

/src/libhevc/common/x86/ihevc_sao_ssse3_intr.c
Line
Count
Source (jump to first uncovered line)
1
/******************************************************************************
2
*
3
* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4
*
5
* Licensed under the Apache License, Version 2.0 (the "License");
6
* you may not use this file except in compliance with the License.
7
* You may obtain a copy of the License at:
8
*
9
* http://www.apache.org/licenses/LICENSE-2.0
10
*
11
* Unless required by applicable law or agreed to in writing, software
12
* distributed under the License is distributed on an "AS IS" BASIS,
13
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
* See the License for the specific language governing permissions and
15
* limitations under the License.
16
*
17
******************************************************************************/
18
/**
19
*******************************************************************************
20
* @file
21
*  ihevc_sao_atom_intr.c
22
*
23
* @brief
24
*  Contains function definitions for Sample adaptive offset(SAO) used in-loop
25
* filtering
26
*
27
* @author
28
* 100592
29
*
30
* @par List of Functions:
31
*   - ihevc_sao_band_offset_luma_ssse3()
32
*   - ihevc_sao_band_offset_chroma_ssse3()
33
*   - ihevc_sao_edge_offset_class0_ssse3()
34
*   - ihevc_sao_edge_offset_class0_chroma_ssse3()
35
*   - ihevc_sao_edge_offset_class1_ssse3()
36
*   - ihevc_sao_edge_offset_class1_chroma_ssse3()
37
*   - ihevc_sao_edge_offset_class2_ssse3()
38
*   - ihevc_sao_edge_offset_class2_chroma_ssse3()
39
*   - ihevc_sao_edge_offset_class3_ssse3()
40
*   - ihevc_sao_edge_offset_class3_chroma_ssse3()
41
*
42
* @remarks
43
*  None
44
*
45
*******************************************************************************
46
*/
47
/*****************************************************************************/
48
/* File Includes                                                             */
49
/*****************************************************************************/
50
#include <stdio.h>
51
52
#include "ihevc_typedefs.h"
53
#include "ihevc_platform_macros.h"
54
#include "ihevc_macros.h"
55
#include "ihevc_func_selector.h"
56
#include "ihevc_defs.h"
57
#include "ihevc_tables_x86_intr.h"
58
#include "ihevc_common_tables.h"
59
#include "ihevc_sao.h"
60
61
#include <immintrin.h>
62
63
#define NUM_BAND_TABLE  32
64
/**
65
*******************************************************************************
66
*
67
* @brief
68
* Has two sets of functions : band offset and edge offset both for luma and chroma
69
* edge offset has horizontal ,vertical, 135 degree and 45 degree
70
*
71
* @par Description:
72
*
73
*
74
* @param[in-out] pu1_src
75
*  Pointer to the source
76
*
77
* @param[in] src_strd
78
*  Source stride
79
*
80
* @param[in-out] pu1_src_left
81
*  source left boundary
82
*
83
* @param[in-out] pu1_src_top
84
* Source top boundary
85
*
86
* @param[in-out] pu1_src_top_left
87
*  Source top left boundary
88
*
89
* @param[in] pu1_src_top_right
90
*  Source top right boundary
91
*
92
* @param[in] pu1_src_bot_left
93
*  Source bottom left boundary
94
*
95
* @param[in] pu1_avail
96
*  boundary availability flags
97
*
98
* @param[in] pi1_sao_offset_u
99
*  Chroma U sao offset values
100
*
101
* @param[in] pi1_sao_offset_v
102
*  Chroma V sao offset values
103
*
104
* @param[in] pi1_sao_offset
105
*  Luma sao offset values
106
*
107
* @param[in] wd
108
*  width of the source
109
110
* @param[in] ht
111
*  height of the source
112
* @returns
113
*
114
* @remarks
115
*  None
116
*
117
*******************************************************************************
118
*/
119
120
121
void ihevc_sao_band_offset_luma_ssse3(UWORD8 *pu1_src,
122
                                      WORD32 src_strd,
123
                                      UWORD8 *pu1_src_left,
124
                                      UWORD8 *pu1_src_top,
125
                                      UWORD8 *pu1_src_top_left,
126
                                      WORD32 sao_band_pos,
127
                                      WORD8 *pi1_sao_offset,
128
                                      WORD32 wd,
129
                                      WORD32 ht)
130
295k
{
131
295k
    WORD32 row, col;
132
295k
    UWORD8 *pu1_src_cpy;
133
295k
    WORD32 wd_rem;
134
295k
    WORD8 offset = 0;
135
136
295k
    __m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b;
137
295k
    __m128i band_table0_8x16b, band_table1_8x16b, band_table2_8x16b, band_table3_8x16b;
138
295k
    __m128i tmp_set_128i_1, tmp_set_128i_2, tmp_set_128i_3, tmp_set_128i_4;
139
295k
    __m128i band_pos_16x8b;
140
295k
    __m128i sao_offset;
141
295k
    __m128i cmp_mask, cmp_store;
142
143
    /* Updating left and top-left and top */
144
9.54M
    for(row = 0; row < ht; row++)
145
9.25M
    {
146
9.25M
        pu1_src_left[row] = pu1_src[row * src_strd + (wd - 1)];
147
9.25M
    }
148
295k
    pu1_src_top_left[0] = pu1_src_top[wd - 1];
149
1.47M
    for(col = 0; col < wd; col += 8)
150
1.17M
    {
151
1.17M
        tmp_set_128i_1 = _mm_loadl_epi64((__m128i *)(pu1_src + (ht - 1) * src_strd + offset));
152
1.17M
        _mm_storel_epi64((__m128i *)(pu1_src_top + offset), tmp_set_128i_1);
153
1.17M
        offset += 8;
154
1.17M
    }
155
156
    //replicating sao_band_pos as 8 bit value 16 times
157
158
159
295k
    band_pos_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos << 3));
160
    //value set for sao_offset extraction
161
295k
    tmp_set_128i_1  = _mm_set_epi8(-128, 1, -128, 1, -128, 1, -128, 1, -128, 1, -128, 1, -128, 1, -128, 1);
162
295k
    tmp_set_128i_2  = _mm_set_epi8(-128, 2, -128, 2, -128, 2, -128, 2, -128, 2, -128, 2, -128, 2, -128, 2);
163
295k
    tmp_set_128i_3  = _mm_set_epi8(-128, 3, -128, 3, -128, 3, -128, 3, -128, 3, -128, 3, -128, 3, -128, 3);
164
295k
    tmp_set_128i_4  = _mm_set_epi8(-128, 4, -128, 4, -128, 4, -128, 4, -128, 4, -128, 4, -128, 4, -128, 4);
165
166
    //loaded sao offset values
167
295k
    sao_offset      = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
168
169
    //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers
170
295k
    band_table0_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx));
171
295k
    band_table1_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8));
172
295k
    band_table2_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16));
173
295k
    band_table3_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24));
174
175
    //band_position addition
176
295k
    band_table0_8x16b = _mm_add_epi16(band_table0_8x16b, band_pos_16x8b);
177
295k
    band_table1_8x16b = _mm_add_epi16(band_table1_8x16b, band_pos_16x8b);
178
295k
    band_table2_8x16b = _mm_add_epi16(band_table2_8x16b, band_pos_16x8b);
179
295k
    band_table3_8x16b = _mm_add_epi16(band_table3_8x16b, band_pos_16x8b);
180
    //sao_offset duplication
181
295k
    tmp_set_128i_1  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1);
182
295k
    tmp_set_128i_2  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2);
183
295k
    tmp_set_128i_3  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3);
184
295k
    tmp_set_128i_4  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4);
185
    //settng for comparision
186
295k
    cmp_mask = _mm_set1_epi16(16);
187
295k
    cmp_store = _mm_set1_epi16(0x00ff);
188
189
    //sao_offset addition
190
295k
    band_table0_8x16b = _mm_add_epi16(band_table0_8x16b, tmp_set_128i_1);
191
295k
    band_table1_8x16b = _mm_add_epi16(band_table1_8x16b, tmp_set_128i_2);
192
295k
    band_table2_8x16b = _mm_add_epi16(band_table2_8x16b, tmp_set_128i_3);
193
295k
    band_table3_8x16b = _mm_add_epi16(band_table3_8x16b, tmp_set_128i_4);
194
    //masking upper 8bit values of each  16 bit band table value
195
295k
    band_table0_8x16b = _mm_and_si128(band_table0_8x16b, cmp_store);
196
295k
    band_table1_8x16b = _mm_and_si128(band_table1_8x16b, cmp_store);
197
295k
    band_table2_8x16b = _mm_and_si128(band_table2_8x16b, cmp_store);
198
295k
    band_table3_8x16b = _mm_and_si128(band_table3_8x16b, cmp_store);
199
200
295k
    switch(sao_band_pos)
201
295k
    {
202
21.9k
        case 0:
203
21.9k
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table0_8x16b);
204
21.9k
            band_table0_8x16b = _mm_and_si128(band_table0_8x16b, tmp_set_128i_2);
205
21.9k
            break;
206
6.37k
        case 28:
207
6.37k
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table3_8x16b);
208
6.37k
            band_table3_8x16b = _mm_or_si128(band_table3_8x16b, tmp_set_128i_2);
209
6.37k
            break;
210
6.33k
        case 29:
211
6.33k
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table2_8x16b);
212
6.33k
            band_table2_8x16b = _mm_or_si128(band_table2_8x16b, tmp_set_128i_2);
213
6.33k
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table3_8x16b);
214
6.33k
            band_table3_8x16b = _mm_and_si128(band_table3_8x16b, tmp_set_128i_2);
215
6.33k
            break;
216
5.15k
        case 30:
217
5.15k
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table1_8x16b);
218
5.15k
            band_table1_8x16b = _mm_or_si128(band_table1_8x16b, tmp_set_128i_2);
219
5.15k
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table2_8x16b);
220
5.15k
            band_table2_8x16b = _mm_and_si128(band_table2_8x16b, tmp_set_128i_2);
221
5.15k
            break;
222
9.08k
        case 31:
223
9.08k
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table0_8x16b);
224
9.08k
            band_table0_8x16b = _mm_or_si128(band_table0_8x16b, tmp_set_128i_2);
225
9.08k
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table1_8x16b);
226
9.08k
            band_table1_8x16b = _mm_and_si128(band_table1_8x16b, tmp_set_128i_2);
227
9.08k
            break;
228
246k
        default:
229
246k
            break;
230
295k
    }
231
    //sao_offset is reused for zero cmp mask.
232
294k
    sao_offset = _mm_setzero_si128();
233
294k
    tmp_set_128i_1 = _mm_set1_epi8(1);
234
    //tmp_set_128i_2 = _mm_set_epi8 (128,7,128,6,128,5,128,4,128,3,128,2,128,1,128,0);
235
294k
    cmp_mask = _mm_packus_epi16(cmp_mask, cmp_mask); //cmp_msk=dup16(16);
236
237
    //masking upper 8bit values of each  16 bit band table value
238
294k
    band_table0_8x16b = _mm_and_si128(band_table0_8x16b, cmp_store);
239
294k
    band_table1_8x16b = _mm_and_si128(band_table1_8x16b, cmp_store);
240
294k
    band_table2_8x16b = _mm_and_si128(band_table2_8x16b, cmp_store);
241
294k
    band_table3_8x16b = _mm_and_si128(band_table3_8x16b, cmp_store);
242
243
    //band table 8x16 four registers are packed into 16x8 two registers:  band_table0_8x16b and band_table2_8x16b
244
294k
    band_table0_8x16b = _mm_packus_epi16(band_table0_8x16b, band_table1_8x16b);
245
294k
    band_table2_8x16b = _mm_packus_epi16(band_table2_8x16b, band_table3_8x16b);
246
247
294k
    band_table3_8x16b = _mm_slli_epi16(cmp_mask, 1); // to compare if value is greater than 31
248
294k
    band_pos_16x8b = _mm_packus_epi16(band_pos_16x8b, band_pos_16x8b); //band_pos is now 8 bit aligned
249
294k
    band_table3_8x16b = _mm_sub_epi8(band_table3_8x16b, tmp_set_128i_1); // to compare if value is greater than 31
250
251
294k
    cmp_mask = _mm_sub_epi8(cmp_mask, tmp_set_128i_1);
252
    //  band_pos_16x8b = _mm_or_si128(band_pos_16x8b,cmp_store);
253
254
739k
    for(col = wd; col >= 16; col -= 16)
255
444k
    {
256
444k
        pu1_src_cpy = pu1_src;
257
7.53M
        for(row = ht; row > 0; row -= 2)
258
7.09M
        {
259
260
261
            //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
262
7.09M
            src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
263
            // row = 1
264
7.09M
            src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
265
266
267
268
            //saturated substract 8 bit
269
7.09M
            tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_16x8b);
270
7.09M
            tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_16x8b);
271
            //if the values less than 0 put ff
272
7.09M
            tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
273
7.09M
            tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
274
7.09M
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
275
7.09M
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
276
            //if the values gret=ater than 31 put ff
277
7.09M
            tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, band_table3_8x16b);
278
7.09M
            tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, band_table3_8x16b);
279
7.09M
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
280
7.09M
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
281
282
283
            //row 0 and row1
284
            //if the values >16 then put ff ,cmp_mask = dup16(15)
285
7.09M
            cmp_store = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
286
            //values 16 to 31 for row 0 & 1 but values <16 ==0
287
7.09M
            tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, cmp_store);
288
            // values 0 to 15 for row 0 & 1
289
7.09M
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, cmp_store);
290
            //values 16 to 31 for row 0 & 1 but values <16 masked to ff
291
7.09M
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
292
7.09M
            tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, cmp_store);
293
            //row 2 and  row 3
294
            //if the values >16 then put ff ,cmp_mask = dup16(15)
295
7.09M
            cmp_store = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
296
            //values 16 to 31 for row 2 & 3 but values <16 ==0
297
7.09M
            tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, cmp_store);
298
            // values 0 to 15 for row 2 & 3
299
7.09M
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, cmp_store);
300
            //values 16 to 31 for row 2 & 3 but values <16 masked to ff
301
7.09M
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
302
7.09M
            tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, cmp_store);
303
304
            //row 0 and row 1
305
            //to preserve pixel values in which no offset needs to be added.
306
7.09M
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
307
7.09M
            src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, cmp_store);
308
309
            //row 2 and row 3
310
            //to preserve pixel values in which no offset needs to be added.
311
7.09M
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
312
7.09M
            src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, cmp_store);
313
314
            //indexing 0 - 15 bandtable indexes
315
7.09M
            tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_1);
316
7.09M
            tmp_set_128i_3 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_3);
317
7.09M
            tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_2);
318
7.09M
            tmp_set_128i_4 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_4);
319
            // combining all offsets results
320
7.09M
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
321
7.09M
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
322
            // combing results woth the pixel values
323
7.09M
            src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
324
7.09M
            src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
325
326
327
            //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
328
7.09M
            _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_8x16b);
329
            // row = 1
330
7.09M
            _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp2_8x16b);
331
332
7.09M
            pu1_src_cpy += (src_strd << 1);
333
7.09M
        }
334
444k
        pu1_src += 16;
335
444k
    }
336
294k
    wd_rem = wd & 0xF;
337
294k
    if(wd_rem)
338
289k
    {pu1_src_cpy = pu1_src;
339
2.55M
        for(row = ht; row > 0; row -= 4)
340
2.26M
        {
341
342
343
            //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
344
2.26M
            src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
345
            // row = 1
346
2.26M
            src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
347
            // row = 2
348
2.26M
            src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
349
            // row = 3
350
2.26M
            src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
351
            //row0 and row1 packed and row2 and row3 packed
352
353
2.26M
            src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b);
354
2.26M
            src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp3_8x16b);
355
356
            //saturated substract 8 bit
357
2.26M
            tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_16x8b);
358
2.26M
            tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_16x8b);
359
            //if the values less than 0 put ff
360
2.26M
            tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
361
2.26M
            tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
362
2.26M
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
363
2.26M
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
364
            //if the values gret=ater than 31 put ff
365
2.26M
            tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, band_table3_8x16b);
366
2.26M
            tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, band_table3_8x16b);
367
2.26M
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
368
2.26M
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
369
370
371
372
            //row 0 and row1
373
            //if the values >16 then put ff ,cmp_mask = dup16(15)
374
2.26M
            cmp_store = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
375
            //values 16 to 31 for row 0 & 1 but values <16 ==0
376
2.26M
            tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, cmp_store);
377
            // values 0 to 15 for row 0 & 1
378
2.26M
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, cmp_store);
379
            //values 16 to 31 for row 0 & 1 but values <16 masked to ff
380
2.26M
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
381
2.26M
            tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, cmp_store);
382
            //row 2 and  row 3
383
            //if the values >16 then put ff ,cmp_mask = dup16(15)
384
2.26M
            cmp_store = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
385
            //values 16 to 31 for row 2 & 3 but values <16 ==0
386
2.26M
            tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, cmp_store);
387
            // values 0 to 15 for row 2 & 3
388
2.26M
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, cmp_store);
389
            //values 16 to 31 for row 2 & 3 but values <16 masked to ff
390
2.26M
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
391
2.26M
            tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, cmp_store);
392
393
            //row 0 and row 1
394
            //to preserve pixel values in which no offset needs to be added.
395
2.26M
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
396
2.26M
            src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, cmp_store);
397
398
            //row 2 and row 3
399
            //to preserve pixel values in which no offset needs to be added.
400
2.26M
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
401
2.26M
            src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, cmp_store);
402
403
            //indexing 0 - 15 bandtable indexes
404
2.26M
            tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_1);
405
2.26M
            tmp_set_128i_3 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_3);
406
2.26M
            tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_2);
407
2.26M
            tmp_set_128i_4 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_4);
408
            // combining all offsets results
409
2.26M
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
410
2.26M
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
411
            // combing results woth the pixel values
412
2.26M
            src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
413
2.26M
            src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
414
415
            //Getting row1 separately
416
2.26M
            src_temp1_8x16b = _mm_srli_si128(src_temp0_8x16b, 8);
417
            //Getting row3 separately
418
2.26M
            src_temp3_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
419
420
            //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
421
2.26M
            _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_8x16b);
422
            // row = 1
423
2.26M
            _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), src_temp1_8x16b);
424
            // row = 2
425
2.26M
            _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp2_8x16b);
426
            // row = 3
427
2.26M
            _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), src_temp3_8x16b);
428
429
2.26M
            pu1_src_cpy += (src_strd << 2);
430
431
2.26M
        }
432
289k
        pu1_src += 8;
433
289k
    }
434
435
436
294k
}
437
438
void ihevc_sao_band_offset_chroma_ssse3(UWORD8 *pu1_src,
439
                                        WORD32 src_strd,
440
                                        UWORD8 *pu1_src_left,
441
                                        UWORD8 *pu1_src_top,
442
                                        UWORD8 *pu1_src_top_left,
443
                                        WORD32 sao_band_pos_u,
444
                                        WORD32 sao_band_pos_v,
445
                                        WORD8 *pi1_sao_offset_u,
446
                                        WORD8 *pi1_sao_offset_v,
447
                                        WORD32 wd,
448
                                        WORD32 ht)
449
301k
{
450
301k
    WORD32 row, col;
451
301k
    WORD8 offset = 0;
452
453
454
301k
    __m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b;
455
301k
    __m128i cmp_msk2;
456
301k
    __m128i band_table0_16x8b, band_table1_16x8b, band_table2_16x8b, band_table3_16x8b;
457
301k
    __m128i tmp_set_128i_1, tmp_set_128i_2, tmp_set_128i_3, tmp_set_128i_4;
458
301k
    __m128i band_pos_u_16x8b, band_pos_v_16x8b;
459
301k
    __m128i sao_offset;
460
301k
    __m128i cmp_mask;
461
462
463
    /* Updating left and top and top-left */
464
5.09M
    for(row = 0; row < ht; row++)
465
4.78M
    {
466
4.78M
        pu1_src_left[2 * row] = pu1_src[row * src_strd + (wd - 2)];
467
4.78M
        pu1_src_left[2 * row + 1] = pu1_src[row * src_strd + (wd - 1)];
468
4.78M
    }
469
301k
    pu1_src_top_left[0] = pu1_src_top[wd - 2];
470
301k
    pu1_src_top_left[1] = pu1_src_top[wd - 1];
471
1.52M
    for(col = 0; col < wd; col += 8)
472
1.22M
    {
473
1.22M
        tmp_set_128i_1 = _mm_loadl_epi64((__m128i *)(pu1_src + (ht - 1) * src_strd + offset));
474
1.22M
        _mm_storel_epi64((__m128i *)(pu1_src_top + offset), tmp_set_128i_1);
475
1.22M
        offset += 8;
476
1.22M
    }
477
478
301k
    { // band _table creation
479
301k
        __m128i temp0_8x16b, temp1_8x16b, temp2_8x16b, temp3_8x16b;
480
        // Band table for U component : band_table0_16x8b and band_table2_16x8b
481
        //replicating sao_band_pos as 8 bit value 16 times
482
301k
        band_pos_u_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos_u << 3));
483
        //value set for sao_offset extraction
484
301k
        tmp_set_128i_1  = _mm_set_epi8(-128, 1, -128, 1, -128, 1, -128, 1, -128, 1, -128, 1, -128, 1, -128, 1);
485
301k
        tmp_set_128i_2  = _mm_set_epi8(-128, 2, -128, 2, -128, 2, -128, 2, -128, 2, -128, 2, -128, 2, -128, 2);
486
301k
        tmp_set_128i_3  = _mm_set_epi8(-128, 3, -128, 3, -128, 3, -128, 3, -128, 3, -128, 3, -128, 3, -128, 3);
487
301k
        tmp_set_128i_4  = _mm_set_epi8(-128, 4, -128, 4, -128, 4, -128, 4, -128, 4, -128, 4, -128, 4, -128, 4);
488
489
        //loaded sao offset values
490
301k
        sao_offset      = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
491
492
        //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers
493
301k
        band_table0_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx));
494
301k
        band_table1_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8));
495
301k
        band_table2_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16));
496
301k
        band_table3_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24));
497
498
        //band_position addition
499
301k
        band_table0_16x8b = _mm_add_epi16(band_table0_16x8b, band_pos_u_16x8b);
500
301k
        band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, band_pos_u_16x8b);
501
301k
        band_table2_16x8b = _mm_add_epi16(band_table2_16x8b, band_pos_u_16x8b);
502
301k
        band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, band_pos_u_16x8b);
503
        //sao_offset duplication
504
301k
        temp0_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1);
505
301k
        temp1_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2);
506
301k
        temp2_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3);
507
301k
        temp3_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4);
508
509
        //sao_offset addition
510
301k
        band_table0_16x8b = _mm_add_epi16(band_table0_16x8b, temp0_8x16b);
511
301k
        band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, temp1_8x16b);
512
301k
        band_table2_16x8b = _mm_add_epi16(band_table2_16x8b, temp2_8x16b);
513
301k
        band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, temp3_8x16b);
514
        //reuse for clipping
515
301k
        temp1_8x16b = _mm_set1_epi16(0x00ff);
516
        //settng for comparision
517
301k
        cmp_mask = _mm_set1_epi16(16);
518
519
        //masking upper 8bit values of each  16 bit band table value
520
301k
        band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp1_8x16b);
521
301k
        band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
522
301k
        band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp1_8x16b);
523
301k
        band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
524
525
        //temp1_8x16b reuse for compare storage
526
301k
        switch(sao_band_pos_u)
527
301k
        {
528
8.67k
            case 0:
529
8.67k
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table0_16x8b);
530
8.67k
                band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp3_8x16b);
531
8.67k
                break;
532
6.40k
            case 28:
533
6.40k
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
534
6.40k
                band_table3_16x8b = _mm_or_si128(band_table3_16x8b, temp3_8x16b);
535
6.40k
                break;
536
2.03k
            case 29:
537
2.03k
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table2_16x8b);
538
2.03k
                band_table2_16x8b = _mm_or_si128(band_table2_16x8b, temp3_8x16b);
539
2.03k
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
540
2.03k
                band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp3_8x16b);
541
2.03k
                break;
542
4.64k
            case 30:
543
4.64k
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
544
4.64k
                band_table1_16x8b = _mm_or_si128(band_table1_16x8b, temp3_8x16b);
545
4.64k
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table2_16x8b);
546
4.64k
                band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp3_8x16b);
547
4.64k
                break;
548
5.38k
            case 31:
549
5.38k
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table0_16x8b);
550
5.38k
                band_table0_16x8b = _mm_or_si128(band_table0_16x8b, temp3_8x16b);
551
5.38k
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
552
5.38k
                band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp3_8x16b);
553
5.38k
                break;
554
274k
            default:
555
274k
                break;
556
301k
        }
557
        //masking upper 8bit values of each  16 bit band table value
558
301k
        band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp1_8x16b);
559
301k
        band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
560
301k
        band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp1_8x16b);
561
301k
        band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
562
        //band table 8x16 four registers are packed into 16x8 two registers:  band_table0_8x16b and band_table2_8x16b
563
301k
        band_table0_16x8b = _mm_packus_epi16(band_table0_16x8b, band_table1_16x8b);
564
301k
        band_table2_16x8b = _mm_packus_epi16(band_table2_16x8b, band_table3_16x8b);
565
        // Band table for U component over
566
567
        // Band table for V component : band_table1_16x8b and band_table3_16x8b
568
        // replicating sao_band_pos as 8 bit value 16 times
569
301k
        band_pos_v_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos_v << 3));
570
571
        //loaded sao offset values
572
301k
        sao_offset      = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
573
574
        //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers
575
301k
        temp0_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx));
576
301k
        band_table1_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8));
577
301k
        temp2_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16));
578
301k
        band_table3_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24));
579
580
        //band_position addition
581
301k
        temp0_8x16b = _mm_add_epi16(temp0_8x16b, band_pos_v_16x8b);
582
301k
        band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, band_pos_v_16x8b);
583
301k
        temp2_8x16b = _mm_add_epi16(temp2_8x16b, band_pos_v_16x8b);
584
301k
        band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, band_pos_v_16x8b);
585
        //sao_offset duplication
586
301k
        tmp_set_128i_1  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1);
587
301k
        tmp_set_128i_2  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2);
588
301k
        tmp_set_128i_3  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3);
589
301k
        tmp_set_128i_4  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4);
590
591
        //sao_offset addition
592
301k
        temp0_8x16b = _mm_add_epi16(temp0_8x16b, tmp_set_128i_1);
593
301k
        band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, tmp_set_128i_2);
594
301k
        temp2_8x16b = _mm_add_epi16(temp2_8x16b, tmp_set_128i_3);
595
301k
        band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, tmp_set_128i_4);
596
597
        //masking upper 8bit values of 16 bit band table value
598
301k
        temp0_8x16b = _mm_and_si128(temp0_8x16b, temp1_8x16b);
599
301k
        band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
600
301k
        temp2_8x16b = _mm_and_si128(temp2_8x16b, temp1_8x16b);
601
301k
        band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
602
        //temp1_8x16b reuse for compare storage
603
604
301k
        switch(sao_band_pos_v)
605
301k
        {
606
12.3k
            case 0:
607
12.3k
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp0_8x16b);
608
12.3k
                temp0_8x16b = _mm_and_si128(temp0_8x16b, temp3_8x16b);
609
12.3k
                break;
610
25.7k
            case 28:
611
25.7k
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
612
25.7k
                band_table3_16x8b = _mm_or_si128(band_table3_16x8b, temp3_8x16b);
613
25.7k
                break;
614
2.81k
            case 29:
615
2.81k
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp2_8x16b);
616
2.81k
                temp2_8x16b = _mm_or_si128(temp2_8x16b, temp3_8x16b);
617
2.81k
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
618
2.81k
                band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp3_8x16b);
619
2.81k
                break;
620
2.85k
            case 30:
621
2.85k
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
622
2.85k
                band_table1_16x8b = _mm_or_si128(band_table1_16x8b, temp3_8x16b);
623
2.85k
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp2_8x16b);
624
2.85k
                temp2_8x16b = _mm_and_si128(temp2_8x16b, temp3_8x16b);
625
2.85k
                break;
626
18.3k
            case 31:
627
18.3k
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp0_8x16b);
628
18.3k
                temp0_8x16b = _mm_or_si128(temp0_8x16b, temp3_8x16b);
629
18.3k
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
630
18.3k
                band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp3_8x16b);
631
18.3k
                break;
632
239k
            default:
633
239k
                break;
634
301k
        }
635
        //masking upper 8bit values of each  16 bit band table value
636
301k
        temp0_8x16b = _mm_and_si128(temp0_8x16b, temp1_8x16b);
637
301k
        band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
638
301k
        temp2_8x16b = _mm_and_si128(temp2_8x16b, temp1_8x16b);
639
301k
        band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
640
        //band table 8x16 four registers are packed into 16x8 two registers:  band_table0_8x16b and band_table2_8x16b
641
301k
        band_table1_16x8b = _mm_packus_epi16(temp0_8x16b, band_table1_16x8b);
642
301k
        band_table3_16x8b = _mm_packus_epi16(temp2_8x16b, band_table3_16x8b);
643
        //band table for u and v created
644
301k
    }
645
0
    {
646
301k
        UWORD8 *pu1_src_cpy;
647
301k
        WORD32 wd_rem;
648
649
650
        //sao_offset is reused for zero cmp mask.
651
301k
        sao_offset = _mm_setzero_si128();
652
301k
        tmp_set_128i_1 = _mm_set1_epi8(1);
653
        //tmp_set_128i_2 = _mm_set_epi8 (128,7,128,6,128,5,128,4,128,3,128,2,128,1,128,0);
654
301k
        cmp_mask = _mm_packus_epi16(cmp_mask, cmp_mask); //cmp_msk=dup16(16);
655
        //to avoid ffff to be saturated to 0 instead it should be to ff
656
657
301k
        cmp_msk2 = _mm_slli_epi16(cmp_mask, 1); // to compare if value is greater than 31
658
301k
        band_pos_u_16x8b = _mm_packus_epi16(band_pos_u_16x8b, band_pos_u_16x8b); //band_pos_u is now 8 bit aligned
659
301k
        band_pos_v_16x8b = _mm_packus_epi16(band_pos_v_16x8b, band_pos_v_16x8b); //band_pos_v is now 8 bit aligned
660
301k
        cmp_msk2 = _mm_sub_epi8(cmp_msk2, tmp_set_128i_1); // to compare if value is greater than 31
661
662
301k
        cmp_mask = _mm_sub_epi8(cmp_mask, tmp_set_128i_1);
663
664
914k
        for(col = wd; col >= 16; col -= 16)
665
613k
        {
666
613k
            pu1_src_cpy = pu1_src;
667
5.50M
            for(row = ht; row > 0; row -= 2)
668
4.88M
            {
669
                //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
670
4.88M
                src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
671
                // row = 1
672
4.88M
                src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
673
674
675
                //odd values
676
4.88M
                src_temp1_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
677
4.88M
                src_temp2_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
678
                //even values
679
4.88M
                src_temp0_8x16b = _mm_slli_epi16(src_temp0_8x16b, 8);
680
4.88M
                src_temp3_8x16b = _mm_slli_epi16(src_temp3_8x16b, 8);
681
4.88M
                src_temp0_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
682
4.88M
                src_temp3_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
683
                //combining odd values
684
4.88M
                src_temp2_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp2_8x16b);
685
                //combining even values
686
4.88M
                src_temp0_8x16b = _mm_packus_epi16(src_temp0_8x16b, src_temp3_8x16b);
687
688
                //saturated substract 8 bit
689
4.88M
                tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_u_16x8b);
690
4.88M
                tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_v_16x8b);
691
                //if the values less than 0 put ff
692
4.88M
                tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
693
4.88M
                tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
694
4.88M
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
695
4.88M
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
696
                //if the values greater than 31 put ff
697
4.88M
                tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_msk2);
698
4.88M
                tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_msk2);
699
4.88M
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
700
4.88M
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
701
                // registers reused to increase performance
702
                //if the values >16 then put ff ,cmp_mask = dup16(15) row 0 and row1
703
4.88M
                src_temp1_8x16b = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
704
                //if the values >16 then put ff ,cmp_mask = dup16(15) row 2 and  row 3
705
4.88M
                src_temp3_8x16b = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
706
707
                //values 16 to 31 for row 0 & 1 but values <16 ==0
708
4.88M
                tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, src_temp1_8x16b);
709
                // values 0 to 15 for row 0 & 1
710
4.88M
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, src_temp1_8x16b);
711
                //values 16 to 31 for row 2 & 3 but values <16 ==0
712
4.88M
                tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, src_temp3_8x16b);
713
                // values 0 to 15 for row 2 & 3
714
4.88M
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, src_temp3_8x16b);
715
716
                //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 0 and row1
717
4.88M
                src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
718
                //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 2 and  row 3
719
4.88M
                src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
720
4.88M
                tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, src_temp1_8x16b);
721
4.88M
                tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, src_temp3_8x16b);
722
723
724
                //to choose which pixel values to preserve in row 0 and row 1
725
4.88M
                src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
726
                //to choose which pixel values to preserve in row 2 and row 3
727
4.88M
                src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
728
                //values of all rows to which no offset needs to be added preserved.
729
4.88M
                src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, src_temp1_8x16b);
730
4.88M
                src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, src_temp3_8x16b);
731
732
                //indexing 0 - 15 bandtable indexes
733
4.88M
                tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_16x8b, tmp_set_128i_1); //U low
734
4.88M
                tmp_set_128i_3 = _mm_shuffle_epi8(band_table1_16x8b, tmp_set_128i_3); //V low
735
                //indexing 16 -31 bandtable indexes
736
4.88M
                tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_16x8b, tmp_set_128i_2); //U high
737
4.88M
                tmp_set_128i_4 = _mm_shuffle_epi8(band_table3_16x8b, tmp_set_128i_4); //V high
738
                // combining all offsets results
739
4.88M
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); //U
740
4.88M
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); //V
741
                // combing results with the pixel values
742
4.88M
                src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
743
4.88M
                src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
744
                //reorganising even and odd values
745
4.88M
                src_temp1_8x16b = _mm_unpacklo_epi8(src_temp0_8x16b, src_temp2_8x16b);
746
4.88M
                src_temp3_8x16b = _mm_unpackhi_epi8(src_temp0_8x16b, src_temp2_8x16b);
747
748
749
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
750
4.88M
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp1_8x16b);
751
                // row = 1
752
4.88M
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp3_8x16b);
753
754
755
4.88M
                pu1_src_cpy += (src_strd << 1);
756
757
4.88M
            }
758
613k
            pu1_src += 16;
759
613k
        }
760
761
301k
        wd_rem = wd & 0xF;
762
301k
        if(wd_rem)
763
541
        {
764
541
            pu1_src_cpy = pu1_src;
765
1.15k
            for(row = ht; row > 0; row -= 4)
766
610
            {
767
                //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
768
610
                src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
769
                // row = 1
770
610
                src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
771
                // row = 2
772
610
                src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
773
                // row = 3
774
610
                src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
775
                //row0 and row1 packed and row2 and row3 packed
776
777
610
                src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b);
778
610
                src_temp3_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp3_8x16b);
779
                //odd values
780
610
                src_temp1_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
781
610
                src_temp2_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
782
                //even values
783
610
                src_temp0_8x16b = _mm_slli_epi16(src_temp0_8x16b, 8);
784
610
                src_temp3_8x16b = _mm_slli_epi16(src_temp3_8x16b, 8);
785
610
                src_temp0_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
786
610
                src_temp3_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
787
                //combining odd values
788
610
                src_temp2_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp2_8x16b);
789
                //combining even values
790
610
                src_temp0_8x16b = _mm_packus_epi16(src_temp0_8x16b, src_temp3_8x16b);
791
792
                //saturated substract 8 bit
793
610
                tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_u_16x8b);
794
610
                tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_v_16x8b);
795
                //if the values less than 0 put ff
796
610
                tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
797
610
                tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
798
610
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
799
610
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
800
                //if the values greater than 31 put ff
801
610
                tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_msk2);
802
610
                tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_msk2);
803
610
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
804
610
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
805
                // registers reused to increase performance
806
                //if the values >16 then put ff ,cmp_mask = dup16(15) row 0 and row1
807
610
                src_temp1_8x16b = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
808
                //if the values >16 then put ff ,cmp_mask = dup16(15) row 2 and  row 3
809
610
                src_temp3_8x16b = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
810
811
                //values 16 to 31 for row 0 & 1 but values <16 ==0
812
610
                tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, src_temp1_8x16b);
813
                // values 0 to 15 for row 0 & 1
814
610
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, src_temp1_8x16b);
815
                //values 16 to 31 for row 2 & 3 but values <16 ==0
816
610
                tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, src_temp3_8x16b);
817
                // values 0 to 15 for row 2 & 3
818
610
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, src_temp3_8x16b);
819
820
                //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 0 and row1
821
610
                src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
822
                //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 2 and  row 3
823
610
                src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
824
610
                tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, src_temp1_8x16b);
825
610
                tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, src_temp3_8x16b);
826
827
828
                //to choose which pixel values to preserve in row 0 and row 1
829
610
                src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
830
                //to choose which pixel values to preserve in row 2 and row 3
831
610
                src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
832
                //values of all rows to which no offset needs to be added preserved.
833
610
                src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, src_temp1_8x16b);
834
610
                src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, src_temp3_8x16b);
835
836
                //indexing 0 - 15 bandtable indexes
837
610
                tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_16x8b, tmp_set_128i_1); //U low
838
610
                tmp_set_128i_3 = _mm_shuffle_epi8(band_table1_16x8b, tmp_set_128i_3); //V low
839
                //indexing 16 -31 bandtable indexes
840
610
                tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_16x8b, tmp_set_128i_2); //U high
841
610
                tmp_set_128i_4 = _mm_shuffle_epi8(band_table3_16x8b, tmp_set_128i_4); //V high
842
                // combining all offsets results
843
610
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); //U
844
610
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); //V
845
                // combing results with the pixel values
846
610
                src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
847
610
                src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
848
                //reorganising even and odd values
849
610
                src_temp1_8x16b = _mm_unpacklo_epi8(src_temp0_8x16b, src_temp2_8x16b);
850
610
                src_temp3_8x16b = _mm_unpackhi_epi8(src_temp0_8x16b, src_temp2_8x16b);
851
                //Getting row1 separately
852
610
                src_temp0_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
853
                //Getting row3 separately
854
610
                src_temp2_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
855
856
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
857
610
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp1_8x16b);
858
                // row = 1
859
610
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), src_temp0_8x16b);
860
                // row = 2
861
610
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp3_8x16b);
862
                // row = 3
863
610
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), src_temp2_8x16b);
864
865
610
                pu1_src_cpy += (src_strd << 2);
866
867
610
            }
868
541
            pu1_src += 16;
869
541
        }
870
871
872
301k
    }
873
301k
}
874
875
876
877
void ihevc_sao_edge_offset_class0_ssse3(UWORD8 *pu1_src,
878
                                        WORD32 src_strd,
879
                                        UWORD8 *pu1_src_left,
880
                                        UWORD8 *pu1_src_top,
881
                                        UWORD8 *pu1_src_top_left,
882
                                        UWORD8 *pu1_src_top_right,
883
                                        UWORD8 *pu1_src_bot_left,
884
                                        UWORD8 *pu1_avail,
885
                                        WORD8 *pi1_sao_offset,
886
                                        WORD32 wd,
887
                                        WORD32 ht)
888
106k
{
889
106k
    WORD32 row, col;
890
106k
    UWORD8 *pu1_src_cpy, *pu1_src_left_cpy, *pu1_src_left_str, *pu1_left_tmp;
891
106k
    UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
892
106k
    UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8];
893
106k
    UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8];
894
106k
    UWORD8 u1_avail0, u1_avail1;
895
106k
    WORD32 wd_rem;
896
106k
    WORD32 offset = 0;
897
106k
    __m128i src_temp0_16x8b, src_temp1_16x8b;
898
106k
    __m128i left0_16x8b, left1_16x8b;
899
106k
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b, cmp_gt1_16x8b, cmp_lt1_16x8b;
900
106k
    __m128i edge0_16x8b, edge1_16x8b;
901
106k
    __m128i au1_mask8x16b;
902
106k
    __m128i edge_idx_8x16b, sao_offset_8x16b;
903
106k
    __m128i const2_16x8b, const0_16x8b;
904
106k
    __m128i left_store_16x8b;
905
106k
    UNUSED(pu1_src_top_right);
906
106k
    UNUSED(pu1_src_bot_left);
907
908
106k
    au1_mask8x16b = _mm_set1_epi8(0xff);
909
910
    /* Update  top and top-left arrays */
911
912
106k
    *pu1_src_top_left = pu1_src_top[wd - 1];
913
914
282k
    for(col = wd; col >= 16; col -= 16)
915
176k
    {
916
176k
        const0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + offset + (ht - 1) * src_strd));
917
176k
        _mm_storeu_si128((__m128i *)(pu1_src_top + offset), const0_16x8b);
918
176k
        offset += 16;
919
176k
    }
920
921
    //setting availability mask to ff size MAX_CTB_SIZE
922
530k
    for(col = 0; col < MAX_CTB_SIZE; col += 16)
923
424k
        _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
924
3.49M
    for(row = 0; row < ht; row++)
925
3.39M
    {
926
3.39M
        au1_src_left_tmp[row] = pu1_src_left[row];
927
3.39M
    }
928
106k
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
929
106k
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
930
931
    //availability mask creation
932
106k
    u1_avail0 = pu1_avail[0];
933
106k
    u1_avail1 = pu1_avail[1];
934
106k
    au1_mask[0] = u1_avail0;
935
106k
    au1_mask[wd - 1] = u1_avail1;
936
937
106k
    const2_16x8b = _mm_set1_epi8(2);
938
106k
    const0_16x8b = _mm_setzero_si128();
939
106k
    pu1_src_left_cpy = au1_src_left_tmp;
940
106k
    pu1_src_left_str = au1_src_left_tmp1;
941
106k
    {
942
106k
        au1_mask_cpy = au1_mask;
943
282k
        for(col = wd; col >= 16; col -= 16)
944
176k
        {
945
176k
            pu1_src_cpy = pu1_src;
946
176k
            au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
947
            //pu1_src_left_cpy =au1_src_left_tmp;
948
2.98M
            for(row = ht; row > 0; row -= 2)
949
2.81M
            {
950
951
2.81M
                left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
952
                //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
953
2.81M
                src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
954
                // row = 1
955
2.81M
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
956
957
2.81M
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 2);
958
                //row 1 left
959
2.81M
                left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 15);
960
2.81M
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15);
961
                //row 0 left
962
2.81M
                left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 15);
963
2.81M
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
964
965
966
                //separating +ve and and -ve values.
967
2.81M
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
968
2.81M
                cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
969
2.81M
                cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
970
2.81M
                cmp_lt1_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
971
                //creating mask 00 for +ve and -ve values and FF for zero.
972
2.81M
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
973
2.81M
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
974
2.81M
                cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
975
2.81M
                cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
976
                //combining the appropriate sign change
977
2.81M
                left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
978
2.81M
                left1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
979
980
                //row = 0 right
981
2.81M
                edge0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 1));
982
                // row = 1 right
983
2.81M
                edge1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
984
                //separating +ve and and -ve values.
985
2.81M
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
986
2.81M
                cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
987
2.81M
                cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
988
2.81M
                cmp_lt1_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
989
                //creating mask 00 for +ve and -ve values and FF for zero.
990
2.81M
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
991
2.81M
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
992
2.81M
                cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
993
2.81M
                cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
994
                //combining the appropriate sign change
995
2.81M
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
996
2.81M
                edge1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
997
998
                //combining sign-left and sign_right
999
2.81M
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
1000
2.81M
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
1001
                //adding constant 2
1002
2.81M
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1003
2.81M
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
1004
                //shuffle to get sao index
1005
2.81M
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1006
2.81M
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
1007
                //using availability mask
1008
2.81M
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
1009
2.81M
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
1010
1011
                //shuffle to get sao offset
1012
2.81M
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1013
2.81M
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
1014
                //cnvert to 16 bit then add and then saturated pack
1015
2.81M
                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1016
2.81M
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1017
2.81M
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
1018
2.81M
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1019
2.81M
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1020
2.81M
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
1021
2.81M
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
1022
2.81M
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1023
1024
2.81M
                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
1025
2.81M
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
1026
2.81M
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
1027
2.81M
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
1028
2.81M
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1029
2.81M
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
1030
2.81M
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
1031
2.81M
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
1032
1033
1034
2.81M
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
1035
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1036
2.81M
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1037
                // row = 1
1038
2.81M
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
1039
1040
2.81M
                pu1_src_cpy += (src_strd << 1);
1041
2.81M
                pu1_src_left_cpy += 2;
1042
2.81M
                pu1_src_left_str += 2;
1043
2.81M
            }
1044
176k
            au1_mask_cpy += 16;
1045
176k
            pu1_src += 16;
1046
176k
            pu1_src_left_cpy -= ht;
1047
176k
            pu1_src_left_str -= ht;
1048
1049
176k
            pu1_left_tmp = pu1_src_left_cpy;
1050
176k
            pu1_src_left_cpy = pu1_src_left_str;
1051
176k
            pu1_src_left_str = pu1_left_tmp;
1052
176k
        }
1053
1054
106k
        wd_rem = wd & 0xF;
1055
106k
        if(wd_rem)
1056
99.1k
        {
1057
1058
99.1k
            cmp_gt1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (ht - 1) * src_strd));
1059
99.1k
            _mm_storel_epi64((__m128i *)(pu1_src_top + offset), cmp_gt1_16x8b);
1060
1061
99.1k
            au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy);
1062
99.1k
            pu1_src_cpy = pu1_src;
1063
99.1k
            au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
1064
            //pu1_src_left_cpy =au1_src_left_tmp;
1065
890k
            for(row = ht; row > 0; row -= 4)
1066
791k
            {
1067
791k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
1068
                //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
1069
791k
                src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
1070
                // row = 1
1071
791k
                cmp_gt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
1072
                // row  = 2
1073
791k
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
1074
                // row = 3
1075
791k
                cmp_gt1_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
1076
1077
1078
791k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 4);
1079
                //row 3 left
1080
791k
                edge0_16x8b = _mm_slli_si128(cmp_gt1_16x8b, 8);
1081
791k
                cmp_lt1_16x8b = _mm_alignr_epi8(cmp_gt1_16x8b, left_store_16x8b, 15);
1082
791k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
1083
                //row 2 left
1084
791k
                edge0_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
1085
791k
                left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 15);
1086
791k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
1087
                //row 1 left
1088
791k
                edge0_16x8b = _mm_slli_si128(cmp_gt0_16x8b, 8);
1089
791k
                cmp_lt0_16x8b = _mm_alignr_epi8(cmp_gt0_16x8b, left_store_16x8b, 15);
1090
791k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
1091
                //row 0 left
1092
791k
                edge0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
1093
791k
                left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 15);
1094
791k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
1095
1096
                // packing rows together for 16 SIMD operations
1097
791k
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, cmp_gt0_16x8b);
1098
791k
                src_temp1_16x8b = _mm_unpacklo_epi64(src_temp1_16x8b, cmp_gt1_16x8b);
1099
                // packing rows together for 16 SIMD operations
1100
791k
                left0_16x8b = _mm_unpacklo_epi64(left0_16x8b, cmp_lt0_16x8b);
1101
791k
                left1_16x8b = _mm_unpacklo_epi64(left1_16x8b, cmp_lt1_16x8b);
1102
1103
                //separating +ve and and -ve values.
1104
791k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
1105
791k
                cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
1106
791k
                cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
1107
791k
                cmp_lt1_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
1108
                //creating mask 00 for +ve and -ve values and FF for zero.
1109
791k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1110
791k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1111
791k
                cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
1112
791k
                cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
1113
                //combining the appropriate sign change
1114
791k
                left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1115
791k
                left1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
1116
1117
                //row = 0 right
1118
791k
                edge0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 1));
1119
                // row = 1 right
1120
791k
                cmp_gt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd + 1));
1121
                // row = 2 right
1122
791k
                edge1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd + 1));
1123
                // row = 3 right
1124
791k
                cmp_gt1_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd + 1));
1125
                // packing rows together for 16 SIMD operations
1126
791k
                edge0_16x8b = _mm_unpacklo_epi64(edge0_16x8b, cmp_gt0_16x8b);
1127
791k
                edge1_16x8b = _mm_unpacklo_epi64(edge1_16x8b, cmp_gt1_16x8b);
1128
1129
                //separating +ve and and -ve values.
1130
791k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
1131
791k
                cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
1132
791k
                cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
1133
791k
                cmp_lt1_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
1134
                //creating mask 00 for +ve and -ve values and FF for zero.
1135
791k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1136
791k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1137
791k
                cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
1138
791k
                cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
1139
                //combining the appropriate sign change
1140
791k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1141
791k
                edge1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
1142
1143
                //combining sign-left and sign_right
1144
791k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
1145
791k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
1146
                //adding constant 2
1147
791k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1148
791k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
1149
                //shuffle to get sao index
1150
791k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1151
791k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
1152
                //shuffle to get sao offset
1153
                //using availability mask
1154
791k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
1155
791k
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
1156
1157
791k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1158
791k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
1159
                //cnvert to 16 bit then add and then saturated pack
1160
791k
                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1161
791k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1162
791k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
1163
791k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1164
791k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1165
791k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
1166
791k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
1167
791k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1168
1169
791k
                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
1170
791k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
1171
791k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
1172
791k
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
1173
791k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1174
791k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
1175
791k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
1176
791k
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
1177
                //separting row 1 and row 3
1178
791k
                cmp_lt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
1179
791k
                cmp_lt1_16x8b = _mm_srli_si128(src_temp1_16x8b, 8);
1180
1181
791k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
1182
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1183
791k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1184
                // row = 1
1185
791k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_lt0_16x8b);
1186
                // row = 2
1187
791k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp1_16x8b);
1188
                // row = 3
1189
791k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt1_16x8b);
1190
1191
791k
                pu1_src_cpy += (src_strd << 2);
1192
791k
                pu1_src_left_cpy += 4;
1193
791k
                pu1_src_left_str += 4;
1194
791k
            }
1195
99.1k
            pu1_src += wd;
1196
99.1k
            pu1_src_left_cpy -= ht;
1197
99.1k
            pu1_src_left_str -= ht;
1198
1199
99.1k
            pu1_left_tmp = pu1_src_left_cpy;
1200
99.1k
            pu1_src_left_cpy = pu1_src_left_str;
1201
99.1k
            pu1_src_left_str = pu1_left_tmp;
1202
99.1k
        }
1203
3.49M
        for(row = 0; row < ht; row++)
1204
3.39M
        {
1205
3.39M
            pu1_src_left[row] = pu1_src_left_cpy[row];
1206
3.39M
        }
1207
106k
    }
1208
106k
}
1209
1210
1211
void ihevc_sao_edge_offset_class0_chroma_ssse3(UWORD8 *pu1_src,
1212
                                               WORD32 src_strd,
1213
                                               UWORD8 *pu1_src_left,
1214
                                               UWORD8 *pu1_src_top,
1215
                                               UWORD8 *pu1_src_top_left,
1216
                                               UWORD8 *pu1_src_top_right,
1217
                                               UWORD8 *pu1_src_bot_left,
1218
                                               UWORD8 *pu1_avail,
1219
                                               WORD8 *pi1_sao_offset_u,
1220
                                               WORD8 *pi1_sao_offset_v,
1221
                                               WORD32 wd,
1222
                                               WORD32 ht)
1223
71.3k
{
1224
71.3k
    WORD32 row, col;
1225
71.3k
    UWORD8 *pu1_src_cpy, *pu1_src_left_cpy, *pu1_src_left_str, *pu1_left_tmp;
1226
71.3k
    UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
1227
71.3k
    UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)];
1228
71.3k
    UWORD8 au1_src_left_tmp1[2 * (MAX_CTB_SIZE + 8)];
1229
71.3k
    UWORD8 u1_avail0, u1_avail1;
1230
71.3k
    WORD32 wd_rem;
1231
71.3k
    WORD32 offset = 0;
1232
1233
71.3k
    __m128i src_temp0_16x8b, src_temp1_16x8b;
1234
71.3k
    __m128i left0_16x8b, left1_16x8b;
1235
71.3k
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
1236
71.3k
    __m128i edge0_16x8b, edge1_16x8b;
1237
71.3k
    __m128i au1_mask8x16b;
1238
71.3k
    __m128i edge_idx_8x16b, sao_offset_8x16b;
1239
71.3k
    __m128i const2_16x8b, const0_16x8b;
1240
71.3k
    __m128i left_store_16x8b;
1241
71.3k
    __m128i chroma_offset_8x16b;
1242
71.3k
    UNUSED(pu1_src_top_right);
1243
71.3k
    UNUSED(pu1_src_bot_left);
1244
1245
71.3k
    au1_mask8x16b = _mm_set1_epi8(0xff);
1246
1247
    /* Update  top and top-left arrays */
1248
71.3k
    pu1_src_top_left[0] = pu1_src_top[wd - 2];
1249
71.3k
    pu1_src_top_left[1] = pu1_src_top[wd - 1];;
1250
1251
222k
    for(col = wd; col >= 16; col -= 16)
1252
150k
    {
1253
150k
        const0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + offset + (ht - 1) * src_strd));
1254
150k
        _mm_storeu_si128((__m128i *)(pu1_src_top + offset), const0_16x8b);
1255
150k
        offset += 16;
1256
150k
    }
1257
2.34M
    for(row = 0; row < 2 * ht; row++)
1258
2.27M
    {
1259
2.27M
        au1_src_left_tmp[row] = pu1_src_left[row];
1260
2.27M
    }
1261
    //setting availability mask to ff size MAX_CTB_SIZE
1262
356k
    for(col = 0; col < MAX_CTB_SIZE; col += 16)
1263
285k
        _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
1264
1265
71.3k
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
1266
71.3k
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
1267
71.3k
    const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
1268
71.3k
    chroma_offset_8x16b = _mm_set1_epi16(0x0800);
1269
    //availability mask creation
1270
71.3k
    u1_avail0 = pu1_avail[0];
1271
71.3k
    u1_avail1 = pu1_avail[1];
1272
71.3k
    au1_mask[0] = u1_avail0;
1273
71.3k
    au1_mask[1] = u1_avail0;
1274
71.3k
    au1_mask[wd - 1] = u1_avail1;
1275
71.3k
    au1_mask[wd - 2] = u1_avail1;
1276
71.3k
    sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
1277
71.3k
    const2_16x8b = _mm_set1_epi8(2);
1278
71.3k
    const0_16x8b = _mm_setzero_si128();
1279
1280
71.3k
    {
1281
71.3k
        pu1_src_left_cpy = au1_src_left_tmp;
1282
71.3k
        pu1_src_left_str = au1_src_left_tmp1;
1283
71.3k
        au1_mask_cpy = au1_mask;
1284
222k
        for(col = wd; col >= 16; col -= 16)
1285
150k
        {
1286
150k
            pu1_src_cpy = pu1_src;
1287
150k
            au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
1288
1289
1.35M
            for(row = ht; row > 0; row -= 2)
1290
1.20M
            {
1291
1292
1.20M
                left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
1293
                //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
1294
1.20M
                src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
1295
                // row = 1
1296
1.20M
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
1297
1298
1.20M
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 4);
1299
                //row 1 left
1300
1.20M
                left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 14);
1301
1.20M
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14);
1302
                //row 0 left
1303
1.20M
                left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 14);
1304
1.20M
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
1305
1306
1307
                //separating +ve and and -ve values.row 0 left
1308
1.20M
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
1309
1.20M
                cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
1310
                //creating mask 00 for +ve and -ve values and FF for zero.
1311
1.20M
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1312
1.20M
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1313
                //combining the appropriate sign change
1314
1.20M
                left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1315
1316
                //separating +ve and and -ve values.row 1 left
1317
1.20M
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
1318
1.20M
                cmp_lt0_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
1319
                //creating mask 00 for +ve and -ve values and FF for zero.
1320
1.20M
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1321
1.20M
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1322
                //combining the appropriate sign change
1323
1.20M
                left1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1324
1325
1326
                //row = 0 right
1327
1.20M
                edge0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2));
1328
                // row = 1 right
1329
1.20M
                edge1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
1330
                //separating +ve and and -ve values.row 0 right
1331
1.20M
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
1332
1.20M
                cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
1333
                //creating mask 00 for +ve and -ve values and FF for zero.
1334
1.20M
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1335
1.20M
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1336
                //combining the appropriate sign change
1337
1.20M
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1338
1339
                //separating +ve and and -ve values.row 1 right
1340
1.20M
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
1341
1.20M
                cmp_lt0_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
1342
                //creating mask 00 for +ve and -ve values and FF for zero.
1343
1.20M
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1344
1.20M
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1345
                //combining the appropriate sign change
1346
1.20M
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1347
1348
                //combining sign-left and sign_right
1349
1.20M
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
1350
1.20M
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
1351
                //adding constant 2
1352
1.20M
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1353
1.20M
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
1354
                //shuffle to get sao index
1355
1.20M
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1356
1.20M
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
1357
                //using availability mask
1358
1.20M
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
1359
1.20M
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
1360
                //adding chroma offset to access U and V
1361
1.20M
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
1362
1.20M
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
1363
1364
                //shuffle to get sao offset
1365
1.20M
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1366
1.20M
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
1367
                //cnvert to 16 bit then add and then saturated pack
1368
1.20M
                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1369
1.20M
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1370
1.20M
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
1371
1.20M
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1372
1.20M
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
1373
1.20M
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1374
1.20M
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
1375
1.20M
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1376
1377
1.20M
                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
1378
1.20M
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
1379
1.20M
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
1380
1.20M
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
1381
1.20M
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
1382
1.20M
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1383
1.20M
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
1384
1.20M
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
1385
1386
1.20M
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
1387
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1388
1.20M
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1389
                // row = 1
1390
1.20M
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
1391
1392
1.20M
                pu1_src_cpy += (src_strd << 1);
1393
1.20M
                pu1_src_left_cpy += 4;
1394
1.20M
                pu1_src_left_str += 4;
1395
1.20M
            }
1396
150k
            au1_mask_cpy += 16;
1397
150k
            pu1_src += 16;
1398
150k
            pu1_src_left_cpy -= 2 * ht;
1399
150k
            pu1_src_left_str -= 2 * ht;
1400
1401
150k
            pu1_left_tmp = pu1_src_left_cpy;
1402
150k
            pu1_src_left_cpy = pu1_src_left_str;
1403
150k
            pu1_src_left_str = pu1_left_tmp;
1404
150k
        }
1405
1406
71.3k
        wd_rem = wd & 0xF;
1407
71.3k
        if(wd_rem)
1408
228
        {
1409
1410
228
            cmp_gt0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (ht - 1) * src_strd));
1411
228
            _mm_storel_epi64((__m128i *)(pu1_src_top + offset), cmp_gt0_16x8b);
1412
1413
228
            au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy);
1414
228
            pu1_src_cpy = pu1_src;
1415
228
            au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
1416
1417
700
            for(row = ht; row > 0; row -= 4)
1418
472
            {
1419
472
                left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
1420
                //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
1421
472
                src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
1422
                // row = 1
1423
472
                cmp_gt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
1424
                // row  = 2
1425
472
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
1426
                // row = 3
1427
472
                cmp_lt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
1428
1429
1430
472
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 8);
1431
                //row 3 left
1432
472
                edge0_16x8b = _mm_slli_si128(cmp_lt0_16x8b, 8);
1433
472
                left0_16x8b = _mm_alignr_epi8(cmp_lt0_16x8b, left_store_16x8b, 14);
1434
472
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
1435
                //row 2 left
1436
472
                edge0_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
1437
472
                left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 14);
1438
472
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
1439
1440
1441
                // packing rows together for 16 SIMD operations
1442
472
                src_temp1_16x8b = _mm_unpacklo_epi64(src_temp1_16x8b, cmp_lt0_16x8b);
1443
472
                left1_16x8b = _mm_unpacklo_epi64(left1_16x8b, left0_16x8b);
1444
1445
                //row 1 left
1446
472
                edge0_16x8b = _mm_slli_si128(cmp_gt0_16x8b, 8);
1447
472
                edge1_16x8b = _mm_alignr_epi8(cmp_gt0_16x8b, left_store_16x8b, 14);
1448
472
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
1449
                //row 0 left
1450
472
                edge0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
1451
472
                left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 14);
1452
472
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
1453
                // packing rows together for 16 SIMD operations
1454
472
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, cmp_gt0_16x8b);
1455
472
                left0_16x8b = _mm_unpacklo_epi64(left0_16x8b, edge1_16x8b);
1456
1457
                //separating +ve and and -ve values.for row 2 and row 3
1458
472
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
1459
472
                cmp_lt0_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
1460
                //creating mask 00 for +ve and -ve values and FF for zero.
1461
472
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1462
472
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1463
                //combining the appropriate sign change
1464
472
                left1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1465
1466
1467
1468
1469
1470
                //separating +ve and and -ve values.
1471
472
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
1472
472
                cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
1473
                //creating mask 00 for +ve and -ve values and FF for zero.
1474
472
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1475
472
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1476
                //combining the appropriate sign change
1477
472
                left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1478
1479
1480
                //row = 0 right
1481
472
                edge0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2));
1482
                // row = 1 right
1483
472
                cmp_gt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd + 2));
1484
                // row = 2 right
1485
472
                edge1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd + 2));
1486
                // row = 3 right
1487
472
                cmp_lt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd + 2));
1488
                // packing rows together for 16 SIMD operations
1489
472
                edge0_16x8b = _mm_unpacklo_epi64(edge0_16x8b, cmp_gt0_16x8b);
1490
472
                edge1_16x8b = _mm_unpacklo_epi64(edge1_16x8b, cmp_lt0_16x8b);
1491
1492
                //separating +ve and and -ve values.
1493
472
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
1494
472
                cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
1495
                //creating mask 00 for +ve and -ve values and FF for zero.
1496
472
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1497
472
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1498
                //combining the appropriate sign change
1499
472
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1500
1501
472
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
1502
472
                cmp_lt0_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
1503
                //creating mask 00 for +ve and -ve values and FF for zero.
1504
472
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1505
472
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1506
                //combining the appropriate sign change
1507
472
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1508
1509
                //combining sign-left and sign_right
1510
472
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
1511
472
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
1512
                //adding constant 2
1513
472
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1514
472
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
1515
                //shuffle to get sao index
1516
472
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1517
472
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
1518
                //shuffle to get sao offset
1519
                //using availability mask
1520
472
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
1521
472
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
1522
                //adding chroma offset to access U and V
1523
472
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
1524
472
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
1525
1526
472
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1527
472
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
1528
                //cnvert to 16 bit then add and then saturated pack
1529
472
                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1530
472
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1531
472
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
1532
472
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1533
472
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
1534
472
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1535
472
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
1536
472
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1537
1538
472
                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
1539
472
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
1540
472
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
1541
472
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
1542
472
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
1543
472
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1544
472
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
1545
472
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
1546
1547
                //seaprting row 1 and row 3
1548
472
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
1549
472
                cmp_lt0_16x8b = _mm_srli_si128(src_temp1_16x8b, 8);
1550
1551
472
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
1552
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1553
472
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1554
                // row = 1
1555
472
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
1556
                // row = 2
1557
472
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp1_16x8b);
1558
                // row = 3
1559
472
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
1560
1561
472
                pu1_src_cpy += (src_strd << 2);
1562
472
                pu1_src_left_cpy += 8;
1563
472
                pu1_src_left_str += 8;
1564
472
            }
1565
228
            pu1_src += wd;
1566
228
            pu1_src_left_cpy -= 2 * ht;
1567
228
            pu1_src_left_str -= 2 * ht;
1568
1569
228
            pu1_left_tmp = pu1_src_left_cpy;
1570
228
            pu1_src_left_cpy = pu1_src_left_str;
1571
228
            pu1_src_left_str = pu1_left_tmp;
1572
228
        }
1573
2.34M
        for(row = 0; row < 2 * ht; row++)
1574
2.27M
        {
1575
2.27M
            pu1_src_left[row] = pu1_src_left_cpy[row];
1576
2.27M
        }
1577
71.3k
    }
1578
1579
71.3k
}
1580
1581
1582
void ihevc_sao_edge_offset_class1_ssse3(UWORD8 *pu1_src,
1583
                                        WORD32 src_strd,
1584
                                        UWORD8 *pu1_src_left,
1585
                                        UWORD8 *pu1_src_top,
1586
                                        UWORD8 *pu1_src_top_left,
1587
                                        UWORD8 *pu1_src_top_right,
1588
                                        UWORD8 *pu1_src_bot_left,
1589
                                        UWORD8 *pu1_avail,
1590
                                        WORD8 *pi1_sao_offset,
1591
                                        WORD32 wd,
1592
                                        WORD32 ht)
1593
70.8k
{
1594
70.8k
    WORD32 row, col;
1595
70.8k
    UWORD8 *pu1_src_top_cpy;
1596
70.8k
    UWORD8 *pu1_src_cpy;
1597
70.8k
    WORD32 wd_rem;
1598
1599
1600
70.8k
    __m128i src_top_16x8b, src_bottom_16x8b;
1601
70.8k
    __m128i src_temp0_16x8b, src_temp1_16x8b;
1602
70.8k
    __m128i signup0_16x8b, signdwn1_16x8b;
1603
70.8k
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
1604
70.8k
    __m128i edge0_16x8b, edge1_16x8b;
1605
70.8k
    __m128i edge_idx_8x16b, sao_offset_8x16b;
1606
70.8k
    __m128i const2_16x8b, const0_16x8b;
1607
1608
70.8k
    UNUSED(pu1_src_top_right);
1609
70.8k
    UNUSED(pu1_src_bot_left);
1610
1611
1612
    /* Updating left and top-left  */
1613
2.31M
    for(row = 0; row < ht; row++)
1614
2.24M
    {
1615
2.24M
        pu1_src_left[row] = pu1_src[row * src_strd + (wd - 1)];
1616
2.24M
    }
1617
70.8k
    *pu1_src_top_left = pu1_src_top[wd - 1];
1618
1619
1620
1621
70.8k
    pu1_src_top_cpy = pu1_src_top;
1622
70.8k
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
1623
70.8k
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
1624
1625
    /* Update height and source pointers based on the availability flags */
1626
70.8k
    if(0 == pu1_avail[2])
1627
2.56k
    {
1628
2.56k
        pu1_src_top_cpy = pu1_src;
1629
2.56k
        pu1_src += src_strd;
1630
2.56k
        ht--;
1631
2.56k
    }
1632
70.8k
    if(0 == pu1_avail[3])
1633
1.39k
    {
1634
1.39k
        ht--;
1635
1.39k
    }
1636
1637
70.8k
    const2_16x8b = _mm_set1_epi8(2);
1638
70.8k
    const0_16x8b = _mm_setzero_si128();
1639
1640
70.8k
    {
1641
70.8k
        WORD32 ht_rem;
1642
179k
        for(col = wd; col >= 16; col -= 16)
1643
109k
        {
1644
109k
            pu1_src_cpy = pu1_src;
1645
109k
            src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
1646
            //row = 0
1647
109k
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
1648
            //separating +ve and and -ve values.
1649
109k
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
1650
109k
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
1651
            //creating mask 00 for +ve and -ve values and FF for zero.
1652
109k
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1653
109k
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1654
            //combining the appropriate sign change
1655
109k
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1656
1657
1.83M
            for(row = ht; row >= 2; row -= 2)
1658
1.73M
            {
1659
1660
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
1661
1.73M
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
1662
                // row = 2
1663
1.73M
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
1664
1665
1666
                //row 0 -row1
1667
                //separating +ve and and -ve values.
1668
1.73M
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
1669
1.73M
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
1670
                //creating mask 00 for +ve and -ve values and FF for zero.
1671
1.73M
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1672
1.73M
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1673
                //combining the appropriate sign change
1674
1.73M
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1675
                //row1-row0
1676
1.73M
                edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
1677
1678
                //row1 -bottom
1679
1.73M
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
1680
1.73M
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
1681
                //creating mask 00 for +ve and -ve values and FF for zero.
1682
1.73M
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1683
1.73M
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1684
                //combining the appropriate sign change
1685
1.73M
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1686
1687
                //combining sign-left and sign_right
1688
1.73M
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
1689
1.73M
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
1690
1691
                //for the next iteration signup0_16x8b = -signdwn1_16x8b
1692
1.73M
                signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
1693
                //adding constant 2
1694
1.73M
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1695
1.73M
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
1696
                //shuffle to get sao index
1697
1.73M
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1698
1.73M
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
1699
                //shuffle to get sao offset
1700
1.73M
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1701
1.73M
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
1702
                //copying the next top
1703
1.73M
                src_top_16x8b = src_temp1_16x8b;
1704
                //cnvert to 16 bit then add and then saturated pack
1705
1.73M
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1706
1.73M
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1707
1.73M
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
1708
1.73M
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1709
1.73M
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1710
1.73M
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
1711
1.73M
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
1712
1.73M
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1713
1714
1.73M
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
1715
1.73M
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
1716
1.73M
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
1717
1.73M
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
1718
1.73M
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1719
1.73M
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
1720
1.73M
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
1721
1.73M
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
1722
1723
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1724
1.73M
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1725
                // row = 1
1726
1.73M
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
1727
1728
1.73M
                src_temp0_16x8b = src_bottom_16x8b;
1729
1.73M
                pu1_src_cpy += (src_strd << 1);
1730
1.73M
            }
1731
109k
            ht_rem = ht & 0x1;
1732
1733
109k
            if(ht_rem)
1734
6.13k
            {
1735
6.13k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
1736
                //current row -next row
1737
                //separating +ve and and -ve values.
1738
6.13k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
1739
6.13k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
1740
                //creating mask 00 for +ve and -ve values and FF for zero.
1741
6.13k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1742
6.13k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1743
                //combining the appropriate sign change
1744
6.13k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1745
                //adding top and botton and constant 2
1746
6.13k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
1747
6.13k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1748
1749
6.13k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1750
6.13k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1751
                //copying the next top
1752
6.13k
                src_top_16x8b = src_temp0_16x8b;
1753
                //cnvert to 16 bit then add and then saturated pack
1754
6.13k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1755
6.13k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1756
6.13k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
1757
6.13k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1758
6.13k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1759
6.13k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
1760
6.13k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
1761
6.13k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1762
1763
6.13k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1764
6.13k
            }
1765
109k
            if(0 == pu1_avail[3])
1766
2.11k
            {
1767
2.11k
                src_top_16x8b = src_bottom_16x8b;
1768
2.11k
            }
1769
            //updating top flag
1770
109k
            _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
1771
109k
            pu1_src += 16;
1772
109k
        }
1773
1774
70.8k
        wd_rem = wd & 0xF;
1775
70.8k
        if(wd_rem)
1776
69.5k
        {
1777
69.5k
            pu1_src_cpy = pu1_src;
1778
69.5k
            src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
1779
            //row = 0
1780
69.5k
            src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
1781
            //separating +ve and and -ve values.
1782
69.5k
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
1783
69.5k
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
1784
            //creating mask 00 for +ve and -ve values and FF for zero.
1785
69.5k
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1786
69.5k
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1787
            //combining the appropriate sign change
1788
69.5k
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1789
69.5k
            signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
1790
616k
            for(row = ht; row >= 4; row -= 4)
1791
547k
            {
1792
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
1793
547k
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
1794
                // row = 2
1795
547k
                src_bottom_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
1796
1797
                //row 0 -row1
1798
                //separating +ve and and -ve values.
1799
547k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
1800
547k
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
1801
                //creating mask 00 for +ve and -ve values and FF for zero.
1802
547k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1803
547k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1804
                //combining the appropriate sign change
1805
547k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1806
1807
                //row1-row0
1808
547k
                edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
1809
547k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
1810
547k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
1811
                //row1 -row2
1812
547k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
1813
547k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
1814
                //creating mask 00 for +ve and -ve values and FF for zero.
1815
547k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1816
547k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1817
                //combining the appropriate sign change
1818
547k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
1819
547k
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
1820
                //packing row 0 n row 1
1821
547k
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
1822
                //row = 3
1823
547k
                src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
1824
                // row = 4
1825
547k
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 4 * src_strd));
1826
1827
547k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
1828
547k
                signdwn1_16x8b = _mm_slli_si128(signdwn1_16x8b, 8); //allign left (1-2)
1829
                //separating +ve and and -ve values.(2,3)
1830
547k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_top_16x8b);
1831
547k
                cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_bottom_16x8b);
1832
                //creating mask 00 for +ve and -ve values and FF for zero.
1833
547k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1834
547k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1835
                //combining the appropriate sign change
1836
547k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
1837
1838
547k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signdwn1_16x8b, 8); //(2-3) ,(1-2) (substract with down)
1839
547k
                edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8);
1840
                //separating +ve and and -ve values.(3,4)
1841
547k
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp1_16x8b);
1842
547k
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_top_16x8b);
1843
                //creating mask 00 for +ve and -ve values and FF for zero.
1844
547k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1845
547k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1846
547k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-4)
1847
                //combining sign-left and sign_right
1848
547k
                edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-4),(2-3)
1849
1850
547k
                edge1_16x8b = _mm_sub_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
1851
1852
                //packing row 2 n row 3
1853
547k
                src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
1854
                //for the next iteration signup0_16x8b = -signdwn1_16x8b
1855
547k
                signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(4-3)
1856
1857
                //adding constant 2
1858
547k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1859
547k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
1860
                //shuffle to get sao index
1861
547k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1862
547k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
1863
                //shuffle to get sao offset
1864
547k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1865
547k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
1866
                //the next top already in  src_top_16x8b
1867
                //src_top_16x8b = src_temp1_16x8b;
1868
                //cnvert to 16 bit then add and then saturated pack
1869
547k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1870
547k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1871
547k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
1872
547k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1873
547k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1874
547k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
1875
547k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
1876
547k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1877
1878
547k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
1879
547k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
1880
547k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
1881
547k
                src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
1882
547k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1883
547k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
1884
547k
                src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b);
1885
547k
                src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
1886
1887
547k
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
1888
547k
                cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
1889
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1890
547k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1891
                // row = 1
1892
547k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
1893
                //row = 2
1894
547k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
1895
                // row = 3
1896
547k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
1897
1898
547k
                src_temp0_16x8b = src_temp1_16x8b;
1899
547k
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
1900
547k
                pu1_src_cpy += (src_strd << 2);
1901
1902
547k
            }
1903
69.5k
            ht_rem = ht & 0x2;
1904
69.5k
            if(ht_rem)
1905
3.80k
            {
1906
1907
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
1908
3.80k
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
1909
                // row = 2
1910
3.80k
                src_bottom_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
1911
1912
                //row 0 -row1
1913
                //separating +ve and and -ve values.
1914
3.80k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
1915
3.80k
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
1916
                //creating mask 00 for +ve and -ve values and FF for zero.
1917
3.80k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1918
3.80k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1919
                //combining the appropriate sign change
1920
3.80k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1921
                //row1-row0
1922
3.80k
                edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
1923
3.80k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
1924
3.80k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
1925
                //row1 -row2
1926
3.80k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
1927
3.80k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
1928
                //creating mask 00 for +ve and -ve values and FF for zero.
1929
3.80k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1930
3.80k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1931
                //combining the appropriate sign change
1932
3.80k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
1933
3.80k
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
1934
                //adding top and down substraction
1935
3.80k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
1936
                //for the next iteration signup0_16x8b = -signdwn1_16x8b
1937
3.80k
                signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(2-1) for next
1938
3.80k
                src_top_16x8b = src_temp1_16x8b;
1939
                //adding constant 2
1940
3.80k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1941
1942
                //shuffle to get sao index
1943
3.80k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1944
1945
                //shuffle to get sao offset
1946
3.80k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1947
1948
                //the next top already in  src_top_16x8b
1949
                //cnvert to 16 bit then add and then saturated pack
1950
3.80k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1951
3.80k
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1952
3.80k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
1953
3.80k
                src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
1954
3.80k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
1955
3.80k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
1956
3.80k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
1957
3.80k
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
1958
1959
3.80k
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
1960
1961
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1962
3.80k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1963
                // row = 1
1964
3.80k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
1965
3.80k
                src_temp0_16x8b = src_bottom_16x8b;
1966
3.80k
                pu1_src_cpy += (src_strd << 1);
1967
1968
3.80k
            }
1969
69.5k
            ht_rem = ht & 0x1;
1970
69.5k
            if(ht_rem)
1971
3.80k
            {
1972
1973
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
1974
3.80k
                src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
1975
1976
                //row 0 -row1
1977
                //separating +ve and and -ve values.
1978
3.80k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
1979
3.80k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
1980
                //creating mask 00 for +ve and -ve values and FF for zero.
1981
3.80k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1982
3.80k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1983
                //combining the appropriate sign change
1984
3.80k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1985
                //adding top and down substraction
1986
3.80k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
1987
                //adding constant 2
1988
3.80k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1989
3.80k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
1990
3.80k
                edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
1991
                //shuffle to get sao index
1992
3.80k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1993
                //shuffle to get sao offset
1994
3.80k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1995
3.80k
                src_top_16x8b = src_temp0_16x8b;
1996
                //cnvert to 16 bit then add and then saturated pack
1997
3.80k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1998
3.80k
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1999
3.80k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2000
3.80k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
2001
3.80k
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
2002
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
2003
3.80k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2004
3.80k
                pu1_src_cpy += (src_strd);
2005
2006
3.80k
            }
2007
69.5k
            if(0 == pu1_avail[3])
2008
1.35k
            {
2009
1.35k
                src_top_16x8b = src_bottom_16x8b;
2010
1.35k
            }
2011
69.5k
            _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
2012
69.5k
            pu1_src += 8;
2013
69.5k
        }
2014
70.8k
    }
2015
70.8k
}
2016
2017
void ihevc_sao_edge_offset_class1_chroma_ssse3(UWORD8 *pu1_src,
2018
                                               WORD32 src_strd,
2019
                                               UWORD8 *pu1_src_left,
2020
                                               UWORD8 *pu1_src_top,
2021
                                               UWORD8 *pu1_src_top_left,
2022
                                               UWORD8 *pu1_src_top_right,
2023
                                               UWORD8 *pu1_src_bot_left,
2024
                                               UWORD8 *pu1_avail,
2025
                                               WORD8 *pi1_sao_offset_u,
2026
                                               WORD8 *pi1_sao_offset_v,
2027
                                               WORD32 wd,
2028
                                               WORD32 ht)
2029
72.2k
{
2030
72.2k
    WORD32 row, col;
2031
72.2k
    UWORD8 *pu1_src_top_cpy;
2032
72.2k
    UWORD8 *pu1_src_cpy;
2033
72.2k
    WORD32 wd_rem;
2034
2035
2036
72.2k
    __m128i src_top_16x8b, src_bottom_16x8b;
2037
72.2k
    __m128i src_temp0_16x8b, src_temp1_16x8b;
2038
72.2k
    __m128i signup0_16x8b, signdwn1_16x8b;
2039
72.2k
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
2040
72.2k
    __m128i edge0_16x8b, edge1_16x8b;
2041
72.2k
    __m128i edge_idx_8x16b, sao_offset_8x16b;
2042
72.2k
    __m128i const2_16x8b, const0_16x8b;
2043
72.2k
    __m128i chroma_offset_8x16b;
2044
2045
72.2k
    UNUSED(pu1_src_top_right);
2046
72.2k
    UNUSED(pu1_src_bot_left);
2047
2048
    /* Updating left and top and top-left */
2049
1.22M
    for(row = 0; row < ht; row++)
2050
1.15M
    {
2051
1.15M
        pu1_src_left[2 * row] = pu1_src[row * src_strd + (wd - 2)];
2052
1.15M
        pu1_src_left[2 * row + 1] = pu1_src[row * src_strd + (wd - 1)];
2053
1.15M
    }
2054
72.2k
    pu1_src_top_left[0] = pu1_src_top[wd - 2];
2055
72.2k
    pu1_src_top_left[1] = pu1_src_top[wd - 1];
2056
2057
2058
2059
72.2k
    pu1_src_top_cpy = pu1_src_top;
2060
72.2k
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
2061
72.2k
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
2062
72.2k
    const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
2063
72.2k
    chroma_offset_8x16b = _mm_set1_epi16(0x0800);
2064
    /* Update height and source pointers based on the availability flags */
2065
72.2k
    if(0 == pu1_avail[2])
2066
2.56k
    {
2067
2.56k
        pu1_src_top_cpy = pu1_src;
2068
2.56k
        pu1_src += src_strd;
2069
2.56k
        ht--;
2070
2.56k
    }
2071
72.2k
    if(0 == pu1_avail[3])
2072
860
    {
2073
860
        ht--;
2074
860
    }
2075
72.2k
    sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
2076
72.2k
    const2_16x8b = _mm_set1_epi8(2);
2077
72.2k
    const0_16x8b = _mm_setzero_si128();
2078
2079
2080
72.2k
    {
2081
72.2k
        WORD32 ht_rem;
2082
2083
2084
2085
217k
        for(col = wd; col >= 16; col -= 16)
2086
145k
        {
2087
145k
            pu1_src_cpy = pu1_src;
2088
145k
            src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
2089
            //row = 0
2090
145k
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
2091
            //separating +ve and and -ve values.
2092
145k
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
2093
145k
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
2094
            //creating mask 00 for +ve and -ve values and FF for zero.
2095
145k
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2096
145k
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2097
            //combining the appropriate sign change
2098
145k
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2099
2100
1.29M
            for(row = ht; row >= 2; row -= 2)
2101
1.14M
            {
2102
2103
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
2104
1.14M
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
2105
                // row = 2
2106
1.14M
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
2107
2108
2109
                //row 0 -row1
2110
                //separating +ve and and -ve values.
2111
1.14M
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
2112
1.14M
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
2113
                //creating mask 00 for +ve and -ve values and FF for zero.
2114
1.14M
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2115
1.14M
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2116
                //combining the appropriate sign change
2117
1.14M
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2118
                //row1-row0
2119
1.14M
                edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
2120
2121
                //row1 -bottom
2122
1.14M
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
2123
1.14M
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
2124
                //creating mask 00 for +ve and -ve values and FF for zero.
2125
1.14M
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2126
1.14M
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2127
                //combining the appropriate sign change
2128
1.14M
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2129
2130
                //combining sign-left and sign_right
2131
1.14M
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
2132
1.14M
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
2133
2134
                //for the next iteration signup0_16x8b = -signdwn1_16x8b
2135
1.14M
                signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
2136
                //adding constant 2
2137
1.14M
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2138
1.14M
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
2139
                //copying the next top
2140
1.14M
                src_top_16x8b = src_temp1_16x8b;
2141
2142
2143
                //shuffle to get sao index
2144
1.14M
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2145
1.14M
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
2146
                //adding chroma offset to access U and V
2147
1.14M
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
2148
1.14M
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
2149
2150
                //shuffle to get sao offset
2151
1.14M
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2152
1.14M
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
2153
                //cnvert to 16 bit then add and then saturated pack
2154
1.14M
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2155
1.14M
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2156
1.14M
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2157
1.14M
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
2158
1.14M
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
2159
1.14M
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2160
1.14M
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
2161
1.14M
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
2162
2163
1.14M
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
2164
1.14M
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
2165
1.14M
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
2166
1.14M
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
2167
1.14M
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
2168
1.14M
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2169
1.14M
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
2170
1.14M
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
2171
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
2172
1.14M
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2173
                // row = 1
2174
1.14M
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
2175
2176
1.14M
                src_temp0_16x8b = src_bottom_16x8b;
2177
1.14M
                pu1_src_cpy += (src_strd << 1);
2178
1.14M
            }
2179
145k
            ht_rem = ht & 0x1;
2180
2181
145k
            if(ht_rem)
2182
6.89k
            {
2183
6.89k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
2184
                //current row -next row
2185
                //separating +ve and and -ve values.
2186
6.89k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
2187
6.89k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
2188
                //creating mask 00 for +ve and -ve values and FF for zero.
2189
6.89k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2190
6.89k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2191
                //combining the appropriate sign change
2192
6.89k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2193
                //adding top and botton and constant 2
2194
6.89k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
2195
6.89k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2196
                //copying the next top
2197
6.89k
                src_top_16x8b = src_temp0_16x8b;
2198
2199
6.89k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2200
                //adding chroma offset to access U and V
2201
6.89k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
2202
6.89k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2203
2204
                //cnvert to 16 bit then add and then saturated pack
2205
6.89k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2206
6.89k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2207
6.89k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2208
6.89k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
2209
6.89k
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
2210
6.89k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2211
6.89k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
2212
6.89k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
2213
2214
6.89k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2215
6.89k
            }
2216
145k
            if(0 == pu1_avail[3])
2217
1.73k
            {
2218
1.73k
                src_top_16x8b = src_bottom_16x8b;
2219
1.73k
            }
2220
            //updating top flag
2221
145k
            _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
2222
145k
            pu1_src += 16;
2223
145k
        }
2224
2225
72.2k
        wd_rem = wd & 0xF;
2226
72.2k
        if(wd_rem)
2227
2
        {
2228
2
            pu1_src_cpy = pu1_src;
2229
2
            src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
2230
            //row = 0
2231
2
            src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
2232
            //separating +ve and and -ve values.
2233
2
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
2234
2
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
2235
            //creating mask 00 for +ve and -ve values and FF for zero.
2236
2
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2237
2
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2238
            //combining the appropriate sign change
2239
2
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2240
2
            signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
2241
5
            for(row = ht; row >= 4; row -= 4)
2242
3
            {
2243
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
2244
3
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
2245
                // row = 2
2246
3
                src_bottom_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
2247
2248
                //row 0 -row1
2249
                //separating +ve and and -ve values.
2250
3
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
2251
3
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
2252
                //creating mask 00 for +ve and -ve values and FF for zero.
2253
3
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2254
3
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2255
                //combining the appropriate sign change
2256
3
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2257
2258
                //row1-row0
2259
3
                edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
2260
3
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
2261
3
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
2262
                //row1 -row2
2263
3
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
2264
3
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
2265
                //creating mask 00 for +ve and -ve values and FF for zero.
2266
3
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2267
3
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2268
                //combining the appropriate sign change
2269
3
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
2270
3
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
2271
                //packing row 0 n row 1
2272
3
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
2273
                //row = 3
2274
3
                src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
2275
                // row = 4
2276
3
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 4 * src_strd));
2277
2278
3
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
2279
3
                signdwn1_16x8b = _mm_slli_si128(signdwn1_16x8b, 8); //allign left (1-2)
2280
                //separating +ve and and -ve values.(2,3)
2281
3
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_top_16x8b);
2282
3
                cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_bottom_16x8b);
2283
                //creating mask 00 for +ve and -ve values and FF for zero.
2284
3
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2285
3
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2286
                //combining the appropriate sign change
2287
3
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
2288
2289
3
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signdwn1_16x8b, 8); //(2-3) ,(1-2) (substract with down)
2290
3
                edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8);
2291
                //separating +ve and and -ve values.(3,4)
2292
3
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp1_16x8b);
2293
3
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_top_16x8b);
2294
                //creating mask 00 for +ve and -ve values and FF for zero.
2295
3
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2296
3
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2297
3
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-4)
2298
                //combining sign-left and sign_right
2299
3
                edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-4),(2-3)
2300
2301
3
                edge1_16x8b = _mm_sub_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
2302
2303
                //packing row 2 n row 3
2304
3
                src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
2305
                //for the next iteration signup0_16x8b = -signdwn1_16x8b
2306
3
                signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(4-3)
2307
                //adding constant 2
2308
3
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2309
3
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
2310
                //shuffle to get sao index
2311
3
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2312
3
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
2313
                //adding chroma offset to access U and V
2314
3
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
2315
3
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
2316
2317
                //shuffle to get sao offset
2318
3
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2319
3
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
2320
                //the next top already in  src_top_16x8b
2321
                //cnvert to 16 bit then add and then saturated pack
2322
3
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2323
3
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2324
3
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2325
3
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
2326
3
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
2327
3
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2328
3
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
2329
3
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
2330
2331
3
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
2332
3
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
2333
3
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
2334
3
                src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
2335
3
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
2336
3
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2337
3
                src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b);
2338
3
                src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
2339
2340
3
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
2341
3
                cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
2342
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
2343
3
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2344
                // row = 1
2345
3
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
2346
                //row = 2
2347
3
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
2348
                // row = 3
2349
3
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
2350
2351
3
                src_temp0_16x8b = src_temp1_16x8b;
2352
3
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
2353
3
                pu1_src_cpy += (src_strd << 2);
2354
2355
3
            }
2356
2
            ht_rem = ht & 0x2;
2357
2
            if(ht_rem)
2358
1
            {
2359
2360
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
2361
1
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
2362
                // row = 2
2363
1
                src_bottom_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
2364
2365
                //row 0 -row1
2366
                //separating +ve and and -ve values.
2367
1
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
2368
1
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
2369
                //creating mask 00 for +ve and -ve values and FF for zero.
2370
1
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2371
1
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2372
                //combining the appropriate sign change
2373
1
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2374
                //row1-row0
2375
1
                edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
2376
1
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
2377
1
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
2378
                //row1 -row2
2379
1
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
2380
1
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
2381
                //creating mask 00 for +ve and -ve values and FF for zero.
2382
1
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2383
1
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2384
                //combining the appropriate sign change
2385
1
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
2386
1
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
2387
                //adding top and down substraction
2388
1
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
2389
                //for the next iteration signup0_16x8b = -signdwn1_16x8b
2390
1
                signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(2-1) for next
2391
1
                src_top_16x8b = src_temp1_16x8b;
2392
2393
                //adding constant 2
2394
1
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2395
2396
                //shuffle to get sao index
2397
1
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2398
2399
                //adding chroma offset to access U and V
2400
1
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
2401
                //shuffle to get sao offset
2402
1
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2403
                //the next top already in  src_top_16x8b
2404
                //cnvert to 16 bit then add and then saturated pack
2405
1
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2406
1
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2407
1
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2408
1
                src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
2409
1
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
2410
1
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
2411
1
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b);
2412
1
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
2413
2414
1
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
2415
2416
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
2417
1
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2418
                // row = 1
2419
1
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
2420
1
                src_temp0_16x8b = src_bottom_16x8b;
2421
1
                pu1_src_cpy += (src_strd << 1);
2422
2423
1
            }
2424
2
            ht_rem = ht & 0x1;
2425
2
            if(ht_rem)
2426
1
            {
2427
2428
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
2429
1
                src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
2430
2431
                //row 0 -row1
2432
                //separating +ve and and -ve values.
2433
1
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
2434
1
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
2435
                //creating mask 00 for +ve and -ve values and FF for zero.
2436
1
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2437
1
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2438
                //combining the appropriate sign change
2439
1
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2440
                //adding top and down substraction
2441
1
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
2442
                //adding constant 2
2443
1
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2444
1
                src_top_16x8b = src_temp0_16x8b;
2445
2446
1
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
2447
1
                edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
2448
                //shuffle to get sao index
2449
1
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2450
                //adding chroma offset to access U and V
2451
1
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
2452
                //shuffle to get sao offset
2453
1
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2454
2455
                //cnvert to 16 bit then add and then saturated pack
2456
1
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2457
1
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2458
1
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2459
1
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
2460
1
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
2461
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
2462
1
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2463
1
                pu1_src_cpy += (src_strd);
2464
2465
1
            }
2466
2
            if(0 == pu1_avail[3])
2467
0
            {
2468
0
                src_top_16x8b = src_bottom_16x8b;
2469
0
            }
2470
2
            _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
2471
2
            pu1_src += 8;
2472
2
        }
2473
72.2k
    }
2474
72.2k
}
2475
2476
/* 135 degree filtering */
2477
void ihevc_sao_edge_offset_class2_ssse3(UWORD8 *pu1_src,
2478
                                        WORD32 src_strd,
2479
                                        UWORD8 *pu1_src_left,
2480
                                        UWORD8 *pu1_src_top,
2481
                                        UWORD8 *pu1_src_top_left,
2482
                                        UWORD8 *pu1_src_top_right,
2483
                                        UWORD8 *pu1_src_bot_left,
2484
                                        UWORD8 *pu1_avail,
2485
                                        WORD8 *pi1_sao_offset,
2486
                                        WORD32 wd,
2487
                                        WORD32 ht)
2488
56.8k
{
2489
56.8k
    WORD32 row, col;
2490
56.8k
    UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
2491
56.8k
    UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2;
2492
56.8k
    UWORD8 *pu1_firstleft;
2493
56.8k
    UWORD8 *pu1_src_cpy, *pu1_src_org;
2494
56.8k
    UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
2495
56.8k
    UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8];
2496
56.8k
    UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8];
2497
56.8k
    WORD32 wd_rem;
2498
56.8k
    UWORD8 u1_pos_0_0_tmp, u1_pos_wd_ht_tmp;
2499
56.8k
    WORD32 ht_tmp, ht_0;
2500
2501
56.8k
    WORD32 bit_depth;
2502
56.8k
    UWORD8 u1_avail0, u1_avail1;
2503
2504
56.8k
    __m128i src_top_16x8b, src_bottom_16x8b;
2505
56.8k
    __m128i src_temp0_16x8b, src_temp1_16x8b;
2506
56.8k
    __m128i signup0_16x8b, signdwn1_16x8b;
2507
56.8k
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
2508
56.8k
    __m128i edge0_16x8b, edge1_16x8b;
2509
56.8k
    __m128i au1_mask8x16b;
2510
56.8k
    __m128i edge_idx_8x16b, sao_offset_8x16b;
2511
56.8k
    __m128i const2_16x8b, const0_16x8b;
2512
56.8k
    __m128i left_store_16x8b;
2513
56.8k
    UNUSED(pu1_src_top_right);
2514
56.8k
    UNUSED(pu1_src_bot_left);
2515
2516
56.8k
    ht_0 = ht; ht_tmp = ht;
2517
56.8k
    au1_mask8x16b = _mm_set1_epi8(0xff);
2518
2519
    //setting availability mask to ff size MAX_CTB_SIZE
2520
284k
    for(col = 0; col < MAX_CTB_SIZE; col += 16)
2521
227k
        _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
2522
1.85M
    for(row = 0; row < ht; row++)
2523
1.79M
    {
2524
1.79M
        au1_src_left_tmp[row] = pu1_src_left[row];
2525
1.79M
    }
2526
56.8k
    bit_depth = BIT_DEPTH_LUMA;
2527
56.8k
    pu1_src_org = pu1_src;
2528
56.8k
    pu1_src_top_cpy = pu1_src_top;
2529
56.8k
    pu1_src_left_cpy2 = au1_src_left_tmp;
2530
56.8k
    pu1_src_left_cpy = au1_src_left_tmp;
2531
56.8k
    pu1_src_left_str2 = au1_src_left_tmp1;
2532
56.8k
    pu1_src_left_str = au1_src_left_tmp1;
2533
56.8k
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
2534
56.8k
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
2535
2536
2537
    /* If top-left is available, process separately */
2538
56.8k
    if(0 != pu1_avail[4])
2539
53.5k
    {
2540
53.5k
        WORD8 edge_idx;
2541
2542
53.5k
        edge_idx = 2 + SIGN(pu1_src[0] - pu1_src_top_left[0]) +
2543
53.5k
                        SIGN(pu1_src[0] - pu1_src[1 + src_strd]);
2544
2545
53.5k
        edge_idx = gi1_table_edge_idx[edge_idx];
2546
2547
53.5k
        if(0 != edge_idx)
2548
15.0k
        {
2549
15.0k
            u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
2550
15.0k
        }
2551
38.5k
        else
2552
38.5k
        {
2553
38.5k
            u1_pos_0_0_tmp = pu1_src[0];
2554
38.5k
        }
2555
53.5k
    }
2556
3.30k
    else
2557
3.30k
    {
2558
3.30k
        u1_pos_0_0_tmp = pu1_src[0];
2559
3.30k
    }
2560
2561
    /* If bottom-right is available, process separately */
2562
56.8k
    if(0 != pu1_avail[7])
2563
54.6k
    {
2564
54.6k
        WORD8 edge_idx;
2565
2566
54.6k
        edge_idx = 2 + SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 1 - src_strd]) +
2567
54.6k
                        SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]);
2568
2569
54.6k
        edge_idx = gi1_table_edge_idx[edge_idx];
2570
2571
54.6k
        if(0 != edge_idx)
2572
16.2k
        {
2573
16.2k
            u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
2574
16.2k
        }
2575
38.3k
        else
2576
38.3k
        {
2577
38.3k
            u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd];
2578
38.3k
        }
2579
54.6k
    }
2580
2.21k
    else
2581
2.21k
    {
2582
2.21k
        u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd];
2583
2.21k
    }
2584
56.8k
    pu1_firstleft = pu1_src_top_left;
2585
2586
    /* Update height and source pointers based on the availability flags */
2587
56.8k
    if(0 == pu1_avail[2])
2588
2.64k
    {
2589
2.64k
        pu1_firstleft = pu1_src_left_cpy2;
2590
2.64k
        pu1_src_left_cpy2++;
2591
2.64k
        pu1_src_left_str2++;
2592
2.64k
        pu1_src_top_cpy = pu1_src;
2593
2.64k
        pu1_src += src_strd;
2594
2.64k
        ht--;
2595
2.64k
    }
2596
56.8k
    if(0 == pu1_avail[3])
2597
1.20k
    {
2598
1.20k
        ht--;
2599
1.20k
        ht_0--;
2600
1.20k
    }
2601
    //storing top left in a mmx register
2602
56.8k
    left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_firstleft);
2603
56.8k
    const2_16x8b = _mm_set1_epi8(2);
2604
56.8k
    const0_16x8b = _mm_setzero_si128();
2605
56.8k
    left_store_16x8b = _mm_slli_si128(left_store_16x8b, 15);
2606
    //update top -left
2607
56.8k
    *pu1_src_top_left = pu1_src_top[wd - 1];
2608
    //availability mask creation
2609
56.8k
    u1_avail0 = pu1_avail[0];
2610
56.8k
    u1_avail1 = pu1_avail[1];
2611
56.8k
    au1_mask[0] = u1_avail0;
2612
56.8k
    au1_mask[wd - 1] = u1_avail1;
2613
56.8k
    {
2614
56.8k
        WORD32 ht_rem;
2615
2616
2617
56.8k
        pu1_src_left_cpy = pu1_src_left_cpy2;
2618
56.8k
        pu1_src_left_str = pu1_src_left_str2;
2619
56.8k
        au1_mask_cpy = au1_mask;
2620
143k
        for(col = wd; col >= 16; col -= 16)
2621
87.1k
        {
2622
87.1k
            pu1_src_cpy = pu1_src;
2623
87.1k
            src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
2624
            //row = 0
2625
87.1k
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
2626
87.1k
            src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 15);
2627
            //loading the mask
2628
87.1k
            au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
2629
            //separating +ve and and -ve values.
2630
87.1k
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
2631
87.1k
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
2632
            //creating mask 00 for +ve and -ve values and FF for zero.
2633
87.1k
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2634
87.1k
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2635
            //combining the appropriate sign change
2636
87.1k
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2637
2638
2639
1.46M
            for(row = ht; row >= 2; row -= 2)
2640
1.37M
            {
2641
1.37M
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
2642
                //row = 1
2643
1.37M
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
2644
                // row = 1 right
2645
1.37M
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
2646
                //to insert left in row 0
2647
1.37M
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 15);
2648
                //row 0 -row1
2649
                //separating +ve and and -ve values.
2650
1.37M
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
2651
1.37M
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
2652
2653
                //creating mask 00 for +ve and -ve values and FF for zero.
2654
1.37M
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2655
1.37M
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2656
                //manipulation for row 1 - row 0
2657
1.37M
                signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15);
2658
                //combining the appropriate sign change
2659
1.37M
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
2660
                //row1-row0
2661
                //separating +ve and and -ve values.
2662
1.37M
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
2663
1.37M
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
2664
                //creating mask 00 for +ve and -ve values and FF for zero.
2665
1.37M
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2666
1.37M
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2667
                // row = 2 right
2668
1.37M
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd + 1));
2669
1.37M
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
2670
2671
2672
                //row1 -bottom
2673
1.37M
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
2674
1.37M
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
2675
                //creating mask 00 for +ve and -ve values and FF for zero.
2676
1.37M
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2677
1.37M
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2678
                //combining the appropriate sign change
2679
1.37M
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2680
                // row = 2
2681
1.37M
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
2682
2683
                //combining sign-left and sign_right
2684
1.37M
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
2685
2686
                //storing the row 1 left for next row.
2687
1.37M
                signup0_16x8b = _mm_slli_si128(left_store_16x8b, 14);
2688
2689
                //combining sign-left and sign_right
2690
1.37M
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
2691
                //manipulation for bottom - row 1
2692
1.37M
                signup0_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signup0_16x8b, 15);
2693
                //eliminating old left for row 0 and row 1
2694
1.37M
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
2695
                //bottom - row1
2696
1.37M
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signup0_16x8b);
2697
1.37M
                cmp_lt0_16x8b = _mm_subs_epu8(signup0_16x8b, src_bottom_16x8b);
2698
                //creating mask 00 for +ve and -ve values and FF for zero.
2699
1.37M
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2700
1.37M
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2701
                //for the next iteration bottom -row1
2702
1.37M
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2703
                //row1  getting it right for left of next block
2704
1.37M
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15);
2705
                //adding constant 2
2706
1.37M
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2707
1.37M
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
2708
                //shuffle to get sao index
2709
1.37M
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2710
1.37M
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
2711
                //using availability mask
2712
1.37M
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
2713
1.37M
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
2714
                //shuffle to get sao offset
2715
1.37M
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2716
1.37M
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
2717
                //row0  getting it right for left of next block
2718
1.37M
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
2719
                //copying the next top
2720
1.37M
                src_top_16x8b = src_temp1_16x8b;
2721
                //cnvert to 16 bit then add and then saturated pack
2722
1.37M
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2723
1.37M
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2724
1.37M
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2725
1.37M
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
2726
1.37M
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2727
1.37M
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
2728
1.37M
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
2729
1.37M
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
2730
2731
1.37M
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
2732
1.37M
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
2733
1.37M
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
2734
1.37M
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
2735
1.37M
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2736
1.37M
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
2737
1.37M
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
2738
1.37M
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
2739
2740
                //store left boundary
2741
1.37M
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
2742
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
2743
1.37M
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2744
                // row = 1
2745
1.37M
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
2746
2747
1.37M
                src_temp0_16x8b = src_bottom_16x8b;
2748
1.37M
                pu1_src_cpy += (src_strd << 1);
2749
1.37M
                pu1_src_left_cpy += 2;
2750
1.37M
                pu1_src_left_str += 2;
2751
1.37M
            }
2752
87.1k
            ht_rem = ht & 0x1;
2753
2754
87.1k
            if(ht_rem)
2755
5.71k
            {
2756
5.71k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
2757
5.71k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
2758
                //current row -next row
2759
                //separating +ve and and -ve values.
2760
5.71k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
2761
5.71k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
2762
                //creating mask 00 for +ve and -ve values and FF for zero.
2763
5.71k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2764
5.71k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2765
                //combining the appropriate sign change
2766
5.71k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2767
                //adding top and botton and constant 2
2768
5.71k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
2769
5.71k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2770
                //eliminating old left for row 0 and row 1
2771
5.71k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
2772
2773
5.71k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2774
                //using availability mask
2775
5.71k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
2776
2777
5.71k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2778
2779
                //row0  getting it right for left of next block
2780
5.71k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
2781
                //copying the next top
2782
5.71k
                src_top_16x8b = src_temp0_16x8b;
2783
                //cnvert to 16 bit then add and then saturated pack
2784
5.71k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2785
5.71k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2786
5.71k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2787
5.71k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
2788
5.71k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2789
5.71k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
2790
5.71k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
2791
5.71k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
2792
                //store left boundary
2793
5.71k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
2794
2795
5.71k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2796
5.71k
                pu1_src_cpy += (src_strd);
2797
5.71k
                pu1_src_left_cpy += 1;
2798
5.71k
                pu1_src_left_str += 1;
2799
5.71k
            }
2800
87.1k
            if(0 == pu1_avail[3])
2801
1.79k
            {
2802
1.79k
                src_top_16x8b = src_bottom_16x8b;
2803
1.79k
                pu1_src_left_str[0] = pu1_src_cpy[15];
2804
1.79k
            }
2805
87.1k
            if(0 == pu1_avail[2])
2806
3.92k
            {
2807
3.92k
                pu1_src_left_str[-ht_0] = pu1_src[15 - src_strd];
2808
3.92k
            }
2809
2810
            //for the top left of next part of the block
2811
87.1k
            left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
2812
            //updating top flag
2813
87.1k
            _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
2814
87.1k
            pu1_src += 16;
2815
87.1k
            au1_mask_cpy += 16;
2816
2817
2818
87.1k
            pu1_left_tmp = pu1_src_left_cpy2;
2819
87.1k
            pu1_src_left_cpy2 = pu1_src_left_str2;
2820
87.1k
            pu1_src_left_str2 = pu1_left_tmp;
2821
2822
87.1k
            pu1_src_left_cpy = pu1_src_left_cpy2;
2823
87.1k
            pu1_src_left_str = pu1_src_left_str2;
2824
87.1k
        }
2825
2826
56.8k
        wd_rem = wd & 0xF;
2827
56.8k
        if(wd_rem)
2828
55.8k
        {
2829
55.8k
            pu1_src_left_cpy = pu1_src_left_cpy2;
2830
55.8k
            pu1_src_left_str = pu1_src_left_str2;
2831
55.8k
            pu1_src_cpy = pu1_src;
2832
55.8k
            src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
2833
            //row = 0
2834
55.8k
            src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
2835
55.8k
            src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 15);
2836
55.8k
            au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
2837
            //separating +ve and and -ve values.
2838
55.8k
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
2839
55.8k
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
2840
            //creating mask 00 for +ve and -ve values and FF for zero.
2841
55.8k
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2842
55.8k
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2843
            //preparing au1_mask
2844
55.8k
            au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
2845
            //combining the appropriate sign change
2846
55.8k
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2847
55.8k
            signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
2848
2849
493k
            for(row = ht; row >= 4; row -= 4)
2850
437k
            {
2851
437k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
2852
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
2853
437k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
2854
                // row = 2
2855
437k
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
2856
                //right row1
2857
437k
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
2858
                //row 0 -row1
2859
                //separating +ve and and -ve values.
2860
437k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
2861
437k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
2862
                //manipulation for row 1 -row 0
2863
437k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 15);
2864
                //creating mask 00 for +ve and -ve values and FF for zero.
2865
437k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2866
437k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2867
                //row 0 left
2868
437k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15);
2869
                //combining the appropriate sign change
2870
437k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2871
                //row 1 -row0
2872
                //separating +ve and and -ve values.
2873
437k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
2874
437k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
2875
2876
                //creating mask 00 for +ve and -ve values and FF for zero.
2877
437k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2878
437k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2879
                //row1-row0
2880
437k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2881
2882
437k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
2883
2884
437k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
2885
                //right row2
2886
437k
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
2887
                //packing row 0 n row 1
2888
437k
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
2889
                //row1 -row2
2890
437k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
2891
437k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
2892
                //creating mask 00 for +ve and -ve values and FF for zero.
2893
437k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2894
437k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2895
                //combining the appropriate sign change
2896
437k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
2897
437k
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
2898
                //manipulation for row 2 -row 1
2899
437k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
2900
                //row 1 left
2901
437k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
2902
                //row = 3
2903
437k
                src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
2904
2905
                // row = 4
2906
437k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
2907
2908
437k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
2909
2910
                //separating +ve and and -ve values.(2,1)
2911
437k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
2912
437k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
2913
                //manipulation for row 3 -row 2
2914
437k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 13);
2915
                //creating mask 00 for +ve and -ve values and FF for zero.
2916
437k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2917
437k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2918
                //row 2 left
2919
437k
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
2920
                //combining the appropriate sign change
2921
437k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
2922
2923
                //separating +ve and and -ve values.(3,2)
2924
437k
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
2925
437k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
2926
437k
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
2927
                //creating mask 00 for +ve and -ve values and FF for zero.
2928
437k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2929
437k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2930
                //right row3
2931
437k
                signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 1);
2932
                //combining the appropriate sign change
2933
437k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
2934
2935
437k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
2936
2937
                //separating +ve and and -ve values.(2,3)
2938
437k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
2939
437k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
2940
                //right row 4
2941
437k
                signdwn1_16x8b =  _mm_srli_si128(src_temp1_16x8b, 1);
2942
                //creating mask 00 for +ve and -ve values and FF for zero.
2943
437k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2944
437k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2945
                //combining the appropriate sign change
2946
437k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
2947
2948
                //separating +ve and and -ve values.(3,bottom)
2949
437k
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
2950
437k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
2951
2952
                //creating mask 00 for +ve and -ve values and FF for zero.
2953
437k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2954
437k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2955
437k
                edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
2956
                //combining the appropriate sign change
2957
437k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
2958
437k
                edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
2959
2960
                //manipulation for bottom -row 3
2961
437k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
2962
                //eliminating old left for row 0,1,2,3
2963
437k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
2964
                //packing row 2 n row 3
2965
437k
                src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
2966
                //row 3 left
2967
437k
                signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 15);
2968
                //loading row 3 right into left
2969
437k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 15);
2970
                //adding bottom and top values of row 2 and row 3
2971
437k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
2972
                //separating +ve and and -ve values.(botttom,3)
2973
437k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
2974
437k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
2975
                //to store right of row 2
2976
437k
                signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
2977
                //creating mask 00 for +ve and -ve values and FF for zero.
2978
437k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2979
437k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2980
437k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
2981
2982
                //storing right of row 2into left
2983
437k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
2984
                //to store right of row 0
2985
437k
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
2986
                //storing right of row 1 into left
2987
437k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
2988
2989
                //adding constant 2
2990
437k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2991
437k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
2992
                //shuffle to get sao index
2993
437k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2994
437k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
2995
                //using availability mask
2996
437k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
2997
437k
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
2998
                //shuffle to get sao offset
2999
437k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3000
437k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
3001
3002
                //storing right of row 0 into left
3003
437k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
3004
                //cnvert to 16 bit then add and then saturated pack
3005
437k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3006
437k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3007
437k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3008
437k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
3009
437k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3010
437k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
3011
437k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
3012
437k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
3013
3014
437k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
3015
437k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
3016
437k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
3017
437k
                src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
3018
437k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3019
437k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
3020
437k
                src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b);
3021
437k
                src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
3022
3023
437k
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
3024
437k
                cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
3025
3026
437k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3027
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
3028
437k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3029
                // row = 1
3030
437k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
3031
                //row = 2
3032
437k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
3033
                // row = 3
3034
437k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
3035
3036
437k
                src_temp0_16x8b = src_temp1_16x8b;
3037
437k
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
3038
437k
                pu1_src_cpy += (src_strd << 2);
3039
437k
                pu1_src_left_cpy += 4;
3040
437k
                pu1_src_left_str += 4;
3041
437k
            }
3042
55.8k
            ht_rem = ht & 0x2;
3043
55.8k
            if(ht_rem)
3044
3.79k
            {
3045
3.79k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3046
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
3047
3.79k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
3048
                // row = 2
3049
3.79k
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
3050
3051
                //row 0 -row 1
3052
3.79k
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
3053
                //separating +ve and and -ve values.
3054
3.79k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
3055
3.79k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
3056
                //manipulation for row 1 -row 0
3057
3.79k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 15);
3058
                //creating mask 00 for +ve and -ve values and FF for zero.
3059
3.79k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3060
3.79k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3061
                //manipulation for row 1 - row 0
3062
3.79k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15);
3063
                //combining the appropriate sign change
3064
3.79k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3065
3066
                //row1-row0
3067
                //separating +ve and and -ve values.
3068
3.79k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3069
3.79k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3070
3071
                //creating mask 00 for +ve and -ve values and FF for zero.
3072
3.79k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3073
3.79k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3074
                //combining the appropriate sign chang
3075
3.79k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3076
                //row 1 -bottom
3077
3.79k
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
3078
3079
3.79k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
3080
3.79k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
3081
                //row1 -bottom
3082
3.79k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3083
3.79k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3084
3085
                //creating mask 00 for +ve and -ve values and FF for zero.
3086
3.79k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3087
3.79k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3088
                //combining the appropriate sign change
3089
3.79k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
3090
3.79k
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
3091
                //manipulation for bottom -row1
3092
3.79k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
3093
                //manipulation for bottom- row 1
3094
3.79k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
3095
                //adding top and down substraction
3096
3.79k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
3097
                //bottom - row 1
3098
3.79k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
3099
3.79k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
3100
3101
                //eliminating old left for row 0,1
3102
3.79k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
3103
3.79k
                signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
3104
                //creating mask 00 for +ve and -ve values and FF for zero.
3105
3.79k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3106
3.79k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3107
                //for the next iteration signup0_16x8b
3108
3.79k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
3109
3110
                //storing right of row 1 into left
3111
3.79k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
3112
                //for storing right of row 1
3113
3.79k
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
3114
3115
3.79k
                src_top_16x8b = src_temp1_16x8b;
3116
                //storing right of row 0 into left
3117
3.79k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
3118
3119
                //adding constant 2
3120
3.79k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
3121
3122
                //shuffle to get sao index
3123
3.79k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
3124
                //using availability mask
3125
3.79k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
3126
                //shuffle to get sao offset
3127
3.79k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3128
3129
                //the next top already in  src_top_16x8b
3130
                //cnvert to 16 bit then add and then saturated pack
3131
3.79k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3132
3.79k
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3133
3.79k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3134
3.79k
                src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
3135
3.79k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
3136
3.79k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
3137
3.79k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
3138
3.79k
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
3139
3140
3.79k
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
3141
3142
3.79k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3143
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
3144
3.79k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3145
                // row = 1
3146
3.79k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
3147
3.79k
                src_temp0_16x8b = src_bottom_16x8b;
3148
3.79k
                pu1_src_cpy += (src_strd << 1);
3149
3.79k
                pu1_src_left_cpy += 2;
3150
3.79k
                pu1_src_left_str += 2;
3151
3.79k
            }
3152
55.8k
            ht_rem = ht & 0x1;
3153
55.8k
            if(ht_rem)
3154
3.79k
            {
3155
3.79k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3156
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
3157
3.79k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
3158
                //left store manipulation 1
3159
3.79k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
3160
                //row 0 -row1
3161
3.79k
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
3162
                //separating +ve and and -ve values.
3163
3.79k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
3164
3.79k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
3165
                //creating mask 00 for +ve and -ve values and FF for zero.
3166
3.79k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3167
3.79k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3168
                //combining the appropriate sign change
3169
3.79k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3170
                //adding top and down substraction
3171
3.79k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
3172
                //for row 0 right to put into left store
3173
3.79k
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
3174
                //adding constant 2
3175
3.79k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
3176
3.79k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
3177
3.79k
                edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
3178
                //filling the left boundary value
3179
3.79k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
3180
3181
                //shuffle to get sao index
3182
3.79k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
3183
                //using availability mask
3184
3.79k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
3185
                //shuffle to get sao offset
3186
3.79k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3187
3.79k
                src_top_16x8b = src_temp0_16x8b;
3188
                //cnvert to 16 bit then add and then saturated pack
3189
3.79k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3190
3.79k
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3191
3.79k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3192
3.79k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
3193
3.79k
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
3194
3195
3.79k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3196
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
3197
3.79k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3198
3.79k
                pu1_src_cpy += (src_strd);
3199
3.79k
                pu1_src_left_cpy += 1;
3200
3.79k
                pu1_src_left_str += 1;
3201
3.79k
            }
3202
55.8k
            if(0 == pu1_avail[3])
3203
1.19k
            {
3204
1.19k
                src_top_16x8b = src_bottom_16x8b;
3205
1.19k
                pu1_src_left_str[0] = pu1_src_cpy[7];
3206
1.19k
            }
3207
3208
55.8k
            if(0 == pu1_avail[2])
3209
2.60k
            {
3210
2.60k
                pu1_src_left_str[-ht_0] = pu1_src[7 - src_strd];
3211
2.60k
            }
3212
3213
55.8k
            _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
3214
55.8k
            pu1_src += 8;
3215
55.8k
            au1_mask_cpy += 16;
3216
3217
55.8k
            pu1_left_tmp = pu1_src_left_cpy2;
3218
55.8k
            pu1_src_left_cpy2 = pu1_src_left_str2;
3219
55.8k
            pu1_src_left_str2 = pu1_left_tmp;
3220
3221
55.8k
            pu1_src_left_cpy = pu1_src_left_cpy2;
3222
55.8k
            pu1_src_left_str = pu1_src_left_str2;
3223
55.8k
        }
3224
56.8k
        pu1_src_org[0] = u1_pos_0_0_tmp;
3225
56.8k
        pu1_src_org[wd - 1 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp;
3226
56.8k
        pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 1) : pu1_src_left_cpy;
3227
1.85M
        for(row = 0; row < ht_tmp; row++)
3228
1.79M
        {
3229
1.79M
            pu1_src_left[row] = pu1_src_left_cpy[row];
3230
1.79M
        }
3231
56.8k
    }
3232
3233
56.8k
}
3234
3235
/* 135 degree filtering */
3236
void ihevc_sao_edge_offset_class2_chroma_ssse3(UWORD8 *pu1_src,
3237
                                               WORD32 src_strd,
3238
                                               UWORD8 *pu1_src_left,
3239
                                               UWORD8 *pu1_src_top,
3240
                                               UWORD8 *pu1_src_top_left,
3241
                                               UWORD8 *pu1_src_top_right,
3242
                                               UWORD8 *pu1_src_bot_left,
3243
                                               UWORD8 *pu1_avail,
3244
                                               WORD8 *pi1_sao_offset_u,
3245
                                               WORD8 *pi1_sao_offset_v,
3246
                                               WORD32 wd,
3247
                                               WORD32 ht)
3248
63.1k
{
3249
63.1k
    WORD32 row, col;
3250
63.1k
    UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
3251
63.1k
    UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2;
3252
63.1k
    UWORD8 *pu1_firstleft;
3253
63.1k
    UWORD8 *pu1_src_cpy, *pu1_src_org;
3254
63.1k
    UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
3255
63.1k
    UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)];
3256
63.1k
    UWORD8 au1_src_left_tmp1[2 * (MAX_CTB_SIZE + 8)];
3257
63.1k
    WORD32 wd_rem;
3258
63.1k
    UWORD8 u1_pos_0_0_tmp_u, u1_pos_0_0_tmp_v, u1_pos_wd_ht_tmp_u, u1_pos_wd_ht_tmp_v;
3259
63.1k
    WORD32 ht_tmp;
3260
63.1k
    WORD32 ht_0;
3261
3262
63.1k
    WORD32 bit_depth;
3263
63.1k
    UWORD8 u1_avail0, u1_avail1;
3264
3265
63.1k
    __m128i src_temp0_16x8b, src_temp1_16x8b;
3266
63.1k
    __m128i signup0_16x8b, signdwn1_16x8b;
3267
63.1k
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
3268
63.1k
    __m128i edge0_16x8b, edge1_16x8b;
3269
63.1k
    __m128i src_top_16x8b, src_bottom_16x8b;
3270
63.1k
    __m128i au1_mask8x16b;
3271
63.1k
    __m128i edge_idx_8x16b, sao_offset_8x16b;
3272
63.1k
    __m128i const2_16x8b, const0_16x8b;
3273
63.1k
    __m128i left_store_16x8b;
3274
63.1k
    __m128i chroma_offset_8x16b;
3275
3276
63.1k
    UNUSED(pu1_src_top_right);
3277
63.1k
    UNUSED(pu1_src_bot_left);
3278
3279
63.1k
    ht_0 = ht; ht_tmp = ht;
3280
63.1k
    au1_mask8x16b = _mm_set1_epi8(0xff);
3281
    /* Updating left and top-left  */
3282
2.06M
    for(row = 0; row < 2 * ht; row++)
3283
2.00M
    {
3284
2.00M
        au1_src_left_tmp[row] = pu1_src_left[row];
3285
2.00M
    }
3286
    //setting availability mask to ff size MAX_CTB_SIZE
3287
315k
    for(col = 0; col < MAX_CTB_SIZE; col += 16)
3288
252k
        _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
3289
63.1k
    bit_depth = BIT_DEPTH_LUMA;
3290
63.1k
    pu1_src_org = pu1_src;
3291
63.1k
    pu1_src_top_cpy = pu1_src_top;
3292
63.1k
    pu1_src_left_cpy2 = au1_src_left_tmp;
3293
63.1k
    pu1_src_left_cpy = au1_src_left_tmp;
3294
63.1k
    pu1_src_left_str2 = au1_src_left_tmp1;
3295
63.1k
    pu1_src_left_str = au1_src_left_tmp1;
3296
63.1k
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
3297
63.1k
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
3298
63.1k
    const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
3299
63.1k
    chroma_offset_8x16b = _mm_set1_epi16(0x0800);
3300
3301
    /* If top-left is available, process separately */
3302
63.1k
    if(0 != pu1_avail[4])
3303
59.5k
    {
3304
59.5k
        WORD32 edge_idx;
3305
3306
        /* U */
3307
59.5k
        edge_idx = 2 + SIGN(pu1_src[0] - pu1_src_top_left[0]) +
3308
59.5k
                        SIGN(pu1_src[0] - pu1_src[2 + src_strd]);
3309
3310
59.5k
        edge_idx = gi1_table_edge_idx[edge_idx];
3311
3312
59.5k
        if(0 != edge_idx)
3313
14.5k
        {
3314
14.5k
            u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
3315
14.5k
        }
3316
45.0k
        else
3317
45.0k
        {
3318
45.0k
            u1_pos_0_0_tmp_u = pu1_src[0];
3319
45.0k
        }
3320
3321
        /* V */
3322
59.5k
        edge_idx = 2 + SIGN(pu1_src[1] - pu1_src_top_left[1]) +
3323
59.5k
                        SIGN(pu1_src[1] - pu1_src[1 + 2 + src_strd]);
3324
3325
59.5k
        edge_idx = gi1_table_edge_idx[edge_idx];
3326
3327
59.5k
        if(0 != edge_idx)
3328
13.1k
        {
3329
13.1k
            u1_pos_0_0_tmp_v = CLIP3(pu1_src[1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
3330
13.1k
        }
3331
46.3k
        else
3332
46.3k
        {
3333
46.3k
            u1_pos_0_0_tmp_v = pu1_src[1];
3334
46.3k
        }
3335
59.5k
    }
3336
3.64k
    else
3337
3.64k
    {
3338
3.64k
        u1_pos_0_0_tmp_u = pu1_src[0];
3339
3.64k
        u1_pos_0_0_tmp_v = pu1_src[1];
3340
3.64k
    }
3341
3342
    /* If bottom-right is available, process separately */
3343
63.1k
    if(0 != pu1_avail[7])
3344
60.6k
    {
3345
60.6k
        WORD32 edge_idx;
3346
3347
        /* U */
3348
60.6k
        edge_idx = 2 + SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]) +
3349
60.6k
                        SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]);
3350
3351
60.6k
        edge_idx = gi1_table_edge_idx[edge_idx];
3352
3353
60.6k
        if(0 != edge_idx)
3354
12.9k
        {
3355
12.9k
            u1_pos_wd_ht_tmp_u = CLIP3(pu1_src[wd - 2 + (ht - 1) * src_strd] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
3356
12.9k
        }
3357
47.7k
        else
3358
47.7k
        {
3359
47.7k
            u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd];
3360
47.7k
        }
3361
3362
        /* V */
3363
60.6k
        edge_idx = 2 + SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]) +
3364
60.6k
                        SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]);
3365
3366
60.6k
        edge_idx = gi1_table_edge_idx[edge_idx];
3367
3368
60.6k
        if(0 != edge_idx)
3369
15.6k
        {
3370
15.6k
            u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
3371
15.6k
        }
3372
44.9k
        else
3373
44.9k
        {
3374
44.9k
            u1_pos_wd_ht_tmp_v = pu1_src[wd - 1 + (ht - 1) * src_strd];
3375
44.9k
        }
3376
60.6k
    }
3377
2.50k
    else
3378
2.50k
    {
3379
2.50k
        u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd];
3380
2.50k
        u1_pos_wd_ht_tmp_v = pu1_src[wd - 1 + (ht - 1) * src_strd];
3381
2.50k
    }
3382
63.1k
    pu1_firstleft = pu1_src_top_left;
3383
3384
    /* Update height and source pointers based on the availability flags */
3385
63.1k
    if(0 == pu1_avail[2])
3386
1.69k
    {
3387
1.69k
        pu1_firstleft = pu1_src_left_cpy2;
3388
1.69k
        pu1_src_left_cpy2 += 2;
3389
1.69k
        pu1_src_left_str2 += 2;
3390
1.69k
        pu1_src_top_cpy = pu1_src;
3391
1.69k
        pu1_src += src_strd;
3392
1.69k
        ht--;
3393
1.69k
    }
3394
63.1k
    if(0 == pu1_avail[3])
3395
1.52k
    {
3396
1.52k
        ht--;
3397
1.52k
        ht_0--;
3398
1.52k
    }
3399
    //storing top left in a mmx register
3400
63.1k
    left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_firstleft);
3401
63.1k
    sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
3402
63.1k
    const2_16x8b = _mm_set1_epi8(2);
3403
63.1k
    const0_16x8b = _mm_setzero_si128();
3404
63.1k
    left_store_16x8b = _mm_slli_si128(left_store_16x8b, 14);
3405
3406
    //availability mask creation
3407
63.1k
    u1_avail0 = pu1_avail[0];
3408
63.1k
    u1_avail1 = pu1_avail[1];
3409
63.1k
    au1_mask[0] = u1_avail0;
3410
63.1k
    au1_mask[1] = u1_avail0;
3411
63.1k
    au1_mask[wd - 1] = u1_avail1;
3412
63.1k
    au1_mask[wd - 2] = u1_avail1;
3413
3414
    /* top-left arrays */
3415
63.1k
    pu1_src_top_left[0] = pu1_src_top[wd - 2];
3416
63.1k
    pu1_src_top_left[1] = pu1_src_top[wd - 1];
3417
63.1k
    {
3418
63.1k
        WORD32 ht_rem;
3419
63.1k
        au1_mask_cpy = au1_mask;
3420
3421
63.1k
        pu1_src_left_cpy = pu1_src_left_cpy2;
3422
63.1k
        pu1_src_left_str = pu1_src_left_str2;
3423
191k
        for(col = wd; col >= 16; col -= 16)
3424
128k
        {
3425
128k
            pu1_src_cpy = pu1_src;
3426
128k
            src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
3427
            //row = 0
3428
128k
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
3429
128k
            src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 14);
3430
            //loading the mask
3431
128k
            au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
3432
            //separating +ve and and -ve values.
3433
128k
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
3434
128k
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
3435
            //creating mask 00 for +ve and -ve values and FF for zero.
3436
128k
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3437
128k
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3438
            //combining the appropriate sign change
3439
128k
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3440
3441
3442
1.13M
            for(row = ht; row >= 2; row -= 2)
3443
1.00M
            {
3444
1.00M
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3445
                //row = 1
3446
1.00M
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
3447
                // row = 1 right
3448
1.00M
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
3449
                //to insert left in row 0
3450
1.00M
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
3451
                //row 0 -row1
3452
                //separating +ve and and -ve values.
3453
1.00M
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
3454
1.00M
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
3455
3456
                //creating mask 00 for +ve and -ve values and FF for zero.
3457
1.00M
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3458
1.00M
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3459
                //manipulation for row 1 - row 0
3460
1.00M
                signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14);
3461
                //combining the appropriate sign change
3462
1.00M
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
3463
                //row1-row0
3464
                //separating +ve and and -ve values.
3465
1.00M
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3466
1.00M
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3467
                //creating mask 00 for +ve and -ve values and FF for zero.
3468
1.00M
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3469
1.00M
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3470
                 // row = 2 right
3471
1.00M
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd + 2));
3472
1.00M
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
3473
3474
3475
                //row1 -bottom
3476
1.00M
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
3477
1.00M
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
3478
                //creating mask 00 for +ve and -ve values and FF for zero.
3479
1.00M
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3480
1.00M
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3481
                //combining the appropriate sign change
3482
1.00M
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3483
                // row = 2
3484
1.00M
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
3485
3486
                //combining sign-left and sign_right
3487
1.00M
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
3488
3489
                //storing the row 1 left for next row.
3490
1.00M
                signup0_16x8b = _mm_slli_si128(left_store_16x8b, 12);
3491
3492
                //combining sign-left and sign_right
3493
1.00M
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
3494
                //manipulation for bottom - row 1
3495
1.00M
                signup0_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signup0_16x8b, 14);
3496
                //eliminating old left for row 0 and row 1
3497
1.00M
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
3498
                //bottom - row1
3499
1.00M
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signup0_16x8b);
3500
1.00M
                cmp_lt0_16x8b = _mm_subs_epu8(signup0_16x8b, src_bottom_16x8b);
3501
                //creating mask 00 for +ve and -ve values and FF for zero.
3502
1.00M
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3503
1.00M
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3504
                //for the next iteration bottom -row1
3505
1.00M
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3506
                //row1  getting it right for left of next iteration
3507
1.00M
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14);
3508
                //copying the next top
3509
1.00M
                src_top_16x8b = src_temp1_16x8b;
3510
                //row0  getting its right for left of next iteration.
3511
1.00M
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
3512
3513
3514
                //adding constant 2
3515
1.00M
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
3516
1.00M
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
3517
                //shuffle to get sao index
3518
1.00M
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
3519
1.00M
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
3520
                //using availability mask
3521
1.00M
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
3522
1.00M
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
3523
                //adding chroma offset to access U and V
3524
1.00M
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
3525
1.00M
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
3526
3527
3528
                //shuffle to get sao offset
3529
1.00M
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3530
1.00M
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
3531
                //cnvert to 16 bit then add and then saturated pack
3532
1.00M
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3533
1.00M
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3534
1.00M
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3535
1.00M
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
3536
1.00M
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
3537
1.00M
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3538
1.00M
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
3539
1.00M
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
3540
3541
1.00M
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
3542
1.00M
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
3543
1.00M
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
3544
1.00M
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
3545
1.00M
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
3546
1.00M
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3547
1.00M
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
3548
1.00M
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
3549
3550
                //store left boundary
3551
1.00M
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3552
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
3553
1.00M
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3554
                // row = 1
3555
1.00M
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
3556
3557
1.00M
                src_temp0_16x8b = src_bottom_16x8b;
3558
1.00M
                pu1_src_cpy += (src_strd << 1);
3559
1.00M
                pu1_src_left_cpy += 4;
3560
1.00M
                pu1_src_left_str += 4;
3561
1.00M
            }
3562
128k
            ht_rem = ht & 0x1;
3563
3564
128k
            if(ht_rem)
3565
6.47k
            {
3566
6.47k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3567
6.47k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
3568
                //current row -next row
3569
                //separating +ve and and -ve values.
3570
6.47k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
3571
6.47k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
3572
                //creating mask 00 for +ve and -ve values and FF for zero.
3573
6.47k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3574
6.47k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3575
                //combining the appropriate sign change
3576
6.47k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3577
                //adding top and botton and constant 2
3578
6.47k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
3579
6.47k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
3580
3581
                //eliminating old left for row 0 and row 1
3582
6.47k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
3583
                //copying the next top
3584
6.47k
                src_top_16x8b = src_temp0_16x8b;
3585
                //row0  getting it right for left of next block
3586
6.47k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
3587
3588
6.47k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
3589
                //using availability mask
3590
6.47k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
3591
                //adding chroma offset to access U and V
3592
6.47k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
3593
3594
6.47k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3595
3596
                //cnvert to 16 bit then add and then saturated pack
3597
6.47k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3598
6.47k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3599
6.47k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3600
6.47k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
3601
6.47k
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
3602
6.47k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3603
6.47k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
3604
6.47k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
3605
3606
6.47k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3607
3608
6.47k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3609
6.47k
                pu1_src_cpy += (src_strd);
3610
6.47k
                pu1_src_left_cpy += 2;
3611
6.47k
                pu1_src_left_str += 2;
3612
6.47k
            }
3613
128k
            if(0 == pu1_avail[3])
3614
3.04k
            {
3615
3.04k
                src_top_16x8b = src_bottom_16x8b;
3616
3.04k
                pu1_src_left_str[1] = pu1_src_cpy[15];
3617
3.04k
                pu1_src_left_str[0] = pu1_src_cpy[14];
3618
3.04k
            }
3619
128k
            if(0 == pu1_avail[2])
3620
3.43k
            {
3621
3.43k
                pu1_src_left_str[-2 * ht_0] = pu1_src[14 - src_strd];
3622
3.43k
                pu1_src_left_str[-2 * ht_0 + 1] = pu1_src[15 - src_strd];
3623
3.43k
            }
3624
3625
            //for the top left of next part of the block
3626
128k
            left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
3627
            //updating top flag
3628
128k
            _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
3629
128k
            pu1_src += 16;
3630
128k
            au1_mask_cpy += 16;
3631
3632
128k
            pu1_left_tmp = pu1_src_left_cpy2;
3633
128k
            pu1_src_left_cpy2 = pu1_src_left_str2;
3634
128k
            pu1_src_left_str2 = pu1_left_tmp;
3635
3636
128k
            pu1_src_left_cpy = pu1_src_left_cpy2;
3637
128k
            pu1_src_left_str = pu1_src_left_str2;
3638
128k
        }
3639
63.1k
        wd_rem = wd & 0xF;
3640
63.1k
        if(wd_rem)
3641
6
        {
3642
6
            pu1_src_left_cpy = pu1_src_left_cpy2;
3643
6
            pu1_src_left_str = pu1_src_left_str2;
3644
6
            pu1_src_cpy = pu1_src;
3645
6
            src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
3646
            //row = 0
3647
6
            src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
3648
6
            src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 14);
3649
6
            au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
3650
            //separating +ve and and -ve values.
3651
6
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
3652
6
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
3653
            //creating mask 00 for +ve and -ve values and FF for zero.
3654
6
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3655
6
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3656
            //preparing au1_mask
3657
6
            au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
3658
            //combining the appropriate sign change
3659
6
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3660
6
            signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
3661
3662
10
            for(row = ht; row >= 4; row -= 4)
3663
4
            {
3664
4
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3665
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
3666
4
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
3667
                // row = 2
3668
4
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
3669
                //right row1
3670
4
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
3671
                //row 0 -row1
3672
                //separating +ve and and -ve values.
3673
4
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
3674
4
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
3675
                //manipulation for row 1 -row 0
3676
4
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
3677
                //creating mask 00 for +ve and -ve values and FF for zero.
3678
4
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3679
4
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3680
                //row 0 left
3681
4
                signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14);
3682
                //combining the appropriate sign change
3683
4
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3684
                //row 1 -row0
3685
                //separating +ve and and -ve values.
3686
4
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3687
4
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3688
3689
                //creating mask 00 for +ve and -ve values and FF for zero.
3690
4
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3691
4
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3692
                //row1-row0
3693
4
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3694
3695
4
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
3696
3697
4
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
3698
                //right row2
3699
4
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
3700
                //packing row 0 n row 1
3701
4
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
3702
                //row1 -row2
3703
4
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3704
4
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3705
                //creating mask 00 for +ve and -ve values and FF for zero.
3706
4
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3707
4
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3708
                //combining the appropriate sign change
3709
4
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
3710
4
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
3711
                //manipulation for row 2 -row 1
3712
4
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
3713
                //row 1 left
3714
4
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
3715
                //row = 3
3716
4
                src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
3717
3718
                // row = 4
3719
4
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
3720
3721
4
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
3722
3723
                //separating +ve and and -ve values.(2,1)
3724
4
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
3725
4
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
3726
                //manipulation for row 3 -row 2
3727
4
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 10);
3728
                //creating mask 00 for +ve and -ve values and FF for zero.
3729
4
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3730
4
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3731
                //row 2 left
3732
4
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
3733
                //combining the appropriate sign change
3734
4
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
3735
3736
                //separating +ve and and -ve values.(3,2)
3737
4
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
3738
4
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
3739
4
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
3740
                //creating mask 00 for +ve and -ve values and FF for zero.
3741
4
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3742
4
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3743
                //right row3
3744
4
                signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 2);
3745
                //combining the appropriate sign change
3746
4
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
3747
3748
4
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
3749
3750
                //separating +ve and and -ve values.(2,3)
3751
4
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
3752
4
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
3753
                //right row 4
3754
4
                signdwn1_16x8b =  _mm_srli_si128(src_temp1_16x8b, 2);
3755
                //creating mask 00 for +ve and -ve values and FF for zero.
3756
4
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3757
4
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3758
                //combining the appropriate sign change
3759
4
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
3760
3761
                //separating +ve and and -ve values.(3,bottom)
3762
4
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
3763
4
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
3764
3765
                //creating mask 00 for +ve and -ve values and FF for zero.
3766
4
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3767
4
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3768
4
                edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
3769
                //combining the appropriate sign change
3770
4
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
3771
4
                edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
3772
3773
                //manipulation for bottom -row 3
3774
4
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 8);
3775
                //eliminating old left for row 0,1,2,3
3776
4
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 8);
3777
                //packing row 2 n row 3
3778
4
                src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
3779
                //row 3 left
3780
4
                signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 14);
3781
3782
                //adding bottom and top values of row 2 and row 3
3783
4
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
3784
                //separating +ve and and -ve values.(botttom,3)
3785
4
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3786
4
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3787
3788
                //creating mask 00 for +ve and -ve values and FF for zero.
3789
4
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3790
4
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3791
4
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
3792
3793
                //to store right of row 2
3794
4
                signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
3795
                //loading row 3 right into left
3796
4
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 14);
3797
                //storing right of row 2into left
3798
4
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
3799
                //to store right of row 0
3800
4
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
3801
                //storing right of row 1 into left
3802
4
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
3803
                //storing right of row 0 into left
3804
4
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
3805
3806
                //adding constant 2
3807
4
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
3808
4
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
3809
                //shuffle to get sao index
3810
4
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
3811
4
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
3812
                //using availability mask
3813
4
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
3814
4
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
3815
3816
                //adding chroma offset to access U and V
3817
4
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
3818
4
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
3819
3820
                //shuffle to get sao offset
3821
4
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3822
4
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
3823
                //cnvert to 16 bit then add and then saturated pack
3824
4
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3825
4
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3826
4
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3827
4
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
3828
4
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
3829
4
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3830
4
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
3831
4
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
3832
3833
4
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
3834
4
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
3835
4
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
3836
4
                src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
3837
4
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
3838
4
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3839
4
                src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b);
3840
4
                src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
3841
3842
4
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
3843
4
                cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
3844
3845
3846
4
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3847
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
3848
4
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3849
                // row = 1
3850
4
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
3851
                //row = 2
3852
4
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
3853
                // row = 3
3854
4
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
3855
3856
4
                src_temp0_16x8b = src_temp1_16x8b;
3857
4
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
3858
4
                pu1_src_cpy += (src_strd << 2);
3859
4
                pu1_src_left_cpy += 8;
3860
4
                pu1_src_left_str += 8;
3861
4
            }
3862
6
            ht_rem = ht & 0x2;
3863
6
            if(ht_rem)
3864
4
            {
3865
4
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3866
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
3867
4
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
3868
                // row = 2
3869
4
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
3870
3871
                //row 0 -row 1
3872
4
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
3873
                //separating +ve and and -ve values.
3874
4
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
3875
4
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
3876
                //manipulation for row 1 -row 0
3877
4
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
3878
                //creating mask 00 for +ve and -ve values and FF for zero.
3879
4
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3880
4
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3881
                //manipulation for row 1 - row 0
3882
4
                signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14);
3883
                //combining the appropriate sign change
3884
4
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3885
3886
                //row1-row0
3887
                //separating +ve and and -ve values.
3888
4
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3889
4
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3890
3891
                //creating mask 00 for +ve and -ve values and FF for zero.
3892
4
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3893
4
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3894
                //combining the appropriate sign chang
3895
4
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3896
                //row 1 -bottom
3897
4
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
3898
3899
4
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
3900
4
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
3901
                //row1 -bottom
3902
4
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3903
4
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3904
3905
                //creating mask 00 for +ve and -ve values and FF for zero.
3906
4
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3907
4
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3908
                //combining the appropriate sign change
3909
4
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
3910
4
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
3911
                //manipulation for bottom -row1
3912
4
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
3913
                //eliminating old left for row 0,1
3914
4
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
3915
                //manipulation for bottom- row 1
3916
4
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
3917
                //adding top and down substraction
3918
4
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
3919
                //bottom - row 1
3920
4
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
3921
4
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
3922
3923
                //shifting row 1
3924
4
                signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
3925
                //creating mask 00 for +ve and -ve values and FF for zero.
3926
4
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3927
4
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3928
                //for the next iteration signup0_16x8b
3929
4
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
3930
                //storing right of row 1 into left
3931
4
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); //for storing right of row 0
3932
4
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
3933
                //the next top  in  src_top_16x8b
3934
4
                src_top_16x8b = src_temp1_16x8b;
3935
                //storing right of row 0 into left
3936
4
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
3937
3938
3939
                //adding constant 2
3940
4
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
3941
3942
                //shuffle to get sao index
3943
4
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
3944
                //using availability mask
3945
4
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
3946
3947
                //adding chroma offset to access U and V
3948
4
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
3949
3950
                //shuffle to get sao offset
3951
4
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3952
                //the next top already in  src_top_16x8b
3953
                //cnvert to 16 bit then add and then saturated pack
3954
4
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3955
4
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3956
4
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3957
4
                src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
3958
4
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
3959
4
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
3960
4
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b);
3961
4
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
3962
3963
4
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
3964
3965
4
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3966
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
3967
4
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3968
                // row = 1
3969
4
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
3970
4
                src_temp0_16x8b = src_bottom_16x8b;
3971
4
                pu1_src_cpy += (src_strd << 1);
3972
4
                pu1_src_left_cpy += 4;
3973
4
                pu1_src_left_str += 4;
3974
4
            }
3975
6
            ht_rem = ht & 0x1;
3976
6
            if(ht_rem)
3977
0
            {
3978
0
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3979
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
3980
0
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
3981
3982
                //row 0 -row1
3983
0
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
3984
                //separating +ve and and -ve values.
3985
0
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
3986
0
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
3987
                //creating mask 00 for +ve and -ve values and FF for zero.
3988
0
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3989
0
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3990
                //combining the appropriate sign change
3991
0
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3992
                //adding top and down substraction
3993
0
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
3994
3995
                //for row 0 right to put into left store
3996
0
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
3997
                //left store manipulation 1
3998
0
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
3999
0
                src_top_16x8b = src_temp0_16x8b;
4000
                //filling the left boundary value
4001
0
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
4002
4003
                //adding constant 2
4004
0
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
4005
0
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
4006
0
                edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
4007
4008
4009
                //shuffle to get sao index
4010
0
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
4011
                //using availability mask
4012
0
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
4013
                //adding chroma offset to access U and V
4014
0
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
4015
4016
                //shuffle to get sao offset
4017
0
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
4018
4019
                //cnvert to 16 bit then add and then saturated pack
4020
0
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
4021
0
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
4022
0
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
4023
0
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
4024
0
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
4025
4026
0
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4027
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
4028
0
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
4029
0
                pu1_src_cpy += (src_strd);
4030
0
                pu1_src_left_cpy += 2;
4031
0
                pu1_src_left_str += 2;
4032
0
            }
4033
6
            if(0 == pu1_avail[3])
4034
4
            {
4035
4
                src_top_16x8b = src_bottom_16x8b;
4036
4
                pu1_src_left_str[1] = pu1_src_cpy[7];
4037
4
                pu1_src_left_str[0] = pu1_src_cpy[6];
4038
4
            }
4039
4040
6
            if(0 == pu1_avail[2])
4041
4
            {
4042
4
                pu1_src_left_str[-2 * ht_0] = pu1_src[6 - src_strd];
4043
4
                pu1_src_left_str[-2 * ht_0 + 1] = pu1_src[7 - src_strd];
4044
4
            }
4045
4046
6
            _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
4047
6
            pu1_src += 8;
4048
4049
6
            pu1_left_tmp = pu1_src_left_cpy2;
4050
6
            pu1_src_left_cpy2 = pu1_src_left_str2;
4051
6
            pu1_src_left_str2 = pu1_left_tmp;
4052
4053
6
            pu1_src_left_cpy = pu1_src_left_cpy2;
4054
6
            pu1_src_left_str = pu1_src_left_str2;
4055
6
        }
4056
63.1k
        pu1_src_org[0] = u1_pos_0_0_tmp_u;
4057
63.1k
        pu1_src_org[1] = u1_pos_0_0_tmp_v;
4058
63.1k
        pu1_src_org[wd - 2 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp_u;
4059
63.1k
        pu1_src_org[wd - 1 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp_v;
4060
63.1k
        pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 2) : pu1_src_left_cpy;
4061
2.06M
        for(row = 0; row < 2 * ht_tmp; row++)
4062
2.00M
        {
4063
2.00M
            pu1_src_left[row] = pu1_src_left_cpy[row];
4064
2.00M
        }
4065
63.1k
    }
4066
4067
63.1k
}
4068
4069
void ihevc_sao_edge_offset_class3_ssse3(UWORD8 *pu1_src,
4070
                                        WORD32 src_strd,
4071
                                        UWORD8 *pu1_src_left,
4072
                                        UWORD8 *pu1_src_top,
4073
                                        UWORD8 *pu1_src_top_left,
4074
                                        UWORD8 *pu1_src_top_right,
4075
                                        UWORD8 *pu1_src_bot_left,
4076
                                        UWORD8 *pu1_avail,
4077
                                        WORD8 *pi1_sao_offset,
4078
                                        WORD32 wd,
4079
                                        WORD32 ht)
4080
56.9k
{
4081
56.9k
    WORD32 row, col;
4082
56.9k
    UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
4083
56.9k
    UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2;
4084
56.9k
    UWORD8 *pu1_src_cpy, *pu1_src_org;
4085
56.9k
    UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8];
4086
56.9k
    UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8];
4087
56.9k
    UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
4088
56.9k
    WORD32 wd_rem;
4089
56.9k
    UWORD8 u1_pos_wd_0_tmp, u1_pos_0_ht_tmp;
4090
56.9k
    WORD32 ht_tmp;
4091
56.9k
    WORD32 bit_depth;
4092
56.9k
    UWORD8 u1_avail0, u1_avail1;
4093
4094
56.9k
    __m128i src_top_16x8b, src_bottom_16x8b;
4095
56.9k
    __m128i src_temp0_16x8b, src_temp1_16x8b;
4096
56.9k
    __m128i signup0_16x8b, signdwn1_16x8b;
4097
56.9k
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
4098
56.9k
    __m128i edge0_16x8b, edge1_16x8b;
4099
56.9k
    __m128i au1_mask8x16b;
4100
56.9k
    __m128i edge_idx_8x16b, sao_offset_8x16b;
4101
56.9k
    __m128i const2_16x8b, const0_16x8b;
4102
56.9k
    __m128i left_store_16x8b;
4103
4104
56.9k
    ht_tmp = ht;
4105
56.9k
    au1_mask8x16b = _mm_set1_epi8(0xff);
4106
4107
56.9k
    au1_src_left_tmp[0] = pu1_src[(wd - 1)];
4108
    //manipulation for bottom left
4109
1.79M
    for(row = 1; row < ht; row++)
4110
1.74M
    {
4111
1.74M
        au1_src_left_tmp[row] = pu1_src_left[row];
4112
1.74M
    }
4113
56.9k
    au1_src_left_tmp[ht] = pu1_src_bot_left[0];
4114
4115
56.9k
    *pu1_src_top_left = pu1_src_top[wd - 1];
4116
    //setting availability mask to ff size MAX_CTB_SIZE
4117
284k
    for(col = 0; col < MAX_CTB_SIZE; col += 16)
4118
227k
        _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
4119
56.9k
    bit_depth = BIT_DEPTH_LUMA;
4120
56.9k
    pu1_src_org = pu1_src;
4121
56.9k
    pu1_src_top_cpy = pu1_src_top;
4122
56.9k
    pu1_src_left_cpy2 = au1_src_left_tmp;
4123
56.9k
    pu1_src_left_cpy = au1_src_left_tmp;
4124
56.9k
    pu1_src_left_str2 = au1_src_left_tmp1;
4125
56.9k
    pu1_src_left_str = au1_src_left_tmp1;
4126
56.9k
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
4127
56.9k
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
4128
4129
    /* If top-right is available, process separately */
4130
56.9k
    if(0 != pu1_avail[5])
4131
54.5k
    {
4132
54.5k
        WORD32 edge_idx;
4133
4134
54.5k
        edge_idx = 2 + SIGN(pu1_src[wd - 1] - pu1_src_top_right[0]) +
4135
54.5k
                        SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd]);
4136
4137
54.5k
        edge_idx = gi1_table_edge_idx[edge_idx];
4138
4139
54.5k
        if(0 != edge_idx)
4140
15.9k
        {
4141
15.9k
            u1_pos_wd_0_tmp = CLIP3(pu1_src[wd - 1] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
4142
15.9k
        }
4143
38.6k
        else
4144
38.6k
        {
4145
38.6k
            u1_pos_wd_0_tmp = pu1_src[wd - 1];
4146
38.6k
        }
4147
54.5k
    }
4148
2.40k
    else
4149
2.40k
    {
4150
2.40k
        u1_pos_wd_0_tmp = pu1_src[wd - 1];
4151
2.40k
    }
4152
4153
    /* If bottom-left is available, process separately */
4154
56.9k
    if(0 != pu1_avail[6])
4155
52.7k
    {
4156
52.7k
        WORD32 edge_idx;
4157
4158
52.7k
        edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd]) +
4159
52.7k
                        SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]);
4160
4161
52.7k
        edge_idx = gi1_table_edge_idx[edge_idx];
4162
4163
52.7k
        if(0 != edge_idx)
4164
18.7k
        {
4165
18.7k
            u1_pos_0_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
4166
18.7k
        }
4167
33.9k
        else
4168
33.9k
        {
4169
33.9k
            u1_pos_0_ht_tmp = pu1_src[(ht - 1) * src_strd];
4170
33.9k
        }
4171
52.7k
    }
4172
4.22k
    else
4173
4.22k
    {
4174
4.22k
        u1_pos_0_ht_tmp = pu1_src[(ht - 1) * src_strd];
4175
4.22k
    }
4176
4177
4178
4179
    /* Update height and source pointers based on the availability flags */
4180
56.9k
    if(0 == pu1_avail[2])
4181
1.34k
    {
4182
1.34k
        pu1_src_left_cpy2++;
4183
1.34k
        pu1_src_left_str2++;
4184
1.34k
        pu1_src_top_cpy = pu1_src;
4185
1.34k
        pu1_src += src_strd;
4186
1.34k
        ht--;
4187
1.34k
    }
4188
56.9k
    if(0 == pu1_avail[3])
4189
1.99k
    {
4190
1.99k
        ht--;
4191
1.99k
    }
4192
4193
4194
56.9k
    const2_16x8b = _mm_set1_epi8(2);
4195
56.9k
    const0_16x8b = _mm_setzero_si128();
4196
4197
4198
    //availability mask creation
4199
56.9k
    u1_avail0 = pu1_avail[0];
4200
56.9k
    u1_avail1 = pu1_avail[1];
4201
56.9k
    au1_mask[0] = u1_avail0;
4202
56.9k
    au1_mask[wd - 1] = u1_avail1;
4203
56.9k
    {
4204
56.9k
        WORD32 ht_rem;
4205
4206
56.9k
        pu1_src_left_cpy = pu1_src_left_cpy2;
4207
56.9k
        pu1_src_left_str = pu1_src_left_str2;
4208
56.9k
        au1_mask_cpy = au1_mask;
4209
144k
        for(col = wd; col >= 16; col -= 16)
4210
87.8k
        {
4211
87.8k
            pu1_src_cpy = pu1_src;
4212
87.8k
            src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col + 1));
4213
            //row = 0
4214
87.8k
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
4215
4216
            //loading the mask
4217
87.8k
            au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
4218
            //separating +ve and and -ve values.
4219
87.8k
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
4220
87.8k
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
4221
            //creating mask 00 for +ve and -ve values and FF for zero.
4222
87.8k
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4223
87.8k
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4224
            //combining the appropriate sign change
4225
87.8k
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4226
4227
1.46M
            for(row = ht; row >= 2; row -= 2)
4228
1.37M
            {
4229
1.37M
                left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
4230
                //row = 1
4231
1.37M
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
4232
                //to insert left in row 1
4233
1.37M
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
4234
                // row = 0 right
4235
1.37M
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 1));
4236
4237
                //manipulation for row 1 - row 0
4238
1.37M
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
4239
                //row 0 -row1
4240
                //separating +ve and and -ve values.
4241
1.37M
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
4242
1.37M
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
4243
4244
                //creating mask 00 for +ve and -ve values and FF for zero.
4245
1.37M
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4246
1.37M
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4247
4248
                //combining the appropriate sign change
4249
1.37M
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
4250
                //combining sign-left and sign_right
4251
1.37M
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
4252
4253
                //row1-row0
4254
                //separating +ve and and -ve values.
4255
1.37M
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
4256
1.37M
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
4257
                //creating mask 00 for +ve and -ve values and FF for zero.
4258
1.37M
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4259
1.37M
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4260
4261
                // row = 2
4262
1.37M
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
4263
                // row = 1 right
4264
1.37M
                signdwn1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
4265
1.37M
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
4266
4267
                //bottom - row1
4268
1.37M
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
4269
1.37M
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
4270
                //creating mask 00 for +ve and -ve values and FF for zero.
4271
1.37M
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4272
1.37M
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4273
                //for the next iteration bottom -row1
4274
1.37M
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4275
4276
                //to insert left in row 1
4277
1.37M
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 13);
4278
                //manipulation for row 1 - bottom
4279
1.37M
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
4280
4281
                //row1 -bottom
4282
1.37M
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
4283
1.37M
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
4284
                //creating mask 00 for +ve and -ve values and FF for zero.
4285
1.37M
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4286
1.37M
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4287
                //combining the appropriate sign change
4288
1.37M
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4289
4290
                //combining sign-left and sign_right
4291
1.37M
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
4292
4293
                //eliminating old left for row 0 and row 1
4294
1.37M
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
4295
4296
                //row1  getting it right for left of next block
4297
1.37M
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15);
4298
                //adding constant 2
4299
1.37M
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
4300
1.37M
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
4301
                //shuffle to get sao index
4302
1.37M
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
4303
1.37M
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
4304
                //using availability mask
4305
1.37M
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
4306
1.37M
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
4307
                //shuffle to get sao offset
4308
1.37M
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
4309
1.37M
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
4310
                //row0  getting it right for left of next block
4311
1.37M
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
4312
                //copying the next top
4313
1.37M
                src_top_16x8b = src_temp1_16x8b;
4314
                //cnvert to 16 bit then add and then saturated pack
4315
1.37M
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
4316
1.37M
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
4317
1.37M
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
4318
1.37M
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
4319
1.37M
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
4320
1.37M
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
4321
1.37M
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
4322
1.37M
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
4323
4324
1.37M
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
4325
1.37M
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
4326
1.37M
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
4327
1.37M
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
4328
1.37M
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
4329
1.37M
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
4330
1.37M
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
4331
1.37M
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
4332
                //store left boundary
4333
1.37M
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4334
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
4335
1.37M
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
4336
                // row = 1
4337
1.37M
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
4338
4339
1.37M
                src_temp0_16x8b = src_bottom_16x8b;
4340
1.37M
                pu1_src_cpy += (src_strd << 1);
4341
1.37M
                pu1_src_left_cpy += 2;
4342
1.37M
                pu1_src_left_str += 2;
4343
1.37M
            }
4344
87.8k
            ht_rem = ht & 0x1;
4345
4346
87.8k
            if(ht_rem)
4347
5.12k
            {
4348
5.12k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
4349
5.12k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
4350
                //to insert left in row 1
4351
5.12k
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
4352
                //manipulation for row 1 - row 0
4353
5.12k
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
4354
4355
                //current row -next row
4356
                //separating +ve and and -ve values.
4357
5.12k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
4358
5.12k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
4359
                //creating mask 00 for +ve and -ve values and FF for zero.
4360
5.12k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4361
5.12k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4362
                //combining the appropriate sign change
4363
5.12k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4364
                //adding top and bottom and constant 2
4365
5.12k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
4366
5.12k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
4367
                //eliminating old left for row 0 and row 1
4368
5.12k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
4369
4370
5.12k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
4371
                //using availability mask
4372
5.12k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
4373
4374
5.12k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
4375
4376
                //row0  getting it right for left of next block
4377
5.12k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
4378
                //copying the next top
4379
5.12k
                src_top_16x8b = src_temp0_16x8b;
4380
                //cnvert to 16 bit then add and then saturated pack
4381
5.12k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
4382
5.12k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
4383
5.12k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
4384
5.12k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
4385
5.12k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
4386
5.12k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
4387
5.12k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
4388
5.12k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
4389
                //store left boundary
4390
5.12k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4391
4392
5.12k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
4393
5.12k
                pu1_src_cpy += (src_strd);
4394
5.12k
                src_temp0_16x8b = src_bottom_16x8b;
4395
5.12k
                pu1_src_left_cpy++;
4396
5.12k
                pu1_src_left_str++;
4397
5.12k
            }
4398
87.8k
            {   //for bottom right
4399
87.8k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
4400
87.8k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
4401
87.8k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
4402
87.8k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4403
87.8k
            }
4404
87.8k
            if(0 == pu1_avail[3])
4405
3.03k
            {
4406
3.03k
                src_top_16x8b = src_bottom_16x8b;
4407
3.03k
            }
4408
            //for the top left of next part of the block
4409
87.8k
            left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
4410
            //updating top flag
4411
87.8k
            _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
4412
87.8k
            pu1_src += 16;
4413
87.8k
            au1_mask_cpy += 16;
4414
4415
87.8k
            pu1_left_tmp = pu1_src_left_cpy2;
4416
87.8k
            pu1_src_left_cpy2 = pu1_src_left_str2;
4417
87.8k
            pu1_src_left_str2 = pu1_left_tmp;
4418
4419
87.8k
            pu1_src_left_cpy = pu1_src_left_cpy2;
4420
87.8k
            pu1_src_left_str = pu1_src_left_str2;
4421
87.8k
        }
4422
4423
56.9k
        wd_rem = wd & 0xF;
4424
56.9k
        if(wd_rem)
4425
55.8k
        {
4426
55.8k
            pu1_src_cpy = pu1_src;
4427
55.8k
            pu1_src_left_cpy = pu1_src_left_cpy2;
4428
55.8k
            pu1_src_left_str = pu1_src_left_str2;
4429
55.8k
            src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col + 1));
4430
            //row = 0
4431
55.8k
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
4432
55.8k
            au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
4433
            //separating +ve and and -ve values.
4434
55.8k
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
4435
55.8k
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
4436
            //creating mask 00 for +ve and -ve values and FF for zero.
4437
55.8k
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4438
55.8k
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4439
            //preparing au1_mask
4440
55.8k
            au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
4441
            //combining the appropriate sign change
4442
55.8k
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4443
55.8k
            signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
4444
4445
492k
            for(row = ht; row >= 4; row -= 4)
4446
436k
            {
4447
436k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
4448
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
4449
436k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
4450
                // row = 2
4451
436k
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
4452
                //manipulation for row 0 -row 1
4453
436k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
4454
                //row 1 left
4455
436k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
4456
                //row 0 -row1
4457
                //separating +ve and and -ve values.
4458
436k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
4459
436k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
4460
4461
                //creating mask 00 for +ve and -ve values and FF for zero.
4462
436k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4463
436k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4464
                //manipulatiing for row 1 -row 0
4465
436k
                signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 1);
4466
                //combining the appropriate sign change
4467
436k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4468
                //row 1 -row0
4469
                //separating +ve and and -ve values.
4470
436k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
4471
436k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
4472
4473
                //creating mask 00 for +ve and -ve values and FF for zero.
4474
436k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4475
436k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4476
                //row1-row0
4477
436k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4478
4479
436k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
4480
4481
436k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
4482
                //manipulation for row 1 -row 2
4483
436k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 13);
4484
                //row 2 left
4485
436k
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
4486
                //packing row 0 n row 1
4487
436k
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
4488
                //row1 -row2
4489
436k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
4490
436k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
4491
                //creating mask 00 for +ve and -ve values and FF for zero.
4492
436k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4493
436k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4494
                //combining the appropriate sign change
4495
436k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
4496
436k
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
4497
4498
                //row 1 right
4499
436k
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
4500
                //row = 3
4501
436k
                src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
4502
4503
                // row = 4
4504
436k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
4505
4506
436k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
4507
4508
                //separating +ve and and -ve values.(2,1)
4509
436k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
4510
436k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
4511
4512
                //creating mask 00 for +ve and -ve values and FF for zero.
4513
436k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4514
436k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4515
                //row 2 right
4516
436k
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
4517
                //combining the appropriate sign change
4518
436k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
4519
4520
                //separating +ve and and -ve values.(3,2)
4521
436k
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
4522
436k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
4523
436k
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
4524
                //creating mask 00 for +ve and -ve values and FF for zero.
4525
436k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4526
436k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4527
                //manipulation for row 2 -row 3
4528
436k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
4529
                //row 3 left
4530
436k
                signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 15);
4531
                //combining the appropriate sign change
4532
436k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
4533
4534
436k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
4535
4536
                //separating +ve and and -ve values.(2,3)
4537
436k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
4538
436k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
4539
4540
                //manipulation for row 3 -bottom
4541
436k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 11);
4542
                //bottom left
4543
436k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
4544
4545
                //creating mask 00 for +ve and -ve values and FF for zero.
4546
436k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4547
436k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4548
                //combining the appropriate sign change
4549
436k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
4550
4551
                //separating +ve and and -ve values.(3,bottom)
4552
436k
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
4553
436k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
4554
4555
                //creating mask 00 for +ve and -ve values and FF for zero.
4556
436k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4557
436k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4558
436k
                edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
4559
                //combining the appropriate sign change
4560
436k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
4561
436k
                edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
4562
4563
4564
                //eliminating old left for row 0,1,2,3
4565
436k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
4566
                //packing row 2 n row 3
4567
436k
                src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
4568
                //row 3 right
4569
436k
                signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 1);
4570
                //loading row 3 right into left
4571
436k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 15);
4572
                //adding bottom and top values of row 2 and row 3
4573
436k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
4574
                //separating +ve and and -ve values.(botttom,3)
4575
436k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
4576
436k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
4577
                //to store right of row 2
4578
436k
                signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
4579
                //creating mask 00 for +ve and -ve values and FF for zero.
4580
436k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4581
436k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4582
436k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
4583
4584
                //storing right of row 2into left
4585
436k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
4586
                //to store right of row 0
4587
436k
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
4588
                //storing right of row 1 into left
4589
436k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
4590
4591
                //adding constant 2
4592
436k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
4593
436k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
4594
                //shuffle to get sao index
4595
436k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
4596
436k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
4597
                //using availability mask
4598
436k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
4599
436k
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
4600
                //shuffle to get sao offset
4601
436k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
4602
436k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
4603
4604
                //storing right of row 0 into left
4605
436k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
4606
                //cnvert to 16 bit then add and then saturated pack
4607
436k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
4608
436k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
4609
436k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
4610
436k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
4611
436k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
4612
436k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
4613
436k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
4614
436k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
4615
4616
436k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
4617
436k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
4618
436k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
4619
436k
                src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
4620
436k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
4621
436k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
4622
436k
                src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b);
4623
436k
                src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
4624
4625
436k
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
4626
436k
                cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
4627
4628
436k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4629
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
4630
436k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
4631
                // row = 1
4632
436k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
4633
                //row = 2
4634
436k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
4635
                // row = 3
4636
436k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
4637
4638
436k
                src_temp0_16x8b = src_temp1_16x8b;
4639
436k
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
4640
436k
                pu1_src_cpy += (src_strd << 2);
4641
436k
                pu1_src_left_cpy += 4;
4642
436k
                pu1_src_left_str += 4;
4643
436k
            }
4644
55.8k
            ht_rem = ht & 0x2;
4645
55.8k
            if(ht_rem)
4646
3.24k
            {
4647
3.24k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
4648
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
4649
3.24k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
4650
                // row = 2
4651
3.24k
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
4652
4653
                //manipulation for row 0 -row 1
4654
3.24k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
4655
                //bottom left
4656
3.24k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
4657
                //separating +ve and and -ve values.
4658
3.24k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
4659
3.24k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
4660
4661
                //creating mask 00 for +ve and -ve values and FF for zero.
4662
3.24k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4663
3.24k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4664
                //manipulation for row 1 - row 0
4665
3.24k
                signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 1);
4666
                //combining the appropriate sign change
4667
3.24k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4668
4669
                //row1-row0
4670
                //separating +ve and and -ve values.
4671
3.24k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
4672
3.24k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
4673
4674
                //creating mask 00 for +ve and -ve values and FF for zero.
4675
3.24k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4676
3.24k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4677
                //combining the appropriate sign chang
4678
3.24k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4679
4680
                //manipulation for row 1 -bottom
4681
3.24k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 13);
4682
                //bottom left
4683
3.24k
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
4684
4685
3.24k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
4686
3.24k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
4687
                //row1 -bottom
4688
3.24k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
4689
3.24k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
4690
4691
                //creating mask 00 for +ve and -ve values and FF for zero.
4692
3.24k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4693
3.24k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4694
                //combining the appropriate sign change
4695
3.24k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
4696
3.24k
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
4697
                //manipulation for bottom- row 1 (row 1 right)
4698
3.24k
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
4699
                //adding top and down substraction
4700
3.24k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
4701
                //bottom - row 1
4702
3.24k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
4703
3.24k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
4704
4705
                //eliminating old left for row 0,1
4706
3.24k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
4707
3.24k
                signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
4708
                //creating mask 00 for +ve and -ve values and FF for zero.
4709
3.24k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4710
3.24k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4711
                //for the next iteration signup0_16x8b
4712
3.24k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
4713
4714
                //storing right of row 1 into left
4715
3.24k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
4716
                //for storing right of row 1
4717
3.24k
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
4718
4719
3.24k
                src_top_16x8b = src_temp1_16x8b;
4720
                //storing right of row 0 into left
4721
3.24k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
4722
4723
                //adding constant 2
4724
3.24k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
4725
4726
                //shuffle to get sao index
4727
3.24k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
4728
                //using availability mask
4729
3.24k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
4730
                //shuffle to get sao offset
4731
3.24k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
4732
4733
                //the next top already in  src_top_16x8b
4734
                //cnvert to 16 bit then add and then saturated pack
4735
3.24k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
4736
3.24k
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
4737
3.24k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
4738
3.24k
                src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
4739
3.24k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
4740
3.24k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
4741
3.24k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
4742
3.24k
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
4743
4744
3.24k
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
4745
4746
3.24k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4747
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
4748
3.24k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
4749
                // row = 1
4750
3.24k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
4751
3.24k
                src_temp0_16x8b = src_bottom_16x8b;
4752
3.24k
                pu1_src_cpy += (src_strd << 1);
4753
3.24k
                pu1_src_left_cpy += 2;
4754
3.24k
                pu1_src_left_str += 2;
4755
3.24k
            }
4756
55.8k
            ht_rem = ht & 0x1;
4757
55.8k
            if(ht_rem)
4758
3.24k
            {
4759
3.24k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
4760
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
4761
3.24k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
4762
4763
4764
                //manipulation for row 0 -bottom
4765
3.24k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
4766
                //bottom left
4767
3.24k
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
4768
                //separating +ve and and -ve values.
4769
3.24k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
4770
3.24k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
4771
                //creating mask 00 for +ve and -ve values and FF for zero.
4772
3.24k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4773
3.24k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4774
                //combining the appropriate sign change
4775
3.24k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4776
                //adding top and down substraction
4777
3.24k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
4778
                //for row 0 right to put into left store
4779
3.24k
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
4780
                //adding constant 2
4781
3.24k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
4782
3.24k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
4783
3.24k
                edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
4784
                //left store manipulation 1
4785
3.24k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
4786
                //filling the left boundary value
4787
3.24k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
4788
4789
                //shuffle to get sao index
4790
3.24k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
4791
                //using availability mask
4792
3.24k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
4793
                //shuffle to get sao offset
4794
3.24k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
4795
3.24k
                src_top_16x8b = src_temp0_16x8b;
4796
                //cnvert to 16 bit then add and then saturated pack
4797
3.24k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
4798
3.24k
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
4799
3.24k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
4800
3.24k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
4801
3.24k
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
4802
4803
3.24k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4804
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
4805
3.24k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
4806
3.24k
                pu1_src_cpy += (src_strd);
4807
3.24k
                src_temp0_16x8b = src_bottom_16x8b;
4808
3.24k
                pu1_src_left_cpy++;
4809
3.24k
                pu1_src_left_str++;
4810
3.24k
            }
4811
55.8k
            {   //for bottom right
4812
55.8k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
4813
55.8k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
4814
55.8k
                src_temp0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
4815
55.8k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
4816
55.8k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4817
55.8k
            }
4818
55.8k
            if(0 == pu1_avail[3])
4819
1.96k
            {
4820
1.96k
                src_top_16x8b = src_bottom_16x8b;
4821
1.96k
            }
4822
55.8k
            _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
4823
55.8k
            pu1_src += 8;
4824
4825
55.8k
            pu1_left_tmp = pu1_src_left_cpy2;
4826
55.8k
            pu1_src_left_cpy2 = pu1_src_left_str2;
4827
55.8k
            pu1_src_left_str2 = pu1_left_tmp;
4828
4829
55.8k
            pu1_src_left_cpy = pu1_src_left_cpy2;
4830
55.8k
            pu1_src_left_str = pu1_src_left_str2;
4831
4832
55.8k
        }
4833
56.9k
        pu1_src_org[wd - 1] = u1_pos_wd_0_tmp;
4834
56.9k
        pu1_src_org[(ht_tmp - 1) * src_strd] = u1_pos_0_ht_tmp;
4835
56.9k
        pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 1) : pu1_src_left_cpy;
4836
56.9k
        pu1_src_left[0] = au1_src_left_tmp[0];
4837
1.79M
        for(row = 1; row < ht_tmp; row++)
4838
1.74M
        {
4839
1.74M
            pu1_src_left[row] = pu1_src_left_cpy[row];
4840
1.74M
        }
4841
56.9k
    }
4842
4843
56.9k
}
4844
4845
void ihevc_sao_edge_offset_class3_chroma_ssse3(UWORD8 *pu1_src,
4846
                                               WORD32 src_strd,
4847
                                               UWORD8 *pu1_src_left,
4848
                                               UWORD8 *pu1_src_top,
4849
                                               UWORD8 *pu1_src_top_left,
4850
                                               UWORD8 *pu1_src_top_right,
4851
                                               UWORD8 *pu1_src_bot_left,
4852
                                               UWORD8 *pu1_avail,
4853
                                               WORD8 *pi1_sao_offset_u,
4854
                                               WORD8 *pi1_sao_offset_v,
4855
                                               WORD32 wd,
4856
                                               WORD32 ht)
4857
62.9k
{
4858
62.9k
    WORD32 row, col;
4859
62.9k
    UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
4860
62.9k
    UWORD8 *pu1_src_cpy, *pu1_src_org;
4861
62.9k
    UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)];
4862
62.9k
    UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
4863
62.9k
    WORD32 wd_rem;
4864
62.9k
    UWORD8 u1_pos_wd_0_tmp_u, u1_pos_wd_0_tmp_v, u1_pos_0_ht_tmp_u, u1_pos_0_ht_tmp_v;
4865
62.9k
    WORD32 ht_tmp;
4866
62.9k
    WORD32 bit_depth;
4867
62.9k
    UWORD8 u1_avail0, u1_avail1;
4868
4869
62.9k
    __m128i src_top_16x8b, src_bottom_16x8b;
4870
62.9k
    __m128i src_temp0_16x8b, src_temp1_16x8b;
4871
62.9k
    __m128i signup0_16x8b, signdwn1_16x8b;
4872
62.9k
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
4873
62.9k
    __m128i edge0_16x8b, edge1_16x8b;
4874
62.9k
    __m128i au1_mask8x16b;
4875
62.9k
    __m128i edge_idx_8x16b, sao_offset_8x16b;
4876
62.9k
    __m128i left_store_16x8b;
4877
62.9k
    __m128i const0_16x8b, const2_16x8b;
4878
62.9k
    __m128i chroma_offset_8x16b;
4879
4880
62.9k
    ht_tmp = ht;
4881
62.9k
    au1_mask8x16b = _mm_set1_epi8(0xff);
4882
4883
4884
62.9k
    au1_src_left_tmp[0] = pu1_src[(wd - 2)];
4885
62.9k
    au1_src_left_tmp[1] = pu1_src[(wd - 1)];
4886
    //manipulation for bottom left
4887
1.92M
    for(row = 2; row < 2 * ht; row++)
4888
1.86M
    {
4889
1.86M
        au1_src_left_tmp[row] = pu1_src_left[row];
4890
1.86M
    }
4891
62.9k
    au1_src_left_tmp[2 * ht] = pu1_src_bot_left[0];
4892
62.9k
    au1_src_left_tmp[2 * ht + 1] = pu1_src_bot_left[1];
4893
4894
62.9k
    pu1_src_top_left[0] = pu1_src_top[wd - 2];
4895
62.9k
    pu1_src_top_left[1] = pu1_src_top[wd - 1];
4896
    //setting availability mask to ff size MAX_CTB_SIZE
4897
314k
    for(col = 0; col < MAX_CTB_SIZE; col += 16)
4898
251k
        _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
4899
62.9k
    bit_depth = BIT_DEPTH_LUMA;
4900
62.9k
    pu1_src_org = pu1_src;
4901
62.9k
    pu1_src_top_cpy = pu1_src_top;
4902
62.9k
    pu1_src_left_cpy2 = au1_src_left_tmp;
4903
62.9k
    pu1_src_left_cpy = au1_src_left_tmp;
4904
62.9k
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
4905
62.9k
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
4906
62.9k
    const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
4907
62.9k
    chroma_offset_8x16b = _mm_set1_epi16(0x0800);
4908
    /* If top-right is available, process separately */
4909
62.9k
    if(0 != pu1_avail[5])
4910
59.3k
    {
4911
59.3k
        WORD32 edge_idx;
4912
4913
        /* U */
4914
59.3k
        edge_idx = 2 + SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) +
4915
59.3k
                        SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]);
4916
4917
59.3k
        edge_idx = gi1_table_edge_idx[edge_idx];
4918
4919
59.3k
        if(0 != edge_idx)
4920
20.0k
        {
4921
20.0k
            u1_pos_wd_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
4922
20.0k
        }
4923
39.3k
        else
4924
39.3k
        {
4925
39.3k
            u1_pos_wd_0_tmp_u = pu1_src[wd - 2];
4926
39.3k
        }
4927
4928
        /* V */
4929
59.3k
        edge_idx = 2 + SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) +
4930
59.3k
                        SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]);
4931
4932
59.3k
        edge_idx = gi1_table_edge_idx[edge_idx];
4933
4934
59.3k
        if(0 != edge_idx)
4935
16.0k
        {
4936
16.0k
            u1_pos_wd_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
4937
16.0k
        }
4938
43.3k
        else
4939
43.3k
        {
4940
43.3k
            u1_pos_wd_0_tmp_v = pu1_src[wd - 1];
4941
43.3k
        }
4942
59.3k
    }
4943
3.57k
    else
4944
3.57k
    {
4945
3.57k
        u1_pos_wd_0_tmp_u = pu1_src[wd - 2];
4946
3.57k
        u1_pos_wd_0_tmp_v = pu1_src[wd - 1];
4947
3.57k
    }
4948
4949
    /* If bottom-left is available, process separately */
4950
62.9k
    if(0 != pu1_avail[6])
4951
60.4k
    {
4952
60.4k
        WORD32 edge_idx;
4953
4954
        /* U */
4955
60.4k
        edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 2 - src_strd]) +
4956
60.4k
                        SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]);
4957
4958
60.4k
        edge_idx = gi1_table_edge_idx[edge_idx];
4959
4960
60.4k
        if(0 != edge_idx)
4961
14.3k
        {
4962
14.3k
            u1_pos_0_ht_tmp_u = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
4963
14.3k
        }
4964
46.0k
        else
4965
46.0k
        {
4966
46.0k
            u1_pos_0_ht_tmp_u = pu1_src[(ht - 1) * src_strd];
4967
46.0k
        }
4968
4969
        /* V */
4970
60.4k
        edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd]) +
4971
60.4k
                        SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1]);
4972
4973
60.4k
        edge_idx = gi1_table_edge_idx[edge_idx];
4974
4975
60.4k
        if(0 != edge_idx)
4976
16.1k
        {
4977
16.1k
            u1_pos_0_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd + 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
4978
16.1k
        }
4979
44.2k
        else
4980
44.2k
        {
4981
44.2k
            u1_pos_0_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1];
4982
44.2k
        }
4983
60.4k
    }
4984
2.56k
    else
4985
2.56k
    {
4986
2.56k
        u1_pos_0_ht_tmp_u = pu1_src[(ht - 1) * src_strd];
4987
2.56k
        u1_pos_0_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1];
4988
2.56k
    }
4989
4990
4991
4992
    /* Update height and source pointers based on the availability flags */
4993
62.9k
    if(0 == pu1_avail[2])
4994
2.02k
    {
4995
2.02k
        pu1_src_left_cpy2 += 2;
4996
2.02k
        pu1_src_top_cpy = pu1_src;
4997
2.02k
        pu1_src += src_strd;
4998
2.02k
        ht--;
4999
2.02k
    }
5000
62.9k
    if(0 == pu1_avail[3])
5001
1.74k
    {
5002
1.74k
        ht--;
5003
1.74k
    }
5004
5005
62.9k
    sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
5006
62.9k
    const2_16x8b = _mm_set1_epi8(2);
5007
62.9k
    const0_16x8b = _mm_setzero_si128();
5008
5009
5010
    //availability mask creation
5011
62.9k
    u1_avail0 = pu1_avail[0];
5012
62.9k
    u1_avail1 = pu1_avail[1];
5013
62.9k
    au1_mask[0] = u1_avail0;
5014
62.9k
    au1_mask[1] = u1_avail0;
5015
62.9k
    au1_mask[wd - 1] = u1_avail1;
5016
62.9k
    au1_mask[wd - 2] = u1_avail1;
5017
62.9k
    {
5018
62.9k
        WORD32 ht_rem;
5019
62.9k
        au1_mask_cpy = au1_mask;
5020
191k
        for(col = wd; col >= 16; col -= 16)
5021
128k
        {
5022
128k
            pu1_src_cpy = pu1_src;
5023
128k
            src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col + 2));
5024
            //row = 0
5025
128k
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
5026
5027
            //loading the mask
5028
128k
            au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
5029
            //separating +ve and and -ve values.
5030
128k
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
5031
128k
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
5032
            //creating mask 00 for +ve and -ve values and FF for zero.
5033
128k
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5034
128k
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5035
            //combining the appropriate sign change
5036
128k
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5037
128k
            pu1_src_left_cpy = pu1_src_left_cpy2;
5038
5039
1.14M
            for(row = ht; row >= 2; row -= 2)
5040
1.01M
            {
5041
1.01M
                left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
5042
                //row = 1
5043
1.01M
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
5044
                //to insert left in row 1
5045
1.01M
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
5046
                // row = 0 right
5047
1.01M
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2));
5048
5049
                //manipulation for row 1 - row 0
5050
1.01M
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
5051
                //row 0 -row1
5052
                //separating +ve and and -ve values.
5053
1.01M
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
5054
1.01M
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
5055
5056
                //creating mask 00 for +ve and -ve values and FF for zero.
5057
1.01M
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5058
1.01M
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5059
5060
                //combining the appropriate sign change
5061
1.01M
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
5062
                //combining sign-left and sign_right
5063
1.01M
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
5064
5065
                //row1-row0
5066
                //separating +ve and and -ve values.
5067
1.01M
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
5068
1.01M
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
5069
                //creating mask 00 for +ve and -ve values and FF for zero.
5070
1.01M
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5071
1.01M
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5072
5073
                // row = 2
5074
1.01M
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
5075
                // row = 1 right
5076
1.01M
                signdwn1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
5077
1.01M
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
5078
5079
                //bottom - row1
5080
1.01M
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
5081
1.01M
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
5082
                //creating mask 00 for +ve and -ve values and FF for zero.
5083
1.01M
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5084
1.01M
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5085
                //for the next iteration bottom -row1
5086
1.01M
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5087
5088
                //to insert left in row 1
5089
1.01M
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 10);
5090
                //manipulation for row 1 - bottom
5091
1.01M
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
5092
5093
                //row1 -bottom
5094
1.01M
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
5095
1.01M
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
5096
                //creating mask 00 for +ve and -ve values and FF for zero.
5097
1.01M
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5098
1.01M
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5099
                //combining the appropriate sign change
5100
1.01M
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5101
5102
                //combining sign-left and sign_right
5103
1.01M
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
5104
5105
                //eliminating old left for row 0 and row 1
5106
1.01M
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
5107
                //row1  getting it right for left of next block
5108
1.01M
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14);
5109
                //row0  getting it right for left of next block
5110
1.01M
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
5111
                //copying the next top
5112
1.01M
                src_top_16x8b = src_temp1_16x8b;
5113
5114
5115
                //adding constant 2
5116
1.01M
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
5117
1.01M
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
5118
                //shuffle to get sao index
5119
1.01M
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
5120
1.01M
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
5121
                //using availability mask
5122
1.01M
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
5123
1.01M
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
5124
5125
                //adding chroma offset to access U and V
5126
1.01M
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
5127
1.01M
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
5128
5129
                //shuffle to get sao offset
5130
1.01M
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
5131
1.01M
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
5132
                //cnvert to 16 bit then add and then saturated pack
5133
1.01M
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
5134
1.01M
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
5135
1.01M
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
5136
1.01M
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
5137
1.01M
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
5138
1.01M
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
5139
1.01M
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
5140
1.01M
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
5141
5142
1.01M
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
5143
1.01M
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
5144
1.01M
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
5145
1.01M
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
5146
1.01M
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
5147
1.01M
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
5148
1.01M
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
5149
1.01M
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
5150
                //store left boundary
5151
1.01M
                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5152
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
5153
1.01M
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
5154
                // row = 1
5155
1.01M
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
5156
5157
1.01M
                src_temp0_16x8b = src_bottom_16x8b;
5158
1.01M
                pu1_src_cpy += (src_strd << 1);
5159
1.01M
                pu1_src_left_cpy += 4;
5160
1.01M
            }
5161
128k
            ht_rem = ht & 0x1;
5162
5163
128k
            if(ht_rem)
5164
7.49k
            {
5165
7.49k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
5166
7.49k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
5167
                //to insert left in row 1
5168
7.49k
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
5169
                //manipulation for row 1 - row 0
5170
7.49k
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
5171
5172
                //current row -next row
5173
                //separating +ve and and -ve values.
5174
7.49k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
5175
7.49k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
5176
                //creating mask 00 for +ve and -ve values and FF for zero.
5177
7.49k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5178
7.49k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5179
                //combining the appropriate sign change
5180
7.49k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5181
                //adding top and bottom and constant 2
5182
7.49k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
5183
7.49k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
5184
                //eliminating old left for row 0 and row 1
5185
7.49k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
5186
                //row0  getting it right for left of next block
5187
7.49k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
5188
                //copying the next top
5189
7.49k
                src_top_16x8b = src_temp0_16x8b;
5190
5191
7.49k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
5192
                //using availability mask
5193
7.49k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
5194
5195
                //adding chroma offset to access U and V
5196
7.49k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
5197
5198
5199
7.49k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
5200
5201
                //cnvert to 16 bit then add and then saturated pack
5202
7.49k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
5203
7.49k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
5204
7.49k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
5205
7.49k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
5206
7.49k
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
5207
7.49k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
5208
7.49k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
5209
7.49k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
5210
5211
                //store left boundary
5212
7.49k
                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5213
5214
7.49k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
5215
7.49k
                pu1_src_cpy += (src_strd);
5216
7.49k
                src_temp0_16x8b = src_bottom_16x8b;
5217
7.49k
                pu1_src_left_cpy += 2;
5218
7.49k
            }
5219
128k
            {   //for bottom right
5220
128k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
5221
128k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
5222
128k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
5223
128k
                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5224
128k
            }
5225
128k
            if(0 == pu1_avail[3])
5226
3.49k
            {
5227
3.49k
                src_top_16x8b = src_bottom_16x8b;
5228
3.49k
            }
5229
            //for the top left of next part of the block
5230
128k
            left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
5231
            //updating top flag
5232
128k
            _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
5233
128k
            pu1_src += 16;
5234
128k
            au1_mask_cpy += 16;
5235
128k
        }
5236
62.9k
        pu1_src_left_cpy = pu1_src_left_cpy2;
5237
62.9k
        wd_rem = wd & 0xF;
5238
62.9k
        if(wd_rem)
5239
16
        {
5240
16
            pu1_src_cpy = pu1_src;
5241
16
            src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col + 2));
5242
            //row = 0
5243
16
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
5244
16
            au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
5245
            //separating +ve and and -ve values.
5246
16
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
5247
16
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
5248
            //creating mask 00 for +ve and -ve values and FF for zero.
5249
16
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5250
16
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5251
            //preparing au1_mask
5252
16
            au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
5253
            //combining the appropriate sign change
5254
16
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5255
16
            signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
5256
16
            pu1_src_left_cpy = pu1_src_left_cpy2;
5257
18
            for(row = ht; row >= 4; row -= 4)
5258
2
            {
5259
2
                left_store_16x8b = _mm_loadu_si128((__m128i *)pu1_src_left_cpy);
5260
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
5261
2
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
5262
                // row = 2
5263
2
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
5264
                //manipulation for row 0 -row 1
5265
2
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
5266
                //row 1 left
5267
2
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
5268
                //row 0 -row1
5269
                //separating +ve and and -ve values.
5270
2
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
5271
2
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
5272
5273
                //creating mask 00 for +ve and -ve values and FF for zero.
5274
2
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5275
2
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5276
                //manipulatiing for row 1 -row 0
5277
2
                signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 2);
5278
                //combining the appropriate sign change
5279
2
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5280
                //row 1 -row0
5281
                //separating +ve and and -ve values.
5282
2
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
5283
2
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
5284
5285
                //creating mask 00 for +ve and -ve values and FF for zero.
5286
2
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5287
2
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5288
                //row1-row0
5289
2
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5290
5291
2
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
5292
5293
2
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
5294
                //manipulation for row 1 -row 2
5295
2
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 10);
5296
                //row 2 left
5297
2
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
5298
                //packing row 0 n row 1
5299
2
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
5300
                //row1 -row2
5301
2
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
5302
2
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
5303
                //creating mask 00 for +ve and -ve values and FF for zero.
5304
2
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5305
2
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5306
                //combining the appropriate sign change
5307
2
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
5308
2
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
5309
5310
                //row 1 right
5311
2
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
5312
                //row = 3
5313
2
                src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
5314
5315
                // row = 4
5316
2
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
5317
5318
2
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
5319
5320
                //separating +ve and and -ve values.(2,1)
5321
2
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
5322
2
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
5323
5324
                //creating mask 00 for +ve and -ve values and FF for zero.
5325
2
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5326
2
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5327
                //row 2 right
5328
2
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
5329
                //combining the appropriate sign change
5330
2
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
5331
5332
                //separating +ve and and -ve values.(3,2)
5333
2
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
5334
2
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
5335
2
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
5336
                //creating mask 00 for +ve and -ve values and FF for zero.
5337
2
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5338
2
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5339
                //manipulation for row 2 -row 3
5340
2
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 8);
5341
                //row 3 left
5342
2
                signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 14);
5343
                //combining the appropriate sign change
5344
2
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
5345
5346
2
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
5347
5348
                //separating +ve and and -ve values.(2,3)
5349
2
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
5350
2
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
5351
5352
                //manipulation for row 3 -bottom
5353
2
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 6);
5354
                //bottom left
5355
2
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
5356
5357
                //creating mask 00 for +ve and -ve values and FF for zero.
5358
2
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5359
2
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5360
                //combining the appropriate sign change
5361
2
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
5362
5363
                //separating +ve and and -ve values.(3,bottom)
5364
2
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
5365
2
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
5366
5367
                //creating mask 00 for +ve and -ve values and FF for zero.
5368
2
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5369
2
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5370
2
                edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
5371
                //combining the appropriate sign change
5372
2
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
5373
2
                edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
5374
5375
5376
                //eliminating old left for row 0,1,2,3
5377
2
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 8);
5378
                //packing row 2 n row 3
5379
2
                src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
5380
                //row 3 right
5381
2
                signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 2);
5382
                //loading row 3 right into left
5383
2
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 14);
5384
                //adding bottom and top values of row 2 and row 3
5385
2
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
5386
                //separating +ve and and -ve values.(botttom,3)
5387
2
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
5388
2
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
5389
                //to store right of row 2
5390
2
                signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
5391
                //creating mask 00 for +ve and -ve values and FF for zero.
5392
2
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5393
2
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5394
2
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
5395
5396
                //storing right of row 2into left
5397
2
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
5398
                //to store right of row 0
5399
2
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
5400
                //storing right of row 1 into left
5401
2
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
5402
                //storing right of row 0 into left
5403
2
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
5404
5405
5406
                //adding constant 2
5407
2
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
5408
2
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
5409
                //shuffle to get sao index
5410
2
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
5411
2
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
5412
                //using availability mask
5413
2
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
5414
2
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
5415
                //adding chroma offset to access U and V
5416
2
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
5417
2
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
5418
                //shuffle to get sao offset
5419
2
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
5420
2
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
5421
5422
                //cnvert to 16 bit then add and then saturated pack
5423
2
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
5424
2
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
5425
2
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
5426
2
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
5427
2
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
5428
2
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
5429
2
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
5430
2
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
5431
5432
2
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
5433
2
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
5434
2
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
5435
2
                src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
5436
2
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
5437
2
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
5438
2
                src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b);
5439
2
                src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
5440
5441
2
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
5442
2
                cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
5443
2
                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5444
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
5445
2
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
5446
                // row = 1
5447
2
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
5448
                //row = 2
5449
2
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
5450
                // row = 3
5451
2
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
5452
5453
2
                src_temp0_16x8b = src_temp1_16x8b;
5454
2
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
5455
2
                pu1_src_cpy += (src_strd << 2);
5456
2
                pu1_src_left_cpy += 8;
5457
2
            }
5458
16
            ht_rem = ht & 0x2;
5459
16
            if(ht_rem)
5460
16
            {
5461
16
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
5462
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
5463
16
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
5464
                // row = 2
5465
16
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
5466
5467
                //manipulation for row 0 -row 1
5468
16
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
5469
                //bottom left
5470
16
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
5471
                //separating +ve and and -ve values.
5472
16
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
5473
16
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
5474
5475
                //creating mask 00 for +ve and -ve values and FF for zero.
5476
16
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5477
16
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5478
                //manipulation for row 1 - row 0
5479
16
                signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 2);
5480
                //combining the appropriate sign change
5481
16
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5482
5483
                //row1-row0
5484
                //separating +ve and and -ve values.
5485
16
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
5486
16
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
5487
5488
                //creating mask 00 for +ve and -ve values and FF for zero.
5489
16
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5490
16
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5491
                //combining the appropriate sign chang
5492
16
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5493
5494
                //manipulation for row 1 -bottom
5495
16
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 10);
5496
                //bottom left
5497
16
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
5498
5499
16
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
5500
16
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
5501
                //row1 -bottom
5502
16
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
5503
16
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
5504
5505
                //creating mask 00 for +ve and -ve values and FF for zero.
5506
16
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5507
16
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5508
                //combining the appropriate sign change
5509
16
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
5510
16
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
5511
5512
                //manipulation for bottom- row 1 (row 1 right)
5513
16
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
5514
                //adding top and down substraction
5515
16
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
5516
                //bottom - row 1
5517
16
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
5518
16
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
5519
5520
                //eliminating old left for row 0,1
5521
16
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
5522
16
                signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
5523
                //creating mask 00 for +ve and -ve values and FF for zero.
5524
16
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5525
16
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5526
                //for the next iteration signup0_16x8b
5527
16
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
5528
5529
                //storing right of row 1 into left
5530
16
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
5531
                //for storing right of row 1
5532
16
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
5533
5534
16
                src_top_16x8b = src_temp1_16x8b;
5535
                //storing right of row 0 into left
5536
16
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
5537
5538
                //adding constant 2
5539
16
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
5540
5541
                //shuffle to get sao index
5542
16
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
5543
                //using availability mask
5544
16
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
5545
                //adding chroma offset to access U and V
5546
16
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
5547
                //shuffle to get sao offset
5548
16
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
5549
                //the next top already in  src_top_16x8b
5550
                //cnvert to 16 bit then add and then saturated pack
5551
16
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
5552
16
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
5553
16
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
5554
16
                src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
5555
16
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
5556
16
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
5557
16
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b);
5558
16
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
5559
5560
16
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
5561
5562
16
                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5563
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
5564
16
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
5565
                // row = 1
5566
16
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
5567
16
                src_temp0_16x8b = src_bottom_16x8b;
5568
16
                pu1_src_cpy += (src_strd << 1);
5569
16
                pu1_src_left_cpy += 4;
5570
16
            }
5571
16
            ht_rem = ht & 0x1;
5572
16
            if(ht_rem)
5573
4
            {
5574
4
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
5575
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
5576
4
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
5577
5578
5579
                //manipulation for row 0 -bottom
5580
4
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
5581
                //bottom left
5582
4
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
5583
                //separating +ve and and -ve values.
5584
4
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
5585
4
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
5586
                //creating mask 00 for +ve and -ve values and FF for zero.
5587
4
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5588
4
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5589
                //combining the appropriate sign change
5590
4
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5591
                //adding top and down substraction
5592
4
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
5593
                //for row 0 right to put into left store
5594
4
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
5595
                //adding constant 2
5596
4
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
5597
4
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
5598
4
                edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
5599
                //left store manipulation 1
5600
4
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
5601
                //filling the left boundary value
5602
4
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
5603
4
                src_top_16x8b = src_temp0_16x8b;
5604
5605
                //shuffle to get sao index
5606
4
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
5607
                //using availability mask
5608
4
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
5609
                //adding chroma offset to access U and V
5610
4
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
5611
                //shuffle to get sao offset
5612
4
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
5613
5614
                //cnvert to 16 bit then add and then saturated pack
5615
4
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
5616
4
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
5617
4
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
5618
4
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
5619
4
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
5620
5621
4
                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5622
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
5623
4
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
5624
4
                pu1_src_cpy += (src_strd);
5625
4
                src_temp0_16x8b = src_bottom_16x8b;
5626
4
                pu1_src_left_cpy += 2;
5627
4
            }
5628
16
            {   //for bottom right
5629
16
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
5630
16
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
5631
16
                src_temp0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
5632
16
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
5633
16
                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5634
16
            }
5635
16
            if(0 == pu1_avail[3])
5636
14
            {
5637
14
                src_top_16x8b = src_bottom_16x8b;
5638
14
            }
5639
5640
16
            _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
5641
16
            pu1_src += 8;
5642
16
        }
5643
62.9k
        pu1_src_org[wd - 2] = u1_pos_wd_0_tmp_u;
5644
62.9k
        pu1_src_org[wd - 1] = u1_pos_wd_0_tmp_v;
5645
62.9k
        pu1_src_org[(ht_tmp - 1) * src_strd] = u1_pos_0_ht_tmp_u;
5646
62.9k
        pu1_src_org[(ht_tmp - 1) * src_strd + 1] = u1_pos_0_ht_tmp_v;
5647
2.05M
        for(row = 0; row < 2 * ht_tmp; row++)
5648
1.99M
        {
5649
1.99M
            pu1_src_left[row] = au1_src_left_tmp[row];
5650
1.99M
        }
5651
62.9k
    }
5652
5653
62.9k
}