Coverage Report

Created: 2025-10-10 06:51

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libhevc/common/x86/ihevc_sao_ssse3_intr.c
Line
Count
Source
1
/******************************************************************************
2
*
3
* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4
*
5
* Licensed under the Apache License, Version 2.0 (the "License");
6
* you may not use this file except in compliance with the License.
7
* You may obtain a copy of the License at:
8
*
9
* http://www.apache.org/licenses/LICENSE-2.0
10
*
11
* Unless required by applicable law or agreed to in writing, software
12
* distributed under the License is distributed on an "AS IS" BASIS,
13
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
* See the License for the specific language governing permissions and
15
* limitations under the License.
16
*
17
******************************************************************************/
18
/**
19
*******************************************************************************
20
* @file
21
*  ihevc_sao_atom_intr.c
22
*
23
* @brief
24
*  Contains function definitions for Sample adaptive offset(SAO) used in-loop
25
* filtering
26
*
27
* @author
28
* 100592
29
*
30
* @par List of Functions:
31
*   - ihevc_sao_band_offset_luma_ssse3()
32
*   - ihevc_sao_band_offset_chroma_ssse3()
33
*   - ihevc_sao_edge_offset_class0_ssse3()
34
*   - ihevc_sao_edge_offset_class0_chroma_ssse3()
35
*   - ihevc_sao_edge_offset_class1_ssse3()
36
*   - ihevc_sao_edge_offset_class1_chroma_ssse3()
37
*   - ihevc_sao_edge_offset_class2_ssse3()
38
*   - ihevc_sao_edge_offset_class2_chroma_ssse3()
39
*   - ihevc_sao_edge_offset_class3_ssse3()
40
*   - ihevc_sao_edge_offset_class3_chroma_ssse3()
41
*
42
* @remarks
43
*  None
44
*
45
*******************************************************************************
46
*/
47
/*****************************************************************************/
48
/* File Includes                                                             */
49
/*****************************************************************************/
50
#include <stdio.h>
51
52
#include "ihevc_typedefs.h"
53
#include "ihevc_platform_macros.h"
54
#include "ihevc_macros.h"
55
#include "ihevc_func_selector.h"
56
#include "ihevc_defs.h"
57
#include "ihevc_tables_x86_intr.h"
58
#include "ihevc_common_tables.h"
59
#include "ihevc_sao.h"
60
61
#include <immintrin.h>
62
63
#define NUM_BAND_TABLE  32
64
/**
65
*******************************************************************************
66
*
67
* @brief
68
* Has two sets of functions : band offset and edge offset both for luma and chroma
69
* edge offset has horizontal ,vertical, 135 degree and 45 degree
70
*
71
* @par Description:
72
*
73
*
74
* @param[in-out] pu1_src
75
*  Pointer to the source
76
*
77
* @param[in] src_strd
78
*  Source stride
79
*
80
* @param[in-out] pu1_src_left
81
*  source left boundary
82
*
83
* @param[in-out] pu1_src_top
84
* Source top boundary
85
*
86
* @param[in-out] pu1_src_top_left
87
*  Source top left boundary
88
*
89
* @param[in] pu1_src_top_right
90
*  Source top right boundary
91
*
92
* @param[in] pu1_src_bot_left
93
*  Source bottom left boundary
94
*
95
* @param[in] pu1_avail
96
*  boundary availability flags
97
*
98
* @param[in] pi1_sao_offset_u
99
*  Chroma U sao offset values
100
*
101
* @param[in] pi1_sao_offset_v
102
*  Chroma V sao offset values
103
*
104
* @param[in] pi1_sao_offset
105
*  Luma sao offset values
106
*
107
* @param[in] wd
108
*  width of the source
109
110
* @param[in] ht
111
*  height of the source
112
* @returns
113
*
114
* @remarks
115
*  None
116
*
117
*******************************************************************************
118
*/
119
120
121
void ihevc_sao_band_offset_luma_ssse3(UWORD8 *pu1_src,
122
                                      WORD32 src_strd,
123
                                      UWORD8 *pu1_src_left,
124
                                      UWORD8 *pu1_src_top,
125
                                      UWORD8 *pu1_src_top_left,
126
                                      WORD32 sao_band_pos,
127
                                      WORD8 *pi1_sao_offset,
128
                                      WORD32 wd,
129
                                      WORD32 ht)
130
51.3k
{
131
51.3k
    WORD32 row, col;
132
51.3k
    UWORD8 *pu1_src_cpy;
133
51.3k
    WORD32 wd_rem;
134
51.3k
    WORD8 offset = 0;
135
136
51.3k
    __m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b;
137
51.3k
    __m128i band_table0_8x16b, band_table1_8x16b, band_table2_8x16b, band_table3_8x16b;
138
51.3k
    __m128i tmp_set_128i_1, tmp_set_128i_2, tmp_set_128i_3, tmp_set_128i_4;
139
51.3k
    __m128i band_pos_16x8b;
140
51.3k
    __m128i sao_offset;
141
51.3k
    __m128i cmp_mask, cmp_store;
142
143
    /* Updating left and top-left and top */
144
1.41M
    for(row = 0; row < ht; row++)
145
1.36M
    {
146
1.36M
        pu1_src_left[row] = pu1_src[row * src_strd + (wd - 1)];
147
1.36M
    }
148
51.3k
    pu1_src_top_left[0] = pu1_src_top[wd - 1];
149
226k
    for(col = 0; col < wd; col += 8)
150
174k
    {
151
174k
        tmp_set_128i_1 = _mm_loadl_epi64((__m128i *)(pu1_src + (ht - 1) * src_strd + offset));
152
174k
        _mm_storel_epi64((__m128i *)(pu1_src_top + offset), tmp_set_128i_1);
153
174k
        offset += 8;
154
174k
    }
155
156
    //replicating sao_band_pos as 8 bit value 16 times
157
158
159
51.3k
    band_pos_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos << 3));
160
    //value set for sao_offset extraction
161
51.3k
    tmp_set_128i_1  = _mm_set_epi8(-128, 1, -128, 1, -128, 1, -128, 1, -128, 1, -128, 1, -128, 1, -128, 1);
162
51.3k
    tmp_set_128i_2  = _mm_set_epi8(-128, 2, -128, 2, -128, 2, -128, 2, -128, 2, -128, 2, -128, 2, -128, 2);
163
51.3k
    tmp_set_128i_3  = _mm_set_epi8(-128, 3, -128, 3, -128, 3, -128, 3, -128, 3, -128, 3, -128, 3, -128, 3);
164
51.3k
    tmp_set_128i_4  = _mm_set_epi8(-128, 4, -128, 4, -128, 4, -128, 4, -128, 4, -128, 4, -128, 4, -128, 4);
165
166
    //loaded sao offset values
167
51.3k
    sao_offset      = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
168
169
    //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers
170
51.3k
    band_table0_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx));
171
51.3k
    band_table1_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8));
172
51.3k
    band_table2_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16));
173
51.3k
    band_table3_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24));
174
175
    //band_position addition
176
51.3k
    band_table0_8x16b = _mm_add_epi16(band_table0_8x16b, band_pos_16x8b);
177
51.3k
    band_table1_8x16b = _mm_add_epi16(band_table1_8x16b, band_pos_16x8b);
178
51.3k
    band_table2_8x16b = _mm_add_epi16(band_table2_8x16b, band_pos_16x8b);
179
51.3k
    band_table3_8x16b = _mm_add_epi16(band_table3_8x16b, band_pos_16x8b);
180
    //sao_offset duplication
181
51.3k
    tmp_set_128i_1  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1);
182
51.3k
    tmp_set_128i_2  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2);
183
51.3k
    tmp_set_128i_3  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3);
184
51.3k
    tmp_set_128i_4  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4);
185
    //settng for comparision
186
51.3k
    cmp_mask = _mm_set1_epi16(16);
187
51.3k
    cmp_store = _mm_set1_epi16(0x00ff);
188
189
    //sao_offset addition
190
51.3k
    band_table0_8x16b = _mm_add_epi16(band_table0_8x16b, tmp_set_128i_1);
191
51.3k
    band_table1_8x16b = _mm_add_epi16(band_table1_8x16b, tmp_set_128i_2);
192
51.3k
    band_table2_8x16b = _mm_add_epi16(band_table2_8x16b, tmp_set_128i_3);
193
51.3k
    band_table3_8x16b = _mm_add_epi16(band_table3_8x16b, tmp_set_128i_4);
194
    //masking upper 8bit values of each  16 bit band table value
195
51.3k
    band_table0_8x16b = _mm_and_si128(band_table0_8x16b, cmp_store);
196
51.3k
    band_table1_8x16b = _mm_and_si128(band_table1_8x16b, cmp_store);
197
51.3k
    band_table2_8x16b = _mm_and_si128(band_table2_8x16b, cmp_store);
198
51.3k
    band_table3_8x16b = _mm_and_si128(band_table3_8x16b, cmp_store);
199
200
51.3k
    switch(sao_band_pos)
201
51.3k
    {
202
4.28k
        case 0:
203
4.28k
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table0_8x16b);
204
4.28k
            band_table0_8x16b = _mm_and_si128(band_table0_8x16b, tmp_set_128i_2);
205
4.28k
            break;
206
856
        case 28:
207
856
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table3_8x16b);
208
856
            band_table3_8x16b = _mm_or_si128(band_table3_8x16b, tmp_set_128i_2);
209
856
            break;
210
3.17k
        case 29:
211
3.17k
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table2_8x16b);
212
3.17k
            band_table2_8x16b = _mm_or_si128(band_table2_8x16b, tmp_set_128i_2);
213
3.17k
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table3_8x16b);
214
3.17k
            band_table3_8x16b = _mm_and_si128(band_table3_8x16b, tmp_set_128i_2);
215
3.17k
            break;
216
757
        case 30:
217
757
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table1_8x16b);
218
757
            band_table1_8x16b = _mm_or_si128(band_table1_8x16b, tmp_set_128i_2);
219
757
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table2_8x16b);
220
757
            band_table2_8x16b = _mm_and_si128(band_table2_8x16b, tmp_set_128i_2);
221
757
            break;
222
664
        case 31:
223
664
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table0_8x16b);
224
664
            band_table0_8x16b = _mm_or_si128(band_table0_8x16b, tmp_set_128i_2);
225
664
            tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table1_8x16b);
226
664
            band_table1_8x16b = _mm_and_si128(band_table1_8x16b, tmp_set_128i_2);
227
664
            break;
228
41.6k
        default:
229
41.6k
            break;
230
51.3k
    }
231
    //sao_offset is reused for zero cmp mask.
232
51.3k
    sao_offset = _mm_setzero_si128();
233
51.3k
    tmp_set_128i_1 = _mm_set1_epi8(1);
234
    //tmp_set_128i_2 = _mm_set_epi8 (128,7,128,6,128,5,128,4,128,3,128,2,128,1,128,0);
235
51.3k
    cmp_mask = _mm_packus_epi16(cmp_mask, cmp_mask); //cmp_msk=dup16(16);
236
237
    //masking upper 8bit values of each  16 bit band table value
238
51.3k
    band_table0_8x16b = _mm_and_si128(band_table0_8x16b, cmp_store);
239
51.3k
    band_table1_8x16b = _mm_and_si128(band_table1_8x16b, cmp_store);
240
51.3k
    band_table2_8x16b = _mm_and_si128(band_table2_8x16b, cmp_store);
241
51.3k
    band_table3_8x16b = _mm_and_si128(band_table3_8x16b, cmp_store);
242
243
    //band table 8x16 four registers are packed into 16x8 two registers:  band_table0_8x16b and band_table2_8x16b
244
51.3k
    band_table0_8x16b = _mm_packus_epi16(band_table0_8x16b, band_table1_8x16b);
245
51.3k
    band_table2_8x16b = _mm_packus_epi16(band_table2_8x16b, band_table3_8x16b);
246
247
51.3k
    band_table3_8x16b = _mm_slli_epi16(cmp_mask, 1); // to compare if value is greater than 31
248
51.3k
    band_pos_16x8b = _mm_packus_epi16(band_pos_16x8b, band_pos_16x8b); //band_pos is now 8 bit aligned
249
51.3k
    band_table3_8x16b = _mm_sub_epi8(band_table3_8x16b, tmp_set_128i_1); // to compare if value is greater than 31
250
251
51.3k
    cmp_mask = _mm_sub_epi8(cmp_mask, tmp_set_128i_1);
252
    //  band_pos_16x8b = _mm_or_si128(band_pos_16x8b,cmp_store);
253
254
114k
    for(col = wd; col >= 16; col -= 16)
255
62.7k
    {
256
62.7k
        pu1_src_cpy = pu1_src;
257
1.05M
        for(row = ht; row > 0; row -= 2)
258
989k
        {
259
260
261
            //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
262
989k
            src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
263
            // row = 1
264
989k
            src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
265
266
267
268
            //saturated substract 8 bit
269
989k
            tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_16x8b);
270
989k
            tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_16x8b);
271
            //if the values less than 0 put ff
272
989k
            tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
273
989k
            tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
274
989k
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
275
989k
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
276
            //if the values gret=ater than 31 put ff
277
989k
            tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, band_table3_8x16b);
278
989k
            tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, band_table3_8x16b);
279
989k
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
280
989k
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
281
282
283
            //row 0 and row1
284
            //if the values >16 then put ff ,cmp_mask = dup16(15)
285
989k
            cmp_store = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
286
            //values 16 to 31 for row 0 & 1 but values <16 ==0
287
989k
            tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, cmp_store);
288
            // values 0 to 15 for row 0 & 1
289
989k
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, cmp_store);
290
            //values 16 to 31 for row 0 & 1 but values <16 masked to ff
291
989k
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
292
989k
            tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, cmp_store);
293
            //row 2 and  row 3
294
            //if the values >16 then put ff ,cmp_mask = dup16(15)
295
989k
            cmp_store = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
296
            //values 16 to 31 for row 2 & 3 but values <16 ==0
297
989k
            tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, cmp_store);
298
            // values 0 to 15 for row 2 & 3
299
989k
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, cmp_store);
300
            //values 16 to 31 for row 2 & 3 but values <16 masked to ff
301
989k
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
302
989k
            tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, cmp_store);
303
304
            //row 0 and row 1
305
            //to preserve pixel values in which no offset needs to be added.
306
989k
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
307
989k
            src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, cmp_store);
308
309
            //row 2 and row 3
310
            //to preserve pixel values in which no offset needs to be added.
311
989k
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
312
989k
            src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, cmp_store);
313
314
            //indexing 0 - 15 bandtable indexes
315
989k
            tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_1);
316
989k
            tmp_set_128i_3 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_3);
317
989k
            tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_2);
318
989k
            tmp_set_128i_4 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_4);
319
            // combining all offsets results
320
989k
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
321
989k
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
322
            // combing results woth the pixel values
323
989k
            src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
324
989k
            src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
325
326
327
            //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
328
989k
            _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_8x16b);
329
            // row = 1
330
989k
            _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp2_8x16b);
331
332
989k
            pu1_src_cpy += (src_strd << 1);
333
989k
        }
334
62.7k
        pu1_src += 16;
335
62.7k
    }
336
51.3k
    wd_rem = wd & 0xF;
337
51.3k
    if(wd_rem)
338
49.3k
    {pu1_src_cpy = pu1_src;
339
376k
        for(row = ht; row > 0; row -= 4)
340
327k
        {
341
342
343
            //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
344
327k
            src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
345
            // row = 1
346
327k
            src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
347
            // row = 2
348
327k
            src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
349
            // row = 3
350
327k
            src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
351
            //row0 and row1 packed and row2 and row3 packed
352
353
327k
            src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b);
354
327k
            src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp3_8x16b);
355
356
            //saturated substract 8 bit
357
327k
            tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_16x8b);
358
327k
            tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_16x8b);
359
            //if the values less than 0 put ff
360
327k
            tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
361
327k
            tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
362
327k
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
363
327k
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
364
            //if the values gret=ater than 31 put ff
365
327k
            tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, band_table3_8x16b);
366
327k
            tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, band_table3_8x16b);
367
327k
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
368
327k
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
369
370
371
372
            //row 0 and row1
373
            //if the values >16 then put ff ,cmp_mask = dup16(15)
374
327k
            cmp_store = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
375
            //values 16 to 31 for row 0 & 1 but values <16 ==0
376
327k
            tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, cmp_store);
377
            // values 0 to 15 for row 0 & 1
378
327k
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, cmp_store);
379
            //values 16 to 31 for row 0 & 1 but values <16 masked to ff
380
327k
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
381
327k
            tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, cmp_store);
382
            //row 2 and  row 3
383
            //if the values >16 then put ff ,cmp_mask = dup16(15)
384
327k
            cmp_store = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
385
            //values 16 to 31 for row 2 & 3 but values <16 ==0
386
327k
            tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, cmp_store);
387
            // values 0 to 15 for row 2 & 3
388
327k
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, cmp_store);
389
            //values 16 to 31 for row 2 & 3 but values <16 masked to ff
390
327k
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
391
327k
            tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, cmp_store);
392
393
            //row 0 and row 1
394
            //to preserve pixel values in which no offset needs to be added.
395
327k
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
396
327k
            src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, cmp_store);
397
398
            //row 2 and row 3
399
            //to preserve pixel values in which no offset needs to be added.
400
327k
            cmp_store = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
401
327k
            src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, cmp_store);
402
403
            //indexing 0 - 15 bandtable indexes
404
327k
            tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_1);
405
327k
            tmp_set_128i_3 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_3);
406
327k
            tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_2);
407
327k
            tmp_set_128i_4 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_4);
408
            // combining all offsets results
409
327k
            tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
410
327k
            tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
411
            // combing results woth the pixel values
412
327k
            src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
413
327k
            src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
414
415
            //Getting row1 separately
416
327k
            src_temp1_8x16b = _mm_srli_si128(src_temp0_8x16b, 8);
417
            //Getting row3 separately
418
327k
            src_temp3_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
419
420
            //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
421
327k
            _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_8x16b);
422
            // row = 1
423
327k
            _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), src_temp1_8x16b);
424
            // row = 2
425
327k
            _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp2_8x16b);
426
            // row = 3
427
327k
            _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), src_temp3_8x16b);
428
429
327k
            pu1_src_cpy += (src_strd << 2);
430
431
327k
        }
432
49.3k
        pu1_src += 8;
433
49.3k
    }
434
435
436
51.3k
}
437
438
void ihevc_sao_band_offset_chroma_ssse3(UWORD8 *pu1_src,
439
                                        WORD32 src_strd,
440
                                        UWORD8 *pu1_src_left,
441
                                        UWORD8 *pu1_src_top,
442
                                        UWORD8 *pu1_src_top_left,
443
                                        WORD32 sao_band_pos_u,
444
                                        WORD32 sao_band_pos_v,
445
                                        WORD8 *pi1_sao_offset_u,
446
                                        WORD8 *pi1_sao_offset_v,
447
                                        WORD32 wd,
448
                                        WORD32 ht)
449
23.4k
{
450
23.4k
    WORD32 row, col;
451
23.4k
    WORD8 offset = 0;
452
453
454
23.4k
    __m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b;
455
23.4k
    __m128i cmp_msk2;
456
23.4k
    __m128i band_table0_16x8b, band_table1_16x8b, band_table2_16x8b, band_table3_16x8b;
457
23.4k
    __m128i tmp_set_128i_1, tmp_set_128i_2, tmp_set_128i_3, tmp_set_128i_4;
458
23.4k
    __m128i band_pos_u_16x8b, band_pos_v_16x8b;
459
23.4k
    __m128i sao_offset;
460
23.4k
    __m128i cmp_mask;
461
462
463
    /* Updating left and top and top-left */
464
367k
    for(row = 0; row < ht; row++)
465
344k
    {
466
344k
        pu1_src_left[2 * row] = pu1_src[row * src_strd + (wd - 2)];
467
344k
        pu1_src_left[2 * row + 1] = pu1_src[row * src_strd + (wd - 1)];
468
344k
    }
469
23.4k
    pu1_src_top_left[0] = pu1_src_top[wd - 2];
470
23.4k
    pu1_src_top_left[1] = pu1_src_top[wd - 1];
471
110k
    for(col = 0; col < wd; col += 8)
472
87.3k
    {
473
87.3k
        tmp_set_128i_1 = _mm_loadl_epi64((__m128i *)(pu1_src + (ht - 1) * src_strd + offset));
474
87.3k
        _mm_storel_epi64((__m128i *)(pu1_src_top + offset), tmp_set_128i_1);
475
87.3k
        offset += 8;
476
87.3k
    }
477
478
23.4k
    { // band _table creation
479
23.4k
        __m128i temp0_8x16b, temp1_8x16b, temp2_8x16b, temp3_8x16b;
480
        // Band table for U component : band_table0_16x8b and band_table2_16x8b
481
        //replicating sao_band_pos as 8 bit value 16 times
482
23.4k
        band_pos_u_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos_u << 3));
483
        //value set for sao_offset extraction
484
23.4k
        tmp_set_128i_1  = _mm_set_epi8(-128, 1, -128, 1, -128, 1, -128, 1, -128, 1, -128, 1, -128, 1, -128, 1);
485
23.4k
        tmp_set_128i_2  = _mm_set_epi8(-128, 2, -128, 2, -128, 2, -128, 2, -128, 2, -128, 2, -128, 2, -128, 2);
486
23.4k
        tmp_set_128i_3  = _mm_set_epi8(-128, 3, -128, 3, -128, 3, -128, 3, -128, 3, -128, 3, -128, 3, -128, 3);
487
23.4k
        tmp_set_128i_4  = _mm_set_epi8(-128, 4, -128, 4, -128, 4, -128, 4, -128, 4, -128, 4, -128, 4, -128, 4);
488
489
        //loaded sao offset values
490
23.4k
        sao_offset      = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
491
492
        //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers
493
23.4k
        band_table0_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx));
494
23.4k
        band_table1_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8));
495
23.4k
        band_table2_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16));
496
23.4k
        band_table3_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24));
497
498
        //band_position addition
499
23.4k
        band_table0_16x8b = _mm_add_epi16(band_table0_16x8b, band_pos_u_16x8b);
500
23.4k
        band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, band_pos_u_16x8b);
501
23.4k
        band_table2_16x8b = _mm_add_epi16(band_table2_16x8b, band_pos_u_16x8b);
502
23.4k
        band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, band_pos_u_16x8b);
503
        //sao_offset duplication
504
23.4k
        temp0_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1);
505
23.4k
        temp1_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2);
506
23.4k
        temp2_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3);
507
23.4k
        temp3_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4);
508
509
        //sao_offset addition
510
23.4k
        band_table0_16x8b = _mm_add_epi16(band_table0_16x8b, temp0_8x16b);
511
23.4k
        band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, temp1_8x16b);
512
23.4k
        band_table2_16x8b = _mm_add_epi16(band_table2_16x8b, temp2_8x16b);
513
23.4k
        band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, temp3_8x16b);
514
        //reuse for clipping
515
23.4k
        temp1_8x16b = _mm_set1_epi16(0x00ff);
516
        //settng for comparision
517
23.4k
        cmp_mask = _mm_set1_epi16(16);
518
519
        //masking upper 8bit values of each  16 bit band table value
520
23.4k
        band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp1_8x16b);
521
23.4k
        band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
522
23.4k
        band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp1_8x16b);
523
23.4k
        band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
524
525
        //temp1_8x16b reuse for compare storage
526
23.4k
        switch(sao_band_pos_u)
527
23.4k
        {
528
1.87k
            case 0:
529
1.87k
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table0_16x8b);
530
1.87k
                band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp3_8x16b);
531
1.87k
                break;
532
562
            case 28:
533
562
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
534
562
                band_table3_16x8b = _mm_or_si128(band_table3_16x8b, temp3_8x16b);
535
562
                break;
536
456
            case 29:
537
456
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table2_16x8b);
538
456
                band_table2_16x8b = _mm_or_si128(band_table2_16x8b, temp3_8x16b);
539
456
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
540
456
                band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp3_8x16b);
541
456
                break;
542
490
            case 30:
543
490
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
544
490
                band_table1_16x8b = _mm_or_si128(band_table1_16x8b, temp3_8x16b);
545
490
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table2_16x8b);
546
490
                band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp3_8x16b);
547
490
                break;
548
840
            case 31:
549
840
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table0_16x8b);
550
840
                band_table0_16x8b = _mm_or_si128(band_table0_16x8b, temp3_8x16b);
551
840
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
552
840
                band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp3_8x16b);
553
840
                break;
554
19.1k
            default:
555
19.1k
                break;
556
23.4k
        }
557
        //masking upper 8bit values of each  16 bit band table value
558
23.4k
        band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp1_8x16b);
559
23.4k
        band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
560
23.4k
        band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp1_8x16b);
561
23.4k
        band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
562
        //band table 8x16 four registers are packed into 16x8 two registers:  band_table0_8x16b and band_table2_8x16b
563
23.4k
        band_table0_16x8b = _mm_packus_epi16(band_table0_16x8b, band_table1_16x8b);
564
23.4k
        band_table2_16x8b = _mm_packus_epi16(band_table2_16x8b, band_table3_16x8b);
565
        // Band table for U component over
566
567
        // Band table for V component : band_table1_16x8b and band_table3_16x8b
568
        // replicating sao_band_pos as 8 bit value 16 times
569
23.4k
        band_pos_v_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos_v << 3));
570
571
        //loaded sao offset values
572
23.4k
        sao_offset      = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
573
574
        //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers
575
23.4k
        temp0_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx));
576
23.4k
        band_table1_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8));
577
23.4k
        temp2_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16));
578
23.4k
        band_table3_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24));
579
580
        //band_position addition
581
23.4k
        temp0_8x16b = _mm_add_epi16(temp0_8x16b, band_pos_v_16x8b);
582
23.4k
        band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, band_pos_v_16x8b);
583
23.4k
        temp2_8x16b = _mm_add_epi16(temp2_8x16b, band_pos_v_16x8b);
584
23.4k
        band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, band_pos_v_16x8b);
585
        //sao_offset duplication
586
23.4k
        tmp_set_128i_1  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1);
587
23.4k
        tmp_set_128i_2  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2);
588
23.4k
        tmp_set_128i_3  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3);
589
23.4k
        tmp_set_128i_4  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4);
590
591
        //sao_offset addition
592
23.4k
        temp0_8x16b = _mm_add_epi16(temp0_8x16b, tmp_set_128i_1);
593
23.4k
        band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, tmp_set_128i_2);
594
23.4k
        temp2_8x16b = _mm_add_epi16(temp2_8x16b, tmp_set_128i_3);
595
23.4k
        band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, tmp_set_128i_4);
596
597
        //masking upper 8bit values of 16 bit band table value
598
23.4k
        temp0_8x16b = _mm_and_si128(temp0_8x16b, temp1_8x16b);
599
23.4k
        band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
600
23.4k
        temp2_8x16b = _mm_and_si128(temp2_8x16b, temp1_8x16b);
601
23.4k
        band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
602
        //temp1_8x16b reuse for compare storage
603
604
23.4k
        switch(sao_band_pos_v)
605
23.4k
        {
606
2.52k
            case 0:
607
2.52k
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp0_8x16b);
608
2.52k
                temp0_8x16b = _mm_and_si128(temp0_8x16b, temp3_8x16b);
609
2.52k
                break;
610
739
            case 28:
611
739
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
612
739
                band_table3_16x8b = _mm_or_si128(band_table3_16x8b, temp3_8x16b);
613
739
                break;
614
606
            case 29:
615
606
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp2_8x16b);
616
606
                temp2_8x16b = _mm_or_si128(temp2_8x16b, temp3_8x16b);
617
606
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
618
606
                band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp3_8x16b);
619
606
                break;
620
430
            case 30:
621
430
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
622
430
                band_table1_16x8b = _mm_or_si128(band_table1_16x8b, temp3_8x16b);
623
430
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp2_8x16b);
624
430
                temp2_8x16b = _mm_and_si128(temp2_8x16b, temp3_8x16b);
625
430
                break;
626
594
            case 31:
627
594
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp0_8x16b);
628
594
                temp0_8x16b = _mm_or_si128(temp0_8x16b, temp3_8x16b);
629
594
                temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
630
594
                band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp3_8x16b);
631
594
                break;
632
18.5k
            default:
633
18.5k
                break;
634
23.4k
        }
635
        //masking upper 8bit values of each  16 bit band table value
636
23.4k
        temp0_8x16b = _mm_and_si128(temp0_8x16b, temp1_8x16b);
637
23.4k
        band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
638
23.4k
        temp2_8x16b = _mm_and_si128(temp2_8x16b, temp1_8x16b);
639
23.4k
        band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
640
        //band table 8x16 four registers are packed into 16x8 two registers:  band_table0_8x16b and band_table2_8x16b
641
23.4k
        band_table1_16x8b = _mm_packus_epi16(temp0_8x16b, band_table1_16x8b);
642
23.4k
        band_table3_16x8b = _mm_packus_epi16(temp2_8x16b, band_table3_16x8b);
643
        //band table for u and v created
644
23.4k
    }
645
0
    {
646
23.4k
        UWORD8 *pu1_src_cpy;
647
23.4k
        WORD32 wd_rem;
648
649
650
        //sao_offset is reused for zero cmp mask.
651
23.4k
        sao_offset = _mm_setzero_si128();
652
23.4k
        tmp_set_128i_1 = _mm_set1_epi8(1);
653
        //tmp_set_128i_2 = _mm_set_epi8 (128,7,128,6,128,5,128,4,128,3,128,2,128,1,128,0);
654
23.4k
        cmp_mask = _mm_packus_epi16(cmp_mask, cmp_mask); //cmp_msk=dup16(16);
655
        //to avoid ffff to be saturated to 0 instead it should be to ff
656
657
23.4k
        cmp_msk2 = _mm_slli_epi16(cmp_mask, 1); // to compare if value is greater than 31
658
23.4k
        band_pos_u_16x8b = _mm_packus_epi16(band_pos_u_16x8b, band_pos_u_16x8b); //band_pos_u is now 8 bit aligned
659
23.4k
        band_pos_v_16x8b = _mm_packus_epi16(band_pos_v_16x8b, band_pos_v_16x8b); //band_pos_v is now 8 bit aligned
660
23.4k
        cmp_msk2 = _mm_sub_epi8(cmp_msk2, tmp_set_128i_1); // to compare if value is greater than 31
661
662
23.4k
        cmp_mask = _mm_sub_epi8(cmp_mask, tmp_set_128i_1);
663
664
66.6k
        for(col = wd; col >= 16; col -= 16)
665
43.2k
        {
666
43.2k
            pu1_src_cpy = pu1_src;
667
378k
            for(row = ht; row > 0; row -= 2)
668
335k
            {
669
                //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
670
335k
                src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
671
                // row = 1
672
335k
                src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
673
674
675
                //odd values
676
335k
                src_temp1_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
677
335k
                src_temp2_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
678
                //even values
679
335k
                src_temp0_8x16b = _mm_slli_epi16(src_temp0_8x16b, 8);
680
335k
                src_temp3_8x16b = _mm_slli_epi16(src_temp3_8x16b, 8);
681
335k
                src_temp0_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
682
335k
                src_temp3_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
683
                //combining odd values
684
335k
                src_temp2_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp2_8x16b);
685
                //combining even values
686
335k
                src_temp0_8x16b = _mm_packus_epi16(src_temp0_8x16b, src_temp3_8x16b);
687
688
                //saturated substract 8 bit
689
335k
                tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_u_16x8b);
690
335k
                tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_v_16x8b);
691
                //if the values less than 0 put ff
692
335k
                tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
693
335k
                tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
694
335k
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
695
335k
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
696
                //if the values greater than 31 put ff
697
335k
                tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_msk2);
698
335k
                tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_msk2);
699
335k
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
700
335k
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
701
                // registers reused to increase performance
702
                //if the values >16 then put ff ,cmp_mask = dup16(15) row 0 and row1
703
335k
                src_temp1_8x16b = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
704
                //if the values >16 then put ff ,cmp_mask = dup16(15) row 2 and  row 3
705
335k
                src_temp3_8x16b = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
706
707
                //values 16 to 31 for row 0 & 1 but values <16 ==0
708
335k
                tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, src_temp1_8x16b);
709
                // values 0 to 15 for row 0 & 1
710
335k
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, src_temp1_8x16b);
711
                //values 16 to 31 for row 2 & 3 but values <16 ==0
712
335k
                tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, src_temp3_8x16b);
713
                // values 0 to 15 for row 2 & 3
714
335k
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, src_temp3_8x16b);
715
716
                //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 0 and row1
717
335k
                src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
718
                //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 2 and  row 3
719
335k
                src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
720
335k
                tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, src_temp1_8x16b);
721
335k
                tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, src_temp3_8x16b);
722
723
724
                //to choose which pixel values to preserve in row 0 and row 1
725
335k
                src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
726
                //to choose which pixel values to preserve in row 2 and row 3
727
335k
                src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
728
                //values of all rows to which no offset needs to be added preserved.
729
335k
                src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, src_temp1_8x16b);
730
335k
                src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, src_temp3_8x16b);
731
732
                //indexing 0 - 15 bandtable indexes
733
335k
                tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_16x8b, tmp_set_128i_1); //U low
734
335k
                tmp_set_128i_3 = _mm_shuffle_epi8(band_table1_16x8b, tmp_set_128i_3); //V low
735
                //indexing 16 -31 bandtable indexes
736
335k
                tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_16x8b, tmp_set_128i_2); //U high
737
335k
                tmp_set_128i_4 = _mm_shuffle_epi8(band_table3_16x8b, tmp_set_128i_4); //V high
738
                // combining all offsets results
739
335k
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); //U
740
335k
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); //V
741
                // combing results with the pixel values
742
335k
                src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
743
335k
                src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
744
                //reorganising even and odd values
745
335k
                src_temp1_8x16b = _mm_unpacklo_epi8(src_temp0_8x16b, src_temp2_8x16b);
746
335k
                src_temp3_8x16b = _mm_unpackhi_epi8(src_temp0_8x16b, src_temp2_8x16b);
747
748
749
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
750
335k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp1_8x16b);
751
                // row = 1
752
335k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp3_8x16b);
753
754
755
335k
                pu1_src_cpy += (src_strd << 1);
756
757
335k
            }
758
43.2k
            pu1_src += 16;
759
43.2k
        }
760
761
23.4k
        wd_rem = wd & 0xF;
762
23.4k
        if(wd_rem)
763
911
        {
764
911
            pu1_src_cpy = pu1_src;
765
2.72k
            for(row = ht; row > 0; row -= 4)
766
1.81k
            {
767
                //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
768
1.81k
                src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
769
                // row = 1
770
1.81k
                src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
771
                // row = 2
772
1.81k
                src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
773
                // row = 3
774
1.81k
                src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
775
                //row0 and row1 packed and row2 and row3 packed
776
777
1.81k
                src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b);
778
1.81k
                src_temp3_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp3_8x16b);
779
                //odd values
780
1.81k
                src_temp1_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
781
1.81k
                src_temp2_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
782
                //even values
783
1.81k
                src_temp0_8x16b = _mm_slli_epi16(src_temp0_8x16b, 8);
784
1.81k
                src_temp3_8x16b = _mm_slli_epi16(src_temp3_8x16b, 8);
785
1.81k
                src_temp0_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
786
1.81k
                src_temp3_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
787
                //combining odd values
788
1.81k
                src_temp2_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp2_8x16b);
789
                //combining even values
790
1.81k
                src_temp0_8x16b = _mm_packus_epi16(src_temp0_8x16b, src_temp3_8x16b);
791
792
                //saturated substract 8 bit
793
1.81k
                tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_u_16x8b);
794
1.81k
                tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_v_16x8b);
795
                //if the values less than 0 put ff
796
1.81k
                tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
797
1.81k
                tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
798
1.81k
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
799
1.81k
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
800
                //if the values greater than 31 put ff
801
1.81k
                tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_msk2);
802
1.81k
                tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_msk2);
803
1.81k
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
804
1.81k
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
805
                // registers reused to increase performance
806
                //if the values >16 then put ff ,cmp_mask = dup16(15) row 0 and row1
807
1.81k
                src_temp1_8x16b = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
808
                //if the values >16 then put ff ,cmp_mask = dup16(15) row 2 and  row 3
809
1.81k
                src_temp3_8x16b = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
810
811
                //values 16 to 31 for row 0 & 1 but values <16 ==0
812
1.81k
                tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, src_temp1_8x16b);
813
                // values 0 to 15 for row 0 & 1
814
1.81k
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, src_temp1_8x16b);
815
                //values 16 to 31 for row 2 & 3 but values <16 ==0
816
1.81k
                tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, src_temp3_8x16b);
817
                // values 0 to 15 for row 2 & 3
818
1.81k
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, src_temp3_8x16b);
819
820
                //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 0 and row1
821
1.81k
                src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
822
                //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 2 and  row 3
823
1.81k
                src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
824
1.81k
                tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, src_temp1_8x16b);
825
1.81k
                tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, src_temp3_8x16b);
826
827
828
                //to choose which pixel values to preserve in row 0 and row 1
829
1.81k
                src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
830
                //to choose which pixel values to preserve in row 2 and row 3
831
1.81k
                src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
832
                //values of all rows to which no offset needs to be added preserved.
833
1.81k
                src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, src_temp1_8x16b);
834
1.81k
                src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, src_temp3_8x16b);
835
836
                //indexing 0 - 15 bandtable indexes
837
1.81k
                tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_16x8b, tmp_set_128i_1); //U low
838
1.81k
                tmp_set_128i_3 = _mm_shuffle_epi8(band_table1_16x8b, tmp_set_128i_3); //V low
839
                //indexing 16 -31 bandtable indexes
840
1.81k
                tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_16x8b, tmp_set_128i_2); //U high
841
1.81k
                tmp_set_128i_4 = _mm_shuffle_epi8(band_table3_16x8b, tmp_set_128i_4); //V high
842
                // combining all offsets results
843
1.81k
                tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); //U
844
1.81k
                tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); //V
845
                // combing results with the pixel values
846
1.81k
                src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
847
1.81k
                src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
848
                //reorganising even and odd values
849
1.81k
                src_temp1_8x16b = _mm_unpacklo_epi8(src_temp0_8x16b, src_temp2_8x16b);
850
1.81k
                src_temp3_8x16b = _mm_unpackhi_epi8(src_temp0_8x16b, src_temp2_8x16b);
851
                //Getting row1 separately
852
1.81k
                src_temp0_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
853
                //Getting row3 separately
854
1.81k
                src_temp2_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
855
856
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
857
1.81k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp1_8x16b);
858
                // row = 1
859
1.81k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), src_temp0_8x16b);
860
                // row = 2
861
1.81k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp3_8x16b);
862
                // row = 3
863
1.81k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), src_temp2_8x16b);
864
865
1.81k
                pu1_src_cpy += (src_strd << 2);
866
867
1.81k
            }
868
911
            pu1_src += 16;
869
911
        }
870
871
872
23.4k
    }
873
23.4k
}
874
875
876
877
void ihevc_sao_edge_offset_class0_ssse3(UWORD8 *pu1_src,
878
                                        WORD32 src_strd,
879
                                        UWORD8 *pu1_src_left,
880
                                        UWORD8 *pu1_src_top,
881
                                        UWORD8 *pu1_src_top_left,
882
                                        UWORD8 *pu1_src_top_right,
883
                                        UWORD8 *pu1_src_bot_left,
884
                                        UWORD8 *pu1_avail,
885
                                        WORD8 *pi1_sao_offset,
886
                                        WORD32 wd,
887
                                        WORD32 ht)
888
16.8k
{
889
16.8k
    WORD32 row, col;
890
16.8k
    UWORD8 *pu1_src_cpy, *pu1_src_left_cpy, *pu1_src_left_str, *pu1_left_tmp;
891
16.8k
    UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
892
16.8k
    UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8];
893
16.8k
    UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8];
894
16.8k
    UWORD8 u1_avail0, u1_avail1;
895
16.8k
    WORD32 wd_rem;
896
16.8k
    WORD32 offset = 0;
897
16.8k
    __m128i src_temp0_16x8b, src_temp1_16x8b;
898
16.8k
    __m128i left0_16x8b, left1_16x8b;
899
16.8k
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b, cmp_gt1_16x8b, cmp_lt1_16x8b;
900
16.8k
    __m128i edge0_16x8b, edge1_16x8b;
901
16.8k
    __m128i au1_mask8x16b;
902
16.8k
    __m128i edge_idx_8x16b, sao_offset_8x16b;
903
16.8k
    __m128i const2_16x8b, const0_16x8b;
904
16.8k
    __m128i left_store_16x8b;
905
16.8k
    UNUSED(pu1_src_top_right);
906
16.8k
    UNUSED(pu1_src_bot_left);
907
908
16.8k
    au1_mask8x16b = _mm_set1_epi8(0xff);
909
910
    /* Update  top and top-left arrays */
911
912
16.8k
    *pu1_src_top_left = pu1_src_top[wd - 1];
913
914
40.3k
    for(col = wd; col >= 16; col -= 16)
915
23.5k
    {
916
23.5k
        const0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + offset + (ht - 1) * src_strd));
917
23.5k
        _mm_storeu_si128((__m128i *)(pu1_src_top + offset), const0_16x8b);
918
23.5k
        offset += 16;
919
23.5k
    }
920
921
    //setting availability mask to ff size MAX_CTB_SIZE
922
84.2k
    for(col = 0; col < MAX_CTB_SIZE; col += 16)
923
67.4k
        _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
924
521k
    for(row = 0; row < ht; row++)
925
504k
    {
926
504k
        au1_src_left_tmp[row] = pu1_src_left[row];
927
504k
    }
928
16.8k
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
929
16.8k
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
930
931
    //availability mask creation
932
16.8k
    u1_avail0 = pu1_avail[0];
933
16.8k
    u1_avail1 = pu1_avail[1];
934
16.8k
    au1_mask[0] = u1_avail0;
935
16.8k
    au1_mask[wd - 1] = u1_avail1;
936
937
16.8k
    const2_16x8b = _mm_set1_epi8(2);
938
16.8k
    const0_16x8b = _mm_setzero_si128();
939
16.8k
    pu1_src_left_cpy = au1_src_left_tmp;
940
16.8k
    pu1_src_left_str = au1_src_left_tmp1;
941
16.8k
    {
942
16.8k
        au1_mask_cpy = au1_mask;
943
40.3k
        for(col = wd; col >= 16; col -= 16)
944
23.5k
        {
945
23.5k
            pu1_src_cpy = pu1_src;
946
23.5k
            au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
947
            //pu1_src_left_cpy =au1_src_left_tmp;
948
394k
            for(row = ht; row > 0; row -= 2)
949
371k
            {
950
951
371k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
952
                //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
953
371k
                src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
954
                // row = 1
955
371k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
956
957
371k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 2);
958
                //row 1 left
959
371k
                left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 15);
960
371k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15);
961
                //row 0 left
962
371k
                left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 15);
963
371k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
964
965
966
                //separating +ve and and -ve values.
967
371k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
968
371k
                cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
969
371k
                cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
970
371k
                cmp_lt1_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
971
                //creating mask 00 for +ve and -ve values and FF for zero.
972
371k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
973
371k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
974
371k
                cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
975
371k
                cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
976
                //combining the appropriate sign change
977
371k
                left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
978
371k
                left1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
979
980
                //row = 0 right
981
371k
                edge0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 1));
982
                // row = 1 right
983
371k
                edge1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
984
                //separating +ve and and -ve values.
985
371k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
986
371k
                cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
987
371k
                cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
988
371k
                cmp_lt1_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
989
                //creating mask 00 for +ve and -ve values and FF for zero.
990
371k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
991
371k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
992
371k
                cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
993
371k
                cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
994
                //combining the appropriate sign change
995
371k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
996
371k
                edge1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
997
998
                //combining sign-left and sign_right
999
371k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
1000
371k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
1001
                //adding constant 2
1002
371k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1003
371k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
1004
                //shuffle to get sao index
1005
371k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1006
371k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
1007
                //using availability mask
1008
371k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
1009
371k
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
1010
1011
                //shuffle to get sao offset
1012
371k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1013
371k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
1014
                //cnvert to 16 bit then add and then saturated pack
1015
371k
                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1016
371k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1017
371k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
1018
371k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1019
371k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1020
371k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
1021
371k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
1022
371k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1023
1024
371k
                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
1025
371k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
1026
371k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
1027
371k
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
1028
371k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1029
371k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
1030
371k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
1031
371k
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
1032
1033
1034
371k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
1035
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1036
371k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1037
                // row = 1
1038
371k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
1039
1040
371k
                pu1_src_cpy += (src_strd << 1);
1041
371k
                pu1_src_left_cpy += 2;
1042
371k
                pu1_src_left_str += 2;
1043
371k
            }
1044
23.5k
            au1_mask_cpy += 16;
1045
23.5k
            pu1_src += 16;
1046
23.5k
            pu1_src_left_cpy -= ht;
1047
23.5k
            pu1_src_left_str -= ht;
1048
1049
23.5k
            pu1_left_tmp = pu1_src_left_cpy;
1050
23.5k
            pu1_src_left_cpy = pu1_src_left_str;
1051
23.5k
            pu1_src_left_str = pu1_left_tmp;
1052
23.5k
        }
1053
1054
16.8k
        wd_rem = wd & 0xF;
1055
16.8k
        if(wd_rem)
1056
16.5k
        {
1057
1058
16.5k
            cmp_gt1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (ht - 1) * src_strd));
1059
16.5k
            _mm_storel_epi64((__m128i *)(pu1_src_top + offset), cmp_gt1_16x8b);
1060
1061
16.5k
            au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy);
1062
16.5k
            pu1_src_cpy = pu1_src;
1063
16.5k
            au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
1064
            //pu1_src_left_cpy =au1_src_left_tmp;
1065
139k
            for(row = ht; row > 0; row -= 4)
1066
123k
            {
1067
123k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
1068
                //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
1069
123k
                src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
1070
                // row = 1
1071
123k
                cmp_gt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
1072
                // row  = 2
1073
123k
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
1074
                // row = 3
1075
123k
                cmp_gt1_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
1076
1077
1078
123k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 4);
1079
                //row 3 left
1080
123k
                edge0_16x8b = _mm_slli_si128(cmp_gt1_16x8b, 8);
1081
123k
                cmp_lt1_16x8b = _mm_alignr_epi8(cmp_gt1_16x8b, left_store_16x8b, 15);
1082
123k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
1083
                //row 2 left
1084
123k
                edge0_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
1085
123k
                left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 15);
1086
123k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
1087
                //row 1 left
1088
123k
                edge0_16x8b = _mm_slli_si128(cmp_gt0_16x8b, 8);
1089
123k
                cmp_lt0_16x8b = _mm_alignr_epi8(cmp_gt0_16x8b, left_store_16x8b, 15);
1090
123k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
1091
                //row 0 left
1092
123k
                edge0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
1093
123k
                left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 15);
1094
123k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
1095
1096
                // packing rows together for 16 SIMD operations
1097
123k
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, cmp_gt0_16x8b);
1098
123k
                src_temp1_16x8b = _mm_unpacklo_epi64(src_temp1_16x8b, cmp_gt1_16x8b);
1099
                // packing rows together for 16 SIMD operations
1100
123k
                left0_16x8b = _mm_unpacklo_epi64(left0_16x8b, cmp_lt0_16x8b);
1101
123k
                left1_16x8b = _mm_unpacklo_epi64(left1_16x8b, cmp_lt1_16x8b);
1102
1103
                //separating +ve and and -ve values.
1104
123k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
1105
123k
                cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
1106
123k
                cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
1107
123k
                cmp_lt1_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
1108
                //creating mask 00 for +ve and -ve values and FF for zero.
1109
123k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1110
123k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1111
123k
                cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
1112
123k
                cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
1113
                //combining the appropriate sign change
1114
123k
                left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1115
123k
                left1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
1116
1117
                //row = 0 right
1118
123k
                edge0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 1));
1119
                // row = 1 right
1120
123k
                cmp_gt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd + 1));
1121
                // row = 2 right
1122
123k
                edge1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd + 1));
1123
                // row = 3 right
1124
123k
                cmp_gt1_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd + 1));
1125
                // packing rows together for 16 SIMD operations
1126
123k
                edge0_16x8b = _mm_unpacklo_epi64(edge0_16x8b, cmp_gt0_16x8b);
1127
123k
                edge1_16x8b = _mm_unpacklo_epi64(edge1_16x8b, cmp_gt1_16x8b);
1128
1129
                //separating +ve and and -ve values.
1130
123k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
1131
123k
                cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
1132
123k
                cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
1133
123k
                cmp_lt1_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
1134
                //creating mask 00 for +ve and -ve values and FF for zero.
1135
123k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1136
123k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1137
123k
                cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
1138
123k
                cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
1139
                //combining the appropriate sign change
1140
123k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1141
123k
                edge1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
1142
1143
                //combining sign-left and sign_right
1144
123k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
1145
123k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
1146
                //adding constant 2
1147
123k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1148
123k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
1149
                //shuffle to get sao index
1150
123k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1151
123k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
1152
                //shuffle to get sao offset
1153
                //using availability mask
1154
123k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
1155
123k
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
1156
1157
123k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1158
123k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
1159
                //cnvert to 16 bit then add and then saturated pack
1160
123k
                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1161
123k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1162
123k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
1163
123k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1164
123k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1165
123k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
1166
123k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
1167
123k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1168
1169
123k
                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
1170
123k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
1171
123k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
1172
123k
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
1173
123k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1174
123k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
1175
123k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
1176
123k
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
1177
                //separting row 1 and row 3
1178
123k
                cmp_lt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
1179
123k
                cmp_lt1_16x8b = _mm_srli_si128(src_temp1_16x8b, 8);
1180
1181
123k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
1182
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1183
123k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1184
                // row = 1
1185
123k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_lt0_16x8b);
1186
                // row = 2
1187
123k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp1_16x8b);
1188
                // row = 3
1189
123k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt1_16x8b);
1190
1191
123k
                pu1_src_cpy += (src_strd << 2);
1192
123k
                pu1_src_left_cpy += 4;
1193
123k
                pu1_src_left_str += 4;
1194
123k
            }
1195
16.5k
            pu1_src += wd;
1196
16.5k
            pu1_src_left_cpy -= ht;
1197
16.5k
            pu1_src_left_str -= ht;
1198
1199
16.5k
            pu1_left_tmp = pu1_src_left_cpy;
1200
16.5k
            pu1_src_left_cpy = pu1_src_left_str;
1201
16.5k
            pu1_src_left_str = pu1_left_tmp;
1202
16.5k
        }
1203
521k
        for(row = 0; row < ht; row++)
1204
504k
        {
1205
504k
            pu1_src_left[row] = pu1_src_left_cpy[row];
1206
504k
        }
1207
16.8k
    }
1208
16.8k
}
1209
1210
1211
void ihevc_sao_edge_offset_class0_chroma_ssse3(UWORD8 *pu1_src,
1212
                                               WORD32 src_strd,
1213
                                               UWORD8 *pu1_src_left,
1214
                                               UWORD8 *pu1_src_top,
1215
                                               UWORD8 *pu1_src_top_left,
1216
                                               UWORD8 *pu1_src_top_right,
1217
                                               UWORD8 *pu1_src_bot_left,
1218
                                               UWORD8 *pu1_avail,
1219
                                               WORD8 *pi1_sao_offset_u,
1220
                                               WORD8 *pi1_sao_offset_v,
1221
                                               WORD32 wd,
1222
                                               WORD32 ht)
1223
5.47k
{
1224
5.47k
    WORD32 row, col;
1225
5.47k
    UWORD8 *pu1_src_cpy, *pu1_src_left_cpy, *pu1_src_left_str, *pu1_left_tmp;
1226
5.47k
    UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
1227
5.47k
    UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)];
1228
5.47k
    UWORD8 au1_src_left_tmp1[2 * (MAX_CTB_SIZE + 8)];
1229
5.47k
    UWORD8 u1_avail0, u1_avail1;
1230
5.47k
    WORD32 wd_rem;
1231
5.47k
    WORD32 offset = 0;
1232
1233
5.47k
    __m128i src_temp0_16x8b, src_temp1_16x8b;
1234
5.47k
    __m128i left0_16x8b, left1_16x8b;
1235
5.47k
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
1236
5.47k
    __m128i edge0_16x8b, edge1_16x8b;
1237
5.47k
    __m128i au1_mask8x16b;
1238
5.47k
    __m128i edge_idx_8x16b, sao_offset_8x16b;
1239
5.47k
    __m128i const2_16x8b, const0_16x8b;
1240
5.47k
    __m128i left_store_16x8b;
1241
5.47k
    __m128i chroma_offset_8x16b;
1242
5.47k
    UNUSED(pu1_src_top_right);
1243
5.47k
    UNUSED(pu1_src_bot_left);
1244
1245
5.47k
    au1_mask8x16b = _mm_set1_epi8(0xff);
1246
1247
    /* Update  top and top-left arrays */
1248
5.47k
    pu1_src_top_left[0] = pu1_src_top[wd - 2];
1249
5.47k
    pu1_src_top_left[1] = pu1_src_top[wd - 1];;
1250
1251
15.6k
    for(col = wd; col >= 16; col -= 16)
1252
10.1k
    {
1253
10.1k
        const0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + offset + (ht - 1) * src_strd));
1254
10.1k
        _mm_storeu_si128((__m128i *)(pu1_src_top + offset), const0_16x8b);
1255
10.1k
        offset += 16;
1256
10.1k
    }
1257
165k
    for(row = 0; row < 2 * ht; row++)
1258
159k
    {
1259
159k
        au1_src_left_tmp[row] = pu1_src_left[row];
1260
159k
    }
1261
    //setting availability mask to ff size MAX_CTB_SIZE
1262
27.3k
    for(col = 0; col < MAX_CTB_SIZE; col += 16)
1263
21.9k
        _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
1264
1265
5.47k
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
1266
5.47k
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
1267
5.47k
    const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
1268
5.47k
    chroma_offset_8x16b = _mm_set1_epi16(0x0800);
1269
    //availability mask creation
1270
5.47k
    u1_avail0 = pu1_avail[0];
1271
5.47k
    u1_avail1 = pu1_avail[1];
1272
5.47k
    au1_mask[0] = u1_avail0;
1273
5.47k
    au1_mask[1] = u1_avail0;
1274
5.47k
    au1_mask[wd - 1] = u1_avail1;
1275
5.47k
    au1_mask[wd - 2] = u1_avail1;
1276
5.47k
    sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
1277
5.47k
    const2_16x8b = _mm_set1_epi8(2);
1278
5.47k
    const0_16x8b = _mm_setzero_si128();
1279
1280
5.47k
    {
1281
5.47k
        pu1_src_left_cpy = au1_src_left_tmp;
1282
5.47k
        pu1_src_left_str = au1_src_left_tmp1;
1283
5.47k
        au1_mask_cpy = au1_mask;
1284
15.6k
        for(col = wd; col >= 16; col -= 16)
1285
10.1k
        {
1286
10.1k
            pu1_src_cpy = pu1_src;
1287
10.1k
            au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
1288
1289
89.6k
            for(row = ht; row > 0; row -= 2)
1290
79.5k
            {
1291
1292
79.5k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
1293
                //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
1294
79.5k
                src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
1295
                // row = 1
1296
79.5k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
1297
1298
79.5k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 4);
1299
                //row 1 left
1300
79.5k
                left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 14);
1301
79.5k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14);
1302
                //row 0 left
1303
79.5k
                left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 14);
1304
79.5k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
1305
1306
1307
                //separating +ve and and -ve values.row 0 left
1308
79.5k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
1309
79.5k
                cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
1310
                //creating mask 00 for +ve and -ve values and FF for zero.
1311
79.5k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1312
79.5k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1313
                //combining the appropriate sign change
1314
79.5k
                left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1315
1316
                //separating +ve and and -ve values.row 1 left
1317
79.5k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
1318
79.5k
                cmp_lt0_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
1319
                //creating mask 00 for +ve and -ve values and FF for zero.
1320
79.5k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1321
79.5k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1322
                //combining the appropriate sign change
1323
79.5k
                left1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1324
1325
1326
                //row = 0 right
1327
79.5k
                edge0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2));
1328
                // row = 1 right
1329
79.5k
                edge1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
1330
                //separating +ve and and -ve values.row 0 right
1331
79.5k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
1332
79.5k
                cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
1333
                //creating mask 00 for +ve and -ve values and FF for zero.
1334
79.5k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1335
79.5k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1336
                //combining the appropriate sign change
1337
79.5k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1338
1339
                //separating +ve and and -ve values.row 1 right
1340
79.5k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
1341
79.5k
                cmp_lt0_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
1342
                //creating mask 00 for +ve and -ve values and FF for zero.
1343
79.5k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1344
79.5k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1345
                //combining the appropriate sign change
1346
79.5k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1347
1348
                //combining sign-left and sign_right
1349
79.5k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
1350
79.5k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
1351
                //adding constant 2
1352
79.5k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1353
79.5k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
1354
                //shuffle to get sao index
1355
79.5k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1356
79.5k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
1357
                //using availability mask
1358
79.5k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
1359
79.5k
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
1360
                //adding chroma offset to access U and V
1361
79.5k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
1362
79.5k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
1363
1364
                //shuffle to get sao offset
1365
79.5k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1366
79.5k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
1367
                //cnvert to 16 bit then add and then saturated pack
1368
79.5k
                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1369
79.5k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1370
79.5k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
1371
79.5k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1372
79.5k
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
1373
79.5k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1374
79.5k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
1375
79.5k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1376
1377
79.5k
                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
1378
79.5k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
1379
79.5k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
1380
79.5k
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
1381
79.5k
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
1382
79.5k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1383
79.5k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
1384
79.5k
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
1385
1386
79.5k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
1387
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1388
79.5k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1389
                // row = 1
1390
79.5k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
1391
1392
79.5k
                pu1_src_cpy += (src_strd << 1);
1393
79.5k
                pu1_src_left_cpy += 4;
1394
79.5k
                pu1_src_left_str += 4;
1395
79.5k
            }
1396
10.1k
            au1_mask_cpy += 16;
1397
10.1k
            pu1_src += 16;
1398
10.1k
            pu1_src_left_cpy -= 2 * ht;
1399
10.1k
            pu1_src_left_str -= 2 * ht;
1400
1401
10.1k
            pu1_left_tmp = pu1_src_left_cpy;
1402
10.1k
            pu1_src_left_cpy = pu1_src_left_str;
1403
10.1k
            pu1_src_left_str = pu1_left_tmp;
1404
10.1k
        }
1405
1406
5.47k
        wd_rem = wd & 0xF;
1407
5.47k
        if(wd_rem)
1408
374
        {
1409
1410
374
            cmp_gt0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (ht - 1) * src_strd));
1411
374
            _mm_storel_epi64((__m128i *)(pu1_src_top + offset), cmp_gt0_16x8b);
1412
1413
374
            au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy);
1414
374
            pu1_src_cpy = pu1_src;
1415
374
            au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
1416
1417
885
            for(row = ht; row > 0; row -= 4)
1418
511
            {
1419
511
                left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
1420
                //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
1421
511
                src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
1422
                // row = 1
1423
511
                cmp_gt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
1424
                // row  = 2
1425
511
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
1426
                // row = 3
1427
511
                cmp_lt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
1428
1429
1430
511
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 8);
1431
                //row 3 left
1432
511
                edge0_16x8b = _mm_slli_si128(cmp_lt0_16x8b, 8);
1433
511
                left0_16x8b = _mm_alignr_epi8(cmp_lt0_16x8b, left_store_16x8b, 14);
1434
511
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
1435
                //row 2 left
1436
511
                edge0_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
1437
511
                left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 14);
1438
511
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
1439
1440
1441
                // packing rows together for 16 SIMD operations
1442
511
                src_temp1_16x8b = _mm_unpacklo_epi64(src_temp1_16x8b, cmp_lt0_16x8b);
1443
511
                left1_16x8b = _mm_unpacklo_epi64(left1_16x8b, left0_16x8b);
1444
1445
                //row 1 left
1446
511
                edge0_16x8b = _mm_slli_si128(cmp_gt0_16x8b, 8);
1447
511
                edge1_16x8b = _mm_alignr_epi8(cmp_gt0_16x8b, left_store_16x8b, 14);
1448
511
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
1449
                //row 0 left
1450
511
                edge0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
1451
511
                left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 14);
1452
511
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
1453
                // packing rows together for 16 SIMD operations
1454
511
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, cmp_gt0_16x8b);
1455
511
                left0_16x8b = _mm_unpacklo_epi64(left0_16x8b, edge1_16x8b);
1456
1457
                //separating +ve and and -ve values.for row 2 and row 3
1458
511
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
1459
511
                cmp_lt0_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
1460
                //creating mask 00 for +ve and -ve values and FF for zero.
1461
511
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1462
511
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1463
                //combining the appropriate sign change
1464
511
                left1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1465
1466
1467
1468
1469
1470
                //separating +ve and and -ve values.
1471
511
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
1472
511
                cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
1473
                //creating mask 00 for +ve and -ve values and FF for zero.
1474
511
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1475
511
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1476
                //combining the appropriate sign change
1477
511
                left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1478
1479
1480
                //row = 0 right
1481
511
                edge0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2));
1482
                // row = 1 right
1483
511
                cmp_gt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd + 2));
1484
                // row = 2 right
1485
511
                edge1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd + 2));
1486
                // row = 3 right
1487
511
                cmp_lt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd + 2));
1488
                // packing rows together for 16 SIMD operations
1489
511
                edge0_16x8b = _mm_unpacklo_epi64(edge0_16x8b, cmp_gt0_16x8b);
1490
511
                edge1_16x8b = _mm_unpacklo_epi64(edge1_16x8b, cmp_lt0_16x8b);
1491
1492
                //separating +ve and and -ve values.
1493
511
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
1494
511
                cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
1495
                //creating mask 00 for +ve and -ve values and FF for zero.
1496
511
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1497
511
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1498
                //combining the appropriate sign change
1499
511
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1500
1501
511
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
1502
511
                cmp_lt0_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
1503
                //creating mask 00 for +ve and -ve values and FF for zero.
1504
511
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1505
511
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1506
                //combining the appropriate sign change
1507
511
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1508
1509
                //combining sign-left and sign_right
1510
511
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
1511
511
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
1512
                //adding constant 2
1513
511
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1514
511
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
1515
                //shuffle to get sao index
1516
511
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1517
511
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
1518
                //shuffle to get sao offset
1519
                //using availability mask
1520
511
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
1521
511
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
1522
                //adding chroma offset to access U and V
1523
511
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
1524
511
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
1525
1526
511
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1527
511
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
1528
                //cnvert to 16 bit then add and then saturated pack
1529
511
                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1530
511
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1531
511
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
1532
511
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1533
511
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
1534
511
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1535
511
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
1536
511
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1537
1538
511
                left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
1539
511
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
1540
511
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
1541
511
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
1542
511
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
1543
511
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1544
511
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
1545
511
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
1546
1547
                //seaprting row 1 and row 3
1548
511
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
1549
511
                cmp_lt0_16x8b = _mm_srli_si128(src_temp1_16x8b, 8);
1550
1551
511
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
1552
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1553
511
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1554
                // row = 1
1555
511
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
1556
                // row = 2
1557
511
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp1_16x8b);
1558
                // row = 3
1559
511
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
1560
1561
511
                pu1_src_cpy += (src_strd << 2);
1562
511
                pu1_src_left_cpy += 8;
1563
511
                pu1_src_left_str += 8;
1564
511
            }
1565
374
            pu1_src += wd;
1566
374
            pu1_src_left_cpy -= 2 * ht;
1567
374
            pu1_src_left_str -= 2 * ht;
1568
1569
374
            pu1_left_tmp = pu1_src_left_cpy;
1570
374
            pu1_src_left_cpy = pu1_src_left_str;
1571
374
            pu1_src_left_str = pu1_left_tmp;
1572
374
        }
1573
165k
        for(row = 0; row < 2 * ht; row++)
1574
159k
        {
1575
159k
            pu1_src_left[row] = pu1_src_left_cpy[row];
1576
159k
        }
1577
5.47k
    }
1578
1579
5.47k
}
1580
1581
1582
void ihevc_sao_edge_offset_class1_ssse3(UWORD8 *pu1_src,
1583
                                        WORD32 src_strd,
1584
                                        UWORD8 *pu1_src_left,
1585
                                        UWORD8 *pu1_src_top,
1586
                                        UWORD8 *pu1_src_top_left,
1587
                                        UWORD8 *pu1_src_top_right,
1588
                                        UWORD8 *pu1_src_bot_left,
1589
                                        UWORD8 *pu1_avail,
1590
                                        WORD8 *pi1_sao_offset,
1591
                                        WORD32 wd,
1592
                                        WORD32 ht)
1593
16.7k
{
1594
16.7k
    WORD32 row, col;
1595
16.7k
    UWORD8 *pu1_src_top_cpy;
1596
16.7k
    UWORD8 *pu1_src_cpy;
1597
16.7k
    WORD32 wd_rem;
1598
1599
1600
16.7k
    __m128i src_top_16x8b, src_bottom_16x8b;
1601
16.7k
    __m128i src_temp0_16x8b, src_temp1_16x8b;
1602
16.7k
    __m128i signup0_16x8b, signdwn1_16x8b;
1603
16.7k
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
1604
16.7k
    __m128i edge0_16x8b, edge1_16x8b;
1605
16.7k
    __m128i edge_idx_8x16b, sao_offset_8x16b;
1606
16.7k
    __m128i const2_16x8b, const0_16x8b;
1607
1608
16.7k
    UNUSED(pu1_src_top_right);
1609
16.7k
    UNUSED(pu1_src_bot_left);
1610
1611
1612
    /* Updating left and top-left  */
1613
540k
    for(row = 0; row < ht; row++)
1614
523k
    {
1615
523k
        pu1_src_left[row] = pu1_src[row * src_strd + (wd - 1)];
1616
523k
    }
1617
16.7k
    *pu1_src_top_left = pu1_src_top[wd - 1];
1618
1619
1620
1621
16.7k
    pu1_src_top_cpy = pu1_src_top;
1622
16.7k
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
1623
16.7k
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
1624
1625
    /* Update height and source pointers based on the availability flags */
1626
16.7k
    if(0 == pu1_avail[2])
1627
2.23k
    {
1628
2.23k
        pu1_src_top_cpy = pu1_src;
1629
2.23k
        pu1_src += src_strd;
1630
2.23k
        ht--;
1631
2.23k
    }
1632
16.7k
    if(0 == pu1_avail[3])
1633
1.55k
    {
1634
1.55k
        ht--;
1635
1.55k
    }
1636
1637
16.7k
    const2_16x8b = _mm_set1_epi8(2);
1638
16.7k
    const0_16x8b = _mm_setzero_si128();
1639
1640
16.7k
    {
1641
16.7k
        WORD32 ht_rem;
1642
42.1k
        for(col = wd; col >= 16; col -= 16)
1643
25.4k
        {
1644
25.4k
            pu1_src_cpy = pu1_src;
1645
25.4k
            src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
1646
            //row = 0
1647
25.4k
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
1648
            //separating +ve and and -ve values.
1649
25.4k
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
1650
25.4k
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
1651
            //creating mask 00 for +ve and -ve values and FF for zero.
1652
25.4k
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1653
25.4k
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1654
            //combining the appropriate sign change
1655
25.4k
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1656
1657
427k
            for(row = ht; row >= 2; row -= 2)
1658
402k
            {
1659
1660
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
1661
402k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
1662
                // row = 2
1663
402k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
1664
1665
1666
                //row 0 -row1
1667
                //separating +ve and and -ve values.
1668
402k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
1669
402k
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
1670
                //creating mask 00 for +ve and -ve values and FF for zero.
1671
402k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1672
402k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1673
                //combining the appropriate sign change
1674
402k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1675
                //row1-row0
1676
402k
                edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
1677
1678
                //row1 -bottom
1679
402k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
1680
402k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
1681
                //creating mask 00 for +ve and -ve values and FF for zero.
1682
402k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1683
402k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1684
                //combining the appropriate sign change
1685
402k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1686
1687
                //combining sign-left and sign_right
1688
402k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
1689
402k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
1690
1691
                //for the next iteration signup0_16x8b = -signdwn1_16x8b
1692
402k
                signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
1693
                //adding constant 2
1694
402k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1695
402k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
1696
                //shuffle to get sao index
1697
402k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1698
402k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
1699
                //shuffle to get sao offset
1700
402k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1701
402k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
1702
                //copying the next top
1703
402k
                src_top_16x8b = src_temp1_16x8b;
1704
                //cnvert to 16 bit then add and then saturated pack
1705
402k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1706
402k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1707
402k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
1708
402k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1709
402k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1710
402k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
1711
402k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
1712
402k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1713
1714
402k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
1715
402k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
1716
402k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
1717
402k
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
1718
402k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1719
402k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
1720
402k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
1721
402k
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
1722
1723
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1724
402k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1725
                // row = 1
1726
402k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
1727
1728
402k
                src_temp0_16x8b = src_bottom_16x8b;
1729
402k
                pu1_src_cpy += (src_strd << 1);
1730
402k
            }
1731
25.4k
            ht_rem = ht & 0x1;
1732
1733
25.4k
            if(ht_rem)
1734
5.56k
            {
1735
5.56k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
1736
                //current row -next row
1737
                //separating +ve and and -ve values.
1738
5.56k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
1739
5.56k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
1740
                //creating mask 00 for +ve and -ve values and FF for zero.
1741
5.56k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1742
5.56k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1743
                //combining the appropriate sign change
1744
5.56k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1745
                //adding top and botton and constant 2
1746
5.56k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
1747
5.56k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1748
1749
5.56k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1750
5.56k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1751
                //copying the next top
1752
5.56k
                src_top_16x8b = src_temp0_16x8b;
1753
                //cnvert to 16 bit then add and then saturated pack
1754
5.56k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1755
5.56k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1756
5.56k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
1757
5.56k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1758
5.56k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1759
5.56k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
1760
5.56k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
1761
5.56k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1762
1763
5.56k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1764
5.56k
            }
1765
25.4k
            if(0 == pu1_avail[3])
1766
2.24k
            {
1767
2.24k
                src_top_16x8b = src_bottom_16x8b;
1768
2.24k
            }
1769
            //updating top flag
1770
25.4k
            _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
1771
25.4k
            pu1_src += 16;
1772
25.4k
        }
1773
1774
16.7k
        wd_rem = wd & 0xF;
1775
16.7k
        if(wd_rem)
1776
16.0k
        {
1777
16.0k
            pu1_src_cpy = pu1_src;
1778
16.0k
            src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
1779
            //row = 0
1780
16.0k
            src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
1781
            //separating +ve and and -ve values.
1782
16.0k
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
1783
16.0k
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
1784
            //creating mask 00 for +ve and -ve values and FF for zero.
1785
16.0k
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1786
16.0k
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1787
            //combining the appropriate sign change
1788
16.0k
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1789
16.0k
            signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
1790
138k
            for(row = ht; row >= 4; row -= 4)
1791
122k
            {
1792
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
1793
122k
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
1794
                // row = 2
1795
122k
                src_bottom_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
1796
1797
                //row 0 -row1
1798
                //separating +ve and and -ve values.
1799
122k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
1800
122k
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
1801
                //creating mask 00 for +ve and -ve values and FF for zero.
1802
122k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1803
122k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1804
                //combining the appropriate sign change
1805
122k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1806
1807
                //row1-row0
1808
122k
                edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
1809
122k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
1810
122k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
1811
                //row1 -row2
1812
122k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
1813
122k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
1814
                //creating mask 00 for +ve and -ve values and FF for zero.
1815
122k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1816
122k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1817
                //combining the appropriate sign change
1818
122k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
1819
122k
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
1820
                //packing row 0 n row 1
1821
122k
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
1822
                //row = 3
1823
122k
                src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
1824
                // row = 4
1825
122k
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 4 * src_strd));
1826
1827
122k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
1828
122k
                signdwn1_16x8b = _mm_slli_si128(signdwn1_16x8b, 8); //allign left (1-2)
1829
                //separating +ve and and -ve values.(2,3)
1830
122k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_top_16x8b);
1831
122k
                cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_bottom_16x8b);
1832
                //creating mask 00 for +ve and -ve values and FF for zero.
1833
122k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1834
122k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1835
                //combining the appropriate sign change
1836
122k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
1837
1838
122k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signdwn1_16x8b, 8); //(2-3) ,(1-2) (substract with down)
1839
122k
                edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8);
1840
                //separating +ve and and -ve values.(3,4)
1841
122k
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp1_16x8b);
1842
122k
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_top_16x8b);
1843
                //creating mask 00 for +ve and -ve values and FF for zero.
1844
122k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1845
122k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1846
122k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-4)
1847
                //combining sign-left and sign_right
1848
122k
                edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-4),(2-3)
1849
1850
122k
                edge1_16x8b = _mm_sub_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
1851
1852
                //packing row 2 n row 3
1853
122k
                src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
1854
                //for the next iteration signup0_16x8b = -signdwn1_16x8b
1855
122k
                signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(4-3)
1856
1857
                //adding constant 2
1858
122k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1859
122k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
1860
                //shuffle to get sao index
1861
122k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1862
122k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
1863
                //shuffle to get sao offset
1864
122k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1865
122k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
1866
                //the next top already in  src_top_16x8b
1867
                //src_top_16x8b = src_temp1_16x8b;
1868
                //cnvert to 16 bit then add and then saturated pack
1869
122k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1870
122k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1871
122k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
1872
122k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1873
122k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1874
122k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
1875
122k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
1876
122k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1877
1878
122k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
1879
122k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
1880
122k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
1881
122k
                src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
1882
122k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1883
122k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
1884
122k
                src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b);
1885
122k
                src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
1886
1887
122k
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
1888
122k
                cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
1889
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1890
122k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1891
                // row = 1
1892
122k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
1893
                //row = 2
1894
122k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
1895
                // row = 3
1896
122k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
1897
1898
122k
                src_temp0_16x8b = src_temp1_16x8b;
1899
122k
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
1900
122k
                pu1_src_cpy += (src_strd << 2);
1901
1902
122k
            }
1903
16.0k
            ht_rem = ht & 0x2;
1904
16.0k
            if(ht_rem)
1905
3.68k
            {
1906
1907
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
1908
3.68k
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
1909
                // row = 2
1910
3.68k
                src_bottom_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
1911
1912
                //row 0 -row1
1913
                //separating +ve and and -ve values.
1914
3.68k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
1915
3.68k
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
1916
                //creating mask 00 for +ve and -ve values and FF for zero.
1917
3.68k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1918
3.68k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1919
                //combining the appropriate sign change
1920
3.68k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1921
                //row1-row0
1922
3.68k
                edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
1923
3.68k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
1924
3.68k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
1925
                //row1 -row2
1926
3.68k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
1927
3.68k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
1928
                //creating mask 00 for +ve and -ve values and FF for zero.
1929
3.68k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1930
3.68k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1931
                //combining the appropriate sign change
1932
3.68k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
1933
3.68k
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
1934
                //adding top and down substraction
1935
3.68k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
1936
                //for the next iteration signup0_16x8b = -signdwn1_16x8b
1937
3.68k
                signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(2-1) for next
1938
3.68k
                src_top_16x8b = src_temp1_16x8b;
1939
                //adding constant 2
1940
3.68k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1941
1942
                //shuffle to get sao index
1943
3.68k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1944
1945
                //shuffle to get sao offset
1946
3.68k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1947
1948
                //the next top already in  src_top_16x8b
1949
                //cnvert to 16 bit then add and then saturated pack
1950
3.68k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1951
3.68k
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1952
3.68k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
1953
3.68k
                src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
1954
3.68k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
1955
3.68k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
1956
3.68k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
1957
3.68k
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
1958
1959
3.68k
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
1960
1961
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1962
3.68k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1963
                // row = 1
1964
3.68k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
1965
3.68k
                src_temp0_16x8b = src_bottom_16x8b;
1966
3.68k
                pu1_src_cpy += (src_strd << 1);
1967
1968
3.68k
            }
1969
16.0k
            ht_rem = ht & 0x1;
1970
16.0k
            if(ht_rem)
1971
3.67k
            {
1972
1973
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
1974
3.67k
                src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
1975
1976
                //row 0 -row1
1977
                //separating +ve and and -ve values.
1978
3.67k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
1979
3.67k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
1980
                //creating mask 00 for +ve and -ve values and FF for zero.
1981
3.67k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1982
3.67k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1983
                //combining the appropriate sign change
1984
3.67k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1985
                //adding top and down substraction
1986
3.67k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
1987
                //adding constant 2
1988
3.67k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1989
3.67k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
1990
3.67k
                edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
1991
                //shuffle to get sao index
1992
3.67k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1993
                //shuffle to get sao offset
1994
3.67k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1995
3.67k
                src_top_16x8b = src_temp0_16x8b;
1996
                //cnvert to 16 bit then add and then saturated pack
1997
3.67k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1998
3.67k
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1999
3.67k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2000
3.67k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
2001
3.67k
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
2002
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
2003
3.67k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2004
3.67k
                pu1_src_cpy += (src_strd);
2005
2006
3.67k
            }
2007
16.0k
            if(0 == pu1_avail[3])
2008
1.52k
            {
2009
1.52k
                src_top_16x8b = src_bottom_16x8b;
2010
1.52k
            }
2011
16.0k
            _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
2012
16.0k
            pu1_src += 8;
2013
16.0k
        }
2014
16.7k
    }
2015
16.7k
}
2016
2017
void ihevc_sao_edge_offset_class1_chroma_ssse3(UWORD8 *pu1_src,
2018
                                               WORD32 src_strd,
2019
                                               UWORD8 *pu1_src_left,
2020
                                               UWORD8 *pu1_src_top,
2021
                                               UWORD8 *pu1_src_top_left,
2022
                                               UWORD8 *pu1_src_top_right,
2023
                                               UWORD8 *pu1_src_bot_left,
2024
                                               UWORD8 *pu1_avail,
2025
                                               WORD8 *pi1_sao_offset_u,
2026
                                               WORD8 *pi1_sao_offset_v,
2027
                                               WORD32 wd,
2028
                                               WORD32 ht)
2029
6.30k
{
2030
6.30k
    WORD32 row, col;
2031
6.30k
    UWORD8 *pu1_src_top_cpy;
2032
6.30k
    UWORD8 *pu1_src_cpy;
2033
6.30k
    WORD32 wd_rem;
2034
2035
2036
6.30k
    __m128i src_top_16x8b, src_bottom_16x8b;
2037
6.30k
    __m128i src_temp0_16x8b, src_temp1_16x8b;
2038
6.30k
    __m128i signup0_16x8b, signdwn1_16x8b;
2039
6.30k
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
2040
6.30k
    __m128i edge0_16x8b, edge1_16x8b;
2041
6.30k
    __m128i edge_idx_8x16b, sao_offset_8x16b;
2042
6.30k
    __m128i const2_16x8b, const0_16x8b;
2043
6.30k
    __m128i chroma_offset_8x16b;
2044
2045
6.30k
    UNUSED(pu1_src_top_right);
2046
6.30k
    UNUSED(pu1_src_bot_left);
2047
2048
    /* Updating left and top and top-left */
2049
104k
    for(row = 0; row < ht; row++)
2050
98.5k
    {
2051
98.5k
        pu1_src_left[2 * row] = pu1_src[row * src_strd + (wd - 2)];
2052
98.5k
        pu1_src_left[2 * row + 1] = pu1_src[row * src_strd + (wd - 1)];
2053
98.5k
    }
2054
6.30k
    pu1_src_top_left[0] = pu1_src_top[wd - 2];
2055
6.30k
    pu1_src_top_left[1] = pu1_src_top[wd - 1];
2056
2057
2058
2059
6.30k
    pu1_src_top_cpy = pu1_src_top;
2060
6.30k
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
2061
6.30k
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
2062
6.30k
    const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
2063
6.30k
    chroma_offset_8x16b = _mm_set1_epi16(0x0800);
2064
    /* Update height and source pointers based on the availability flags */
2065
6.30k
    if(0 == pu1_avail[2])
2066
738
    {
2067
738
        pu1_src_top_cpy = pu1_src;
2068
738
        pu1_src += src_strd;
2069
738
        ht--;
2070
738
    }
2071
6.30k
    if(0 == pu1_avail[3])
2072
439
    {
2073
439
        ht--;
2074
439
    }
2075
6.30k
    sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
2076
6.30k
    const2_16x8b = _mm_set1_epi8(2);
2077
6.30k
    const0_16x8b = _mm_setzero_si128();
2078
2079
2080
6.30k
    {
2081
6.30k
        WORD32 ht_rem;
2082
2083
2084
2085
18.8k
        for(col = wd; col >= 16; col -= 16)
2086
12.5k
        {
2087
12.5k
            pu1_src_cpy = pu1_src;
2088
12.5k
            src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
2089
            //row = 0
2090
12.5k
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
2091
            //separating +ve and and -ve values.
2092
12.5k
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
2093
12.5k
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
2094
            //creating mask 00 for +ve and -ve values and FF for zero.
2095
12.5k
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2096
12.5k
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2097
            //combining the appropriate sign change
2098
12.5k
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2099
2100
110k
            for(row = ht; row >= 2; row -= 2)
2101
97.9k
            {
2102
2103
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
2104
97.9k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
2105
                // row = 2
2106
97.9k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
2107
2108
2109
                //row 0 -row1
2110
                //separating +ve and and -ve values.
2111
97.9k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
2112
97.9k
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
2113
                //creating mask 00 for +ve and -ve values and FF for zero.
2114
97.9k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2115
97.9k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2116
                //combining the appropriate sign change
2117
97.9k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2118
                //row1-row0
2119
97.9k
                edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
2120
2121
                //row1 -bottom
2122
97.9k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
2123
97.9k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
2124
                //creating mask 00 for +ve and -ve values and FF for zero.
2125
97.9k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2126
97.9k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2127
                //combining the appropriate sign change
2128
97.9k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2129
2130
                //combining sign-left and sign_right
2131
97.9k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
2132
97.9k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
2133
2134
                //for the next iteration signup0_16x8b = -signdwn1_16x8b
2135
97.9k
                signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
2136
                //adding constant 2
2137
97.9k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2138
97.9k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
2139
                //copying the next top
2140
97.9k
                src_top_16x8b = src_temp1_16x8b;
2141
2142
2143
                //shuffle to get sao index
2144
97.9k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2145
97.9k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
2146
                //adding chroma offset to access U and V
2147
97.9k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
2148
97.9k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
2149
2150
                //shuffle to get sao offset
2151
97.9k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2152
97.9k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
2153
                //cnvert to 16 bit then add and then saturated pack
2154
97.9k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2155
97.9k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2156
97.9k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2157
97.9k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
2158
97.9k
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
2159
97.9k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2160
97.9k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
2161
97.9k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
2162
2163
97.9k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
2164
97.9k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
2165
97.9k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
2166
97.9k
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
2167
97.9k
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
2168
97.9k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2169
97.9k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
2170
97.9k
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
2171
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
2172
97.9k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2173
                // row = 1
2174
97.9k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
2175
2176
97.9k
                src_temp0_16x8b = src_bottom_16x8b;
2177
97.9k
                pu1_src_cpy += (src_strd << 1);
2178
97.9k
            }
2179
12.5k
            ht_rem = ht & 0x1;
2180
2181
12.5k
            if(ht_rem)
2182
2.21k
            {
2183
2.21k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
2184
                //current row -next row
2185
                //separating +ve and and -ve values.
2186
2.21k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
2187
2.21k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
2188
                //creating mask 00 for +ve and -ve values and FF for zero.
2189
2.21k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2190
2.21k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2191
                //combining the appropriate sign change
2192
2.21k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2193
                //adding top and botton and constant 2
2194
2.21k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
2195
2.21k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2196
                //copying the next top
2197
2.21k
                src_top_16x8b = src_temp0_16x8b;
2198
2199
2.21k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2200
                //adding chroma offset to access U and V
2201
2.21k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
2202
2.21k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2203
2204
                //cnvert to 16 bit then add and then saturated pack
2205
2.21k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2206
2.21k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2207
2.21k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2208
2.21k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
2209
2.21k
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
2210
2.21k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2211
2.21k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
2212
2.21k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
2213
2214
2.21k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2215
2.21k
            }
2216
12.5k
            if(0 == pu1_avail[3])
2217
846
            {
2218
846
                src_top_16x8b = src_bottom_16x8b;
2219
846
            }
2220
            //updating top flag
2221
12.5k
            _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
2222
12.5k
            pu1_src += 16;
2223
12.5k
        }
2224
2225
6.30k
        wd_rem = wd & 0xF;
2226
6.30k
        if(wd_rem)
2227
117
        {
2228
117
            pu1_src_cpy = pu1_src;
2229
117
            src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
2230
            //row = 0
2231
117
            src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
2232
            //separating +ve and and -ve values.
2233
117
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
2234
117
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
2235
            //creating mask 00 for +ve and -ve values and FF for zero.
2236
117
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2237
117
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2238
            //combining the appropriate sign change
2239
117
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2240
117
            signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
2241
390
            for(row = ht; row >= 4; row -= 4)
2242
273
            {
2243
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
2244
273
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
2245
                // row = 2
2246
273
                src_bottom_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
2247
2248
                //row 0 -row1
2249
                //separating +ve and and -ve values.
2250
273
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
2251
273
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
2252
                //creating mask 00 for +ve and -ve values and FF for zero.
2253
273
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2254
273
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2255
                //combining the appropriate sign change
2256
273
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2257
2258
                //row1-row0
2259
273
                edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
2260
273
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
2261
273
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
2262
                //row1 -row2
2263
273
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
2264
273
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
2265
                //creating mask 00 for +ve and -ve values and FF for zero.
2266
273
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2267
273
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2268
                //combining the appropriate sign change
2269
273
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
2270
273
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
2271
                //packing row 0 n row 1
2272
273
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
2273
                //row = 3
2274
273
                src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
2275
                // row = 4
2276
273
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 4 * src_strd));
2277
2278
273
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
2279
273
                signdwn1_16x8b = _mm_slli_si128(signdwn1_16x8b, 8); //allign left (1-2)
2280
                //separating +ve and and -ve values.(2,3)
2281
273
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_top_16x8b);
2282
273
                cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_bottom_16x8b);
2283
                //creating mask 00 for +ve and -ve values and FF for zero.
2284
273
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2285
273
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2286
                //combining the appropriate sign change
2287
273
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
2288
2289
273
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signdwn1_16x8b, 8); //(2-3) ,(1-2) (substract with down)
2290
273
                edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8);
2291
                //separating +ve and and -ve values.(3,4)
2292
273
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp1_16x8b);
2293
273
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_top_16x8b);
2294
                //creating mask 00 for +ve and -ve values and FF for zero.
2295
273
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2296
273
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2297
273
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-4)
2298
                //combining sign-left and sign_right
2299
273
                edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-4),(2-3)
2300
2301
273
                edge1_16x8b = _mm_sub_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
2302
2303
                //packing row 2 n row 3
2304
273
                src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
2305
                //for the next iteration signup0_16x8b = -signdwn1_16x8b
2306
273
                signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(4-3)
2307
                //adding constant 2
2308
273
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2309
273
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
2310
                //shuffle to get sao index
2311
273
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2312
273
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
2313
                //adding chroma offset to access U and V
2314
273
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
2315
273
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
2316
2317
                //shuffle to get sao offset
2318
273
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2319
273
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
2320
                //the next top already in  src_top_16x8b
2321
                //cnvert to 16 bit then add and then saturated pack
2322
273
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2323
273
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2324
273
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2325
273
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
2326
273
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
2327
273
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2328
273
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
2329
273
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
2330
2331
273
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
2332
273
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
2333
273
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
2334
273
                src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
2335
273
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
2336
273
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2337
273
                src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b);
2338
273
                src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
2339
2340
273
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
2341
273
                cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
2342
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
2343
273
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2344
                // row = 1
2345
273
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
2346
                //row = 2
2347
273
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
2348
                // row = 3
2349
273
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
2350
2351
273
                src_temp0_16x8b = src_temp1_16x8b;
2352
273
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
2353
273
                pu1_src_cpy += (src_strd << 2);
2354
2355
273
            }
2356
117
            ht_rem = ht & 0x2;
2357
117
            if(ht_rem)
2358
13
            {
2359
2360
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
2361
13
                src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
2362
                // row = 2
2363
13
                src_bottom_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
2364
2365
                //row 0 -row1
2366
                //separating +ve and and -ve values.
2367
13
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
2368
13
                cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
2369
                //creating mask 00 for +ve and -ve values and FF for zero.
2370
13
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2371
13
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2372
                //combining the appropriate sign change
2373
13
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2374
                //row1-row0
2375
13
                edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
2376
13
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
2377
13
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
2378
                //row1 -row2
2379
13
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
2380
13
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
2381
                //creating mask 00 for +ve and -ve values and FF for zero.
2382
13
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2383
13
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2384
                //combining the appropriate sign change
2385
13
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
2386
13
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
2387
                //adding top and down substraction
2388
13
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
2389
                //for the next iteration signup0_16x8b = -signdwn1_16x8b
2390
13
                signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(2-1) for next
2391
13
                src_top_16x8b = src_temp1_16x8b;
2392
2393
                //adding constant 2
2394
13
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2395
2396
                //shuffle to get sao index
2397
13
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2398
2399
                //adding chroma offset to access U and V
2400
13
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
2401
                //shuffle to get sao offset
2402
13
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2403
                //the next top already in  src_top_16x8b
2404
                //cnvert to 16 bit then add and then saturated pack
2405
13
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2406
13
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2407
13
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2408
13
                src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
2409
13
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
2410
13
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
2411
13
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b);
2412
13
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
2413
2414
13
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
2415
2416
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
2417
13
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2418
                // row = 1
2419
13
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
2420
13
                src_temp0_16x8b = src_bottom_16x8b;
2421
13
                pu1_src_cpy += (src_strd << 1);
2422
2423
13
            }
2424
117
            ht_rem = ht & 0x1;
2425
117
            if(ht_rem)
2426
13
            {
2427
2428
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
2429
13
                src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
2430
2431
                //row 0 -row1
2432
                //separating +ve and and -ve values.
2433
13
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
2434
13
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
2435
                //creating mask 00 for +ve and -ve values and FF for zero.
2436
13
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2437
13
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2438
                //combining the appropriate sign change
2439
13
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2440
                //adding top and down substraction
2441
13
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
2442
                //adding constant 2
2443
13
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2444
13
                src_top_16x8b = src_temp0_16x8b;
2445
2446
13
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
2447
13
                edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
2448
                //shuffle to get sao index
2449
13
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2450
                //adding chroma offset to access U and V
2451
13
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
2452
                //shuffle to get sao offset
2453
13
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2454
2455
                //cnvert to 16 bit then add and then saturated pack
2456
13
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2457
13
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2458
13
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2459
13
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
2460
13
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
2461
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
2462
13
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2463
13
                pu1_src_cpy += (src_strd);
2464
2465
13
            }
2466
117
            if(0 == pu1_avail[3])
2467
3
            {
2468
3
                src_top_16x8b = src_bottom_16x8b;
2469
3
            }
2470
117
            _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
2471
117
            pu1_src += 8;
2472
117
        }
2473
6.30k
    }
2474
6.30k
}
2475
2476
/* 135 degree filtering */
2477
void ihevc_sao_edge_offset_class2_ssse3(UWORD8 *pu1_src,
2478
                                        WORD32 src_strd,
2479
                                        UWORD8 *pu1_src_left,
2480
                                        UWORD8 *pu1_src_top,
2481
                                        UWORD8 *pu1_src_top_left,
2482
                                        UWORD8 *pu1_src_top_right,
2483
                                        UWORD8 *pu1_src_bot_left,
2484
                                        UWORD8 *pu1_avail,
2485
                                        WORD8 *pi1_sao_offset,
2486
                                        WORD32 wd,
2487
                                        WORD32 ht)
2488
17.5k
{
2489
17.5k
    WORD32 row, col;
2490
17.5k
    UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
2491
17.5k
    UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2;
2492
17.5k
    UWORD8 *pu1_firstleft;
2493
17.5k
    UWORD8 *pu1_src_cpy, *pu1_src_org;
2494
17.5k
    UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
2495
17.5k
    UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8];
2496
17.5k
    UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8];
2497
17.5k
    WORD32 wd_rem;
2498
17.5k
    UWORD8 u1_pos_0_0_tmp, u1_pos_wd_ht_tmp;
2499
17.5k
    WORD32 ht_tmp, ht_0;
2500
2501
17.5k
    WORD32 bit_depth;
2502
17.5k
    UWORD8 u1_avail0, u1_avail1;
2503
2504
17.5k
    __m128i src_top_16x8b, src_bottom_16x8b;
2505
17.5k
    __m128i src_temp0_16x8b, src_temp1_16x8b;
2506
17.5k
    __m128i signup0_16x8b, signdwn1_16x8b;
2507
17.5k
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
2508
17.5k
    __m128i edge0_16x8b, edge1_16x8b;
2509
17.5k
    __m128i au1_mask8x16b;
2510
17.5k
    __m128i edge_idx_8x16b, sao_offset_8x16b;
2511
17.5k
    __m128i const2_16x8b, const0_16x8b;
2512
17.5k
    __m128i left_store_16x8b;
2513
17.5k
    UNUSED(pu1_src_top_right);
2514
17.5k
    UNUSED(pu1_src_bot_left);
2515
2516
17.5k
    ht_0 = ht; ht_tmp = ht;
2517
17.5k
    au1_mask8x16b = _mm_set1_epi8(0xff);
2518
2519
    //setting availability mask to ff size MAX_CTB_SIZE
2520
87.9k
    for(col = 0; col < MAX_CTB_SIZE; col += 16)
2521
70.3k
        _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
2522
494k
    for(row = 0; row < ht; row++)
2523
477k
    {
2524
477k
        au1_src_left_tmp[row] = pu1_src_left[row];
2525
477k
    }
2526
17.5k
    bit_depth = BIT_DEPTH_LUMA;
2527
17.5k
    pu1_src_org = pu1_src;
2528
17.5k
    pu1_src_top_cpy = pu1_src_top;
2529
17.5k
    pu1_src_left_cpy2 = au1_src_left_tmp;
2530
17.5k
    pu1_src_left_cpy = au1_src_left_tmp;
2531
17.5k
    pu1_src_left_str2 = au1_src_left_tmp1;
2532
17.5k
    pu1_src_left_str = au1_src_left_tmp1;
2533
17.5k
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
2534
17.5k
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
2535
2536
2537
    /* If top-left is available, process separately */
2538
17.5k
    if(0 != pu1_avail[4])
2539
13.6k
    {
2540
13.6k
        WORD8 edge_idx;
2541
2542
13.6k
        edge_idx = 2 + SIGN(pu1_src[0] - pu1_src_top_left[0]) +
2543
13.6k
                        SIGN(pu1_src[0] - pu1_src[1 + src_strd]);
2544
2545
13.6k
        edge_idx = gi1_table_edge_idx[edge_idx];
2546
2547
13.6k
        if(0 != edge_idx)
2548
4.10k
        {
2549
4.10k
            u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
2550
4.10k
        }
2551
9.56k
        else
2552
9.56k
        {
2553
9.56k
            u1_pos_0_0_tmp = pu1_src[0];
2554
9.56k
        }
2555
13.6k
    }
2556
3.91k
    else
2557
3.91k
    {
2558
3.91k
        u1_pos_0_0_tmp = pu1_src[0];
2559
3.91k
    }
2560
2561
    /* If bottom-right is available, process separately */
2562
17.5k
    if(0 != pu1_avail[7])
2563
15.1k
    {
2564
15.1k
        WORD8 edge_idx;
2565
2566
15.1k
        edge_idx = 2 + SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 1 - src_strd]) +
2567
15.1k
                        SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]);
2568
2569
15.1k
        edge_idx = gi1_table_edge_idx[edge_idx];
2570
2571
15.1k
        if(0 != edge_idx)
2572
5.40k
        {
2573
5.40k
            u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
2574
5.40k
        }
2575
9.77k
        else
2576
9.77k
        {
2577
9.77k
            u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd];
2578
9.77k
        }
2579
15.1k
    }
2580
2.40k
    else
2581
2.40k
    {
2582
2.40k
        u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd];
2583
2.40k
    }
2584
17.5k
    pu1_firstleft = pu1_src_top_left;
2585
2586
    /* Update height and source pointers based on the availability flags */
2587
17.5k
    if(0 == pu1_avail[2])
2588
2.41k
    {
2589
2.41k
        pu1_firstleft = pu1_src_left_cpy2;
2590
2.41k
        pu1_src_left_cpy2++;
2591
2.41k
        pu1_src_left_str2++;
2592
2.41k
        pu1_src_top_cpy = pu1_src;
2593
2.41k
        pu1_src += src_strd;
2594
2.41k
        ht--;
2595
2.41k
    }
2596
17.5k
    if(0 == pu1_avail[3])
2597
1.66k
    {
2598
1.66k
        ht--;
2599
1.66k
        ht_0--;
2600
1.66k
    }
2601
    //storing top left in a mmx register
2602
17.5k
    left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_firstleft);
2603
17.5k
    const2_16x8b = _mm_set1_epi8(2);
2604
17.5k
    const0_16x8b = _mm_setzero_si128();
2605
17.5k
    left_store_16x8b = _mm_slli_si128(left_store_16x8b, 15);
2606
    //update top -left
2607
17.5k
    *pu1_src_top_left = pu1_src_top[wd - 1];
2608
    //availability mask creation
2609
17.5k
    u1_avail0 = pu1_avail[0];
2610
17.5k
    u1_avail1 = pu1_avail[1];
2611
17.5k
    au1_mask[0] = u1_avail0;
2612
17.5k
    au1_mask[wd - 1] = u1_avail1;
2613
17.5k
    {
2614
17.5k
        WORD32 ht_rem;
2615
2616
2617
17.5k
        pu1_src_left_cpy = pu1_src_left_cpy2;
2618
17.5k
        pu1_src_left_str = pu1_src_left_str2;
2619
17.5k
        au1_mask_cpy = au1_mask;
2620
39.0k
        for(col = wd; col >= 16; col -= 16)
2621
21.4k
        {
2622
21.4k
            pu1_src_cpy = pu1_src;
2623
21.4k
            src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
2624
            //row = 0
2625
21.4k
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
2626
21.4k
            src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 15);
2627
            //loading the mask
2628
21.4k
            au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
2629
            //separating +ve and and -ve values.
2630
21.4k
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
2631
21.4k
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
2632
            //creating mask 00 for +ve and -ve values and FF for zero.
2633
21.4k
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2634
21.4k
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2635
            //combining the appropriate sign change
2636
21.4k
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2637
2638
2639
348k
            for(row = ht; row >= 2; row -= 2)
2640
326k
            {
2641
326k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
2642
                //row = 1
2643
326k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
2644
                // row = 1 right
2645
326k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
2646
                //to insert left in row 0
2647
326k
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 15);
2648
                //row 0 -row1
2649
                //separating +ve and and -ve values.
2650
326k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
2651
326k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
2652
2653
                //creating mask 00 for +ve and -ve values and FF for zero.
2654
326k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2655
326k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2656
                //manipulation for row 1 - row 0
2657
326k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15);
2658
                //combining the appropriate sign change
2659
326k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
2660
                //row1-row0
2661
                //separating +ve and and -ve values.
2662
326k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
2663
326k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
2664
                //creating mask 00 for +ve and -ve values and FF for zero.
2665
326k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2666
326k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2667
                // row = 2 right
2668
326k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd + 1));
2669
326k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
2670
2671
2672
                //row1 -bottom
2673
326k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
2674
326k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
2675
                //creating mask 00 for +ve and -ve values and FF for zero.
2676
326k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2677
326k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2678
                //combining the appropriate sign change
2679
326k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2680
                // row = 2
2681
326k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
2682
2683
                //combining sign-left and sign_right
2684
326k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
2685
2686
                //storing the row 1 left for next row.
2687
326k
                signup0_16x8b = _mm_slli_si128(left_store_16x8b, 14);
2688
2689
                //combining sign-left and sign_right
2690
326k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
2691
                //manipulation for bottom - row 1
2692
326k
                signup0_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signup0_16x8b, 15);
2693
                //eliminating old left for row 0 and row 1
2694
326k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
2695
                //bottom - row1
2696
326k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signup0_16x8b);
2697
326k
                cmp_lt0_16x8b = _mm_subs_epu8(signup0_16x8b, src_bottom_16x8b);
2698
                //creating mask 00 for +ve and -ve values and FF for zero.
2699
326k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2700
326k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2701
                //for the next iteration bottom -row1
2702
326k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2703
                //row1  getting it right for left of next block
2704
326k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15);
2705
                //adding constant 2
2706
326k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2707
326k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
2708
                //shuffle to get sao index
2709
326k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2710
326k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
2711
                //using availability mask
2712
326k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
2713
326k
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
2714
                //shuffle to get sao offset
2715
326k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2716
326k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
2717
                //row0  getting it right for left of next block
2718
326k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
2719
                //copying the next top
2720
326k
                src_top_16x8b = src_temp1_16x8b;
2721
                //cnvert to 16 bit then add and then saturated pack
2722
326k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2723
326k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2724
326k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2725
326k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
2726
326k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2727
326k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
2728
326k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
2729
326k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
2730
2731
326k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
2732
326k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
2733
326k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
2734
326k
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
2735
326k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2736
326k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
2737
326k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
2738
326k
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
2739
2740
                //store left boundary
2741
326k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
2742
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
2743
326k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2744
                // row = 1
2745
326k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
2746
2747
326k
                src_temp0_16x8b = src_bottom_16x8b;
2748
326k
                pu1_src_cpy += (src_strd << 1);
2749
326k
                pu1_src_left_cpy += 2;
2750
326k
                pu1_src_left_str += 2;
2751
326k
            }
2752
21.4k
            ht_rem = ht & 0x1;
2753
2754
21.4k
            if(ht_rem)
2755
4.45k
            {
2756
4.45k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
2757
4.45k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
2758
                //current row -next row
2759
                //separating +ve and and -ve values.
2760
4.45k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
2761
4.45k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
2762
                //creating mask 00 for +ve and -ve values and FF for zero.
2763
4.45k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2764
4.45k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2765
                //combining the appropriate sign change
2766
4.45k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2767
                //adding top and botton and constant 2
2768
4.45k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
2769
4.45k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2770
                //eliminating old left for row 0 and row 1
2771
4.45k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
2772
2773
4.45k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2774
                //using availability mask
2775
4.45k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
2776
2777
4.45k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2778
2779
                //row0  getting it right for left of next block
2780
4.45k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
2781
                //copying the next top
2782
4.45k
                src_top_16x8b = src_temp0_16x8b;
2783
                //cnvert to 16 bit then add and then saturated pack
2784
4.45k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2785
4.45k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2786
4.45k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2787
4.45k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
2788
4.45k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2789
4.45k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
2790
4.45k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
2791
4.45k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
2792
                //store left boundary
2793
4.45k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
2794
2795
4.45k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2796
4.45k
                pu1_src_cpy += (src_strd);
2797
4.45k
                pu1_src_left_cpy += 1;
2798
4.45k
                pu1_src_left_str += 1;
2799
4.45k
            }
2800
21.4k
            if(0 == pu1_avail[3])
2801
1.68k
            {
2802
1.68k
                src_top_16x8b = src_bottom_16x8b;
2803
1.68k
                pu1_src_left_str[0] = pu1_src_cpy[15];
2804
1.68k
            }
2805
21.4k
            if(0 == pu1_avail[2])
2806
2.80k
            {
2807
2.80k
                pu1_src_left_str[-ht_0] = pu1_src[15 - src_strd];
2808
2.80k
            }
2809
2810
            //for the top left of next part of the block
2811
21.4k
            left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
2812
            //updating top flag
2813
21.4k
            _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
2814
21.4k
            pu1_src += 16;
2815
21.4k
            au1_mask_cpy += 16;
2816
2817
2818
21.4k
            pu1_left_tmp = pu1_src_left_cpy2;
2819
21.4k
            pu1_src_left_cpy2 = pu1_src_left_str2;
2820
21.4k
            pu1_src_left_str2 = pu1_left_tmp;
2821
2822
21.4k
            pu1_src_left_cpy = pu1_src_left_cpy2;
2823
21.4k
            pu1_src_left_str = pu1_src_left_str2;
2824
21.4k
        }
2825
2826
17.5k
        wd_rem = wd & 0xF;
2827
17.5k
        if(wd_rem)
2828
17.1k
        {
2829
17.1k
            pu1_src_left_cpy = pu1_src_left_cpy2;
2830
17.1k
            pu1_src_left_str = pu1_src_left_str2;
2831
17.1k
            pu1_src_cpy = pu1_src;
2832
17.1k
            src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
2833
            //row = 0
2834
17.1k
            src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
2835
17.1k
            src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 15);
2836
17.1k
            au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
2837
            //separating +ve and and -ve values.
2838
17.1k
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
2839
17.1k
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
2840
            //creating mask 00 for +ve and -ve values and FF for zero.
2841
17.1k
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2842
17.1k
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2843
            //preparing au1_mask
2844
17.1k
            au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
2845
            //combining the appropriate sign change
2846
17.1k
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2847
17.1k
            signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
2848
2849
130k
            for(row = ht; row >= 4; row -= 4)
2850
112k
            {
2851
112k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
2852
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
2853
112k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
2854
                // row = 2
2855
112k
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
2856
                //right row1
2857
112k
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
2858
                //row 0 -row1
2859
                //separating +ve and and -ve values.
2860
112k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
2861
112k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
2862
                //manipulation for row 1 -row 0
2863
112k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 15);
2864
                //creating mask 00 for +ve and -ve values and FF for zero.
2865
112k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2866
112k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2867
                //row 0 left
2868
112k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15);
2869
                //combining the appropriate sign change
2870
112k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2871
                //row 1 -row0
2872
                //separating +ve and and -ve values.
2873
112k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
2874
112k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
2875
2876
                //creating mask 00 for +ve and -ve values and FF for zero.
2877
112k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2878
112k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2879
                //row1-row0
2880
112k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2881
2882
112k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
2883
2884
112k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
2885
                //right row2
2886
112k
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
2887
                //packing row 0 n row 1
2888
112k
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
2889
                //row1 -row2
2890
112k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
2891
112k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
2892
                //creating mask 00 for +ve and -ve values and FF for zero.
2893
112k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2894
112k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2895
                //combining the appropriate sign change
2896
112k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
2897
112k
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
2898
                //manipulation for row 2 -row 1
2899
112k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
2900
                //row 1 left
2901
112k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
2902
                //row = 3
2903
112k
                src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
2904
2905
                // row = 4
2906
112k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
2907
2908
112k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
2909
2910
                //separating +ve and and -ve values.(2,1)
2911
112k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
2912
112k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
2913
                //manipulation for row 3 -row 2
2914
112k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 13);
2915
                //creating mask 00 for +ve and -ve values and FF for zero.
2916
112k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2917
112k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2918
                //row 2 left
2919
112k
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
2920
                //combining the appropriate sign change
2921
112k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
2922
2923
                //separating +ve and and -ve values.(3,2)
2924
112k
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
2925
112k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
2926
112k
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
2927
                //creating mask 00 for +ve and -ve values and FF for zero.
2928
112k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2929
112k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2930
                //right row3
2931
112k
                signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 1);
2932
                //combining the appropriate sign change
2933
112k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
2934
2935
112k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
2936
2937
                //separating +ve and and -ve values.(2,3)
2938
112k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
2939
112k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
2940
                //right row 4
2941
112k
                signdwn1_16x8b =  _mm_srli_si128(src_temp1_16x8b, 1);
2942
                //creating mask 00 for +ve and -ve values and FF for zero.
2943
112k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2944
112k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2945
                //combining the appropriate sign change
2946
112k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
2947
2948
                //separating +ve and and -ve values.(3,bottom)
2949
112k
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
2950
112k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
2951
2952
                //creating mask 00 for +ve and -ve values and FF for zero.
2953
112k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2954
112k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2955
112k
                edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
2956
                //combining the appropriate sign change
2957
112k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
2958
112k
                edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
2959
2960
                //manipulation for bottom -row 3
2961
112k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
2962
                //eliminating old left for row 0,1,2,3
2963
112k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
2964
                //packing row 2 n row 3
2965
112k
                src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
2966
                //row 3 left
2967
112k
                signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 15);
2968
                //loading row 3 right into left
2969
112k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 15);
2970
                //adding bottom and top values of row 2 and row 3
2971
112k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
2972
                //separating +ve and and -ve values.(botttom,3)
2973
112k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
2974
112k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
2975
                //to store right of row 2
2976
112k
                signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
2977
                //creating mask 00 for +ve and -ve values and FF for zero.
2978
112k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2979
112k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2980
112k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
2981
2982
                //storing right of row 2into left
2983
112k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
2984
                //to store right of row 0
2985
112k
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
2986
                //storing right of row 1 into left
2987
112k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
2988
2989
                //adding constant 2
2990
112k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2991
112k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
2992
                //shuffle to get sao index
2993
112k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2994
112k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
2995
                //using availability mask
2996
112k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
2997
112k
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
2998
                //shuffle to get sao offset
2999
112k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3000
112k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
3001
3002
                //storing right of row 0 into left
3003
112k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
3004
                //cnvert to 16 bit then add and then saturated pack
3005
112k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3006
112k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3007
112k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3008
112k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
3009
112k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3010
112k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
3011
112k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
3012
112k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
3013
3014
112k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
3015
112k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
3016
112k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
3017
112k
                src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
3018
112k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3019
112k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
3020
112k
                src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b);
3021
112k
                src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
3022
3023
112k
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
3024
112k
                cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
3025
3026
112k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3027
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
3028
112k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3029
                // row = 1
3030
112k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
3031
                //row = 2
3032
112k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
3033
                // row = 3
3034
112k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
3035
3036
112k
                src_temp0_16x8b = src_temp1_16x8b;
3037
112k
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
3038
112k
                pu1_src_cpy += (src_strd << 2);
3039
112k
                pu1_src_left_cpy += 4;
3040
112k
                pu1_src_left_str += 4;
3041
112k
            }
3042
17.1k
            ht_rem = ht & 0x2;
3043
17.1k
            if(ht_rem)
3044
4.04k
            {
3045
4.04k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3046
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
3047
4.04k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
3048
                // row = 2
3049
4.04k
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
3050
3051
                //row 0 -row 1
3052
4.04k
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
3053
                //separating +ve and and -ve values.
3054
4.04k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
3055
4.04k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
3056
                //manipulation for row 1 -row 0
3057
4.04k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 15);
3058
                //creating mask 00 for +ve and -ve values and FF for zero.
3059
4.04k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3060
4.04k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3061
                //manipulation for row 1 - row 0
3062
4.04k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15);
3063
                //combining the appropriate sign change
3064
4.04k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3065
3066
                //row1-row0
3067
                //separating +ve and and -ve values.
3068
4.04k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3069
4.04k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3070
3071
                //creating mask 00 for +ve and -ve values and FF for zero.
3072
4.04k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3073
4.04k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3074
                //combining the appropriate sign chang
3075
4.04k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3076
                //row 1 -bottom
3077
4.04k
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
3078
3079
4.04k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
3080
4.04k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
3081
                //row1 -bottom
3082
4.04k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3083
4.04k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3084
3085
                //creating mask 00 for +ve and -ve values and FF for zero.
3086
4.04k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3087
4.04k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3088
                //combining the appropriate sign change
3089
4.04k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
3090
4.04k
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
3091
                //manipulation for bottom -row1
3092
4.04k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
3093
                //manipulation for bottom- row 1
3094
4.04k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
3095
                //adding top and down substraction
3096
4.04k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
3097
                //bottom - row 1
3098
4.04k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
3099
4.04k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
3100
3101
                //eliminating old left for row 0,1
3102
4.04k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
3103
4.04k
                signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
3104
                //creating mask 00 for +ve and -ve values and FF for zero.
3105
4.04k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3106
4.04k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3107
                //for the next iteration signup0_16x8b
3108
4.04k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
3109
3110
                //storing right of row 1 into left
3111
4.04k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
3112
                //for storing right of row 1
3113
4.04k
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
3114
3115
4.04k
                src_top_16x8b = src_temp1_16x8b;
3116
                //storing right of row 0 into left
3117
4.04k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
3118
3119
                //adding constant 2
3120
4.04k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
3121
3122
                //shuffle to get sao index
3123
4.04k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
3124
                //using availability mask
3125
4.04k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
3126
                //shuffle to get sao offset
3127
4.04k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3128
3129
                //the next top already in  src_top_16x8b
3130
                //cnvert to 16 bit then add and then saturated pack
3131
4.04k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3132
4.04k
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3133
4.04k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3134
4.04k
                src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
3135
4.04k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
3136
4.04k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
3137
4.04k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
3138
4.04k
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
3139
3140
4.04k
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
3141
3142
4.04k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3143
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
3144
4.04k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3145
                // row = 1
3146
4.04k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
3147
4.04k
                src_temp0_16x8b = src_bottom_16x8b;
3148
4.04k
                pu1_src_cpy += (src_strd << 1);
3149
4.04k
                pu1_src_left_cpy += 2;
3150
4.04k
                pu1_src_left_str += 2;
3151
4.04k
            }
3152
17.1k
            ht_rem = ht & 0x1;
3153
17.1k
            if(ht_rem)
3154
4.03k
            {
3155
4.03k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3156
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
3157
4.03k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
3158
                //left store manipulation 1
3159
4.03k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
3160
                //row 0 -row1
3161
4.03k
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
3162
                //separating +ve and and -ve values.
3163
4.03k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
3164
4.03k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
3165
                //creating mask 00 for +ve and -ve values and FF for zero.
3166
4.03k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3167
4.03k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3168
                //combining the appropriate sign change
3169
4.03k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3170
                //adding top and down substraction
3171
4.03k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
3172
                //for row 0 right to put into left store
3173
4.03k
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
3174
                //adding constant 2
3175
4.03k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
3176
4.03k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
3177
4.03k
                edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
3178
                //filling the left boundary value
3179
4.03k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
3180
3181
                //shuffle to get sao index
3182
4.03k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
3183
                //using availability mask
3184
4.03k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
3185
                //shuffle to get sao offset
3186
4.03k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3187
4.03k
                src_top_16x8b = src_temp0_16x8b;
3188
                //cnvert to 16 bit then add and then saturated pack
3189
4.03k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3190
4.03k
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3191
4.03k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3192
4.03k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
3193
4.03k
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
3194
3195
4.03k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3196
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
3197
4.03k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3198
4.03k
                pu1_src_cpy += (src_strd);
3199
4.03k
                pu1_src_left_cpy += 1;
3200
4.03k
                pu1_src_left_str += 1;
3201
4.03k
            }
3202
17.1k
            if(0 == pu1_avail[3])
3203
1.65k
            {
3204
1.65k
                src_top_16x8b = src_bottom_16x8b;
3205
1.65k
                pu1_src_left_str[0] = pu1_src_cpy[7];
3206
1.65k
            }
3207
3208
17.1k
            if(0 == pu1_avail[2])
3209
2.38k
            {
3210
2.38k
                pu1_src_left_str[-ht_0] = pu1_src[7 - src_strd];
3211
2.38k
            }
3212
3213
17.1k
            _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
3214
17.1k
            pu1_src += 8;
3215
17.1k
            au1_mask_cpy += 16;
3216
3217
17.1k
            pu1_left_tmp = pu1_src_left_cpy2;
3218
17.1k
            pu1_src_left_cpy2 = pu1_src_left_str2;
3219
17.1k
            pu1_src_left_str2 = pu1_left_tmp;
3220
3221
17.1k
            pu1_src_left_cpy = pu1_src_left_cpy2;
3222
17.1k
            pu1_src_left_str = pu1_src_left_str2;
3223
17.1k
        }
3224
17.5k
        pu1_src_org[0] = u1_pos_0_0_tmp;
3225
17.5k
        pu1_src_org[wd - 1 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp;
3226
17.5k
        pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 1) : pu1_src_left_cpy;
3227
494k
        for(row = 0; row < ht_tmp; row++)
3228
476k
        {
3229
476k
            pu1_src_left[row] = pu1_src_left_cpy[row];
3230
476k
        }
3231
17.5k
    }
3232
3233
17.5k
}
3234
3235
/* 135 degree filtering */
3236
void ihevc_sao_edge_offset_class2_chroma_ssse3(UWORD8 *pu1_src,
3237
                                               WORD32 src_strd,
3238
                                               UWORD8 *pu1_src_left,
3239
                                               UWORD8 *pu1_src_top,
3240
                                               UWORD8 *pu1_src_top_left,
3241
                                               UWORD8 *pu1_src_top_right,
3242
                                               UWORD8 *pu1_src_bot_left,
3243
                                               UWORD8 *pu1_avail,
3244
                                               WORD8 *pi1_sao_offset_u,
3245
                                               WORD8 *pi1_sao_offset_v,
3246
                                               WORD32 wd,
3247
                                               WORD32 ht)
3248
9.48k
{
3249
9.48k
    WORD32 row, col;
3250
9.48k
    UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
3251
9.48k
    UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2;
3252
9.48k
    UWORD8 *pu1_firstleft;
3253
9.48k
    UWORD8 *pu1_src_cpy, *pu1_src_org;
3254
9.48k
    UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
3255
9.48k
    UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)];
3256
9.48k
    UWORD8 au1_src_left_tmp1[2 * (MAX_CTB_SIZE + 8)];
3257
9.48k
    WORD32 wd_rem;
3258
9.48k
    UWORD8 u1_pos_0_0_tmp_u, u1_pos_0_0_tmp_v, u1_pos_wd_ht_tmp_u, u1_pos_wd_ht_tmp_v;
3259
9.48k
    WORD32 ht_tmp;
3260
9.48k
    WORD32 ht_0;
3261
3262
9.48k
    WORD32 bit_depth;
3263
9.48k
    UWORD8 u1_avail0, u1_avail1;
3264
3265
9.48k
    __m128i src_temp0_16x8b, src_temp1_16x8b;
3266
9.48k
    __m128i signup0_16x8b, signdwn1_16x8b;
3267
9.48k
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
3268
9.48k
    __m128i edge0_16x8b, edge1_16x8b;
3269
9.48k
    __m128i src_top_16x8b, src_bottom_16x8b;
3270
9.48k
    __m128i au1_mask8x16b;
3271
9.48k
    __m128i edge_idx_8x16b, sao_offset_8x16b;
3272
9.48k
    __m128i const2_16x8b, const0_16x8b;
3273
9.48k
    __m128i left_store_16x8b;
3274
9.48k
    __m128i chroma_offset_8x16b;
3275
3276
9.48k
    UNUSED(pu1_src_top_right);
3277
9.48k
    UNUSED(pu1_src_bot_left);
3278
3279
9.48k
    ht_0 = ht; ht_tmp = ht;
3280
9.48k
    au1_mask8x16b = _mm_set1_epi8(0xff);
3281
    /* Updating left and top-left  */
3282
309k
    for(row = 0; row < 2 * ht; row++)
3283
300k
    {
3284
300k
        au1_src_left_tmp[row] = pu1_src_left[row];
3285
300k
    }
3286
    //setting availability mask to ff size MAX_CTB_SIZE
3287
47.4k
    for(col = 0; col < MAX_CTB_SIZE; col += 16)
3288
37.9k
        _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
3289
9.48k
    bit_depth = BIT_DEPTH_LUMA;
3290
9.48k
    pu1_src_org = pu1_src;
3291
9.48k
    pu1_src_top_cpy = pu1_src_top;
3292
9.48k
    pu1_src_left_cpy2 = au1_src_left_tmp;
3293
9.48k
    pu1_src_left_cpy = au1_src_left_tmp;
3294
9.48k
    pu1_src_left_str2 = au1_src_left_tmp1;
3295
9.48k
    pu1_src_left_str = au1_src_left_tmp1;
3296
9.48k
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
3297
9.48k
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
3298
9.48k
    const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
3299
9.48k
    chroma_offset_8x16b = _mm_set1_epi16(0x0800);
3300
3301
    /* If top-left is available, process separately */
3302
9.48k
    if(0 != pu1_avail[4])
3303
7.60k
    {
3304
7.60k
        WORD32 edge_idx;
3305
3306
        /* U */
3307
7.60k
        edge_idx = 2 + SIGN(pu1_src[0] - pu1_src_top_left[0]) +
3308
7.60k
                        SIGN(pu1_src[0] - pu1_src[2 + src_strd]);
3309
3310
7.60k
        edge_idx = gi1_table_edge_idx[edge_idx];
3311
3312
7.60k
        if(0 != edge_idx)
3313
2.20k
        {
3314
2.20k
            u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
3315
2.20k
        }
3316
5.40k
        else
3317
5.40k
        {
3318
5.40k
            u1_pos_0_0_tmp_u = pu1_src[0];
3319
5.40k
        }
3320
3321
        /* V */
3322
7.60k
        edge_idx = 2 + SIGN(pu1_src[1] - pu1_src_top_left[1]) +
3323
7.60k
                        SIGN(pu1_src[1] - pu1_src[1 + 2 + src_strd]);
3324
3325
7.60k
        edge_idx = gi1_table_edge_idx[edge_idx];
3326
3327
7.60k
        if(0 != edge_idx)
3328
2.34k
        {
3329
2.34k
            u1_pos_0_0_tmp_v = CLIP3(pu1_src[1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
3330
2.34k
        }
3331
5.26k
        else
3332
5.26k
        {
3333
5.26k
            u1_pos_0_0_tmp_v = pu1_src[1];
3334
5.26k
        }
3335
7.60k
    }
3336
1.87k
    else
3337
1.87k
    {
3338
1.87k
        u1_pos_0_0_tmp_u = pu1_src[0];
3339
1.87k
        u1_pos_0_0_tmp_v = pu1_src[1];
3340
1.87k
    }
3341
3342
    /* If bottom-right is available, process separately */
3343
9.48k
    if(0 != pu1_avail[7])
3344
8.42k
    {
3345
8.42k
        WORD32 edge_idx;
3346
3347
        /* U */
3348
8.42k
        edge_idx = 2 + SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]) +
3349
8.42k
                        SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]);
3350
3351
8.42k
        edge_idx = gi1_table_edge_idx[edge_idx];
3352
3353
8.42k
        if(0 != edge_idx)
3354
2.76k
        {
3355
2.76k
            u1_pos_wd_ht_tmp_u = CLIP3(pu1_src[wd - 2 + (ht - 1) * src_strd] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
3356
2.76k
        }
3357
5.66k
        else
3358
5.66k
        {
3359
5.66k
            u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd];
3360
5.66k
        }
3361
3362
        /* V */
3363
8.42k
        edge_idx = 2 + SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]) +
3364
8.42k
                        SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]);
3365
3366
8.42k
        edge_idx = gi1_table_edge_idx[edge_idx];
3367
3368
8.42k
        if(0 != edge_idx)
3369
2.80k
        {
3370
2.80k
            u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
3371
2.80k
        }
3372
5.61k
        else
3373
5.61k
        {
3374
5.61k
            u1_pos_wd_ht_tmp_v = pu1_src[wd - 1 + (ht - 1) * src_strd];
3375
5.61k
        }
3376
8.42k
    }
3377
1.06k
    else
3378
1.06k
    {
3379
1.06k
        u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd];
3380
1.06k
        u1_pos_wd_ht_tmp_v = pu1_src[wd - 1 + (ht - 1) * src_strd];
3381
1.06k
    }
3382
9.48k
    pu1_firstleft = pu1_src_top_left;
3383
3384
    /* Update height and source pointers based on the availability flags */
3385
9.48k
    if(0 == pu1_avail[2])
3386
1.08k
    {
3387
1.08k
        pu1_firstleft = pu1_src_left_cpy2;
3388
1.08k
        pu1_src_left_cpy2 += 2;
3389
1.08k
        pu1_src_left_str2 += 2;
3390
1.08k
        pu1_src_top_cpy = pu1_src;
3391
1.08k
        pu1_src += src_strd;
3392
1.08k
        ht--;
3393
1.08k
    }
3394
9.48k
    if(0 == pu1_avail[3])
3395
461
    {
3396
461
        ht--;
3397
461
        ht_0--;
3398
461
    }
3399
    //storing top left in a mmx register
3400
9.48k
    left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_firstleft);
3401
9.48k
    sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
3402
9.48k
    const2_16x8b = _mm_set1_epi8(2);
3403
9.48k
    const0_16x8b = _mm_setzero_si128();
3404
9.48k
    left_store_16x8b = _mm_slli_si128(left_store_16x8b, 14);
3405
3406
    //availability mask creation
3407
9.48k
    u1_avail0 = pu1_avail[0];
3408
9.48k
    u1_avail1 = pu1_avail[1];
3409
9.48k
    au1_mask[0] = u1_avail0;
3410
9.48k
    au1_mask[1] = u1_avail0;
3411
9.48k
    au1_mask[wd - 1] = u1_avail1;
3412
9.48k
    au1_mask[wd - 2] = u1_avail1;
3413
3414
    /* top-left arrays */
3415
9.48k
    pu1_src_top_left[0] = pu1_src_top[wd - 2];
3416
9.48k
    pu1_src_top_left[1] = pu1_src_top[wd - 1];
3417
9.48k
    {
3418
9.48k
        WORD32 ht_rem;
3419
9.48k
        au1_mask_cpy = au1_mask;
3420
3421
9.48k
        pu1_src_left_cpy = pu1_src_left_cpy2;
3422
9.48k
        pu1_src_left_str = pu1_src_left_str2;
3423
28.7k
        for(col = wd; col >= 16; col -= 16)
3424
19.2k
        {
3425
19.2k
            pu1_src_cpy = pu1_src;
3426
19.2k
            src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
3427
            //row = 0
3428
19.2k
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
3429
19.2k
            src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 14);
3430
            //loading the mask
3431
19.2k
            au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
3432
            //separating +ve and and -ve values.
3433
19.2k
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
3434
19.2k
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
3435
            //creating mask 00 for +ve and -ve values and FF for zero.
3436
19.2k
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3437
19.2k
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3438
            //combining the appropriate sign change
3439
19.2k
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3440
3441
3442
169k
            for(row = ht; row >= 2; row -= 2)
3443
150k
            {
3444
150k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3445
                //row = 1
3446
150k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
3447
                // row = 1 right
3448
150k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
3449
                //to insert left in row 0
3450
150k
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
3451
                //row 0 -row1
3452
                //separating +ve and and -ve values.
3453
150k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
3454
150k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
3455
3456
                //creating mask 00 for +ve and -ve values and FF for zero.
3457
150k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3458
150k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3459
                //manipulation for row 1 - row 0
3460
150k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14);
3461
                //combining the appropriate sign change
3462
150k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
3463
                //row1-row0
3464
                //separating +ve and and -ve values.
3465
150k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3466
150k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3467
                //creating mask 00 for +ve and -ve values and FF for zero.
3468
150k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3469
150k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3470
                 // row = 2 right
3471
150k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd + 2));
3472
150k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
3473
3474
3475
                //row1 -bottom
3476
150k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
3477
150k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
3478
                //creating mask 00 for +ve and -ve values and FF for zero.
3479
150k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3480
150k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3481
                //combining the appropriate sign change
3482
150k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3483
                // row = 2
3484
150k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
3485
3486
                //combining sign-left and sign_right
3487
150k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
3488
3489
                //storing the row 1 left for next row.
3490
150k
                signup0_16x8b = _mm_slli_si128(left_store_16x8b, 12);
3491
3492
                //combining sign-left and sign_right
3493
150k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
3494
                //manipulation for bottom - row 1
3495
150k
                signup0_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signup0_16x8b, 14);
3496
                //eliminating old left for row 0 and row 1
3497
150k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
3498
                //bottom - row1
3499
150k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signup0_16x8b);
3500
150k
                cmp_lt0_16x8b = _mm_subs_epu8(signup0_16x8b, src_bottom_16x8b);
3501
                //creating mask 00 for +ve and -ve values and FF for zero.
3502
150k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3503
150k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3504
                //for the next iteration bottom -row1
3505
150k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3506
                //row1  getting it right for left of next iteration
3507
150k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14);
3508
                //copying the next top
3509
150k
                src_top_16x8b = src_temp1_16x8b;
3510
                //row0  getting its right for left of next iteration.
3511
150k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
3512
3513
3514
                //adding constant 2
3515
150k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
3516
150k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
3517
                //shuffle to get sao index
3518
150k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
3519
150k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
3520
                //using availability mask
3521
150k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
3522
150k
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
3523
                //adding chroma offset to access U and V
3524
150k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
3525
150k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
3526
3527
3528
                //shuffle to get sao offset
3529
150k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3530
150k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
3531
                //cnvert to 16 bit then add and then saturated pack
3532
150k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3533
150k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3534
150k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3535
150k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
3536
150k
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
3537
150k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3538
150k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
3539
150k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
3540
3541
150k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
3542
150k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
3543
150k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
3544
150k
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
3545
150k
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
3546
150k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3547
150k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
3548
150k
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
3549
3550
                //store left boundary
3551
150k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3552
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
3553
150k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3554
                // row = 1
3555
150k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
3556
3557
150k
                src_temp0_16x8b = src_bottom_16x8b;
3558
150k
                pu1_src_cpy += (src_strd << 1);
3559
150k
                pu1_src_left_cpy += 4;
3560
150k
                pu1_src_left_str += 4;
3561
150k
            }
3562
19.2k
            ht_rem = ht & 0x1;
3563
3564
19.2k
            if(ht_rem)
3565
2.93k
            {
3566
2.93k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3567
2.93k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
3568
                //current row -next row
3569
                //separating +ve and and -ve values.
3570
2.93k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
3571
2.93k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
3572
                //creating mask 00 for +ve and -ve values and FF for zero.
3573
2.93k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3574
2.93k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3575
                //combining the appropriate sign change
3576
2.93k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3577
                //adding top and botton and constant 2
3578
2.93k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
3579
2.93k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
3580
3581
                //eliminating old left for row 0 and row 1
3582
2.93k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
3583
                //copying the next top
3584
2.93k
                src_top_16x8b = src_temp0_16x8b;
3585
                //row0  getting it right for left of next block
3586
2.93k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
3587
3588
2.93k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
3589
                //using availability mask
3590
2.93k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
3591
                //adding chroma offset to access U and V
3592
2.93k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
3593
3594
2.93k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3595
3596
                //cnvert to 16 bit then add and then saturated pack
3597
2.93k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3598
2.93k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3599
2.93k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3600
2.93k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
3601
2.93k
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
3602
2.93k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3603
2.93k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
3604
2.93k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
3605
3606
2.93k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3607
3608
2.93k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3609
2.93k
                pu1_src_cpy += (src_strd);
3610
2.93k
                pu1_src_left_cpy += 2;
3611
2.93k
                pu1_src_left_str += 2;
3612
2.93k
            }
3613
19.2k
            if(0 == pu1_avail[3])
3614
863
            {
3615
863
                src_top_16x8b = src_bottom_16x8b;
3616
863
                pu1_src_left_str[1] = pu1_src_cpy[15];
3617
863
                pu1_src_left_str[0] = pu1_src_cpy[14];
3618
863
            }
3619
19.2k
            if(0 == pu1_avail[2])
3620
2.11k
            {
3621
2.11k
                pu1_src_left_str[-2 * ht_0] = pu1_src[14 - src_strd];
3622
2.11k
                pu1_src_left_str[-2 * ht_0 + 1] = pu1_src[15 - src_strd];
3623
2.11k
            }
3624
3625
            //for the top left of next part of the block
3626
19.2k
            left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
3627
            //updating top flag
3628
19.2k
            _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
3629
19.2k
            pu1_src += 16;
3630
19.2k
            au1_mask_cpy += 16;
3631
3632
19.2k
            pu1_left_tmp = pu1_src_left_cpy2;
3633
19.2k
            pu1_src_left_cpy2 = pu1_src_left_str2;
3634
19.2k
            pu1_src_left_str2 = pu1_left_tmp;
3635
3636
19.2k
            pu1_src_left_cpy = pu1_src_left_cpy2;
3637
19.2k
            pu1_src_left_str = pu1_src_left_str2;
3638
19.2k
        }
3639
9.48k
        wd_rem = wd & 0xF;
3640
9.48k
        if(wd_rem)
3641
96
        {
3642
96
            pu1_src_left_cpy = pu1_src_left_cpy2;
3643
96
            pu1_src_left_str = pu1_src_left_str2;
3644
96
            pu1_src_cpy = pu1_src;
3645
96
            src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
3646
            //row = 0
3647
96
            src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
3648
96
            src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 14);
3649
96
            au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
3650
            //separating +ve and and -ve values.
3651
96
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
3652
96
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
3653
            //creating mask 00 for +ve and -ve values and FF for zero.
3654
96
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3655
96
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3656
            //preparing au1_mask
3657
96
            au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
3658
            //combining the appropriate sign change
3659
96
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3660
96
            signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
3661
3662
369
            for(row = ht; row >= 4; row -= 4)
3663
273
            {
3664
273
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3665
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
3666
273
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
3667
                // row = 2
3668
273
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
3669
                //right row1
3670
273
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
3671
                //row 0 -row1
3672
                //separating +ve and and -ve values.
3673
273
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
3674
273
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
3675
                //manipulation for row 1 -row 0
3676
273
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
3677
                //creating mask 00 for +ve and -ve values and FF for zero.
3678
273
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3679
273
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3680
                //row 0 left
3681
273
                signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14);
3682
                //combining the appropriate sign change
3683
273
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3684
                //row 1 -row0
3685
                //separating +ve and and -ve values.
3686
273
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3687
273
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3688
3689
                //creating mask 00 for +ve and -ve values and FF for zero.
3690
273
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3691
273
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3692
                //row1-row0
3693
273
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3694
3695
273
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
3696
3697
273
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
3698
                //right row2
3699
273
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
3700
                //packing row 0 n row 1
3701
273
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
3702
                //row1 -row2
3703
273
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3704
273
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3705
                //creating mask 00 for +ve and -ve values and FF for zero.
3706
273
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3707
273
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3708
                //combining the appropriate sign change
3709
273
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
3710
273
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
3711
                //manipulation for row 2 -row 1
3712
273
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
3713
                //row 1 left
3714
273
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
3715
                //row = 3
3716
273
                src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
3717
3718
                // row = 4
3719
273
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
3720
3721
273
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
3722
3723
                //separating +ve and and -ve values.(2,1)
3724
273
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
3725
273
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
3726
                //manipulation for row 3 -row 2
3727
273
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 10);
3728
                //creating mask 00 for +ve and -ve values and FF for zero.
3729
273
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3730
273
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3731
                //row 2 left
3732
273
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
3733
                //combining the appropriate sign change
3734
273
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
3735
3736
                //separating +ve and and -ve values.(3,2)
3737
273
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
3738
273
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
3739
273
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
3740
                //creating mask 00 for +ve and -ve values and FF for zero.
3741
273
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3742
273
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3743
                //right row3
3744
273
                signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 2);
3745
                //combining the appropriate sign change
3746
273
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
3747
3748
273
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
3749
3750
                //separating +ve and and -ve values.(2,3)
3751
273
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
3752
273
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
3753
                //right row 4
3754
273
                signdwn1_16x8b =  _mm_srli_si128(src_temp1_16x8b, 2);
3755
                //creating mask 00 for +ve and -ve values and FF for zero.
3756
273
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3757
273
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3758
                //combining the appropriate sign change
3759
273
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
3760
3761
                //separating +ve and and -ve values.(3,bottom)
3762
273
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
3763
273
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
3764
3765
                //creating mask 00 for +ve and -ve values and FF for zero.
3766
273
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3767
273
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3768
273
                edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
3769
                //combining the appropriate sign change
3770
273
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
3771
273
                edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
3772
3773
                //manipulation for bottom -row 3
3774
273
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 8);
3775
                //eliminating old left for row 0,1,2,3
3776
273
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 8);
3777
                //packing row 2 n row 3
3778
273
                src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
3779
                //row 3 left
3780
273
                signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 14);
3781
3782
                //adding bottom and top values of row 2 and row 3
3783
273
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
3784
                //separating +ve and and -ve values.(botttom,3)
3785
273
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3786
273
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3787
3788
                //creating mask 00 for +ve and -ve values and FF for zero.
3789
273
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3790
273
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3791
273
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
3792
3793
                //to store right of row 2
3794
273
                signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
3795
                //loading row 3 right into left
3796
273
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 14);
3797
                //storing right of row 2into left
3798
273
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
3799
                //to store right of row 0
3800
273
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
3801
                //storing right of row 1 into left
3802
273
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
3803
                //storing right of row 0 into left
3804
273
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
3805
3806
                //adding constant 2
3807
273
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
3808
273
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
3809
                //shuffle to get sao index
3810
273
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
3811
273
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
3812
                //using availability mask
3813
273
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
3814
273
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
3815
3816
                //adding chroma offset to access U and V
3817
273
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
3818
273
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
3819
3820
                //shuffle to get sao offset
3821
273
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3822
273
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
3823
                //cnvert to 16 bit then add and then saturated pack
3824
273
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3825
273
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3826
273
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3827
273
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
3828
273
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
3829
273
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3830
273
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
3831
273
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
3832
3833
273
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
3834
273
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
3835
273
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
3836
273
                src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
3837
273
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
3838
273
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3839
273
                src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b);
3840
273
                src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
3841
3842
273
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
3843
273
                cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
3844
3845
3846
273
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3847
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
3848
273
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3849
                // row = 1
3850
273
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
3851
                //row = 2
3852
273
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
3853
                // row = 3
3854
273
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
3855
3856
273
                src_temp0_16x8b = src_temp1_16x8b;
3857
273
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
3858
273
                pu1_src_cpy += (src_strd << 2);
3859
273
                pu1_src_left_cpy += 8;
3860
273
                pu1_src_left_str += 8;
3861
273
            }
3862
96
            ht_rem = ht & 0x2;
3863
96
            if(ht_rem)
3864
22
            {
3865
22
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3866
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
3867
22
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
3868
                // row = 2
3869
22
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
3870
3871
                //row 0 -row 1
3872
22
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
3873
                //separating +ve and and -ve values.
3874
22
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
3875
22
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
3876
                //manipulation for row 1 -row 0
3877
22
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
3878
                //creating mask 00 for +ve and -ve values and FF for zero.
3879
22
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3880
22
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3881
                //manipulation for row 1 - row 0
3882
22
                signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14);
3883
                //combining the appropriate sign change
3884
22
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3885
3886
                //row1-row0
3887
                //separating +ve and and -ve values.
3888
22
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3889
22
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3890
3891
                //creating mask 00 for +ve and -ve values and FF for zero.
3892
22
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3893
22
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3894
                //combining the appropriate sign chang
3895
22
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3896
                //row 1 -bottom
3897
22
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
3898
3899
22
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
3900
22
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
3901
                //row1 -bottom
3902
22
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3903
22
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3904
3905
                //creating mask 00 for +ve and -ve values and FF for zero.
3906
22
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3907
22
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3908
                //combining the appropriate sign change
3909
22
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
3910
22
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
3911
                //manipulation for bottom -row1
3912
22
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
3913
                //eliminating old left for row 0,1
3914
22
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
3915
                //manipulation for bottom- row 1
3916
22
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
3917
                //adding top and down substraction
3918
22
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
3919
                //bottom - row 1
3920
22
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
3921
22
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
3922
3923
                //shifting row 1
3924
22
                signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
3925
                //creating mask 00 for +ve and -ve values and FF for zero.
3926
22
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3927
22
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3928
                //for the next iteration signup0_16x8b
3929
22
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
3930
                //storing right of row 1 into left
3931
22
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); //for storing right of row 0
3932
22
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
3933
                //the next top  in  src_top_16x8b
3934
22
                src_top_16x8b = src_temp1_16x8b;
3935
                //storing right of row 0 into left
3936
22
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
3937
3938
3939
                //adding constant 2
3940
22
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
3941
3942
                //shuffle to get sao index
3943
22
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
3944
                //using availability mask
3945
22
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
3946
3947
                //adding chroma offset to access U and V
3948
22
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
3949
3950
                //shuffle to get sao offset
3951
22
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3952
                //the next top already in  src_top_16x8b
3953
                //cnvert to 16 bit then add and then saturated pack
3954
22
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3955
22
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3956
22
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3957
22
                src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
3958
22
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
3959
22
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
3960
22
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b);
3961
22
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
3962
3963
22
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
3964
3965
22
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3966
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
3967
22
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3968
                // row = 1
3969
22
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
3970
22
                src_temp0_16x8b = src_bottom_16x8b;
3971
22
                pu1_src_cpy += (src_strd << 1);
3972
22
                pu1_src_left_cpy += 4;
3973
22
                pu1_src_left_str += 4;
3974
22
            }
3975
96
            ht_rem = ht & 0x1;
3976
96
            if(ht_rem)
3977
17
            {
3978
17
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3979
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
3980
17
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
3981
3982
                //row 0 -row1
3983
17
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
3984
                //separating +ve and and -ve values.
3985
17
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
3986
17
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
3987
                //creating mask 00 for +ve and -ve values and FF for zero.
3988
17
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3989
17
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3990
                //combining the appropriate sign change
3991
17
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3992
                //adding top and down substraction
3993
17
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
3994
3995
                //for row 0 right to put into left store
3996
17
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
3997
                //left store manipulation 1
3998
17
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
3999
17
                src_top_16x8b = src_temp0_16x8b;
4000
                //filling the left boundary value
4001
17
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
4002
4003
                //adding constant 2
4004
17
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
4005
17
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
4006
17
                edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
4007
4008
4009
                //shuffle to get sao index
4010
17
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
4011
                //using availability mask
4012
17
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
4013
                //adding chroma offset to access U and V
4014
17
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
4015
4016
                //shuffle to get sao offset
4017
17
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
4018
4019
                //cnvert to 16 bit then add and then saturated pack
4020
17
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
4021
17
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
4022
17
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
4023
17
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
4024
17
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
4025
4026
17
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4027
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
4028
17
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
4029
17
                pu1_src_cpy += (src_strd);
4030
17
                pu1_src_left_cpy += 2;
4031
17
                pu1_src_left_str += 2;
4032
17
            }
4033
96
            if(0 == pu1_avail[3])
4034
11
            {
4035
11
                src_top_16x8b = src_bottom_16x8b;
4036
11
                pu1_src_left_str[1] = pu1_src_cpy[7];
4037
11
                pu1_src_left_str[0] = pu1_src_cpy[6];
4038
11
            }
4039
4040
96
            if(0 == pu1_avail[2])
4041
16
            {
4042
16
                pu1_src_left_str[-2 * ht_0] = pu1_src[6 - src_strd];
4043
16
                pu1_src_left_str[-2 * ht_0 + 1] = pu1_src[7 - src_strd];
4044
16
            }
4045
4046
96
            _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
4047
96
            pu1_src += 8;
4048
4049
96
            pu1_left_tmp = pu1_src_left_cpy2;
4050
96
            pu1_src_left_cpy2 = pu1_src_left_str2;
4051
96
            pu1_src_left_str2 = pu1_left_tmp;
4052
4053
96
            pu1_src_left_cpy = pu1_src_left_cpy2;
4054
96
            pu1_src_left_str = pu1_src_left_str2;
4055
96
        }
4056
9.48k
        pu1_src_org[0] = u1_pos_0_0_tmp_u;
4057
9.48k
        pu1_src_org[1] = u1_pos_0_0_tmp_v;
4058
9.48k
        pu1_src_org[wd - 2 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp_u;
4059
9.48k
        pu1_src_org[wd - 1 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp_v;
4060
9.48k
        pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 2) : pu1_src_left_cpy;
4061
309k
        for(row = 0; row < 2 * ht_tmp; row++)
4062
300k
        {
4063
300k
            pu1_src_left[row] = pu1_src_left_cpy[row];
4064
300k
        }
4065
9.48k
    }
4066
4067
9.48k
}
4068
4069
void ihevc_sao_edge_offset_class3_ssse3(UWORD8 *pu1_src,
4070
                                        WORD32 src_strd,
4071
                                        UWORD8 *pu1_src_left,
4072
                                        UWORD8 *pu1_src_top,
4073
                                        UWORD8 *pu1_src_top_left,
4074
                                        UWORD8 *pu1_src_top_right,
4075
                                        UWORD8 *pu1_src_bot_left,
4076
                                        UWORD8 *pu1_avail,
4077
                                        WORD8 *pi1_sao_offset,
4078
                                        WORD32 wd,
4079
                                        WORD32 ht)
4080
12.8k
{
4081
12.8k
    WORD32 row, col;
4082
12.8k
    UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
4083
12.8k
    UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2;
4084
12.8k
    UWORD8 *pu1_src_cpy, *pu1_src_org;
4085
12.8k
    UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8];
4086
12.8k
    UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8];
4087
12.8k
    UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
4088
12.8k
    WORD32 wd_rem;
4089
12.8k
    UWORD8 u1_pos_wd_0_tmp, u1_pos_0_ht_tmp;
4090
12.8k
    WORD32 ht_tmp;
4091
12.8k
    WORD32 bit_depth;
4092
12.8k
    UWORD8 u1_avail0, u1_avail1;
4093
4094
12.8k
    __m128i src_top_16x8b, src_bottom_16x8b;
4095
12.8k
    __m128i src_temp0_16x8b, src_temp1_16x8b;
4096
12.8k
    __m128i signup0_16x8b, signdwn1_16x8b;
4097
12.8k
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
4098
12.8k
    __m128i edge0_16x8b, edge1_16x8b;
4099
12.8k
    __m128i au1_mask8x16b;
4100
12.8k
    __m128i edge_idx_8x16b, sao_offset_8x16b;
4101
12.8k
    __m128i const2_16x8b, const0_16x8b;
4102
12.8k
    __m128i left_store_16x8b;
4103
4104
12.8k
    ht_tmp = ht;
4105
12.8k
    au1_mask8x16b = _mm_set1_epi8(0xff);
4106
4107
12.8k
    au1_src_left_tmp[0] = pu1_src[(wd - 1)];
4108
    //manipulation for bottom left
4109
348k
    for(row = 1; row < ht; row++)
4110
335k
    {
4111
335k
        au1_src_left_tmp[row] = pu1_src_left[row];
4112
335k
    }
4113
12.8k
    au1_src_left_tmp[ht] = pu1_src_bot_left[0];
4114
4115
12.8k
    *pu1_src_top_left = pu1_src_top[wd - 1];
4116
    //setting availability mask to ff size MAX_CTB_SIZE
4117
64.4k
    for(col = 0; col < MAX_CTB_SIZE; col += 16)
4118
51.5k
        _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
4119
12.8k
    bit_depth = BIT_DEPTH_LUMA;
4120
12.8k
    pu1_src_org = pu1_src;
4121
12.8k
    pu1_src_top_cpy = pu1_src_top;
4122
12.8k
    pu1_src_left_cpy2 = au1_src_left_tmp;
4123
12.8k
    pu1_src_left_cpy = au1_src_left_tmp;
4124
12.8k
    pu1_src_left_str2 = au1_src_left_tmp1;
4125
12.8k
    pu1_src_left_str = au1_src_left_tmp1;
4126
12.8k
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
4127
12.8k
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
4128
4129
    /* If top-right is available, process separately */
4130
12.8k
    if(0 != pu1_avail[5])
4131
10.0k
    {
4132
10.0k
        WORD32 edge_idx;
4133
4134
10.0k
        edge_idx = 2 + SIGN(pu1_src[wd - 1] - pu1_src_top_right[0]) +
4135
10.0k
                        SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd]);
4136
4137
10.0k
        edge_idx = gi1_table_edge_idx[edge_idx];
4138
4139
10.0k
        if(0 != edge_idx)
4140
2.85k
        {
4141
2.85k
            u1_pos_wd_0_tmp = CLIP3(pu1_src[wd - 1] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
4142
2.85k
        }
4143
7.18k
        else
4144
7.18k
        {
4145
7.18k
            u1_pos_wd_0_tmp = pu1_src[wd - 1];
4146
7.18k
        }
4147
10.0k
    }
4148
2.84k
    else
4149
2.84k
    {
4150
2.84k
        u1_pos_wd_0_tmp = pu1_src[wd - 1];
4151
2.84k
    }
4152
4153
    /* If bottom-left is available, process separately */
4154
12.8k
    if(0 != pu1_avail[6])
4155
10.5k
    {
4156
10.5k
        WORD32 edge_idx;
4157
4158
10.5k
        edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd]) +
4159
10.5k
                        SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]);
4160
4161
10.5k
        edge_idx = gi1_table_edge_idx[edge_idx];
4162
4163
10.5k
        if(0 != edge_idx)
4164
3.08k
        {
4165
3.08k
            u1_pos_0_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
4166
3.08k
        }
4167
7.44k
        else
4168
7.44k
        {
4169
7.44k
            u1_pos_0_ht_tmp = pu1_src[(ht - 1) * src_strd];
4170
7.44k
        }
4171
10.5k
    }
4172
2.35k
    else
4173
2.35k
    {
4174
2.35k
        u1_pos_0_ht_tmp = pu1_src[(ht - 1) * src_strd];
4175
2.35k
    }
4176
4177
4178
4179
    /* Update height and source pointers based on the availability flags */
4180
12.8k
    if(0 == pu1_avail[2])
4181
2.05k
    {
4182
2.05k
        pu1_src_left_cpy2++;
4183
2.05k
        pu1_src_left_str2++;
4184
2.05k
        pu1_src_top_cpy = pu1_src;
4185
2.05k
        pu1_src += src_strd;
4186
2.05k
        ht--;
4187
2.05k
    }
4188
12.8k
    if(0 == pu1_avail[3])
4189
1.36k
    {
4190
1.36k
        ht--;
4191
1.36k
    }
4192
4193
4194
12.8k
    const2_16x8b = _mm_set1_epi8(2);
4195
12.8k
    const0_16x8b = _mm_setzero_si128();
4196
4197
4198
    //availability mask creation
4199
12.8k
    u1_avail0 = pu1_avail[0];
4200
12.8k
    u1_avail1 = pu1_avail[1];
4201
12.8k
    au1_mask[0] = u1_avail0;
4202
12.8k
    au1_mask[wd - 1] = u1_avail1;
4203
12.8k
    {
4204
12.8k
        WORD32 ht_rem;
4205
4206
12.8k
        pu1_src_left_cpy = pu1_src_left_cpy2;
4207
12.8k
        pu1_src_left_str = pu1_src_left_str2;
4208
12.8k
        au1_mask_cpy = au1_mask;
4209
29.0k
        for(col = wd; col >= 16; col -= 16)
4210
16.1k
        {
4211
16.1k
            pu1_src_cpy = pu1_src;
4212
16.1k
            src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col + 1));
4213
            //row = 0
4214
16.1k
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
4215
4216
            //loading the mask
4217
16.1k
            au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
4218
            //separating +ve and and -ve values.
4219
16.1k
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
4220
16.1k
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
4221
            //creating mask 00 for +ve and -ve values and FF for zero.
4222
16.1k
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4223
16.1k
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4224
            //combining the appropriate sign change
4225
16.1k
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4226
4227
267k
            for(row = ht; row >= 2; row -= 2)
4228
250k
            {
4229
250k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
4230
                //row = 1
4231
250k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
4232
                //to insert left in row 1
4233
250k
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
4234
                // row = 0 right
4235
250k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 1));
4236
4237
                //manipulation for row 1 - row 0
4238
250k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
4239
                //row 0 -row1
4240
                //separating +ve and and -ve values.
4241
250k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
4242
250k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
4243
4244
                //creating mask 00 for +ve and -ve values and FF for zero.
4245
250k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4246
250k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4247
4248
                //combining the appropriate sign change
4249
250k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
4250
                //combining sign-left and sign_right
4251
250k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
4252
4253
                //row1-row0
4254
                //separating +ve and and -ve values.
4255
250k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
4256
250k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
4257
                //creating mask 00 for +ve and -ve values and FF for zero.
4258
250k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4259
250k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4260
4261
                // row = 2
4262
250k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
4263
                // row = 1 right
4264
250k
                signdwn1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
4265
250k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
4266
4267
                //bottom - row1
4268
250k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
4269
250k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
4270
                //creating mask 00 for +ve and -ve values and FF for zero.
4271
250k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4272
250k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4273
                //for the next iteration bottom -row1
4274
250k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4275
4276
                //to insert left in row 1
4277
250k
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 13);
4278
                //manipulation for row 1 - bottom
4279
250k
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
4280
4281
                //row1 -bottom
4282
250k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
4283
250k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
4284
                //creating mask 00 for +ve and -ve values and FF for zero.
4285
250k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4286
250k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4287
                //combining the appropriate sign change
4288
250k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4289
4290
                //combining sign-left and sign_right
4291
250k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
4292
4293
                //eliminating old left for row 0 and row 1
4294
250k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
4295
4296
                //row1  getting it right for left of next block
4297
250k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15);
4298
                //adding constant 2
4299
250k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
4300
250k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
4301
                //shuffle to get sao index
4302
250k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
4303
250k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
4304
                //using availability mask
4305
250k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
4306
250k
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
4307
                //shuffle to get sao offset
4308
250k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
4309
250k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
4310
                //row0  getting it right for left of next block
4311
250k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
4312
                //copying the next top
4313
250k
                src_top_16x8b = src_temp1_16x8b;
4314
                //cnvert to 16 bit then add and then saturated pack
4315
250k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
4316
250k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
4317
250k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
4318
250k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
4319
250k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
4320
250k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
4321
250k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
4322
250k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
4323
4324
250k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
4325
250k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
4326
250k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
4327
250k
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
4328
250k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
4329
250k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
4330
250k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
4331
250k
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
4332
                //store left boundary
4333
250k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4334
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
4335
250k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
4336
                // row = 1
4337
250k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
4338
4339
250k
                src_temp0_16x8b = src_bottom_16x8b;
4340
250k
                pu1_src_cpy += (src_strd << 1);
4341
250k
                pu1_src_left_cpy += 2;
4342
250k
                pu1_src_left_str += 2;
4343
250k
            }
4344
16.1k
            ht_rem = ht & 0x1;
4345
4346
16.1k
            if(ht_rem)
4347
5.10k
            {
4348
5.10k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
4349
5.10k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
4350
                //to insert left in row 1
4351
5.10k
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
4352
                //manipulation for row 1 - row 0
4353
5.10k
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
4354
4355
                //current row -next row
4356
                //separating +ve and and -ve values.
4357
5.10k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
4358
5.10k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
4359
                //creating mask 00 for +ve and -ve values and FF for zero.
4360
5.10k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4361
5.10k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4362
                //combining the appropriate sign change
4363
5.10k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4364
                //adding top and bottom and constant 2
4365
5.10k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
4366
5.10k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
4367
                //eliminating old left for row 0 and row 1
4368
5.10k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
4369
4370
5.10k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
4371
                //using availability mask
4372
5.10k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
4373
4374
5.10k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
4375
4376
                //row0  getting it right for left of next block
4377
5.10k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
4378
                //copying the next top
4379
5.10k
                src_top_16x8b = src_temp0_16x8b;
4380
                //cnvert to 16 bit then add and then saturated pack
4381
5.10k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
4382
5.10k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
4383
5.10k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
4384
5.10k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
4385
5.10k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
4386
5.10k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
4387
5.10k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
4388
5.10k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
4389
                //store left boundary
4390
5.10k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4391
4392
5.10k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
4393
5.10k
                pu1_src_cpy += (src_strd);
4394
5.10k
                src_temp0_16x8b = src_bottom_16x8b;
4395
5.10k
                pu1_src_left_cpy++;
4396
5.10k
                pu1_src_left_str++;
4397
5.10k
            }
4398
16.1k
            {   //for bottom right
4399
16.1k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
4400
16.1k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
4401
16.1k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
4402
16.1k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4403
16.1k
            }
4404
16.1k
            if(0 == pu1_avail[3])
4405
1.99k
            {
4406
1.99k
                src_top_16x8b = src_bottom_16x8b;
4407
1.99k
            }
4408
            //for the top left of next part of the block
4409
16.1k
            left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
4410
            //updating top flag
4411
16.1k
            _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
4412
16.1k
            pu1_src += 16;
4413
16.1k
            au1_mask_cpy += 16;
4414
4415
16.1k
            pu1_left_tmp = pu1_src_left_cpy2;
4416
16.1k
            pu1_src_left_cpy2 = pu1_src_left_str2;
4417
16.1k
            pu1_src_left_str2 = pu1_left_tmp;
4418
4419
16.1k
            pu1_src_left_cpy = pu1_src_left_cpy2;
4420
16.1k
            pu1_src_left_str = pu1_src_left_str2;
4421
16.1k
        }
4422
4423
12.8k
        wd_rem = wd & 0xF;
4424
12.8k
        if(wd_rem)
4425
12.2k
        {
4426
12.2k
            pu1_src_cpy = pu1_src;
4427
12.2k
            pu1_src_left_cpy = pu1_src_left_cpy2;
4428
12.2k
            pu1_src_left_str = pu1_src_left_str2;
4429
12.2k
            src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col + 1));
4430
            //row = 0
4431
12.2k
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
4432
12.2k
            au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
4433
            //separating +ve and and -ve values.
4434
12.2k
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
4435
12.2k
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
4436
            //creating mask 00 for +ve and -ve values and FF for zero.
4437
12.2k
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4438
12.2k
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4439
            //preparing au1_mask
4440
12.2k
            au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
4441
            //combining the appropriate sign change
4442
12.2k
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4443
12.2k
            signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
4444
4445
92.2k
            for(row = ht; row >= 4; row -= 4)
4446
79.9k
            {
4447
79.9k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
4448
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
4449
79.9k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
4450
                // row = 2
4451
79.9k
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
4452
                //manipulation for row 0 -row 1
4453
79.9k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
4454
                //row 1 left
4455
79.9k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
4456
                //row 0 -row1
4457
                //separating +ve and and -ve values.
4458
79.9k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
4459
79.9k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
4460
4461
                //creating mask 00 for +ve and -ve values and FF for zero.
4462
79.9k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4463
79.9k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4464
                //manipulatiing for row 1 -row 0
4465
79.9k
                signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 1);
4466
                //combining the appropriate sign change
4467
79.9k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4468
                //row 1 -row0
4469
                //separating +ve and and -ve values.
4470
79.9k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
4471
79.9k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
4472
4473
                //creating mask 00 for +ve and -ve values and FF for zero.
4474
79.9k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4475
79.9k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4476
                //row1-row0
4477
79.9k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4478
4479
79.9k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
4480
4481
79.9k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
4482
                //manipulation for row 1 -row 2
4483
79.9k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 13);
4484
                //row 2 left
4485
79.9k
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
4486
                //packing row 0 n row 1
4487
79.9k
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
4488
                //row1 -row2
4489
79.9k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
4490
79.9k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
4491
                //creating mask 00 for +ve and -ve values and FF for zero.
4492
79.9k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4493
79.9k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4494
                //combining the appropriate sign change
4495
79.9k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
4496
79.9k
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
4497
4498
                //row 1 right
4499
79.9k
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
4500
                //row = 3
4501
79.9k
                src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
4502
4503
                // row = 4
4504
79.9k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
4505
4506
79.9k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
4507
4508
                //separating +ve and and -ve values.(2,1)
4509
79.9k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
4510
79.9k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
4511
4512
                //creating mask 00 for +ve and -ve values and FF for zero.
4513
79.9k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4514
79.9k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4515
                //row 2 right
4516
79.9k
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
4517
                //combining the appropriate sign change
4518
79.9k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
4519
4520
                //separating +ve and and -ve values.(3,2)
4521
79.9k
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
4522
79.9k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
4523
79.9k
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
4524
                //creating mask 00 for +ve and -ve values and FF for zero.
4525
79.9k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4526
79.9k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4527
                //manipulation for row 2 -row 3
4528
79.9k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
4529
                //row 3 left
4530
79.9k
                signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 15);
4531
                //combining the appropriate sign change
4532
79.9k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
4533
4534
79.9k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
4535
4536
                //separating +ve and and -ve values.(2,3)
4537
79.9k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
4538
79.9k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
4539
4540
                //manipulation for row 3 -bottom
4541
79.9k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 11);
4542
                //bottom left
4543
79.9k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
4544
4545
                //creating mask 00 for +ve and -ve values and FF for zero.
4546
79.9k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4547
79.9k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4548
                //combining the appropriate sign change
4549
79.9k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
4550
4551
                //separating +ve and and -ve values.(3,bottom)
4552
79.9k
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
4553
79.9k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
4554
4555
                //creating mask 00 for +ve and -ve values and FF for zero.
4556
79.9k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4557
79.9k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4558
79.9k
                edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
4559
                //combining the appropriate sign change
4560
79.9k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
4561
79.9k
                edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
4562
4563
4564
                //eliminating old left for row 0,1,2,3
4565
79.9k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
4566
                //packing row 2 n row 3
4567
79.9k
                src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
4568
                //row 3 right
4569
79.9k
                signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 1);
4570
                //loading row 3 right into left
4571
79.9k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 15);
4572
                //adding bottom and top values of row 2 and row 3
4573
79.9k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
4574
                //separating +ve and and -ve values.(botttom,3)
4575
79.9k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
4576
79.9k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
4577
                //to store right of row 2
4578
79.9k
                signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
4579
                //creating mask 00 for +ve and -ve values and FF for zero.
4580
79.9k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4581
79.9k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4582
79.9k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
4583
4584
                //storing right of row 2into left
4585
79.9k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
4586
                //to store right of row 0
4587
79.9k
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
4588
                //storing right of row 1 into left
4589
79.9k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
4590
4591
                //adding constant 2
4592
79.9k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
4593
79.9k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
4594
                //shuffle to get sao index
4595
79.9k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
4596
79.9k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
4597
                //using availability mask
4598
79.9k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
4599
79.9k
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
4600
                //shuffle to get sao offset
4601
79.9k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
4602
79.9k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
4603
4604
                //storing right of row 0 into left
4605
79.9k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
4606
                //cnvert to 16 bit then add and then saturated pack
4607
79.9k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
4608
79.9k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
4609
79.9k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
4610
79.9k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
4611
79.9k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
4612
79.9k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
4613
79.9k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
4614
79.9k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
4615
4616
79.9k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
4617
79.9k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
4618
79.9k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
4619
79.9k
                src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
4620
79.9k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
4621
79.9k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
4622
79.9k
                src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b);
4623
79.9k
                src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
4624
4625
79.9k
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
4626
79.9k
                cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
4627
4628
79.9k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4629
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
4630
79.9k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
4631
                // row = 1
4632
79.9k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
4633
                //row = 2
4634
79.9k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
4635
                // row = 3
4636
79.9k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
4637
4638
79.9k
                src_temp0_16x8b = src_temp1_16x8b;
4639
79.9k
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
4640
79.9k
                pu1_src_cpy += (src_strd << 2);
4641
79.9k
                pu1_src_left_cpy += 4;
4642
79.9k
                pu1_src_left_str += 4;
4643
79.9k
            }
4644
12.2k
            ht_rem = ht & 0x2;
4645
12.2k
            if(ht_rem)
4646
3.33k
            {
4647
3.33k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
4648
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
4649
3.33k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
4650
                // row = 2
4651
3.33k
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
4652
4653
                //manipulation for row 0 -row 1
4654
3.33k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
4655
                //bottom left
4656
3.33k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
4657
                //separating +ve and and -ve values.
4658
3.33k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
4659
3.33k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
4660
4661
                //creating mask 00 for +ve and -ve values and FF for zero.
4662
3.33k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4663
3.33k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4664
                //manipulation for row 1 - row 0
4665
3.33k
                signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 1);
4666
                //combining the appropriate sign change
4667
3.33k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4668
4669
                //row1-row0
4670
                //separating +ve and and -ve values.
4671
3.33k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
4672
3.33k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
4673
4674
                //creating mask 00 for +ve and -ve values and FF for zero.
4675
3.33k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4676
3.33k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4677
                //combining the appropriate sign chang
4678
3.33k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4679
4680
                //manipulation for row 1 -bottom
4681
3.33k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 13);
4682
                //bottom left
4683
3.33k
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
4684
4685
3.33k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
4686
3.33k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
4687
                //row1 -bottom
4688
3.33k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
4689
3.33k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
4690
4691
                //creating mask 00 for +ve and -ve values and FF for zero.
4692
3.33k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4693
3.33k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4694
                //combining the appropriate sign change
4695
3.33k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
4696
3.33k
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
4697
                //manipulation for bottom- row 1 (row 1 right)
4698
3.33k
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
4699
                //adding top and down substraction
4700
3.33k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
4701
                //bottom - row 1
4702
3.33k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
4703
3.33k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
4704
4705
                //eliminating old left for row 0,1
4706
3.33k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
4707
3.33k
                signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
4708
                //creating mask 00 for +ve and -ve values and FF for zero.
4709
3.33k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4710
3.33k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4711
                //for the next iteration signup0_16x8b
4712
3.33k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
4713
4714
                //storing right of row 1 into left
4715
3.33k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
4716
                //for storing right of row 1
4717
3.33k
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
4718
4719
3.33k
                src_top_16x8b = src_temp1_16x8b;
4720
                //storing right of row 0 into left
4721
3.33k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
4722
4723
                //adding constant 2
4724
3.33k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
4725
4726
                //shuffle to get sao index
4727
3.33k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
4728
                //using availability mask
4729
3.33k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
4730
                //shuffle to get sao offset
4731
3.33k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
4732
4733
                //the next top already in  src_top_16x8b
4734
                //cnvert to 16 bit then add and then saturated pack
4735
3.33k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
4736
3.33k
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
4737
3.33k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
4738
3.33k
                src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
4739
3.33k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
4740
3.33k
                cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
4741
3.33k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
4742
3.33k
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
4743
4744
3.33k
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
4745
4746
3.33k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4747
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
4748
3.33k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
4749
                // row = 1
4750
3.33k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
4751
3.33k
                src_temp0_16x8b = src_bottom_16x8b;
4752
3.33k
                pu1_src_cpy += (src_strd << 1);
4753
3.33k
                pu1_src_left_cpy += 2;
4754
3.33k
                pu1_src_left_str += 2;
4755
3.33k
            }
4756
12.2k
            ht_rem = ht & 0x1;
4757
12.2k
            if(ht_rem)
4758
3.33k
            {
4759
3.33k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
4760
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
4761
3.33k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
4762
4763
4764
                //manipulation for row 0 -bottom
4765
3.33k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
4766
                //bottom left
4767
3.33k
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
4768
                //separating +ve and and -ve values.
4769
3.33k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
4770
3.33k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
4771
                //creating mask 00 for +ve and -ve values and FF for zero.
4772
3.33k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4773
3.33k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4774
                //combining the appropriate sign change
4775
3.33k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4776
                //adding top and down substraction
4777
3.33k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
4778
                //for row 0 right to put into left store
4779
3.33k
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
4780
                //adding constant 2
4781
3.33k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
4782
3.33k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
4783
3.33k
                edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
4784
                //left store manipulation 1
4785
3.33k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
4786
                //filling the left boundary value
4787
3.33k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
4788
4789
                //shuffle to get sao index
4790
3.33k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
4791
                //using availability mask
4792
3.33k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
4793
                //shuffle to get sao offset
4794
3.33k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
4795
3.33k
                src_top_16x8b = src_temp0_16x8b;
4796
                //cnvert to 16 bit then add and then saturated pack
4797
3.33k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
4798
3.33k
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
4799
3.33k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
4800
3.33k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
4801
3.33k
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
4802
4803
3.33k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4804
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
4805
3.33k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
4806
3.33k
                pu1_src_cpy += (src_strd);
4807
3.33k
                src_temp0_16x8b = src_bottom_16x8b;
4808
3.33k
                pu1_src_left_cpy++;
4809
3.33k
                pu1_src_left_str++;
4810
3.33k
            }
4811
12.2k
            {   //for bottom right
4812
12.2k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
4813
12.2k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
4814
12.2k
                src_temp0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
4815
12.2k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
4816
12.2k
                _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4817
12.2k
            }
4818
12.2k
            if(0 == pu1_avail[3])
4819
1.34k
            {
4820
1.34k
                src_top_16x8b = src_bottom_16x8b;
4821
1.34k
            }
4822
12.2k
            _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
4823
12.2k
            pu1_src += 8;
4824
4825
12.2k
            pu1_left_tmp = pu1_src_left_cpy2;
4826
12.2k
            pu1_src_left_cpy2 = pu1_src_left_str2;
4827
12.2k
            pu1_src_left_str2 = pu1_left_tmp;
4828
4829
12.2k
            pu1_src_left_cpy = pu1_src_left_cpy2;
4830
12.2k
            pu1_src_left_str = pu1_src_left_str2;
4831
4832
12.2k
        }
4833
12.8k
        pu1_src_org[wd - 1] = u1_pos_wd_0_tmp;
4834
12.8k
        pu1_src_org[(ht_tmp - 1) * src_strd] = u1_pos_0_ht_tmp;
4835
12.8k
        pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 1) : pu1_src_left_cpy;
4836
12.8k
        pu1_src_left[0] = au1_src_left_tmp[0];
4837
348k
        for(row = 1; row < ht_tmp; row++)
4838
335k
        {
4839
335k
            pu1_src_left[row] = pu1_src_left_cpy[row];
4840
335k
        }
4841
12.8k
    }
4842
4843
12.8k
}
4844
4845
void ihevc_sao_edge_offset_class3_chroma_ssse3(UWORD8 *pu1_src,
4846
                                               WORD32 src_strd,
4847
                                               UWORD8 *pu1_src_left,
4848
                                               UWORD8 *pu1_src_top,
4849
                                               UWORD8 *pu1_src_top_left,
4850
                                               UWORD8 *pu1_src_top_right,
4851
                                               UWORD8 *pu1_src_bot_left,
4852
                                               UWORD8 *pu1_avail,
4853
                                               WORD8 *pi1_sao_offset_u,
4854
                                               WORD8 *pi1_sao_offset_v,
4855
                                               WORD32 wd,
4856
                                               WORD32 ht)
4857
12.3k
{
4858
12.3k
    WORD32 row, col;
4859
12.3k
    UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
4860
12.3k
    UWORD8 *pu1_src_cpy, *pu1_src_org;
4861
12.3k
    UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)];
4862
12.3k
    UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
4863
12.3k
    WORD32 wd_rem;
4864
12.3k
    UWORD8 u1_pos_wd_0_tmp_u, u1_pos_wd_0_tmp_v, u1_pos_0_ht_tmp_u, u1_pos_0_ht_tmp_v;
4865
12.3k
    WORD32 ht_tmp;
4866
12.3k
    WORD32 bit_depth;
4867
12.3k
    UWORD8 u1_avail0, u1_avail1;
4868
4869
12.3k
    __m128i src_top_16x8b, src_bottom_16x8b;
4870
12.3k
    __m128i src_temp0_16x8b, src_temp1_16x8b;
4871
12.3k
    __m128i signup0_16x8b, signdwn1_16x8b;
4872
12.3k
    __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
4873
12.3k
    __m128i edge0_16x8b, edge1_16x8b;
4874
12.3k
    __m128i au1_mask8x16b;
4875
12.3k
    __m128i edge_idx_8x16b, sao_offset_8x16b;
4876
12.3k
    __m128i left_store_16x8b;
4877
12.3k
    __m128i const0_16x8b, const2_16x8b;
4878
12.3k
    __m128i chroma_offset_8x16b;
4879
4880
12.3k
    ht_tmp = ht;
4881
12.3k
    au1_mask8x16b = _mm_set1_epi8(0xff);
4882
4883
4884
12.3k
    au1_src_left_tmp[0] = pu1_src[(wd - 2)];
4885
12.3k
    au1_src_left_tmp[1] = pu1_src[(wd - 1)];
4886
    //manipulation for bottom left
4887
307k
    for(row = 2; row < 2 * ht; row++)
4888
295k
    {
4889
295k
        au1_src_left_tmp[row] = pu1_src_left[row];
4890
295k
    }
4891
12.3k
    au1_src_left_tmp[2 * ht] = pu1_src_bot_left[0];
4892
12.3k
    au1_src_left_tmp[2 * ht + 1] = pu1_src_bot_left[1];
4893
4894
12.3k
    pu1_src_top_left[0] = pu1_src_top[wd - 2];
4895
12.3k
    pu1_src_top_left[1] = pu1_src_top[wd - 1];
4896
    //setting availability mask to ff size MAX_CTB_SIZE
4897
61.6k
    for(col = 0; col < MAX_CTB_SIZE; col += 16)
4898
49.3k
        _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
4899
12.3k
    bit_depth = BIT_DEPTH_LUMA;
4900
12.3k
    pu1_src_org = pu1_src;
4901
12.3k
    pu1_src_top_cpy = pu1_src_top;
4902
12.3k
    pu1_src_left_cpy2 = au1_src_left_tmp;
4903
12.3k
    pu1_src_left_cpy = au1_src_left_tmp;
4904
12.3k
    edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
4905
12.3k
    sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
4906
12.3k
    const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
4907
12.3k
    chroma_offset_8x16b = _mm_set1_epi16(0x0800);
4908
    /* If top-right is available, process separately */
4909
12.3k
    if(0 != pu1_avail[5])
4910
9.75k
    {
4911
9.75k
        WORD32 edge_idx;
4912
4913
        /* U */
4914
9.75k
        edge_idx = 2 + SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) +
4915
9.75k
                        SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]);
4916
4917
9.75k
        edge_idx = gi1_table_edge_idx[edge_idx];
4918
4919
9.75k
        if(0 != edge_idx)
4920
2.08k
        {
4921
2.08k
            u1_pos_wd_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
4922
2.08k
        }
4923
7.66k
        else
4924
7.66k
        {
4925
7.66k
            u1_pos_wd_0_tmp_u = pu1_src[wd - 2];
4926
7.66k
        }
4927
4928
        /* V */
4929
9.75k
        edge_idx = 2 + SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) +
4930
9.75k
                        SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]);
4931
4932
9.75k
        edge_idx = gi1_table_edge_idx[edge_idx];
4933
4934
9.75k
        if(0 != edge_idx)
4935
2.08k
        {
4936
2.08k
            u1_pos_wd_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
4937
2.08k
        }
4938
7.66k
        else
4939
7.66k
        {
4940
7.66k
            u1_pos_wd_0_tmp_v = pu1_src[wd - 1];
4941
7.66k
        }
4942
9.75k
    }
4943
2.58k
    else
4944
2.58k
    {
4945
2.58k
        u1_pos_wd_0_tmp_u = pu1_src[wd - 2];
4946
2.58k
        u1_pos_wd_0_tmp_v = pu1_src[wd - 1];
4947
2.58k
    }
4948
4949
    /* If bottom-left is available, process separately */
4950
12.3k
    if(0 != pu1_avail[6])
4951
9.91k
    {
4952
9.91k
        WORD32 edge_idx;
4953
4954
        /* U */
4955
9.91k
        edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 2 - src_strd]) +
4956
9.91k
                        SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]);
4957
4958
9.91k
        edge_idx = gi1_table_edge_idx[edge_idx];
4959
4960
9.91k
        if(0 != edge_idx)
4961
2.32k
        {
4962
2.32k
            u1_pos_0_ht_tmp_u = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
4963
2.32k
        }
4964
7.59k
        else
4965
7.59k
        {
4966
7.59k
            u1_pos_0_ht_tmp_u = pu1_src[(ht - 1) * src_strd];
4967
7.59k
        }
4968
4969
        /* V */
4970
9.91k
        edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd]) +
4971
9.91k
                        SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1]);
4972
4973
9.91k
        edge_idx = gi1_table_edge_idx[edge_idx];
4974
4975
9.91k
        if(0 != edge_idx)
4976
2.25k
        {
4977
2.25k
            u1_pos_0_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd + 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
4978
2.25k
        }
4979
7.66k
        else
4980
7.66k
        {
4981
7.66k
            u1_pos_0_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1];
4982
7.66k
        }
4983
9.91k
    }
4984
2.42k
    else
4985
2.42k
    {
4986
2.42k
        u1_pos_0_ht_tmp_u = pu1_src[(ht - 1) * src_strd];
4987
2.42k
        u1_pos_0_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1];
4988
2.42k
    }
4989
4990
4991
4992
    /* Update height and source pointers based on the availability flags */
4993
12.3k
    if(0 == pu1_avail[2])
4994
1.61k
    {
4995
1.61k
        pu1_src_left_cpy2 += 2;
4996
1.61k
        pu1_src_top_cpy = pu1_src;
4997
1.61k
        pu1_src += src_strd;
4998
1.61k
        ht--;
4999
1.61k
    }
5000
12.3k
    if(0 == pu1_avail[3])
5001
1.18k
    {
5002
1.18k
        ht--;
5003
1.18k
    }
5004
5005
12.3k
    sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
5006
12.3k
    const2_16x8b = _mm_set1_epi8(2);
5007
12.3k
    const0_16x8b = _mm_setzero_si128();
5008
5009
5010
    //availability mask creation
5011
12.3k
    u1_avail0 = pu1_avail[0];
5012
12.3k
    u1_avail1 = pu1_avail[1];
5013
12.3k
    au1_mask[0] = u1_avail0;
5014
12.3k
    au1_mask[1] = u1_avail0;
5015
12.3k
    au1_mask[wd - 1] = u1_avail1;
5016
12.3k
    au1_mask[wd - 2] = u1_avail1;
5017
12.3k
    {
5018
12.3k
        WORD32 ht_rem;
5019
12.3k
        au1_mask_cpy = au1_mask;
5020
32.2k
        for(col = wd; col >= 16; col -= 16)
5021
19.8k
        {
5022
19.8k
            pu1_src_cpy = pu1_src;
5023
19.8k
            src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col + 2));
5024
            //row = 0
5025
19.8k
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
5026
5027
            //loading the mask
5028
19.8k
            au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
5029
            //separating +ve and and -ve values.
5030
19.8k
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
5031
19.8k
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
5032
            //creating mask 00 for +ve and -ve values and FF for zero.
5033
19.8k
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5034
19.8k
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5035
            //combining the appropriate sign change
5036
19.8k
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5037
19.8k
            pu1_src_left_cpy = pu1_src_left_cpy2;
5038
5039
159k
            for(row = ht; row >= 2; row -= 2)
5040
139k
            {
5041
139k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
5042
                //row = 1
5043
139k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
5044
                //to insert left in row 1
5045
139k
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
5046
                // row = 0 right
5047
139k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2));
5048
5049
                //manipulation for row 1 - row 0
5050
139k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
5051
                //row 0 -row1
5052
                //separating +ve and and -ve values.
5053
139k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
5054
139k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
5055
5056
                //creating mask 00 for +ve and -ve values and FF for zero.
5057
139k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5058
139k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5059
5060
                //combining the appropriate sign change
5061
139k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
5062
                //combining sign-left and sign_right
5063
139k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
5064
5065
                //row1-row0
5066
                //separating +ve and and -ve values.
5067
139k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
5068
139k
                cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
5069
                //creating mask 00 for +ve and -ve values and FF for zero.
5070
139k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5071
139k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5072
5073
                // row = 2
5074
139k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
5075
                // row = 1 right
5076
139k
                signdwn1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
5077
139k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
5078
5079
                //bottom - row1
5080
139k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
5081
139k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
5082
                //creating mask 00 for +ve and -ve values and FF for zero.
5083
139k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5084
139k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5085
                //for the next iteration bottom -row1
5086
139k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5087
5088
                //to insert left in row 1
5089
139k
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 10);
5090
                //manipulation for row 1 - bottom
5091
139k
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
5092
5093
                //row1 -bottom
5094
139k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
5095
139k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
5096
                //creating mask 00 for +ve and -ve values and FF for zero.
5097
139k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5098
139k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5099
                //combining the appropriate sign change
5100
139k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5101
5102
                //combining sign-left and sign_right
5103
139k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
5104
5105
                //eliminating old left for row 0 and row 1
5106
139k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
5107
                //row1  getting it right for left of next block
5108
139k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14);
5109
                //row0  getting it right for left of next block
5110
139k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
5111
                //copying the next top
5112
139k
                src_top_16x8b = src_temp1_16x8b;
5113
5114
5115
                //adding constant 2
5116
139k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
5117
139k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
5118
                //shuffle to get sao index
5119
139k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
5120
139k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
5121
                //using availability mask
5122
139k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
5123
139k
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
5124
5125
                //adding chroma offset to access U and V
5126
139k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
5127
139k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
5128
5129
                //shuffle to get sao offset
5130
139k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
5131
139k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
5132
                //cnvert to 16 bit then add and then saturated pack
5133
139k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
5134
139k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
5135
139k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
5136
139k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
5137
139k
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
5138
139k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
5139
139k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
5140
139k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
5141
5142
139k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
5143
139k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
5144
139k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
5145
139k
                src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
5146
139k
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
5147
139k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
5148
139k
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
5149
139k
                src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
5150
                //store left boundary
5151
139k
                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5152
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
5153
139k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
5154
                // row = 1
5155
139k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
5156
5157
139k
                src_temp0_16x8b = src_bottom_16x8b;
5158
139k
                pu1_src_cpy += (src_strd << 1);
5159
139k
                pu1_src_left_cpy += 4;
5160
139k
            }
5161
19.8k
            ht_rem = ht & 0x1;
5162
5163
19.8k
            if(ht_rem)
5164
3.63k
            {
5165
3.63k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
5166
3.63k
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
5167
                //to insert left in row 1
5168
3.63k
                signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
5169
                //manipulation for row 1 - row 0
5170
3.63k
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
5171
5172
                //current row -next row
5173
                //separating +ve and and -ve values.
5174
3.63k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
5175
3.63k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
5176
                //creating mask 00 for +ve and -ve values and FF for zero.
5177
3.63k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5178
3.63k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5179
                //combining the appropriate sign change
5180
3.63k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5181
                //adding top and bottom and constant 2
5182
3.63k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
5183
3.63k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
5184
                //eliminating old left for row 0 and row 1
5185
3.63k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
5186
                //row0  getting it right for left of next block
5187
3.63k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
5188
                //copying the next top
5189
3.63k
                src_top_16x8b = src_temp0_16x8b;
5190
5191
3.63k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
5192
                //using availability mask
5193
3.63k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
5194
5195
                //adding chroma offset to access U and V
5196
3.63k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
5197
5198
5199
3.63k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
5200
5201
                //cnvert to 16 bit then add and then saturated pack
5202
3.63k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
5203
3.63k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
5204
3.63k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
5205
3.63k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
5206
3.63k
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
5207
3.63k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
5208
3.63k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
5209
3.63k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
5210
5211
                //store left boundary
5212
3.63k
                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5213
5214
3.63k
                _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
5215
3.63k
                pu1_src_cpy += (src_strd);
5216
3.63k
                src_temp0_16x8b = src_bottom_16x8b;
5217
3.63k
                pu1_src_left_cpy += 2;
5218
3.63k
            }
5219
19.8k
            {   //for bottom right
5220
19.8k
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
5221
19.8k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
5222
19.8k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
5223
19.8k
                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5224
19.8k
            }
5225
19.8k
            if(0 == pu1_avail[3])
5226
1.48k
            {
5227
1.48k
                src_top_16x8b = src_bottom_16x8b;
5228
1.48k
            }
5229
            //for the top left of next part of the block
5230
19.8k
            left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
5231
            //updating top flag
5232
19.8k
            _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
5233
19.8k
            pu1_src += 16;
5234
19.8k
            au1_mask_cpy += 16;
5235
19.8k
        }
5236
12.3k
        pu1_src_left_cpy = pu1_src_left_cpy2;
5237
12.3k
        wd_rem = wd & 0xF;
5238
12.3k
        if(wd_rem)
5239
654
        {
5240
654
            pu1_src_cpy = pu1_src;
5241
654
            src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col + 2));
5242
            //row = 0
5243
654
            src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
5244
654
            au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
5245
            //separating +ve and and -ve values.
5246
654
            cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
5247
654
            cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
5248
            //creating mask 00 for +ve and -ve values and FF for zero.
5249
654
            cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5250
654
            cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5251
            //preparing au1_mask
5252
654
            au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
5253
            //combining the appropriate sign change
5254
654
            signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5255
654
            signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
5256
654
            pu1_src_left_cpy = pu1_src_left_cpy2;
5257
1.70k
            for(row = ht; row >= 4; row -= 4)
5258
1.05k
            {
5259
1.05k
                left_store_16x8b = _mm_loadu_si128((__m128i *)pu1_src_left_cpy);
5260
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
5261
1.05k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
5262
                // row = 2
5263
1.05k
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
5264
                //manipulation for row 0 -row 1
5265
1.05k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
5266
                //row 1 left
5267
1.05k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
5268
                //row 0 -row1
5269
                //separating +ve and and -ve values.
5270
1.05k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
5271
1.05k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
5272
5273
                //creating mask 00 for +ve and -ve values and FF for zero.
5274
1.05k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5275
1.05k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5276
                //manipulatiing for row 1 -row 0
5277
1.05k
                signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 2);
5278
                //combining the appropriate sign change
5279
1.05k
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5280
                //row 1 -row0
5281
                //separating +ve and and -ve values.
5282
1.05k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
5283
1.05k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
5284
5285
                //creating mask 00 for +ve and -ve values and FF for zero.
5286
1.05k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5287
1.05k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5288
                //row1-row0
5289
1.05k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5290
5291
1.05k
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
5292
5293
1.05k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
5294
                //manipulation for row 1 -row 2
5295
1.05k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 10);
5296
                //row 2 left
5297
1.05k
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
5298
                //packing row 0 n row 1
5299
1.05k
                src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
5300
                //row1 -row2
5301
1.05k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
5302
1.05k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
5303
                //creating mask 00 for +ve and -ve values and FF for zero.
5304
1.05k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5305
1.05k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5306
                //combining the appropriate sign change
5307
1.05k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
5308
1.05k
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
5309
5310
                //row 1 right
5311
1.05k
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
5312
                //row = 3
5313
1.05k
                src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
5314
5315
                // row = 4
5316
1.05k
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
5317
5318
1.05k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
5319
5320
                //separating +ve and and -ve values.(2,1)
5321
1.05k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
5322
1.05k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
5323
5324
                //creating mask 00 for +ve and -ve values and FF for zero.
5325
1.05k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5326
1.05k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5327
                //row 2 right
5328
1.05k
                signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
5329
                //combining the appropriate sign change
5330
1.05k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
5331
5332
                //separating +ve and and -ve values.(3,2)
5333
1.05k
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
5334
1.05k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
5335
1.05k
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
5336
                //creating mask 00 for +ve and -ve values and FF for zero.
5337
1.05k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5338
1.05k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5339
                //manipulation for row 2 -row 3
5340
1.05k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 8);
5341
                //row 3 left
5342
1.05k
                signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 14);
5343
                //combining the appropriate sign change
5344
1.05k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
5345
5346
1.05k
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
5347
5348
                //separating +ve and and -ve values.(2,3)
5349
1.05k
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
5350
1.05k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
5351
5352
                //manipulation for row 3 -bottom
5353
1.05k
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 6);
5354
                //bottom left
5355
1.05k
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
5356
5357
                //creating mask 00 for +ve and -ve values and FF for zero.
5358
1.05k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5359
1.05k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5360
                //combining the appropriate sign change
5361
1.05k
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
5362
5363
                //separating +ve and and -ve values.(3,bottom)
5364
1.05k
                cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
5365
1.05k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
5366
5367
                //creating mask 00 for +ve and -ve values and FF for zero.
5368
1.05k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5369
1.05k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5370
1.05k
                edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
5371
                //combining the appropriate sign change
5372
1.05k
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
5373
1.05k
                edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
5374
5375
5376
                //eliminating old left for row 0,1,2,3
5377
1.05k
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 8);
5378
                //packing row 2 n row 3
5379
1.05k
                src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
5380
                //row 3 right
5381
1.05k
                signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 2);
5382
                //loading row 3 right into left
5383
1.05k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 14);
5384
                //adding bottom and top values of row 2 and row 3
5385
1.05k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
5386
                //separating +ve and and -ve values.(botttom,3)
5387
1.05k
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
5388
1.05k
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
5389
                //to store right of row 2
5390
1.05k
                signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
5391
                //creating mask 00 for +ve and -ve values and FF for zero.
5392
1.05k
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5393
1.05k
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5394
1.05k
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
5395
5396
                //storing right of row 2into left
5397
1.05k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
5398
                //to store right of row 0
5399
1.05k
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
5400
                //storing right of row 1 into left
5401
1.05k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
5402
                //storing right of row 0 into left
5403
1.05k
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
5404
5405
5406
                //adding constant 2
5407
1.05k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
5408
1.05k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
5409
                //shuffle to get sao index
5410
1.05k
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
5411
1.05k
                edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
5412
                //using availability mask
5413
1.05k
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
5414
1.05k
                edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
5415
                //adding chroma offset to access U and V
5416
1.05k
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
5417
1.05k
                edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
5418
                //shuffle to get sao offset
5419
1.05k
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
5420
1.05k
                edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
5421
5422
                //cnvert to 16 bit then add and then saturated pack
5423
1.05k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
5424
1.05k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
5425
1.05k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
5426
1.05k
                src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
5427
1.05k
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
5428
1.05k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
5429
1.05k
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
5430
1.05k
                src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
5431
5432
1.05k
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
5433
1.05k
                cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
5434
1.05k
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
5435
1.05k
                src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
5436
1.05k
                edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
5437
1.05k
                cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
5438
1.05k
                src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b);
5439
1.05k
                src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
5440
5441
1.05k
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
5442
1.05k
                cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
5443
1.05k
                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5444
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
5445
1.05k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
5446
                // row = 1
5447
1.05k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
5448
                //row = 2
5449
1.05k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
5450
                // row = 3
5451
1.05k
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
5452
5453
1.05k
                src_temp0_16x8b = src_temp1_16x8b;
5454
1.05k
                signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
5455
1.05k
                pu1_src_cpy += (src_strd << 2);
5456
1.05k
                pu1_src_left_cpy += 8;
5457
1.05k
            }
5458
654
            ht_rem = ht & 0x2;
5459
654
            if(ht_rem)
5460
292
            {
5461
292
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
5462
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
5463
292
                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
5464
                // row = 2
5465
292
                src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
5466
5467
                //manipulation for row 0 -row 1
5468
292
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
5469
                //bottom left
5470
292
                signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
5471
                //separating +ve and and -ve values.
5472
292
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
5473
292
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
5474
5475
                //creating mask 00 for +ve and -ve values and FF for zero.
5476
292
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5477
292
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5478
                //manipulation for row 1 - row 0
5479
292
                signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 2);
5480
                //combining the appropriate sign change
5481
292
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5482
5483
                //row1-row0
5484
                //separating +ve and and -ve values.
5485
292
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
5486
292
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
5487
5488
                //creating mask 00 for +ve and -ve values and FF for zero.
5489
292
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5490
292
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5491
                //combining the appropriate sign chang
5492
292
                edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5493
5494
                //manipulation for row 1 -bottom
5495
292
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 10);
5496
                //bottom left
5497
292
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
5498
5499
292
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
5500
292
                signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
5501
                //row1 -bottom
5502
292
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
5503
292
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
5504
5505
                //creating mask 00 for +ve and -ve values and FF for zero.
5506
292
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5507
292
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5508
                //combining the appropriate sign change
5509
292
                signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
5510
292
                edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
5511
5512
                //manipulation for bottom- row 1 (row 1 right)
5513
292
                signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
5514
                //adding top and down substraction
5515
292
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
5516
                //bottom - row 1
5517
292
                cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
5518
292
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
5519
5520
                //eliminating old left for row 0,1
5521
292
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
5522
292
                signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
5523
                //creating mask 00 for +ve and -ve values and FF for zero.
5524
292
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5525
292
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5526
                //for the next iteration signup0_16x8b
5527
292
                signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
5528
5529
                //storing right of row 1 into left
5530
292
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
5531
                //for storing right of row 1
5532
292
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
5533
5534
292
                src_top_16x8b = src_temp1_16x8b;
5535
                //storing right of row 0 into left
5536
292
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
5537
5538
                //adding constant 2
5539
292
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
5540
5541
                //shuffle to get sao index
5542
292
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
5543
                //using availability mask
5544
292
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
5545
                //adding chroma offset to access U and V
5546
292
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
5547
                //shuffle to get sao offset
5548
292
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
5549
                //the next top already in  src_top_16x8b
5550
                //cnvert to 16 bit then add and then saturated pack
5551
292
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
5552
292
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
5553
292
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
5554
292
                src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
5555
292
                edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
5556
292
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
5557
292
                src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b);
5558
292
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
5559
5560
292
                cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
5561
5562
292
                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5563
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
5564
292
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
5565
                // row = 1
5566
292
                _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
5567
292
                src_temp0_16x8b = src_bottom_16x8b;
5568
292
                pu1_src_cpy += (src_strd << 1);
5569
292
                pu1_src_left_cpy += 4;
5570
292
            }
5571
654
            ht_rem = ht & 0x1;
5572
654
            if(ht_rem)
5573
289
            {
5574
289
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
5575
                //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
5576
289
                src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
5577
5578
5579
                //manipulation for row 0 -bottom
5580
289
                signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
5581
                //bottom left
5582
289
                signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
5583
                //separating +ve and and -ve values.
5584
289
                cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
5585
289
                cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
5586
                //creating mask 00 for +ve and -ve values and FF for zero.
5587
289
                cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5588
289
                cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5589
                //combining the appropriate sign change
5590
289
                edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5591
                //adding top and down substraction
5592
289
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
5593
                //for row 0 right to put into left store
5594
289
                signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
5595
                //adding constant 2
5596
289
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
5597
289
                edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
5598
289
                edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
5599
                //left store manipulation 1
5600
289
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
5601
                //filling the left boundary value
5602
289
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
5603
289
                src_top_16x8b = src_temp0_16x8b;
5604
5605
                //shuffle to get sao index
5606
289
                edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
5607
                //using availability mask
5608
289
                edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
5609
                //adding chroma offset to access U and V
5610
289
                edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
5611
                //shuffle to get sao offset
5612
289
                edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
5613
5614
                //cnvert to 16 bit then add and then saturated pack
5615
289
                signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
5616
289
                src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
5617
289
                cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
5618
289
                src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
5619
289
                src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
5620
5621
289
                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5622
                //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
5623
289
                _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
5624
289
                pu1_src_cpy += (src_strd);
5625
289
                src_temp0_16x8b = src_bottom_16x8b;
5626
289
                pu1_src_left_cpy += 2;
5627
289
            }
5628
654
            {   //for bottom right
5629
654
                left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
5630
654
                left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
5631
654
                src_temp0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
5632
654
                left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
5633
654
                _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5634
654
            }
5635
654
            if(0 == pu1_avail[3])
5636
145
            {
5637
145
                src_top_16x8b = src_bottom_16x8b;
5638
145
            }
5639
5640
654
            _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
5641
654
            pu1_src += 8;
5642
654
        }
5643
12.3k
        pu1_src_org[wd - 2] = u1_pos_wd_0_tmp_u;
5644
12.3k
        pu1_src_org[wd - 1] = u1_pos_wd_0_tmp_v;
5645
12.3k
        pu1_src_org[(ht_tmp - 1) * src_strd] = u1_pos_0_ht_tmp_u;
5646
12.3k
        pu1_src_org[(ht_tmp - 1) * src_strd + 1] = u1_pos_0_ht_tmp_v;
5647
332k
        for(row = 0; row < 2 * ht_tmp; row++)
5648
319k
        {
5649
319k
            pu1_src_left[row] = au1_src_left_tmp[row];
5650
319k
        }
5651
12.3k
    }
5652
5653
12.3k
}