/src/libhevc/common/x86/ihevc_sao_ssse3_intr.c
Line | Count | Source (jump to first uncovered line) |
1 | | /****************************************************************************** |
2 | | * |
3 | | * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore |
4 | | * |
5 | | * Licensed under the Apache License, Version 2.0 (the "License"); |
6 | | * you may not use this file except in compliance with the License. |
7 | | * You may obtain a copy of the License at: |
8 | | * |
9 | | * http://www.apache.org/licenses/LICENSE-2.0 |
10 | | * |
11 | | * Unless required by applicable law or agreed to in writing, software |
12 | | * distributed under the License is distributed on an "AS IS" BASIS, |
13 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | * See the License for the specific language governing permissions and |
15 | | * limitations under the License. |
16 | | * |
17 | | ******************************************************************************/ |
18 | | /** |
19 | | ******************************************************************************* |
20 | | * @file |
21 | | * ihevc_sao_atom_intr.c |
22 | | * |
23 | | * @brief |
24 | | * Contains function definitions for Sample adaptive offset(SAO) used in-loop |
25 | | * filtering |
26 | | * |
27 | | * @author |
28 | | * 100592 |
29 | | * |
30 | | * @par List of Functions: |
31 | | * - ihevc_sao_band_offset_luma_ssse3() |
32 | | * - ihevc_sao_band_offset_chroma_ssse3() |
33 | | * - ihevc_sao_edge_offset_class0_ssse3() |
34 | | * - ihevc_sao_edge_offset_class0_chroma_ssse3() |
35 | | * - ihevc_sao_edge_offset_class1_ssse3() |
36 | | * - ihevc_sao_edge_offset_class1_chroma_ssse3() |
37 | | * - ihevc_sao_edge_offset_class2_ssse3() |
38 | | * - ihevc_sao_edge_offset_class2_chroma_ssse3() |
39 | | * - ihevc_sao_edge_offset_class3_ssse3() |
40 | | * - ihevc_sao_edge_offset_class3_chroma_ssse3() |
41 | | * |
42 | | * @remarks |
43 | | * None |
44 | | * |
45 | | ******************************************************************************* |
46 | | */ |
47 | | /*****************************************************************************/ |
48 | | /* File Includes */ |
49 | | /*****************************************************************************/ |
50 | | #include <stdio.h> |
51 | | |
52 | | #include "ihevc_typedefs.h" |
53 | | #include "ihevc_platform_macros.h" |
54 | | #include "ihevc_macros.h" |
55 | | #include "ihevc_func_selector.h" |
56 | | #include "ihevc_defs.h" |
57 | | #include "ihevc_tables_x86_intr.h" |
58 | | #include "ihevc_common_tables.h" |
59 | | #include "ihevc_sao.h" |
60 | | |
61 | | #include <immintrin.h> |
62 | | |
63 | | #define NUM_BAND_TABLE 32 |
64 | | /** |
65 | | ******************************************************************************* |
66 | | * |
67 | | * @brief |
68 | | * Has two sets of functions : band offset and edge offset both for luma and chroma |
69 | | * edge offset has horizontal ,vertical, 135 degree and 45 degree |
70 | | * |
71 | | * @par Description: |
72 | | * |
73 | | * |
74 | | * @param[in-out] pu1_src |
75 | | * Pointer to the source |
76 | | * |
77 | | * @param[in] src_strd |
78 | | * Source stride |
79 | | * |
80 | | * @param[in-out] pu1_src_left |
81 | | * source left boundary |
82 | | * |
83 | | * @param[in-out] pu1_src_top |
84 | | * Source top boundary |
85 | | * |
86 | | * @param[in-out] pu1_src_top_left |
87 | | * Source top left boundary |
88 | | * |
89 | | * @param[in] pu1_src_top_right |
90 | | * Source top right boundary |
91 | | * |
92 | | * @param[in] pu1_src_bot_left |
93 | | * Source bottom left boundary |
94 | | * |
95 | | * @param[in] pu1_avail |
96 | | * boundary availability flags |
97 | | * |
98 | | * @param[in] pi1_sao_offset_u |
99 | | * Chroma U sao offset values |
100 | | * |
101 | | * @param[in] pi1_sao_offset_v |
102 | | * Chroma V sao offset values |
103 | | * |
104 | | * @param[in] pi1_sao_offset |
105 | | * Luma sao offset values |
106 | | * |
107 | | * @param[in] wd |
108 | | * width of the source |
109 | | |
110 | | * @param[in] ht |
111 | | * height of the source |
112 | | * @returns |
113 | | * |
114 | | * @remarks |
115 | | * None |
116 | | * |
117 | | ******************************************************************************* |
118 | | */ |
119 | | |
120 | | |
121 | | void ihevc_sao_band_offset_luma_ssse3(UWORD8 *pu1_src, |
122 | | WORD32 src_strd, |
123 | | UWORD8 *pu1_src_left, |
124 | | UWORD8 *pu1_src_top, |
125 | | UWORD8 *pu1_src_top_left, |
126 | | WORD32 sao_band_pos, |
127 | | WORD8 *pi1_sao_offset, |
128 | | WORD32 wd, |
129 | | WORD32 ht) |
130 | 295k | { |
131 | 295k | WORD32 row, col; |
132 | 295k | UWORD8 *pu1_src_cpy; |
133 | 295k | WORD32 wd_rem; |
134 | 295k | WORD8 offset = 0; |
135 | | |
136 | 295k | __m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b; |
137 | 295k | __m128i band_table0_8x16b, band_table1_8x16b, band_table2_8x16b, band_table3_8x16b; |
138 | 295k | __m128i tmp_set_128i_1, tmp_set_128i_2, tmp_set_128i_3, tmp_set_128i_4; |
139 | 295k | __m128i band_pos_16x8b; |
140 | 295k | __m128i sao_offset; |
141 | 295k | __m128i cmp_mask, cmp_store; |
142 | | |
143 | | /* Updating left and top-left and top */ |
144 | 9.54M | for(row = 0; row < ht; row++) |
145 | 9.25M | { |
146 | 9.25M | pu1_src_left[row] = pu1_src[row * src_strd + (wd - 1)]; |
147 | 9.25M | } |
148 | 295k | pu1_src_top_left[0] = pu1_src_top[wd - 1]; |
149 | 1.47M | for(col = 0; col < wd; col += 8) |
150 | 1.17M | { |
151 | 1.17M | tmp_set_128i_1 = _mm_loadl_epi64((__m128i *)(pu1_src + (ht - 1) * src_strd + offset)); |
152 | 1.17M | _mm_storel_epi64((__m128i *)(pu1_src_top + offset), tmp_set_128i_1); |
153 | 1.17M | offset += 8; |
154 | 1.17M | } |
155 | | |
156 | | //replicating sao_band_pos as 8 bit value 16 times |
157 | | |
158 | | |
159 | 295k | band_pos_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos << 3)); |
160 | | //value set for sao_offset extraction |
161 | 295k | tmp_set_128i_1 = _mm_set_epi8(-128, 1, -128, 1, -128, 1, -128, 1, -128, 1, -128, 1, -128, 1, -128, 1); |
162 | 295k | tmp_set_128i_2 = _mm_set_epi8(-128, 2, -128, 2, -128, 2, -128, 2, -128, 2, -128, 2, -128, 2, -128, 2); |
163 | 295k | tmp_set_128i_3 = _mm_set_epi8(-128, 3, -128, 3, -128, 3, -128, 3, -128, 3, -128, 3, -128, 3, -128, 3); |
164 | 295k | tmp_set_128i_4 = _mm_set_epi8(-128, 4, -128, 4, -128, 4, -128, 4, -128, 4, -128, 4, -128, 4, -128, 4); |
165 | | |
166 | | //loaded sao offset values |
167 | 295k | sao_offset = _mm_loadl_epi64((__m128i *)pi1_sao_offset); |
168 | | |
169 | | //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers |
170 | 295k | band_table0_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx)); |
171 | 295k | band_table1_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8)); |
172 | 295k | band_table2_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16)); |
173 | 295k | band_table3_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24)); |
174 | | |
175 | | //band_position addition |
176 | 295k | band_table0_8x16b = _mm_add_epi16(band_table0_8x16b, band_pos_16x8b); |
177 | 295k | band_table1_8x16b = _mm_add_epi16(band_table1_8x16b, band_pos_16x8b); |
178 | 295k | band_table2_8x16b = _mm_add_epi16(band_table2_8x16b, band_pos_16x8b); |
179 | 295k | band_table3_8x16b = _mm_add_epi16(band_table3_8x16b, band_pos_16x8b); |
180 | | //sao_offset duplication |
181 | 295k | tmp_set_128i_1 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1); |
182 | 295k | tmp_set_128i_2 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2); |
183 | 295k | tmp_set_128i_3 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3); |
184 | 295k | tmp_set_128i_4 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4); |
185 | | //settng for comparision |
186 | 295k | cmp_mask = _mm_set1_epi16(16); |
187 | 295k | cmp_store = _mm_set1_epi16(0x00ff); |
188 | | |
189 | | //sao_offset addition |
190 | 295k | band_table0_8x16b = _mm_add_epi16(band_table0_8x16b, tmp_set_128i_1); |
191 | 295k | band_table1_8x16b = _mm_add_epi16(band_table1_8x16b, tmp_set_128i_2); |
192 | 295k | band_table2_8x16b = _mm_add_epi16(band_table2_8x16b, tmp_set_128i_3); |
193 | 295k | band_table3_8x16b = _mm_add_epi16(band_table3_8x16b, tmp_set_128i_4); |
194 | | //masking upper 8bit values of each 16 bit band table value |
195 | 295k | band_table0_8x16b = _mm_and_si128(band_table0_8x16b, cmp_store); |
196 | 295k | band_table1_8x16b = _mm_and_si128(band_table1_8x16b, cmp_store); |
197 | 295k | band_table2_8x16b = _mm_and_si128(band_table2_8x16b, cmp_store); |
198 | 295k | band_table3_8x16b = _mm_and_si128(band_table3_8x16b, cmp_store); |
199 | | |
200 | 295k | switch(sao_band_pos) |
201 | 295k | { |
202 | 21.9k | case 0: |
203 | 21.9k | tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table0_8x16b); |
204 | 21.9k | band_table0_8x16b = _mm_and_si128(band_table0_8x16b, tmp_set_128i_2); |
205 | 21.9k | break; |
206 | 6.37k | case 28: |
207 | 6.37k | tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table3_8x16b); |
208 | 6.37k | band_table3_8x16b = _mm_or_si128(band_table3_8x16b, tmp_set_128i_2); |
209 | 6.37k | break; |
210 | 6.33k | case 29: |
211 | 6.33k | tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table2_8x16b); |
212 | 6.33k | band_table2_8x16b = _mm_or_si128(band_table2_8x16b, tmp_set_128i_2); |
213 | 6.33k | tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table3_8x16b); |
214 | 6.33k | band_table3_8x16b = _mm_and_si128(band_table3_8x16b, tmp_set_128i_2); |
215 | 6.33k | break; |
216 | 5.15k | case 30: |
217 | 5.15k | tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table1_8x16b); |
218 | 5.15k | band_table1_8x16b = _mm_or_si128(band_table1_8x16b, tmp_set_128i_2); |
219 | 5.15k | tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table2_8x16b); |
220 | 5.15k | band_table2_8x16b = _mm_and_si128(band_table2_8x16b, tmp_set_128i_2); |
221 | 5.15k | break; |
222 | 9.08k | case 31: |
223 | 9.08k | tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table0_8x16b); |
224 | 9.08k | band_table0_8x16b = _mm_or_si128(band_table0_8x16b, tmp_set_128i_2); |
225 | 9.08k | tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table1_8x16b); |
226 | 9.08k | band_table1_8x16b = _mm_and_si128(band_table1_8x16b, tmp_set_128i_2); |
227 | 9.08k | break; |
228 | 246k | default: |
229 | 246k | break; |
230 | 295k | } |
231 | | //sao_offset is reused for zero cmp mask. |
232 | 294k | sao_offset = _mm_setzero_si128(); |
233 | 294k | tmp_set_128i_1 = _mm_set1_epi8(1); |
234 | | //tmp_set_128i_2 = _mm_set_epi8 (128,7,128,6,128,5,128,4,128,3,128,2,128,1,128,0); |
235 | 294k | cmp_mask = _mm_packus_epi16(cmp_mask, cmp_mask); //cmp_msk=dup16(16); |
236 | | |
237 | | //masking upper 8bit values of each 16 bit band table value |
238 | 294k | band_table0_8x16b = _mm_and_si128(band_table0_8x16b, cmp_store); |
239 | 294k | band_table1_8x16b = _mm_and_si128(band_table1_8x16b, cmp_store); |
240 | 294k | band_table2_8x16b = _mm_and_si128(band_table2_8x16b, cmp_store); |
241 | 294k | band_table3_8x16b = _mm_and_si128(band_table3_8x16b, cmp_store); |
242 | | |
243 | | //band table 8x16 four registers are packed into 16x8 two registers: band_table0_8x16b and band_table2_8x16b |
244 | 294k | band_table0_8x16b = _mm_packus_epi16(band_table0_8x16b, band_table1_8x16b); |
245 | 294k | band_table2_8x16b = _mm_packus_epi16(band_table2_8x16b, band_table3_8x16b); |
246 | | |
247 | 294k | band_table3_8x16b = _mm_slli_epi16(cmp_mask, 1); // to compare if value is greater than 31 |
248 | 294k | band_pos_16x8b = _mm_packus_epi16(band_pos_16x8b, band_pos_16x8b); //band_pos is now 8 bit aligned |
249 | 294k | band_table3_8x16b = _mm_sub_epi8(band_table3_8x16b, tmp_set_128i_1); // to compare if value is greater than 31 |
250 | | |
251 | 294k | cmp_mask = _mm_sub_epi8(cmp_mask, tmp_set_128i_1); |
252 | | // band_pos_16x8b = _mm_or_si128(band_pos_16x8b,cmp_store); |
253 | | |
254 | 739k | for(col = wd; col >= 16; col -= 16) |
255 | 444k | { |
256 | 444k | pu1_src_cpy = pu1_src; |
257 | 7.53M | for(row = ht; row > 0; row -= 2) |
258 | 7.09M | { |
259 | | |
260 | | |
261 | | //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos. |
262 | 7.09M | src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy)); |
263 | | // row = 1 |
264 | 7.09M | src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); |
265 | | |
266 | | |
267 | | |
268 | | //saturated substract 8 bit |
269 | 7.09M | tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_16x8b); |
270 | 7.09M | tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_16x8b); |
271 | | //if the values less than 0 put ff |
272 | 7.09M | tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1); |
273 | 7.09M | tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3); |
274 | 7.09M | tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); |
275 | 7.09M | tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); |
276 | | //if the values gret=ater than 31 put ff |
277 | 7.09M | tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, band_table3_8x16b); |
278 | 7.09M | tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, band_table3_8x16b); |
279 | 7.09M | tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); |
280 | 7.09M | tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); |
281 | | |
282 | | |
283 | | //row 0 and row1 |
284 | | //if the values >16 then put ff ,cmp_mask = dup16(15) |
285 | 7.09M | cmp_store = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask); |
286 | | //values 16 to 31 for row 0 & 1 but values <16 ==0 |
287 | 7.09M | tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, cmp_store); |
288 | | // values 0 to 15 for row 0 & 1 |
289 | 7.09M | tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, cmp_store); |
290 | | //values 16 to 31 for row 0 & 1 but values <16 masked to ff |
291 | 7.09M | cmp_store = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset); |
292 | 7.09M | tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, cmp_store); |
293 | | //row 2 and row 3 |
294 | | //if the values >16 then put ff ,cmp_mask = dup16(15) |
295 | 7.09M | cmp_store = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask); |
296 | | //values 16 to 31 for row 2 & 3 but values <16 ==0 |
297 | 7.09M | tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, cmp_store); |
298 | | // values 0 to 15 for row 2 & 3 |
299 | 7.09M | tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, cmp_store); |
300 | | //values 16 to 31 for row 2 & 3 but values <16 masked to ff |
301 | 7.09M | cmp_store = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset); |
302 | 7.09M | tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, cmp_store); |
303 | | |
304 | | //row 0 and row 1 |
305 | | //to preserve pixel values in which no offset needs to be added. |
306 | 7.09M | cmp_store = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2); |
307 | 7.09M | src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, cmp_store); |
308 | | |
309 | | //row 2 and row 3 |
310 | | //to preserve pixel values in which no offset needs to be added. |
311 | 7.09M | cmp_store = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4); |
312 | 7.09M | src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, cmp_store); |
313 | | |
314 | | //indexing 0 - 15 bandtable indexes |
315 | 7.09M | tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_1); |
316 | 7.09M | tmp_set_128i_3 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_3); |
317 | 7.09M | tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_2); |
318 | 7.09M | tmp_set_128i_4 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_4); |
319 | | // combining all offsets results |
320 | 7.09M | tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); |
321 | 7.09M | tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); |
322 | | // combing results woth the pixel values |
323 | 7.09M | src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1); |
324 | 7.09M | src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3); |
325 | | |
326 | | |
327 | | //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. |
328 | 7.09M | _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_8x16b); |
329 | | // row = 1 |
330 | 7.09M | _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp2_8x16b); |
331 | | |
332 | 7.09M | pu1_src_cpy += (src_strd << 1); |
333 | 7.09M | } |
334 | 444k | pu1_src += 16; |
335 | 444k | } |
336 | 294k | wd_rem = wd & 0xF; |
337 | 294k | if(wd_rem) |
338 | 289k | {pu1_src_cpy = pu1_src; |
339 | 2.55M | for(row = ht; row > 0; row -= 4) |
340 | 2.26M | { |
341 | | |
342 | | |
343 | | //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos. |
344 | 2.26M | src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy)); |
345 | | // row = 1 |
346 | 2.26M | src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd)); |
347 | | // row = 2 |
348 | 2.26M | src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd)); |
349 | | // row = 3 |
350 | 2.26M | src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd)); |
351 | | //row0 and row1 packed and row2 and row3 packed |
352 | | |
353 | 2.26M | src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b); |
354 | 2.26M | src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp3_8x16b); |
355 | | |
356 | | //saturated substract 8 bit |
357 | 2.26M | tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_16x8b); |
358 | 2.26M | tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_16x8b); |
359 | | //if the values less than 0 put ff |
360 | 2.26M | tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1); |
361 | 2.26M | tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3); |
362 | 2.26M | tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); |
363 | 2.26M | tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); |
364 | | //if the values gret=ater than 31 put ff |
365 | 2.26M | tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, band_table3_8x16b); |
366 | 2.26M | tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, band_table3_8x16b); |
367 | 2.26M | tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); |
368 | 2.26M | tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); |
369 | | |
370 | | |
371 | | |
372 | | //row 0 and row1 |
373 | | //if the values >16 then put ff ,cmp_mask = dup16(15) |
374 | 2.26M | cmp_store = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask); |
375 | | //values 16 to 31 for row 0 & 1 but values <16 ==0 |
376 | 2.26M | tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, cmp_store); |
377 | | // values 0 to 15 for row 0 & 1 |
378 | 2.26M | tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, cmp_store); |
379 | | //values 16 to 31 for row 0 & 1 but values <16 masked to ff |
380 | 2.26M | cmp_store = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset); |
381 | 2.26M | tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, cmp_store); |
382 | | //row 2 and row 3 |
383 | | //if the values >16 then put ff ,cmp_mask = dup16(15) |
384 | 2.26M | cmp_store = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask); |
385 | | //values 16 to 31 for row 2 & 3 but values <16 ==0 |
386 | 2.26M | tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, cmp_store); |
387 | | // values 0 to 15 for row 2 & 3 |
388 | 2.26M | tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, cmp_store); |
389 | | //values 16 to 31 for row 2 & 3 but values <16 masked to ff |
390 | 2.26M | cmp_store = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset); |
391 | 2.26M | tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, cmp_store); |
392 | | |
393 | | //row 0 and row 1 |
394 | | //to preserve pixel values in which no offset needs to be added. |
395 | 2.26M | cmp_store = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2); |
396 | 2.26M | src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, cmp_store); |
397 | | |
398 | | //row 2 and row 3 |
399 | | //to preserve pixel values in which no offset needs to be added. |
400 | 2.26M | cmp_store = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4); |
401 | 2.26M | src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, cmp_store); |
402 | | |
403 | | //indexing 0 - 15 bandtable indexes |
404 | 2.26M | tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_1); |
405 | 2.26M | tmp_set_128i_3 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_3); |
406 | 2.26M | tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_2); |
407 | 2.26M | tmp_set_128i_4 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_4); |
408 | | // combining all offsets results |
409 | 2.26M | tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); |
410 | 2.26M | tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); |
411 | | // combing results woth the pixel values |
412 | 2.26M | src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1); |
413 | 2.26M | src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3); |
414 | | |
415 | | //Getting row1 separately |
416 | 2.26M | src_temp1_8x16b = _mm_srli_si128(src_temp0_8x16b, 8); |
417 | | //Getting row3 separately |
418 | 2.26M | src_temp3_8x16b = _mm_srli_si128(src_temp2_8x16b, 8); |
419 | | |
420 | | //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. |
421 | 2.26M | _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_8x16b); |
422 | | // row = 1 |
423 | 2.26M | _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), src_temp1_8x16b); |
424 | | // row = 2 |
425 | 2.26M | _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp2_8x16b); |
426 | | // row = 3 |
427 | 2.26M | _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), src_temp3_8x16b); |
428 | | |
429 | 2.26M | pu1_src_cpy += (src_strd << 2); |
430 | | |
431 | 2.26M | } |
432 | 289k | pu1_src += 8; |
433 | 289k | } |
434 | | |
435 | | |
436 | 294k | } |
437 | | |
438 | | void ihevc_sao_band_offset_chroma_ssse3(UWORD8 *pu1_src, |
439 | | WORD32 src_strd, |
440 | | UWORD8 *pu1_src_left, |
441 | | UWORD8 *pu1_src_top, |
442 | | UWORD8 *pu1_src_top_left, |
443 | | WORD32 sao_band_pos_u, |
444 | | WORD32 sao_band_pos_v, |
445 | | WORD8 *pi1_sao_offset_u, |
446 | | WORD8 *pi1_sao_offset_v, |
447 | | WORD32 wd, |
448 | | WORD32 ht) |
449 | 301k | { |
450 | 301k | WORD32 row, col; |
451 | 301k | WORD8 offset = 0; |
452 | | |
453 | | |
454 | 301k | __m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b; |
455 | 301k | __m128i cmp_msk2; |
456 | 301k | __m128i band_table0_16x8b, band_table1_16x8b, band_table2_16x8b, band_table3_16x8b; |
457 | 301k | __m128i tmp_set_128i_1, tmp_set_128i_2, tmp_set_128i_3, tmp_set_128i_4; |
458 | 301k | __m128i band_pos_u_16x8b, band_pos_v_16x8b; |
459 | 301k | __m128i sao_offset; |
460 | 301k | __m128i cmp_mask; |
461 | | |
462 | | |
463 | | /* Updating left and top and top-left */ |
464 | 5.09M | for(row = 0; row < ht; row++) |
465 | 4.78M | { |
466 | 4.78M | pu1_src_left[2 * row] = pu1_src[row * src_strd + (wd - 2)]; |
467 | 4.78M | pu1_src_left[2 * row + 1] = pu1_src[row * src_strd + (wd - 1)]; |
468 | 4.78M | } |
469 | 301k | pu1_src_top_left[0] = pu1_src_top[wd - 2]; |
470 | 301k | pu1_src_top_left[1] = pu1_src_top[wd - 1]; |
471 | 1.52M | for(col = 0; col < wd; col += 8) |
472 | 1.22M | { |
473 | 1.22M | tmp_set_128i_1 = _mm_loadl_epi64((__m128i *)(pu1_src + (ht - 1) * src_strd + offset)); |
474 | 1.22M | _mm_storel_epi64((__m128i *)(pu1_src_top + offset), tmp_set_128i_1); |
475 | 1.22M | offset += 8; |
476 | 1.22M | } |
477 | | |
478 | 301k | { // band _table creation |
479 | 301k | __m128i temp0_8x16b, temp1_8x16b, temp2_8x16b, temp3_8x16b; |
480 | | // Band table for U component : band_table0_16x8b and band_table2_16x8b |
481 | | //replicating sao_band_pos as 8 bit value 16 times |
482 | 301k | band_pos_u_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos_u << 3)); |
483 | | //value set for sao_offset extraction |
484 | 301k | tmp_set_128i_1 = _mm_set_epi8(-128, 1, -128, 1, -128, 1, -128, 1, -128, 1, -128, 1, -128, 1, -128, 1); |
485 | 301k | tmp_set_128i_2 = _mm_set_epi8(-128, 2, -128, 2, -128, 2, -128, 2, -128, 2, -128, 2, -128, 2, -128, 2); |
486 | 301k | tmp_set_128i_3 = _mm_set_epi8(-128, 3, -128, 3, -128, 3, -128, 3, -128, 3, -128, 3, -128, 3, -128, 3); |
487 | 301k | tmp_set_128i_4 = _mm_set_epi8(-128, 4, -128, 4, -128, 4, -128, 4, -128, 4, -128, 4, -128, 4, -128, 4); |
488 | | |
489 | | //loaded sao offset values |
490 | 301k | sao_offset = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u); |
491 | | |
492 | | //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers |
493 | 301k | band_table0_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx)); |
494 | 301k | band_table1_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8)); |
495 | 301k | band_table2_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16)); |
496 | 301k | band_table3_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24)); |
497 | | |
498 | | //band_position addition |
499 | 301k | band_table0_16x8b = _mm_add_epi16(band_table0_16x8b, band_pos_u_16x8b); |
500 | 301k | band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, band_pos_u_16x8b); |
501 | 301k | band_table2_16x8b = _mm_add_epi16(band_table2_16x8b, band_pos_u_16x8b); |
502 | 301k | band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, band_pos_u_16x8b); |
503 | | //sao_offset duplication |
504 | 301k | temp0_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1); |
505 | 301k | temp1_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2); |
506 | 301k | temp2_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3); |
507 | 301k | temp3_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4); |
508 | | |
509 | | //sao_offset addition |
510 | 301k | band_table0_16x8b = _mm_add_epi16(band_table0_16x8b, temp0_8x16b); |
511 | 301k | band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, temp1_8x16b); |
512 | 301k | band_table2_16x8b = _mm_add_epi16(band_table2_16x8b, temp2_8x16b); |
513 | 301k | band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, temp3_8x16b); |
514 | | //reuse for clipping |
515 | 301k | temp1_8x16b = _mm_set1_epi16(0x00ff); |
516 | | //settng for comparision |
517 | 301k | cmp_mask = _mm_set1_epi16(16); |
518 | | |
519 | | //masking upper 8bit values of each 16 bit band table value |
520 | 301k | band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp1_8x16b); |
521 | 301k | band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b); |
522 | 301k | band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp1_8x16b); |
523 | 301k | band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b); |
524 | | |
525 | | //temp1_8x16b reuse for compare storage |
526 | 301k | switch(sao_band_pos_u) |
527 | 301k | { |
528 | 8.67k | case 0: |
529 | 8.67k | temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table0_16x8b); |
530 | 8.67k | band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp3_8x16b); |
531 | 8.67k | break; |
532 | 6.40k | case 28: |
533 | 6.40k | temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b); |
534 | 6.40k | band_table3_16x8b = _mm_or_si128(band_table3_16x8b, temp3_8x16b); |
535 | 6.40k | break; |
536 | 2.03k | case 29: |
537 | 2.03k | temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table2_16x8b); |
538 | 2.03k | band_table2_16x8b = _mm_or_si128(band_table2_16x8b, temp3_8x16b); |
539 | 2.03k | temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b); |
540 | 2.03k | band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp3_8x16b); |
541 | 2.03k | break; |
542 | 4.64k | case 30: |
543 | 4.64k | temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b); |
544 | 4.64k | band_table1_16x8b = _mm_or_si128(band_table1_16x8b, temp3_8x16b); |
545 | 4.64k | temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table2_16x8b); |
546 | 4.64k | band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp3_8x16b); |
547 | 4.64k | break; |
548 | 5.38k | case 31: |
549 | 5.38k | temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table0_16x8b); |
550 | 5.38k | band_table0_16x8b = _mm_or_si128(band_table0_16x8b, temp3_8x16b); |
551 | 5.38k | temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b); |
552 | 5.38k | band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp3_8x16b); |
553 | 5.38k | break; |
554 | 274k | default: |
555 | 274k | break; |
556 | 301k | } |
557 | | //masking upper 8bit values of each 16 bit band table value |
558 | 301k | band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp1_8x16b); |
559 | 301k | band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b); |
560 | 301k | band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp1_8x16b); |
561 | 301k | band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b); |
562 | | //band table 8x16 four registers are packed into 16x8 two registers: band_table0_8x16b and band_table2_8x16b |
563 | 301k | band_table0_16x8b = _mm_packus_epi16(band_table0_16x8b, band_table1_16x8b); |
564 | 301k | band_table2_16x8b = _mm_packus_epi16(band_table2_16x8b, band_table3_16x8b); |
565 | | // Band table for U component over |
566 | | |
567 | | // Band table for V component : band_table1_16x8b and band_table3_16x8b |
568 | | // replicating sao_band_pos as 8 bit value 16 times |
569 | 301k | band_pos_v_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos_v << 3)); |
570 | | |
571 | | //loaded sao offset values |
572 | 301k | sao_offset = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v); |
573 | | |
574 | | //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers |
575 | 301k | temp0_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx)); |
576 | 301k | band_table1_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8)); |
577 | 301k | temp2_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16)); |
578 | 301k | band_table3_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24)); |
579 | | |
580 | | //band_position addition |
581 | 301k | temp0_8x16b = _mm_add_epi16(temp0_8x16b, band_pos_v_16x8b); |
582 | 301k | band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, band_pos_v_16x8b); |
583 | 301k | temp2_8x16b = _mm_add_epi16(temp2_8x16b, band_pos_v_16x8b); |
584 | 301k | band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, band_pos_v_16x8b); |
585 | | //sao_offset duplication |
586 | 301k | tmp_set_128i_1 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1); |
587 | 301k | tmp_set_128i_2 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2); |
588 | 301k | tmp_set_128i_3 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3); |
589 | 301k | tmp_set_128i_4 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4); |
590 | | |
591 | | //sao_offset addition |
592 | 301k | temp0_8x16b = _mm_add_epi16(temp0_8x16b, tmp_set_128i_1); |
593 | 301k | band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, tmp_set_128i_2); |
594 | 301k | temp2_8x16b = _mm_add_epi16(temp2_8x16b, tmp_set_128i_3); |
595 | 301k | band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, tmp_set_128i_4); |
596 | | |
597 | | //masking upper 8bit values of 16 bit band table value |
598 | 301k | temp0_8x16b = _mm_and_si128(temp0_8x16b, temp1_8x16b); |
599 | 301k | band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b); |
600 | 301k | temp2_8x16b = _mm_and_si128(temp2_8x16b, temp1_8x16b); |
601 | 301k | band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b); |
602 | | //temp1_8x16b reuse for compare storage |
603 | | |
604 | 301k | switch(sao_band_pos_v) |
605 | 301k | { |
606 | 12.3k | case 0: |
607 | 12.3k | temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp0_8x16b); |
608 | 12.3k | temp0_8x16b = _mm_and_si128(temp0_8x16b, temp3_8x16b); |
609 | 12.3k | break; |
610 | 25.7k | case 28: |
611 | 25.7k | temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b); |
612 | 25.7k | band_table3_16x8b = _mm_or_si128(band_table3_16x8b, temp3_8x16b); |
613 | 25.7k | break; |
614 | 2.81k | case 29: |
615 | 2.81k | temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp2_8x16b); |
616 | 2.81k | temp2_8x16b = _mm_or_si128(temp2_8x16b, temp3_8x16b); |
617 | 2.81k | temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b); |
618 | 2.81k | band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp3_8x16b); |
619 | 2.81k | break; |
620 | 2.85k | case 30: |
621 | 2.85k | temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b); |
622 | 2.85k | band_table1_16x8b = _mm_or_si128(band_table1_16x8b, temp3_8x16b); |
623 | 2.85k | temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp2_8x16b); |
624 | 2.85k | temp2_8x16b = _mm_and_si128(temp2_8x16b, temp3_8x16b); |
625 | 2.85k | break; |
626 | 18.3k | case 31: |
627 | 18.3k | temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp0_8x16b); |
628 | 18.3k | temp0_8x16b = _mm_or_si128(temp0_8x16b, temp3_8x16b); |
629 | 18.3k | temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b); |
630 | 18.3k | band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp3_8x16b); |
631 | 18.3k | break; |
632 | 239k | default: |
633 | 239k | break; |
634 | 301k | } |
635 | | //masking upper 8bit values of each 16 bit band table value |
636 | 301k | temp0_8x16b = _mm_and_si128(temp0_8x16b, temp1_8x16b); |
637 | 301k | band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b); |
638 | 301k | temp2_8x16b = _mm_and_si128(temp2_8x16b, temp1_8x16b); |
639 | 301k | band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b); |
640 | | //band table 8x16 four registers are packed into 16x8 two registers: band_table0_8x16b and band_table2_8x16b |
641 | 301k | band_table1_16x8b = _mm_packus_epi16(temp0_8x16b, band_table1_16x8b); |
642 | 301k | band_table3_16x8b = _mm_packus_epi16(temp2_8x16b, band_table3_16x8b); |
643 | | //band table for u and v created |
644 | 301k | } |
645 | 0 | { |
646 | 301k | UWORD8 *pu1_src_cpy; |
647 | 301k | WORD32 wd_rem; |
648 | | |
649 | | |
650 | | //sao_offset is reused for zero cmp mask. |
651 | 301k | sao_offset = _mm_setzero_si128(); |
652 | 301k | tmp_set_128i_1 = _mm_set1_epi8(1); |
653 | | //tmp_set_128i_2 = _mm_set_epi8 (128,7,128,6,128,5,128,4,128,3,128,2,128,1,128,0); |
654 | 301k | cmp_mask = _mm_packus_epi16(cmp_mask, cmp_mask); //cmp_msk=dup16(16); |
655 | | //to avoid ffff to be saturated to 0 instead it should be to ff |
656 | | |
657 | 301k | cmp_msk2 = _mm_slli_epi16(cmp_mask, 1); // to compare if value is greater than 31 |
658 | 301k | band_pos_u_16x8b = _mm_packus_epi16(band_pos_u_16x8b, band_pos_u_16x8b); //band_pos_u is now 8 bit aligned |
659 | 301k | band_pos_v_16x8b = _mm_packus_epi16(band_pos_v_16x8b, band_pos_v_16x8b); //band_pos_v is now 8 bit aligned |
660 | 301k | cmp_msk2 = _mm_sub_epi8(cmp_msk2, tmp_set_128i_1); // to compare if value is greater than 31 |
661 | | |
662 | 301k | cmp_mask = _mm_sub_epi8(cmp_mask, tmp_set_128i_1); |
663 | | |
664 | 914k | for(col = wd; col >= 16; col -= 16) |
665 | 613k | { |
666 | 613k | pu1_src_cpy = pu1_src; |
667 | 5.50M | for(row = ht; row > 0; row -= 2) |
668 | 4.88M | { |
669 | | //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos. |
670 | 4.88M | src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy)); |
671 | | // row = 1 |
672 | 4.88M | src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); |
673 | | |
674 | | |
675 | | //odd values |
676 | 4.88M | src_temp1_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8); |
677 | 4.88M | src_temp2_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8); |
678 | | //even values |
679 | 4.88M | src_temp0_8x16b = _mm_slli_epi16(src_temp0_8x16b, 8); |
680 | 4.88M | src_temp3_8x16b = _mm_slli_epi16(src_temp3_8x16b, 8); |
681 | 4.88M | src_temp0_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8); |
682 | 4.88M | src_temp3_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8); |
683 | | //combining odd values |
684 | 4.88M | src_temp2_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp2_8x16b); |
685 | | //combining even values |
686 | 4.88M | src_temp0_8x16b = _mm_packus_epi16(src_temp0_8x16b, src_temp3_8x16b); |
687 | | |
688 | | //saturated substract 8 bit |
689 | 4.88M | tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_u_16x8b); |
690 | 4.88M | tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_v_16x8b); |
691 | | //if the values less than 0 put ff |
692 | 4.88M | tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1); |
693 | 4.88M | tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3); |
694 | 4.88M | tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); |
695 | 4.88M | tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); |
696 | | //if the values greater than 31 put ff |
697 | 4.88M | tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_msk2); |
698 | 4.88M | tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_msk2); |
699 | 4.88M | tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); |
700 | 4.88M | tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); |
701 | | // registers reused to increase performance |
702 | | //if the values >16 then put ff ,cmp_mask = dup16(15) row 0 and row1 |
703 | 4.88M | src_temp1_8x16b = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask); |
704 | | //if the values >16 then put ff ,cmp_mask = dup16(15) row 2 and row 3 |
705 | 4.88M | src_temp3_8x16b = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask); |
706 | | |
707 | | //values 16 to 31 for row 0 & 1 but values <16 ==0 |
708 | 4.88M | tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, src_temp1_8x16b); |
709 | | // values 0 to 15 for row 0 & 1 |
710 | 4.88M | tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, src_temp1_8x16b); |
711 | | //values 16 to 31 for row 2 & 3 but values <16 ==0 |
712 | 4.88M | tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, src_temp3_8x16b); |
713 | | // values 0 to 15 for row 2 & 3 |
714 | 4.88M | tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, src_temp3_8x16b); |
715 | | |
716 | | //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 0 and row1 |
717 | 4.88M | src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset); |
718 | | //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 2 and row 3 |
719 | 4.88M | src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset); |
720 | 4.88M | tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, src_temp1_8x16b); |
721 | 4.88M | tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, src_temp3_8x16b); |
722 | | |
723 | | |
724 | | //to choose which pixel values to preserve in row 0 and row 1 |
725 | 4.88M | src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2); |
726 | | //to choose which pixel values to preserve in row 2 and row 3 |
727 | 4.88M | src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4); |
728 | | //values of all rows to which no offset needs to be added preserved. |
729 | 4.88M | src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, src_temp1_8x16b); |
730 | 4.88M | src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, src_temp3_8x16b); |
731 | | |
732 | | //indexing 0 - 15 bandtable indexes |
733 | 4.88M | tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_16x8b, tmp_set_128i_1); //U low |
734 | 4.88M | tmp_set_128i_3 = _mm_shuffle_epi8(band_table1_16x8b, tmp_set_128i_3); //V low |
735 | | //indexing 16 -31 bandtable indexes |
736 | 4.88M | tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_16x8b, tmp_set_128i_2); //U high |
737 | 4.88M | tmp_set_128i_4 = _mm_shuffle_epi8(band_table3_16x8b, tmp_set_128i_4); //V high |
738 | | // combining all offsets results |
739 | 4.88M | tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); //U |
740 | 4.88M | tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); //V |
741 | | // combing results with the pixel values |
742 | 4.88M | src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1); |
743 | 4.88M | src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3); |
744 | | //reorganising even and odd values |
745 | 4.88M | src_temp1_8x16b = _mm_unpacklo_epi8(src_temp0_8x16b, src_temp2_8x16b); |
746 | 4.88M | src_temp3_8x16b = _mm_unpackhi_epi8(src_temp0_8x16b, src_temp2_8x16b); |
747 | | |
748 | | |
749 | | //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. |
750 | 4.88M | _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp1_8x16b); |
751 | | // row = 1 |
752 | 4.88M | _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp3_8x16b); |
753 | | |
754 | | |
755 | 4.88M | pu1_src_cpy += (src_strd << 1); |
756 | | |
757 | 4.88M | } |
758 | 613k | pu1_src += 16; |
759 | 613k | } |
760 | | |
761 | 301k | wd_rem = wd & 0xF; |
762 | 301k | if(wd_rem) |
763 | 541 | { |
764 | 541 | pu1_src_cpy = pu1_src; |
765 | 1.15k | for(row = ht; row > 0; row -= 4) |
766 | 610 | { |
767 | | //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos. |
768 | 610 | src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy)); |
769 | | // row = 1 |
770 | 610 | src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd)); |
771 | | // row = 2 |
772 | 610 | src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd)); |
773 | | // row = 3 |
774 | 610 | src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd)); |
775 | | //row0 and row1 packed and row2 and row3 packed |
776 | | |
777 | 610 | src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b); |
778 | 610 | src_temp3_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp3_8x16b); |
779 | | //odd values |
780 | 610 | src_temp1_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8); |
781 | 610 | src_temp2_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8); |
782 | | //even values |
783 | 610 | src_temp0_8x16b = _mm_slli_epi16(src_temp0_8x16b, 8); |
784 | 610 | src_temp3_8x16b = _mm_slli_epi16(src_temp3_8x16b, 8); |
785 | 610 | src_temp0_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8); |
786 | 610 | src_temp3_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8); |
787 | | //combining odd values |
788 | 610 | src_temp2_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp2_8x16b); |
789 | | //combining even values |
790 | 610 | src_temp0_8x16b = _mm_packus_epi16(src_temp0_8x16b, src_temp3_8x16b); |
791 | | |
792 | | //saturated substract 8 bit |
793 | 610 | tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_u_16x8b); |
794 | 610 | tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_v_16x8b); |
795 | | //if the values less than 0 put ff |
796 | 610 | tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1); |
797 | 610 | tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3); |
798 | 610 | tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); |
799 | 610 | tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); |
800 | | //if the values greater than 31 put ff |
801 | 610 | tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_msk2); |
802 | 610 | tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_msk2); |
803 | 610 | tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); |
804 | 610 | tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); |
805 | | // registers reused to increase performance |
806 | | //if the values >16 then put ff ,cmp_mask = dup16(15) row 0 and row1 |
807 | 610 | src_temp1_8x16b = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask); |
808 | | //if the values >16 then put ff ,cmp_mask = dup16(15) row 2 and row 3 |
809 | 610 | src_temp3_8x16b = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask); |
810 | | |
811 | | //values 16 to 31 for row 0 & 1 but values <16 ==0 |
812 | 610 | tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, src_temp1_8x16b); |
813 | | // values 0 to 15 for row 0 & 1 |
814 | 610 | tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, src_temp1_8x16b); |
815 | | //values 16 to 31 for row 2 & 3 but values <16 ==0 |
816 | 610 | tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, src_temp3_8x16b); |
817 | | // values 0 to 15 for row 2 & 3 |
818 | 610 | tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, src_temp3_8x16b); |
819 | | |
820 | | //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 0 and row1 |
821 | 610 | src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset); |
822 | | //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 2 and row 3 |
823 | 610 | src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset); |
824 | 610 | tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, src_temp1_8x16b); |
825 | 610 | tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, src_temp3_8x16b); |
826 | | |
827 | | |
828 | | //to choose which pixel values to preserve in row 0 and row 1 |
829 | 610 | src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2); |
830 | | //to choose which pixel values to preserve in row 2 and row 3 |
831 | 610 | src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4); |
832 | | //values of all rows to which no offset needs to be added preserved. |
833 | 610 | src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, src_temp1_8x16b); |
834 | 610 | src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, src_temp3_8x16b); |
835 | | |
836 | | //indexing 0 - 15 bandtable indexes |
837 | 610 | tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_16x8b, tmp_set_128i_1); //U low |
838 | 610 | tmp_set_128i_3 = _mm_shuffle_epi8(band_table1_16x8b, tmp_set_128i_3); //V low |
839 | | //indexing 16 -31 bandtable indexes |
840 | 610 | tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_16x8b, tmp_set_128i_2); //U high |
841 | 610 | tmp_set_128i_4 = _mm_shuffle_epi8(band_table3_16x8b, tmp_set_128i_4); //V high |
842 | | // combining all offsets results |
843 | 610 | tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); //U |
844 | 610 | tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); //V |
845 | | // combing results with the pixel values |
846 | 610 | src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1); |
847 | 610 | src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3); |
848 | | //reorganising even and odd values |
849 | 610 | src_temp1_8x16b = _mm_unpacklo_epi8(src_temp0_8x16b, src_temp2_8x16b); |
850 | 610 | src_temp3_8x16b = _mm_unpackhi_epi8(src_temp0_8x16b, src_temp2_8x16b); |
851 | | //Getting row1 separately |
852 | 610 | src_temp0_8x16b = _mm_srli_si128(src_temp1_8x16b, 8); |
853 | | //Getting row3 separately |
854 | 610 | src_temp2_8x16b = _mm_srli_si128(src_temp3_8x16b, 8); |
855 | | |
856 | | //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. |
857 | 610 | _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp1_8x16b); |
858 | | // row = 1 |
859 | 610 | _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), src_temp0_8x16b); |
860 | | // row = 2 |
861 | 610 | _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp3_8x16b); |
862 | | // row = 3 |
863 | 610 | _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), src_temp2_8x16b); |
864 | | |
865 | 610 | pu1_src_cpy += (src_strd << 2); |
866 | | |
867 | 610 | } |
868 | 541 | pu1_src += 16; |
869 | 541 | } |
870 | | |
871 | | |
872 | 301k | } |
873 | 301k | } |
874 | | |
875 | | |
876 | | |
877 | | void ihevc_sao_edge_offset_class0_ssse3(UWORD8 *pu1_src, |
878 | | WORD32 src_strd, |
879 | | UWORD8 *pu1_src_left, |
880 | | UWORD8 *pu1_src_top, |
881 | | UWORD8 *pu1_src_top_left, |
882 | | UWORD8 *pu1_src_top_right, |
883 | | UWORD8 *pu1_src_bot_left, |
884 | | UWORD8 *pu1_avail, |
885 | | WORD8 *pi1_sao_offset, |
886 | | WORD32 wd, |
887 | | WORD32 ht) |
888 | 106k | { |
889 | 106k | WORD32 row, col; |
890 | 106k | UWORD8 *pu1_src_cpy, *pu1_src_left_cpy, *pu1_src_left_str, *pu1_left_tmp; |
891 | 106k | UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy; |
892 | 106k | UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8]; |
893 | 106k | UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8]; |
894 | 106k | UWORD8 u1_avail0, u1_avail1; |
895 | 106k | WORD32 wd_rem; |
896 | 106k | WORD32 offset = 0; |
897 | 106k | __m128i src_temp0_16x8b, src_temp1_16x8b; |
898 | 106k | __m128i left0_16x8b, left1_16x8b; |
899 | 106k | __m128i cmp_gt0_16x8b, cmp_lt0_16x8b, cmp_gt1_16x8b, cmp_lt1_16x8b; |
900 | 106k | __m128i edge0_16x8b, edge1_16x8b; |
901 | 106k | __m128i au1_mask8x16b; |
902 | 106k | __m128i edge_idx_8x16b, sao_offset_8x16b; |
903 | 106k | __m128i const2_16x8b, const0_16x8b; |
904 | 106k | __m128i left_store_16x8b; |
905 | 106k | UNUSED(pu1_src_top_right); |
906 | 106k | UNUSED(pu1_src_bot_left); |
907 | | |
908 | 106k | au1_mask8x16b = _mm_set1_epi8(0xff); |
909 | | |
910 | | /* Update top and top-left arrays */ |
911 | | |
912 | 106k | *pu1_src_top_left = pu1_src_top[wd - 1]; |
913 | | |
914 | 282k | for(col = wd; col >= 16; col -= 16) |
915 | 176k | { |
916 | 176k | const0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + offset + (ht - 1) * src_strd)); |
917 | 176k | _mm_storeu_si128((__m128i *)(pu1_src_top + offset), const0_16x8b); |
918 | 176k | offset += 16; |
919 | 176k | } |
920 | | |
921 | | //setting availability mask to ff size MAX_CTB_SIZE |
922 | 530k | for(col = 0; col < MAX_CTB_SIZE; col += 16) |
923 | 424k | _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b); |
924 | 3.49M | for(row = 0; row < ht; row++) |
925 | 3.39M | { |
926 | 3.39M | au1_src_left_tmp[row] = pu1_src_left[row]; |
927 | 3.39M | } |
928 | 106k | edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx); |
929 | 106k | sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset); |
930 | | |
931 | | //availability mask creation |
932 | 106k | u1_avail0 = pu1_avail[0]; |
933 | 106k | u1_avail1 = pu1_avail[1]; |
934 | 106k | au1_mask[0] = u1_avail0; |
935 | 106k | au1_mask[wd - 1] = u1_avail1; |
936 | | |
937 | 106k | const2_16x8b = _mm_set1_epi8(2); |
938 | 106k | const0_16x8b = _mm_setzero_si128(); |
939 | 106k | pu1_src_left_cpy = au1_src_left_tmp; |
940 | 106k | pu1_src_left_str = au1_src_left_tmp1; |
941 | 106k | { |
942 | 106k | au1_mask_cpy = au1_mask; |
943 | 282k | for(col = wd; col >= 16; col -= 16) |
944 | 176k | { |
945 | 176k | pu1_src_cpy = pu1_src; |
946 | 176k | au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy); |
947 | | //pu1_src_left_cpy =au1_src_left_tmp; |
948 | 2.98M | for(row = ht; row > 0; row -= 2) |
949 | 2.81M | { |
950 | | |
951 | 2.81M | left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy)); |
952 | | //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos. |
953 | 2.81M | src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy)); |
954 | | // row = 1 |
955 | 2.81M | src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); |
956 | | |
957 | 2.81M | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 2); |
958 | | //row 1 left |
959 | 2.81M | left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 15); |
960 | 2.81M | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15); |
961 | | //row 0 left |
962 | 2.81M | left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 15); |
963 | 2.81M | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15); |
964 | | |
965 | | |
966 | | //separating +ve and and -ve values. |
967 | 2.81M | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b); |
968 | 2.81M | cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b); |
969 | 2.81M | cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b); |
970 | 2.81M | cmp_lt1_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b); |
971 | | //creating mask 00 for +ve and -ve values and FF for zero. |
972 | 2.81M | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
973 | 2.81M | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
974 | 2.81M | cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b); |
975 | 2.81M | cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b); |
976 | | //combining the appropriate sign change |
977 | 2.81M | left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
978 | 2.81M | left1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b); |
979 | | |
980 | | //row = 0 right |
981 | 2.81M | edge0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 1)); |
982 | | // row = 1 right |
983 | 2.81M | edge1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1)); |
984 | | //separating +ve and and -ve values. |
985 | 2.81M | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b); |
986 | 2.81M | cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b); |
987 | 2.81M | cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b); |
988 | 2.81M | cmp_lt1_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b); |
989 | | //creating mask 00 for +ve and -ve values and FF for zero. |
990 | 2.81M | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
991 | 2.81M | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
992 | 2.81M | cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b); |
993 | 2.81M | cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b); |
994 | | //combining the appropriate sign change |
995 | 2.81M | edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
996 | 2.81M | edge1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b); |
997 | | |
998 | | //combining sign-left and sign_right |
999 | 2.81M | edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b); |
1000 | 2.81M | edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b); |
1001 | | //adding constant 2 |
1002 | 2.81M | edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); |
1003 | 2.81M | edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); |
1004 | | //shuffle to get sao index |
1005 | 2.81M | edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); |
1006 | 2.81M | edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); |
1007 | | //using availability mask |
1008 | 2.81M | edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); |
1009 | 2.81M | edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b); |
1010 | | |
1011 | | //shuffle to get sao offset |
1012 | 2.81M | edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); |
1013 | 2.81M | edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); |
1014 | | //cnvert to 16 bit then add and then saturated pack |
1015 | 2.81M | left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); |
1016 | 2.81M | cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); |
1017 | 2.81M | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b); |
1018 | 2.81M | src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); |
1019 | 2.81M | cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); |
1020 | 2.81M | cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b); |
1021 | 2.81M | src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); |
1022 | 2.81M | src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); |
1023 | | |
1024 | 2.81M | left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); |
1025 | 2.81M | cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); |
1026 | 2.81M | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b); |
1027 | 2.81M | src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b); |
1028 | 2.81M | cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); |
1029 | 2.81M | cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b); |
1030 | 2.81M | src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b); |
1031 | 2.81M | src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b); |
1032 | | |
1033 | | |
1034 | 2.81M | _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); |
1035 | | //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. |
1036 | 2.81M | _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); |
1037 | | // row = 1 |
1038 | 2.81M | _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b); |
1039 | | |
1040 | 2.81M | pu1_src_cpy += (src_strd << 1); |
1041 | 2.81M | pu1_src_left_cpy += 2; |
1042 | 2.81M | pu1_src_left_str += 2; |
1043 | 2.81M | } |
1044 | 176k | au1_mask_cpy += 16; |
1045 | 176k | pu1_src += 16; |
1046 | 176k | pu1_src_left_cpy -= ht; |
1047 | 176k | pu1_src_left_str -= ht; |
1048 | | |
1049 | 176k | pu1_left_tmp = pu1_src_left_cpy; |
1050 | 176k | pu1_src_left_cpy = pu1_src_left_str; |
1051 | 176k | pu1_src_left_str = pu1_left_tmp; |
1052 | 176k | } |
1053 | | |
1054 | 106k | wd_rem = wd & 0xF; |
1055 | 106k | if(wd_rem) |
1056 | 99.1k | { |
1057 | | |
1058 | 99.1k | cmp_gt1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (ht - 1) * src_strd)); |
1059 | 99.1k | _mm_storel_epi64((__m128i *)(pu1_src_top + offset), cmp_gt1_16x8b); |
1060 | | |
1061 | 99.1k | au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); |
1062 | 99.1k | pu1_src_cpy = pu1_src; |
1063 | 99.1k | au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b); |
1064 | | //pu1_src_left_cpy =au1_src_left_tmp; |
1065 | 890k | for(row = ht; row > 0; row -= 4) |
1066 | 791k | { |
1067 | 791k | left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy)); |
1068 | | //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos. |
1069 | 791k | src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy)); |
1070 | | // row = 1 |
1071 | 791k | cmp_gt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd)); |
1072 | | // row = 2 |
1073 | 791k | src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd)); |
1074 | | // row = 3 |
1075 | 791k | cmp_gt1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd)); |
1076 | | |
1077 | | |
1078 | 791k | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 4); |
1079 | | //row 3 left |
1080 | 791k | edge0_16x8b = _mm_slli_si128(cmp_gt1_16x8b, 8); |
1081 | 791k | cmp_lt1_16x8b = _mm_alignr_epi8(cmp_gt1_16x8b, left_store_16x8b, 15); |
1082 | 791k | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15); |
1083 | | //row 2 left |
1084 | 791k | edge0_16x8b = _mm_slli_si128(src_temp1_16x8b, 8); |
1085 | 791k | left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 15); |
1086 | 791k | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15); |
1087 | | //row 1 left |
1088 | 791k | edge0_16x8b = _mm_slli_si128(cmp_gt0_16x8b, 8); |
1089 | 791k | cmp_lt0_16x8b = _mm_alignr_epi8(cmp_gt0_16x8b, left_store_16x8b, 15); |
1090 | 791k | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15); |
1091 | | //row 0 left |
1092 | 791k | edge0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); |
1093 | 791k | left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 15); |
1094 | 791k | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15); |
1095 | | |
1096 | | // packing rows together for 16 SIMD operations |
1097 | 791k | src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, cmp_gt0_16x8b); |
1098 | 791k | src_temp1_16x8b = _mm_unpacklo_epi64(src_temp1_16x8b, cmp_gt1_16x8b); |
1099 | | // packing rows together for 16 SIMD operations |
1100 | 791k | left0_16x8b = _mm_unpacklo_epi64(left0_16x8b, cmp_lt0_16x8b); |
1101 | 791k | left1_16x8b = _mm_unpacklo_epi64(left1_16x8b, cmp_lt1_16x8b); |
1102 | | |
1103 | | //separating +ve and and -ve values. |
1104 | 791k | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b); |
1105 | 791k | cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b); |
1106 | 791k | cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b); |
1107 | 791k | cmp_lt1_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b); |
1108 | | //creating mask 00 for +ve and -ve values and FF for zero. |
1109 | 791k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
1110 | 791k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
1111 | 791k | cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b); |
1112 | 791k | cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b); |
1113 | | //combining the appropriate sign change |
1114 | 791k | left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
1115 | 791k | left1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b); |
1116 | | |
1117 | | //row = 0 right |
1118 | 791k | edge0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 1)); |
1119 | | // row = 1 right |
1120 | 791k | cmp_gt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd + 1)); |
1121 | | // row = 2 right |
1122 | 791k | edge1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd + 1)); |
1123 | | // row = 3 right |
1124 | 791k | cmp_gt1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd + 1)); |
1125 | | // packing rows together for 16 SIMD operations |
1126 | 791k | edge0_16x8b = _mm_unpacklo_epi64(edge0_16x8b, cmp_gt0_16x8b); |
1127 | 791k | edge1_16x8b = _mm_unpacklo_epi64(edge1_16x8b, cmp_gt1_16x8b); |
1128 | | |
1129 | | //separating +ve and and -ve values. |
1130 | 791k | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b); |
1131 | 791k | cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b); |
1132 | 791k | cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b); |
1133 | 791k | cmp_lt1_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b); |
1134 | | //creating mask 00 for +ve and -ve values and FF for zero. |
1135 | 791k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
1136 | 791k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
1137 | 791k | cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b); |
1138 | 791k | cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b); |
1139 | | //combining the appropriate sign change |
1140 | 791k | edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
1141 | 791k | edge1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b); |
1142 | | |
1143 | | //combining sign-left and sign_right |
1144 | 791k | edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b); |
1145 | 791k | edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b); |
1146 | | //adding constant 2 |
1147 | 791k | edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); |
1148 | 791k | edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); |
1149 | | //shuffle to get sao index |
1150 | 791k | edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); |
1151 | 791k | edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); |
1152 | | //shuffle to get sao offset |
1153 | | //using availability mask |
1154 | 791k | edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); |
1155 | 791k | edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b); |
1156 | | |
1157 | 791k | edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); |
1158 | 791k | edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); |
1159 | | //cnvert to 16 bit then add and then saturated pack |
1160 | 791k | left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); |
1161 | 791k | cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); |
1162 | 791k | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b); |
1163 | 791k | src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); |
1164 | 791k | cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); |
1165 | 791k | cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b); |
1166 | 791k | src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); |
1167 | 791k | src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); |
1168 | | |
1169 | 791k | left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); |
1170 | 791k | cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); |
1171 | 791k | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b); |
1172 | 791k | src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b); |
1173 | 791k | cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); |
1174 | 791k | cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b); |
1175 | 791k | src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b); |
1176 | 791k | src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b); |
1177 | | //separting row 1 and row 3 |
1178 | 791k | cmp_lt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); |
1179 | 791k | cmp_lt1_16x8b = _mm_srli_si128(src_temp1_16x8b, 8); |
1180 | | |
1181 | 791k | _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); |
1182 | | //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. |
1183 | 791k | _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); |
1184 | | // row = 1 |
1185 | 791k | _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_lt0_16x8b); |
1186 | | // row = 2 |
1187 | 791k | _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp1_16x8b); |
1188 | | // row = 3 |
1189 | 791k | _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt1_16x8b); |
1190 | | |
1191 | 791k | pu1_src_cpy += (src_strd << 2); |
1192 | 791k | pu1_src_left_cpy += 4; |
1193 | 791k | pu1_src_left_str += 4; |
1194 | 791k | } |
1195 | 99.1k | pu1_src += wd; |
1196 | 99.1k | pu1_src_left_cpy -= ht; |
1197 | 99.1k | pu1_src_left_str -= ht; |
1198 | | |
1199 | 99.1k | pu1_left_tmp = pu1_src_left_cpy; |
1200 | 99.1k | pu1_src_left_cpy = pu1_src_left_str; |
1201 | 99.1k | pu1_src_left_str = pu1_left_tmp; |
1202 | 99.1k | } |
1203 | 3.49M | for(row = 0; row < ht; row++) |
1204 | 3.39M | { |
1205 | 3.39M | pu1_src_left[row] = pu1_src_left_cpy[row]; |
1206 | 3.39M | } |
1207 | 106k | } |
1208 | 106k | } |
1209 | | |
1210 | | |
1211 | | void ihevc_sao_edge_offset_class0_chroma_ssse3(UWORD8 *pu1_src, |
1212 | | WORD32 src_strd, |
1213 | | UWORD8 *pu1_src_left, |
1214 | | UWORD8 *pu1_src_top, |
1215 | | UWORD8 *pu1_src_top_left, |
1216 | | UWORD8 *pu1_src_top_right, |
1217 | | UWORD8 *pu1_src_bot_left, |
1218 | | UWORD8 *pu1_avail, |
1219 | | WORD8 *pi1_sao_offset_u, |
1220 | | WORD8 *pi1_sao_offset_v, |
1221 | | WORD32 wd, |
1222 | | WORD32 ht) |
1223 | 71.3k | { |
1224 | 71.3k | WORD32 row, col; |
1225 | 71.3k | UWORD8 *pu1_src_cpy, *pu1_src_left_cpy, *pu1_src_left_str, *pu1_left_tmp; |
1226 | 71.3k | UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy; |
1227 | 71.3k | UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)]; |
1228 | 71.3k | UWORD8 au1_src_left_tmp1[2 * (MAX_CTB_SIZE + 8)]; |
1229 | 71.3k | UWORD8 u1_avail0, u1_avail1; |
1230 | 71.3k | WORD32 wd_rem; |
1231 | 71.3k | WORD32 offset = 0; |
1232 | | |
1233 | 71.3k | __m128i src_temp0_16x8b, src_temp1_16x8b; |
1234 | 71.3k | __m128i left0_16x8b, left1_16x8b; |
1235 | 71.3k | __m128i cmp_gt0_16x8b, cmp_lt0_16x8b; |
1236 | 71.3k | __m128i edge0_16x8b, edge1_16x8b; |
1237 | 71.3k | __m128i au1_mask8x16b; |
1238 | 71.3k | __m128i edge_idx_8x16b, sao_offset_8x16b; |
1239 | 71.3k | __m128i const2_16x8b, const0_16x8b; |
1240 | 71.3k | __m128i left_store_16x8b; |
1241 | 71.3k | __m128i chroma_offset_8x16b; |
1242 | 71.3k | UNUSED(pu1_src_top_right); |
1243 | 71.3k | UNUSED(pu1_src_bot_left); |
1244 | | |
1245 | 71.3k | au1_mask8x16b = _mm_set1_epi8(0xff); |
1246 | | |
1247 | | /* Update top and top-left arrays */ |
1248 | 71.3k | pu1_src_top_left[0] = pu1_src_top[wd - 2]; |
1249 | 71.3k | pu1_src_top_left[1] = pu1_src_top[wd - 1];; |
1250 | | |
1251 | 222k | for(col = wd; col >= 16; col -= 16) |
1252 | 150k | { |
1253 | 150k | const0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + offset + (ht - 1) * src_strd)); |
1254 | 150k | _mm_storeu_si128((__m128i *)(pu1_src_top + offset), const0_16x8b); |
1255 | 150k | offset += 16; |
1256 | 150k | } |
1257 | 2.34M | for(row = 0; row < 2 * ht; row++) |
1258 | 2.27M | { |
1259 | 2.27M | au1_src_left_tmp[row] = pu1_src_left[row]; |
1260 | 2.27M | } |
1261 | | //setting availability mask to ff size MAX_CTB_SIZE |
1262 | 356k | for(col = 0; col < MAX_CTB_SIZE; col += 16) |
1263 | 285k | _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b); |
1264 | | |
1265 | 71.3k | edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx); |
1266 | 71.3k | sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u); |
1267 | 71.3k | const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v); |
1268 | 71.3k | chroma_offset_8x16b = _mm_set1_epi16(0x0800); |
1269 | | //availability mask creation |
1270 | 71.3k | u1_avail0 = pu1_avail[0]; |
1271 | 71.3k | u1_avail1 = pu1_avail[1]; |
1272 | 71.3k | au1_mask[0] = u1_avail0; |
1273 | 71.3k | au1_mask[1] = u1_avail0; |
1274 | 71.3k | au1_mask[wd - 1] = u1_avail1; |
1275 | 71.3k | au1_mask[wd - 2] = u1_avail1; |
1276 | 71.3k | sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b); |
1277 | 71.3k | const2_16x8b = _mm_set1_epi8(2); |
1278 | 71.3k | const0_16x8b = _mm_setzero_si128(); |
1279 | | |
1280 | 71.3k | { |
1281 | 71.3k | pu1_src_left_cpy = au1_src_left_tmp; |
1282 | 71.3k | pu1_src_left_str = au1_src_left_tmp1; |
1283 | 71.3k | au1_mask_cpy = au1_mask; |
1284 | 222k | for(col = wd; col >= 16; col -= 16) |
1285 | 150k | { |
1286 | 150k | pu1_src_cpy = pu1_src; |
1287 | 150k | au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy); |
1288 | | |
1289 | 1.35M | for(row = ht; row > 0; row -= 2) |
1290 | 1.20M | { |
1291 | | |
1292 | 1.20M | left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy)); |
1293 | | //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos. |
1294 | 1.20M | src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy)); |
1295 | | // row = 1 |
1296 | 1.20M | src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); |
1297 | | |
1298 | 1.20M | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 4); |
1299 | | //row 1 left |
1300 | 1.20M | left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 14); |
1301 | 1.20M | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14); |
1302 | | //row 0 left |
1303 | 1.20M | left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 14); |
1304 | 1.20M | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14); |
1305 | | |
1306 | | |
1307 | | //separating +ve and and -ve values.row 0 left |
1308 | 1.20M | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b); |
1309 | 1.20M | cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b); |
1310 | | //creating mask 00 for +ve and -ve values and FF for zero. |
1311 | 1.20M | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
1312 | 1.20M | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
1313 | | //combining the appropriate sign change |
1314 | 1.20M | left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
1315 | | |
1316 | | //separating +ve and and -ve values.row 1 left |
1317 | 1.20M | cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b); |
1318 | 1.20M | cmp_lt0_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b); |
1319 | | //creating mask 00 for +ve and -ve values and FF for zero. |
1320 | 1.20M | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
1321 | 1.20M | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
1322 | | //combining the appropriate sign change |
1323 | 1.20M | left1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
1324 | | |
1325 | | |
1326 | | //row = 0 right |
1327 | 1.20M | edge0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2)); |
1328 | | // row = 1 right |
1329 | 1.20M | edge1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2)); |
1330 | | //separating +ve and and -ve values.row 0 right |
1331 | 1.20M | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b); |
1332 | 1.20M | cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b); |
1333 | | //creating mask 00 for +ve and -ve values and FF for zero. |
1334 | 1.20M | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
1335 | 1.20M | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
1336 | | //combining the appropriate sign change |
1337 | 1.20M | edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
1338 | | |
1339 | | //separating +ve and and -ve values.row 1 right |
1340 | 1.20M | cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b); |
1341 | 1.20M | cmp_lt0_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b); |
1342 | | //creating mask 00 for +ve and -ve values and FF for zero. |
1343 | 1.20M | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
1344 | 1.20M | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
1345 | | //combining the appropriate sign change |
1346 | 1.20M | edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
1347 | | |
1348 | | //combining sign-left and sign_right |
1349 | 1.20M | edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b); |
1350 | 1.20M | edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b); |
1351 | | //adding constant 2 |
1352 | 1.20M | edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); |
1353 | 1.20M | edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); |
1354 | | //shuffle to get sao index |
1355 | 1.20M | edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); |
1356 | 1.20M | edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); |
1357 | | //using availability mask |
1358 | 1.20M | edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); |
1359 | 1.20M | edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b); |
1360 | | //adding chroma offset to access U and V |
1361 | 1.20M | edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); |
1362 | 1.20M | edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b); |
1363 | | |
1364 | | //shuffle to get sao offset |
1365 | 1.20M | edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); |
1366 | 1.20M | edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); |
1367 | | //cnvert to 16 bit then add and then saturated pack |
1368 | 1.20M | left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); |
1369 | 1.20M | cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); |
1370 | 1.20M | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b); |
1371 | 1.20M | src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); |
1372 | 1.20M | edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b); |
1373 | 1.20M | cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); |
1374 | 1.20M | src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b); |
1375 | 1.20M | src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); |
1376 | | |
1377 | 1.20M | left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); |
1378 | 1.20M | cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); |
1379 | 1.20M | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b); |
1380 | 1.20M | src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b); |
1381 | 1.20M | edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b); |
1382 | 1.20M | cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); |
1383 | 1.20M | src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b); |
1384 | 1.20M | src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b); |
1385 | | |
1386 | 1.20M | _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); |
1387 | | //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. |
1388 | 1.20M | _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); |
1389 | | // row = 1 |
1390 | 1.20M | _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b); |
1391 | | |
1392 | 1.20M | pu1_src_cpy += (src_strd << 1); |
1393 | 1.20M | pu1_src_left_cpy += 4; |
1394 | 1.20M | pu1_src_left_str += 4; |
1395 | 1.20M | } |
1396 | 150k | au1_mask_cpy += 16; |
1397 | 150k | pu1_src += 16; |
1398 | 150k | pu1_src_left_cpy -= 2 * ht; |
1399 | 150k | pu1_src_left_str -= 2 * ht; |
1400 | | |
1401 | 150k | pu1_left_tmp = pu1_src_left_cpy; |
1402 | 150k | pu1_src_left_cpy = pu1_src_left_str; |
1403 | 150k | pu1_src_left_str = pu1_left_tmp; |
1404 | 150k | } |
1405 | | |
1406 | 71.3k | wd_rem = wd & 0xF; |
1407 | 71.3k | if(wd_rem) |
1408 | 228 | { |
1409 | | |
1410 | 228 | cmp_gt0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (ht - 1) * src_strd)); |
1411 | 228 | _mm_storel_epi64((__m128i *)(pu1_src_top + offset), cmp_gt0_16x8b); |
1412 | | |
1413 | 228 | au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); |
1414 | 228 | pu1_src_cpy = pu1_src; |
1415 | 228 | au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b); |
1416 | | |
1417 | 700 | for(row = ht; row > 0; row -= 4) |
1418 | 472 | { |
1419 | 472 | left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy)); |
1420 | | //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos. |
1421 | 472 | src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy)); |
1422 | | // row = 1 |
1423 | 472 | cmp_gt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd)); |
1424 | | // row = 2 |
1425 | 472 | src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd)); |
1426 | | // row = 3 |
1427 | 472 | cmp_lt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd)); |
1428 | | |
1429 | | |
1430 | 472 | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 8); |
1431 | | //row 3 left |
1432 | 472 | edge0_16x8b = _mm_slli_si128(cmp_lt0_16x8b, 8); |
1433 | 472 | left0_16x8b = _mm_alignr_epi8(cmp_lt0_16x8b, left_store_16x8b, 14); |
1434 | 472 | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14); |
1435 | | //row 2 left |
1436 | 472 | edge0_16x8b = _mm_slli_si128(src_temp1_16x8b, 8); |
1437 | 472 | left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 14); |
1438 | 472 | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14); |
1439 | | |
1440 | | |
1441 | | // packing rows together for 16 SIMD operations |
1442 | 472 | src_temp1_16x8b = _mm_unpacklo_epi64(src_temp1_16x8b, cmp_lt0_16x8b); |
1443 | 472 | left1_16x8b = _mm_unpacklo_epi64(left1_16x8b, left0_16x8b); |
1444 | | |
1445 | | //row 1 left |
1446 | 472 | edge0_16x8b = _mm_slli_si128(cmp_gt0_16x8b, 8); |
1447 | 472 | edge1_16x8b = _mm_alignr_epi8(cmp_gt0_16x8b, left_store_16x8b, 14); |
1448 | 472 | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14); |
1449 | | //row 0 left |
1450 | 472 | edge0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); |
1451 | 472 | left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 14); |
1452 | 472 | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14); |
1453 | | // packing rows together for 16 SIMD operations |
1454 | 472 | src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, cmp_gt0_16x8b); |
1455 | 472 | left0_16x8b = _mm_unpacklo_epi64(left0_16x8b, edge1_16x8b); |
1456 | | |
1457 | | //separating +ve and and -ve values.for row 2 and row 3 |
1458 | 472 | cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b); |
1459 | 472 | cmp_lt0_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b); |
1460 | | //creating mask 00 for +ve and -ve values and FF for zero. |
1461 | 472 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
1462 | 472 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
1463 | | //combining the appropriate sign change |
1464 | 472 | left1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
1465 | | |
1466 | | |
1467 | | |
1468 | | |
1469 | | |
1470 | | //separating +ve and and -ve values. |
1471 | 472 | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b); |
1472 | 472 | cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b); |
1473 | | //creating mask 00 for +ve and -ve values and FF for zero. |
1474 | 472 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
1475 | 472 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
1476 | | //combining the appropriate sign change |
1477 | 472 | left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
1478 | | |
1479 | | |
1480 | | //row = 0 right |
1481 | 472 | edge0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2)); |
1482 | | // row = 1 right |
1483 | 472 | cmp_gt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd + 2)); |
1484 | | // row = 2 right |
1485 | 472 | edge1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd + 2)); |
1486 | | // row = 3 right |
1487 | 472 | cmp_lt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd + 2)); |
1488 | | // packing rows together for 16 SIMD operations |
1489 | 472 | edge0_16x8b = _mm_unpacklo_epi64(edge0_16x8b, cmp_gt0_16x8b); |
1490 | 472 | edge1_16x8b = _mm_unpacklo_epi64(edge1_16x8b, cmp_lt0_16x8b); |
1491 | | |
1492 | | //separating +ve and and -ve values. |
1493 | 472 | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b); |
1494 | 472 | cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b); |
1495 | | //creating mask 00 for +ve and -ve values and FF for zero. |
1496 | 472 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
1497 | 472 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
1498 | | //combining the appropriate sign change |
1499 | 472 | edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
1500 | | |
1501 | 472 | cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b); |
1502 | 472 | cmp_lt0_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b); |
1503 | | //creating mask 00 for +ve and -ve values and FF for zero. |
1504 | 472 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
1505 | 472 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
1506 | | //combining the appropriate sign change |
1507 | 472 | edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
1508 | | |
1509 | | //combining sign-left and sign_right |
1510 | 472 | edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b); |
1511 | 472 | edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b); |
1512 | | //adding constant 2 |
1513 | 472 | edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); |
1514 | 472 | edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); |
1515 | | //shuffle to get sao index |
1516 | 472 | edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); |
1517 | 472 | edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); |
1518 | | //shuffle to get sao offset |
1519 | | //using availability mask |
1520 | 472 | edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); |
1521 | 472 | edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b); |
1522 | | //adding chroma offset to access U and V |
1523 | 472 | edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); |
1524 | 472 | edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b); |
1525 | | |
1526 | 472 | edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); |
1527 | 472 | edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); |
1528 | | //cnvert to 16 bit then add and then saturated pack |
1529 | 472 | left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); |
1530 | 472 | cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); |
1531 | 472 | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b); |
1532 | 472 | src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); |
1533 | 472 | edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b); |
1534 | 472 | cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); |
1535 | 472 | src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b); |
1536 | 472 | src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); |
1537 | | |
1538 | 472 | left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); |
1539 | 472 | cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); |
1540 | 472 | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b); |
1541 | 472 | src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b); |
1542 | 472 | edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b); |
1543 | 472 | cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); |
1544 | 472 | src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b); |
1545 | 472 | src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b); |
1546 | | |
1547 | | //seaprting row 1 and row 3 |
1548 | 472 | cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); |
1549 | 472 | cmp_lt0_16x8b = _mm_srli_si128(src_temp1_16x8b, 8); |
1550 | | |
1551 | 472 | _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); |
1552 | | //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. |
1553 | 472 | _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); |
1554 | | // row = 1 |
1555 | 472 | _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); |
1556 | | // row = 2 |
1557 | 472 | _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp1_16x8b); |
1558 | | // row = 3 |
1559 | 472 | _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b); |
1560 | | |
1561 | 472 | pu1_src_cpy += (src_strd << 2); |
1562 | 472 | pu1_src_left_cpy += 8; |
1563 | 472 | pu1_src_left_str += 8; |
1564 | 472 | } |
1565 | 228 | pu1_src += wd; |
1566 | 228 | pu1_src_left_cpy -= 2 * ht; |
1567 | 228 | pu1_src_left_str -= 2 * ht; |
1568 | | |
1569 | 228 | pu1_left_tmp = pu1_src_left_cpy; |
1570 | 228 | pu1_src_left_cpy = pu1_src_left_str; |
1571 | 228 | pu1_src_left_str = pu1_left_tmp; |
1572 | 228 | } |
1573 | 2.34M | for(row = 0; row < 2 * ht; row++) |
1574 | 2.27M | { |
1575 | 2.27M | pu1_src_left[row] = pu1_src_left_cpy[row]; |
1576 | 2.27M | } |
1577 | 71.3k | } |
1578 | | |
1579 | 71.3k | } |
1580 | | |
1581 | | |
1582 | | void ihevc_sao_edge_offset_class1_ssse3(UWORD8 *pu1_src, |
1583 | | WORD32 src_strd, |
1584 | | UWORD8 *pu1_src_left, |
1585 | | UWORD8 *pu1_src_top, |
1586 | | UWORD8 *pu1_src_top_left, |
1587 | | UWORD8 *pu1_src_top_right, |
1588 | | UWORD8 *pu1_src_bot_left, |
1589 | | UWORD8 *pu1_avail, |
1590 | | WORD8 *pi1_sao_offset, |
1591 | | WORD32 wd, |
1592 | | WORD32 ht) |
1593 | 70.8k | { |
1594 | 70.8k | WORD32 row, col; |
1595 | 70.8k | UWORD8 *pu1_src_top_cpy; |
1596 | 70.8k | UWORD8 *pu1_src_cpy; |
1597 | 70.8k | WORD32 wd_rem; |
1598 | | |
1599 | | |
1600 | 70.8k | __m128i src_top_16x8b, src_bottom_16x8b; |
1601 | 70.8k | __m128i src_temp0_16x8b, src_temp1_16x8b; |
1602 | 70.8k | __m128i signup0_16x8b, signdwn1_16x8b; |
1603 | 70.8k | __m128i cmp_gt0_16x8b, cmp_lt0_16x8b; |
1604 | 70.8k | __m128i edge0_16x8b, edge1_16x8b; |
1605 | 70.8k | __m128i edge_idx_8x16b, sao_offset_8x16b; |
1606 | 70.8k | __m128i const2_16x8b, const0_16x8b; |
1607 | | |
1608 | 70.8k | UNUSED(pu1_src_top_right); |
1609 | 70.8k | UNUSED(pu1_src_bot_left); |
1610 | | |
1611 | | |
1612 | | /* Updating left and top-left */ |
1613 | 2.31M | for(row = 0; row < ht; row++) |
1614 | 2.24M | { |
1615 | 2.24M | pu1_src_left[row] = pu1_src[row * src_strd + (wd - 1)]; |
1616 | 2.24M | } |
1617 | 70.8k | *pu1_src_top_left = pu1_src_top[wd - 1]; |
1618 | | |
1619 | | |
1620 | | |
1621 | 70.8k | pu1_src_top_cpy = pu1_src_top; |
1622 | 70.8k | edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx); |
1623 | 70.8k | sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset); |
1624 | | |
1625 | | /* Update height and source pointers based on the availability flags */ |
1626 | 70.8k | if(0 == pu1_avail[2]) |
1627 | 2.56k | { |
1628 | 2.56k | pu1_src_top_cpy = pu1_src; |
1629 | 2.56k | pu1_src += src_strd; |
1630 | 2.56k | ht--; |
1631 | 2.56k | } |
1632 | 70.8k | if(0 == pu1_avail[3]) |
1633 | 1.39k | { |
1634 | 1.39k | ht--; |
1635 | 1.39k | } |
1636 | | |
1637 | 70.8k | const2_16x8b = _mm_set1_epi8(2); |
1638 | 70.8k | const0_16x8b = _mm_setzero_si128(); |
1639 | | |
1640 | 70.8k | { |
1641 | 70.8k | WORD32 ht_rem; |
1642 | 179k | for(col = wd; col >= 16; col -= 16) |
1643 | 109k | { |
1644 | 109k | pu1_src_cpy = pu1_src; |
1645 | 109k | src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col)); |
1646 | | //row = 0 |
1647 | 109k | src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy)); |
1648 | | //separating +ve and and -ve values. |
1649 | 109k | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b); |
1650 | 109k | cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b); |
1651 | | //creating mask 00 for +ve and -ve values and FF for zero. |
1652 | 109k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
1653 | 109k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
1654 | | //combining the appropriate sign change |
1655 | 109k | signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
1656 | | |
1657 | 1.83M | for(row = ht; row >= 2; row -= 2) |
1658 | 1.73M | { |
1659 | | |
1660 | | //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. |
1661 | 1.73M | src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); |
1662 | | // row = 2 |
1663 | 1.73M | src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); |
1664 | | |
1665 | | |
1666 | | //row 0 -row1 |
1667 | | //separating +ve and and -ve values. |
1668 | 1.73M | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b); |
1669 | 1.73M | cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b); |
1670 | | //creating mask 00 for +ve and -ve values and FF for zero. |
1671 | 1.73M | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
1672 | 1.73M | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
1673 | | //combining the appropriate sign change |
1674 | 1.73M | edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
1675 | | //row1-row0 |
1676 | 1.73M | edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); |
1677 | | |
1678 | | //row1 -bottom |
1679 | 1.73M | cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b); |
1680 | 1.73M | cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b); |
1681 | | //creating mask 00 for +ve and -ve values and FF for zero. |
1682 | 1.73M | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
1683 | 1.73M | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
1684 | | //combining the appropriate sign change |
1685 | 1.73M | signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
1686 | | |
1687 | | //combining sign-left and sign_right |
1688 | 1.73M | edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); |
1689 | 1.73M | edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b); |
1690 | | |
1691 | | //for the next iteration signup0_16x8b = -signdwn1_16x8b |
1692 | 1.73M | signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); |
1693 | | //adding constant 2 |
1694 | 1.73M | edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); |
1695 | 1.73M | edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); |
1696 | | //shuffle to get sao index |
1697 | 1.73M | edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); |
1698 | 1.73M | edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); |
1699 | | //shuffle to get sao offset |
1700 | 1.73M | edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); |
1701 | 1.73M | edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); |
1702 | | //copying the next top |
1703 | 1.73M | src_top_16x8b = src_temp1_16x8b; |
1704 | | //cnvert to 16 bit then add and then saturated pack |
1705 | 1.73M | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); |
1706 | 1.73M | cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); |
1707 | 1.73M | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); |
1708 | 1.73M | src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); |
1709 | 1.73M | cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); |
1710 | 1.73M | cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); |
1711 | 1.73M | src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); |
1712 | 1.73M | src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); |
1713 | | |
1714 | 1.73M | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); |
1715 | 1.73M | cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); |
1716 | 1.73M | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b); |
1717 | 1.73M | src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b); |
1718 | 1.73M | cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); |
1719 | 1.73M | cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b); |
1720 | 1.73M | src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b); |
1721 | 1.73M | src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b); |
1722 | | |
1723 | | //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. |
1724 | 1.73M | _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); |
1725 | | // row = 1 |
1726 | 1.73M | _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b); |
1727 | | |
1728 | 1.73M | src_temp0_16x8b = src_bottom_16x8b; |
1729 | 1.73M | pu1_src_cpy += (src_strd << 1); |
1730 | 1.73M | } |
1731 | 109k | ht_rem = ht & 0x1; |
1732 | | |
1733 | 109k | if(ht_rem) |
1734 | 6.13k | { |
1735 | 6.13k | src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); |
1736 | | //current row -next row |
1737 | | //separating +ve and and -ve values. |
1738 | 6.13k | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b); |
1739 | 6.13k | cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b); |
1740 | | //creating mask 00 for +ve and -ve values and FF for zero. |
1741 | 6.13k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
1742 | 6.13k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
1743 | | //combining the appropriate sign change |
1744 | 6.13k | edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
1745 | | //adding top and botton and constant 2 |
1746 | 6.13k | edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); |
1747 | 6.13k | edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); |
1748 | | |
1749 | 6.13k | edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); |
1750 | 6.13k | edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); |
1751 | | //copying the next top |
1752 | 6.13k | src_top_16x8b = src_temp0_16x8b; |
1753 | | //cnvert to 16 bit then add and then saturated pack |
1754 | 6.13k | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); |
1755 | 6.13k | cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); |
1756 | 6.13k | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); |
1757 | 6.13k | src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); |
1758 | 6.13k | cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); |
1759 | 6.13k | cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); |
1760 | 6.13k | src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); |
1761 | 6.13k | src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); |
1762 | | |
1763 | 6.13k | _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); |
1764 | 6.13k | } |
1765 | 109k | if(0 == pu1_avail[3]) |
1766 | 2.11k | { |
1767 | 2.11k | src_top_16x8b = src_bottom_16x8b; |
1768 | 2.11k | } |
1769 | | //updating top flag |
1770 | 109k | _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b); |
1771 | 109k | pu1_src += 16; |
1772 | 109k | } |
1773 | | |
1774 | 70.8k | wd_rem = wd & 0xF; |
1775 | 70.8k | if(wd_rem) |
1776 | 69.5k | { |
1777 | 69.5k | pu1_src_cpy = pu1_src; |
1778 | 69.5k | src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col)); |
1779 | | //row = 0 |
1780 | 69.5k | src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy)); |
1781 | | //separating +ve and and -ve values. |
1782 | 69.5k | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b); |
1783 | 69.5k | cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b); |
1784 | | //creating mask 00 for +ve and -ve values and FF for zero. |
1785 | 69.5k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
1786 | 69.5k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
1787 | | //combining the appropriate sign change |
1788 | 69.5k | signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
1789 | 69.5k | signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); |
1790 | 616k | for(row = ht; row >= 4; row -= 4) |
1791 | 547k | { |
1792 | | //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. |
1793 | 547k | src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd)); |
1794 | | // row = 2 |
1795 | 547k | src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd)); |
1796 | | |
1797 | | //row 0 -row1 |
1798 | | //separating +ve and and -ve values. |
1799 | 547k | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b); |
1800 | 547k | cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b); |
1801 | | //creating mask 00 for +ve and -ve values and FF for zero. |
1802 | 547k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
1803 | 547k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
1804 | | //combining the appropriate sign change |
1805 | 547k | edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
1806 | | |
1807 | | //row1-row0 |
1808 | 547k | edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); |
1809 | 547k | edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1) |
1810 | 547k | signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top) |
1811 | | //row1 -row2 |
1812 | 547k | cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b); |
1813 | 547k | cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b); |
1814 | | //creating mask 00 for +ve and -ve values and FF for zero. |
1815 | 547k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
1816 | 547k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
1817 | | //combining the appropriate sign change |
1818 | 547k | signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2) |
1819 | 547k | edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1) |
1820 | | //packing row 0 n row 1 |
1821 | 547k | src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b); |
1822 | | //row = 3 |
1823 | 547k | src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd)); |
1824 | | // row = 4 |
1825 | 547k | src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 4 * src_strd)); |
1826 | | |
1827 | 547k | edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty |
1828 | 547k | signdwn1_16x8b = _mm_slli_si128(signdwn1_16x8b, 8); //allign left (1-2) |
1829 | | //separating +ve and and -ve values.(2,3) |
1830 | 547k | cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_top_16x8b); |
1831 | 547k | cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_bottom_16x8b); |
1832 | | //creating mask 00 for +ve and -ve values and FF for zero. |
1833 | 547k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
1834 | 547k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
1835 | | //combining the appropriate sign change |
1836 | 547k | edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3) |
1837 | | |
1838 | 547k | signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signdwn1_16x8b, 8); //(2-3) ,(1-2) (substract with down) |
1839 | 547k | edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); |
1840 | | //separating +ve and and -ve values.(3,4) |
1841 | 547k | cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp1_16x8b); |
1842 | 547k | cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_top_16x8b); |
1843 | | //creating mask 00 for +ve and -ve values and FF for zero. |
1844 | 547k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
1845 | 547k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
1846 | 547k | signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-4) |
1847 | | //combining sign-left and sign_right |
1848 | 547k | edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-4),(2-3) |
1849 | | |
1850 | 547k | edge1_16x8b = _mm_sub_epi8(edge1_16x8b, signup0_16x8b); //(3,2) |
1851 | | |
1852 | | //packing row 2 n row 3 |
1853 | 547k | src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b); |
1854 | | //for the next iteration signup0_16x8b = -signdwn1_16x8b |
1855 | 547k | signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(4-3) |
1856 | | |
1857 | | //adding constant 2 |
1858 | 547k | edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); |
1859 | 547k | edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); |
1860 | | //shuffle to get sao index |
1861 | 547k | edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); |
1862 | 547k | edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); |
1863 | | //shuffle to get sao offset |
1864 | 547k | edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); |
1865 | 547k | edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); |
1866 | | //the next top already in src_top_16x8b |
1867 | | //src_top_16x8b = src_temp1_16x8b; |
1868 | | //cnvert to 16 bit then add and then saturated pack |
1869 | 547k | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); |
1870 | 547k | cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); |
1871 | 547k | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); |
1872 | 547k | src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); |
1873 | 547k | cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); |
1874 | 547k | cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); |
1875 | 547k | src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); |
1876 | 547k | src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); |
1877 | | |
1878 | 547k | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); |
1879 | 547k | cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b); |
1880 | 547k | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b); |
1881 | 547k | src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b); |
1882 | 547k | cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); |
1883 | 547k | cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b); |
1884 | 547k | src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b); |
1885 | 547k | src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b); |
1886 | | |
1887 | 547k | cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); |
1888 | 547k | cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8); |
1889 | | //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. |
1890 | 547k | _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); |
1891 | | // row = 1 |
1892 | 547k | _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); |
1893 | | //row = 2 |
1894 | 547k | _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b); |
1895 | | // row = 3 |
1896 | 547k | _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b); |
1897 | | |
1898 | 547k | src_temp0_16x8b = src_temp1_16x8b; |
1899 | 547k | signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); |
1900 | 547k | pu1_src_cpy += (src_strd << 2); |
1901 | | |
1902 | 547k | } |
1903 | 69.5k | ht_rem = ht & 0x2; |
1904 | 69.5k | if(ht_rem) |
1905 | 3.80k | { |
1906 | | |
1907 | | //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. |
1908 | 3.80k | src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd)); |
1909 | | // row = 2 |
1910 | 3.80k | src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd)); |
1911 | | |
1912 | | //row 0 -row1 |
1913 | | //separating +ve and and -ve values. |
1914 | 3.80k | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b); |
1915 | 3.80k | cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b); |
1916 | | //creating mask 00 for +ve and -ve values and FF for zero. |
1917 | 3.80k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
1918 | 3.80k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
1919 | | //combining the appropriate sign change |
1920 | 3.80k | edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
1921 | | //row1-row0 |
1922 | 3.80k | edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); |
1923 | 3.80k | edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1) |
1924 | 3.80k | signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top) |
1925 | | //row1 -row2 |
1926 | 3.80k | cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b); |
1927 | 3.80k | cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b); |
1928 | | //creating mask 00 for +ve and -ve values and FF for zero. |
1929 | 3.80k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
1930 | 3.80k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
1931 | | //combining the appropriate sign change |
1932 | 3.80k | signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2) |
1933 | 3.80k | edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1) |
1934 | | //adding top and down substraction |
1935 | 3.80k | edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty |
1936 | | //for the next iteration signup0_16x8b = -signdwn1_16x8b |
1937 | 3.80k | signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(2-1) for next |
1938 | 3.80k | src_top_16x8b = src_temp1_16x8b; |
1939 | | //adding constant 2 |
1940 | 3.80k | edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); |
1941 | | |
1942 | | //shuffle to get sao index |
1943 | 3.80k | edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); |
1944 | | |
1945 | | //shuffle to get sao offset |
1946 | 3.80k | edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); |
1947 | | |
1948 | | //the next top already in src_top_16x8b |
1949 | | //cnvert to 16 bit then add and then saturated pack |
1950 | 3.80k | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); |
1951 | 3.80k | src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); |
1952 | 3.80k | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); |
1953 | 3.80k | src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); |
1954 | 3.80k | src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); |
1955 | 3.80k | cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); |
1956 | 3.80k | src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b); |
1957 | 3.80k | src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b); |
1958 | | |
1959 | 3.80k | cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); |
1960 | | |
1961 | | //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. |
1962 | 3.80k | _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); |
1963 | | // row = 1 |
1964 | 3.80k | _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); |
1965 | 3.80k | src_temp0_16x8b = src_bottom_16x8b; |
1966 | 3.80k | pu1_src_cpy += (src_strd << 1); |
1967 | | |
1968 | 3.80k | } |
1969 | 69.5k | ht_rem = ht & 0x1; |
1970 | 69.5k | if(ht_rem) |
1971 | 3.80k | { |
1972 | | |
1973 | | //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. |
1974 | 3.80k | src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd)); |
1975 | | |
1976 | | //row 0 -row1 |
1977 | | //separating +ve and and -ve values. |
1978 | 3.80k | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b); |
1979 | 3.80k | cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b); |
1980 | | //creating mask 00 for +ve and -ve values and FF for zero. |
1981 | 3.80k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
1982 | 3.80k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
1983 | | //combining the appropriate sign change |
1984 | 3.80k | edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
1985 | | //adding top and down substraction |
1986 | 3.80k | edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); |
1987 | | //adding constant 2 |
1988 | 3.80k | edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); |
1989 | 3.80k | edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); |
1990 | 3.80k | edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8); |
1991 | | //shuffle to get sao index |
1992 | 3.80k | edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); |
1993 | | //shuffle to get sao offset |
1994 | 3.80k | edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); |
1995 | 3.80k | src_top_16x8b = src_temp0_16x8b; |
1996 | | //cnvert to 16 bit then add and then saturated pack |
1997 | 3.80k | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); |
1998 | 3.80k | src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); |
1999 | 3.80k | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); |
2000 | 3.80k | src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); |
2001 | 3.80k | src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b); |
2002 | | //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. |
2003 | 3.80k | _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); |
2004 | 3.80k | pu1_src_cpy += (src_strd); |
2005 | | |
2006 | 3.80k | } |
2007 | 69.5k | if(0 == pu1_avail[3]) |
2008 | 1.35k | { |
2009 | 1.35k | src_top_16x8b = src_bottom_16x8b; |
2010 | 1.35k | } |
2011 | 69.5k | _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b); |
2012 | 69.5k | pu1_src += 8; |
2013 | 69.5k | } |
2014 | 70.8k | } |
2015 | 70.8k | } |
2016 | | |
2017 | | void ihevc_sao_edge_offset_class1_chroma_ssse3(UWORD8 *pu1_src, |
2018 | | WORD32 src_strd, |
2019 | | UWORD8 *pu1_src_left, |
2020 | | UWORD8 *pu1_src_top, |
2021 | | UWORD8 *pu1_src_top_left, |
2022 | | UWORD8 *pu1_src_top_right, |
2023 | | UWORD8 *pu1_src_bot_left, |
2024 | | UWORD8 *pu1_avail, |
2025 | | WORD8 *pi1_sao_offset_u, |
2026 | | WORD8 *pi1_sao_offset_v, |
2027 | | WORD32 wd, |
2028 | | WORD32 ht) |
2029 | 72.2k | { |
2030 | 72.2k | WORD32 row, col; |
2031 | 72.2k | UWORD8 *pu1_src_top_cpy; |
2032 | 72.2k | UWORD8 *pu1_src_cpy; |
2033 | 72.2k | WORD32 wd_rem; |
2034 | | |
2035 | | |
2036 | 72.2k | __m128i src_top_16x8b, src_bottom_16x8b; |
2037 | 72.2k | __m128i src_temp0_16x8b, src_temp1_16x8b; |
2038 | 72.2k | __m128i signup0_16x8b, signdwn1_16x8b; |
2039 | 72.2k | __m128i cmp_gt0_16x8b, cmp_lt0_16x8b; |
2040 | 72.2k | __m128i edge0_16x8b, edge1_16x8b; |
2041 | 72.2k | __m128i edge_idx_8x16b, sao_offset_8x16b; |
2042 | 72.2k | __m128i const2_16x8b, const0_16x8b; |
2043 | 72.2k | __m128i chroma_offset_8x16b; |
2044 | | |
2045 | 72.2k | UNUSED(pu1_src_top_right); |
2046 | 72.2k | UNUSED(pu1_src_bot_left); |
2047 | | |
2048 | | /* Updating left and top and top-left */ |
2049 | 1.22M | for(row = 0; row < ht; row++) |
2050 | 1.15M | { |
2051 | 1.15M | pu1_src_left[2 * row] = pu1_src[row * src_strd + (wd - 2)]; |
2052 | 1.15M | pu1_src_left[2 * row + 1] = pu1_src[row * src_strd + (wd - 1)]; |
2053 | 1.15M | } |
2054 | 72.2k | pu1_src_top_left[0] = pu1_src_top[wd - 2]; |
2055 | 72.2k | pu1_src_top_left[1] = pu1_src_top[wd - 1]; |
2056 | | |
2057 | | |
2058 | | |
2059 | 72.2k | pu1_src_top_cpy = pu1_src_top; |
2060 | 72.2k | edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx); |
2061 | 72.2k | sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u); |
2062 | 72.2k | const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v); |
2063 | 72.2k | chroma_offset_8x16b = _mm_set1_epi16(0x0800); |
2064 | | /* Update height and source pointers based on the availability flags */ |
2065 | 72.2k | if(0 == pu1_avail[2]) |
2066 | 2.56k | { |
2067 | 2.56k | pu1_src_top_cpy = pu1_src; |
2068 | 2.56k | pu1_src += src_strd; |
2069 | 2.56k | ht--; |
2070 | 2.56k | } |
2071 | 72.2k | if(0 == pu1_avail[3]) |
2072 | 860 | { |
2073 | 860 | ht--; |
2074 | 860 | } |
2075 | 72.2k | sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b); |
2076 | 72.2k | const2_16x8b = _mm_set1_epi8(2); |
2077 | 72.2k | const0_16x8b = _mm_setzero_si128(); |
2078 | | |
2079 | | |
2080 | 72.2k | { |
2081 | 72.2k | WORD32 ht_rem; |
2082 | | |
2083 | | |
2084 | | |
2085 | 217k | for(col = wd; col >= 16; col -= 16) |
2086 | 145k | { |
2087 | 145k | pu1_src_cpy = pu1_src; |
2088 | 145k | src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col)); |
2089 | | //row = 0 |
2090 | 145k | src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy)); |
2091 | | //separating +ve and and -ve values. |
2092 | 145k | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b); |
2093 | 145k | cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b); |
2094 | | //creating mask 00 for +ve and -ve values and FF for zero. |
2095 | 145k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
2096 | 145k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
2097 | | //combining the appropriate sign change |
2098 | 145k | signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
2099 | | |
2100 | 1.29M | for(row = ht; row >= 2; row -= 2) |
2101 | 1.14M | { |
2102 | | |
2103 | | //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. |
2104 | 1.14M | src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); |
2105 | | // row = 2 |
2106 | 1.14M | src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); |
2107 | | |
2108 | | |
2109 | | //row 0 -row1 |
2110 | | //separating +ve and and -ve values. |
2111 | 1.14M | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b); |
2112 | 1.14M | cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b); |
2113 | | //creating mask 00 for +ve and -ve values and FF for zero. |
2114 | 1.14M | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
2115 | 1.14M | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
2116 | | //combining the appropriate sign change |
2117 | 1.14M | edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
2118 | | //row1-row0 |
2119 | 1.14M | edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); |
2120 | | |
2121 | | //row1 -bottom |
2122 | 1.14M | cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b); |
2123 | 1.14M | cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b); |
2124 | | //creating mask 00 for +ve and -ve values and FF for zero. |
2125 | 1.14M | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
2126 | 1.14M | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
2127 | | //combining the appropriate sign change |
2128 | 1.14M | signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
2129 | | |
2130 | | //combining sign-left and sign_right |
2131 | 1.14M | edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); |
2132 | 1.14M | edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b); |
2133 | | |
2134 | | //for the next iteration signup0_16x8b = -signdwn1_16x8b |
2135 | 1.14M | signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); |
2136 | | //adding constant 2 |
2137 | 1.14M | edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); |
2138 | 1.14M | edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); |
2139 | | //copying the next top |
2140 | 1.14M | src_top_16x8b = src_temp1_16x8b; |
2141 | | |
2142 | | |
2143 | | //shuffle to get sao index |
2144 | 1.14M | edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); |
2145 | 1.14M | edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); |
2146 | | //adding chroma offset to access U and V |
2147 | 1.14M | edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); |
2148 | 1.14M | edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b); |
2149 | | |
2150 | | //shuffle to get sao offset |
2151 | 1.14M | edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); |
2152 | 1.14M | edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); |
2153 | | //cnvert to 16 bit then add and then saturated pack |
2154 | 1.14M | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); |
2155 | 1.14M | cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); |
2156 | 1.14M | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); |
2157 | 1.14M | src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); |
2158 | 1.14M | edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); |
2159 | 1.14M | cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); |
2160 | 1.14M | src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b); |
2161 | 1.14M | src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); |
2162 | | |
2163 | 1.14M | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); |
2164 | 1.14M | cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); |
2165 | 1.14M | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b); |
2166 | 1.14M | src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b); |
2167 | 1.14M | edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b); |
2168 | 1.14M | cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); |
2169 | 1.14M | src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b); |
2170 | 1.14M | src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b); |
2171 | | //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. |
2172 | 1.14M | _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); |
2173 | | // row = 1 |
2174 | 1.14M | _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b); |
2175 | | |
2176 | 1.14M | src_temp0_16x8b = src_bottom_16x8b; |
2177 | 1.14M | pu1_src_cpy += (src_strd << 1); |
2178 | 1.14M | } |
2179 | 145k | ht_rem = ht & 0x1; |
2180 | | |
2181 | 145k | if(ht_rem) |
2182 | 6.89k | { |
2183 | 6.89k | src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); |
2184 | | //current row -next row |
2185 | | //separating +ve and and -ve values. |
2186 | 6.89k | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b); |
2187 | 6.89k | cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b); |
2188 | | //creating mask 00 for +ve and -ve values and FF for zero. |
2189 | 6.89k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
2190 | 6.89k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
2191 | | //combining the appropriate sign change |
2192 | 6.89k | edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
2193 | | //adding top and botton and constant 2 |
2194 | 6.89k | edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); |
2195 | 6.89k | edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); |
2196 | | //copying the next top |
2197 | 6.89k | src_top_16x8b = src_temp0_16x8b; |
2198 | | |
2199 | 6.89k | edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); |
2200 | | //adding chroma offset to access U and V |
2201 | 6.89k | edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); |
2202 | 6.89k | edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); |
2203 | | |
2204 | | //cnvert to 16 bit then add and then saturated pack |
2205 | 6.89k | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); |
2206 | 6.89k | cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); |
2207 | 6.89k | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); |
2208 | 6.89k | src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); |
2209 | 6.89k | edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); |
2210 | 6.89k | cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); |
2211 | 6.89k | src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b); |
2212 | 6.89k | src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); |
2213 | | |
2214 | 6.89k | _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); |
2215 | 6.89k | } |
2216 | 145k | if(0 == pu1_avail[3]) |
2217 | 1.73k | { |
2218 | 1.73k | src_top_16x8b = src_bottom_16x8b; |
2219 | 1.73k | } |
2220 | | //updating top flag |
2221 | 145k | _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b); |
2222 | 145k | pu1_src += 16; |
2223 | 145k | } |
2224 | | |
2225 | 72.2k | wd_rem = wd & 0xF; |
2226 | 72.2k | if(wd_rem) |
2227 | 2 | { |
2228 | 2 | pu1_src_cpy = pu1_src; |
2229 | 2 | src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col)); |
2230 | | //row = 0 |
2231 | 2 | src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy)); |
2232 | | //separating +ve and and -ve values. |
2233 | 2 | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b); |
2234 | 2 | cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b); |
2235 | | //creating mask 00 for +ve and -ve values and FF for zero. |
2236 | 2 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
2237 | 2 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
2238 | | //combining the appropriate sign change |
2239 | 2 | signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
2240 | 2 | signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); |
2241 | 5 | for(row = ht; row >= 4; row -= 4) |
2242 | 3 | { |
2243 | | //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. |
2244 | 3 | src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd)); |
2245 | | // row = 2 |
2246 | 3 | src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd)); |
2247 | | |
2248 | | //row 0 -row1 |
2249 | | //separating +ve and and -ve values. |
2250 | 3 | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b); |
2251 | 3 | cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b); |
2252 | | //creating mask 00 for +ve and -ve values and FF for zero. |
2253 | 3 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
2254 | 3 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
2255 | | //combining the appropriate sign change |
2256 | 3 | edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
2257 | | |
2258 | | //row1-row0 |
2259 | 3 | edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); |
2260 | 3 | edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1) |
2261 | 3 | signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top) |
2262 | | //row1 -row2 |
2263 | 3 | cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b); |
2264 | 3 | cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b); |
2265 | | //creating mask 00 for +ve and -ve values and FF for zero. |
2266 | 3 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
2267 | 3 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
2268 | | //combining the appropriate sign change |
2269 | 3 | signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2) |
2270 | 3 | edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1) |
2271 | | //packing row 0 n row 1 |
2272 | 3 | src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b); |
2273 | | //row = 3 |
2274 | 3 | src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd)); |
2275 | | // row = 4 |
2276 | 3 | src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 4 * src_strd)); |
2277 | | |
2278 | 3 | edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty |
2279 | 3 | signdwn1_16x8b = _mm_slli_si128(signdwn1_16x8b, 8); //allign left (1-2) |
2280 | | //separating +ve and and -ve values.(2,3) |
2281 | 3 | cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_top_16x8b); |
2282 | 3 | cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_bottom_16x8b); |
2283 | | //creating mask 00 for +ve and -ve values and FF for zero. |
2284 | 3 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
2285 | 3 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
2286 | | //combining the appropriate sign change |
2287 | 3 | edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3) |
2288 | | |
2289 | 3 | signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signdwn1_16x8b, 8); //(2-3) ,(1-2) (substract with down) |
2290 | 3 | edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); |
2291 | | //separating +ve and and -ve values.(3,4) |
2292 | 3 | cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp1_16x8b); |
2293 | 3 | cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_top_16x8b); |
2294 | | //creating mask 00 for +ve and -ve values and FF for zero. |
2295 | 3 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
2296 | 3 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
2297 | 3 | signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-4) |
2298 | | //combining sign-left and sign_right |
2299 | 3 | edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-4),(2-3) |
2300 | | |
2301 | 3 | edge1_16x8b = _mm_sub_epi8(edge1_16x8b, signup0_16x8b); //(3,2) |
2302 | | |
2303 | | //packing row 2 n row 3 |
2304 | 3 | src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b); |
2305 | | //for the next iteration signup0_16x8b = -signdwn1_16x8b |
2306 | 3 | signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(4-3) |
2307 | | //adding constant 2 |
2308 | 3 | edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); |
2309 | 3 | edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); |
2310 | | //shuffle to get sao index |
2311 | 3 | edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); |
2312 | 3 | edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); |
2313 | | //adding chroma offset to access U and V |
2314 | 3 | edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); |
2315 | 3 | edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b); |
2316 | | |
2317 | | //shuffle to get sao offset |
2318 | 3 | edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); |
2319 | 3 | edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); |
2320 | | //the next top already in src_top_16x8b |
2321 | | //cnvert to 16 bit then add and then saturated pack |
2322 | 3 | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); |
2323 | 3 | cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); |
2324 | 3 | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); |
2325 | 3 | src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); |
2326 | 3 | edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); |
2327 | 3 | cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); |
2328 | 3 | src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b); |
2329 | 3 | src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); |
2330 | | |
2331 | 3 | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); |
2332 | 3 | cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b); |
2333 | 3 | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b); |
2334 | 3 | src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b); |
2335 | 3 | edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b); |
2336 | 3 | cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); |
2337 | 3 | src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b); |
2338 | 3 | src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b); |
2339 | | |
2340 | 3 | cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); |
2341 | 3 | cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8); |
2342 | | //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. |
2343 | 3 | _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); |
2344 | | // row = 1 |
2345 | 3 | _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); |
2346 | | //row = 2 |
2347 | 3 | _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b); |
2348 | | // row = 3 |
2349 | 3 | _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b); |
2350 | | |
2351 | 3 | src_temp0_16x8b = src_temp1_16x8b; |
2352 | 3 | signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); |
2353 | 3 | pu1_src_cpy += (src_strd << 2); |
2354 | | |
2355 | 3 | } |
2356 | 2 | ht_rem = ht & 0x2; |
2357 | 2 | if(ht_rem) |
2358 | 1 | { |
2359 | | |
2360 | | //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. |
2361 | 1 | src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd)); |
2362 | | // row = 2 |
2363 | 1 | src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd)); |
2364 | | |
2365 | | //row 0 -row1 |
2366 | | //separating +ve and and -ve values. |
2367 | 1 | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b); |
2368 | 1 | cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b); |
2369 | | //creating mask 00 for +ve and -ve values and FF for zero. |
2370 | 1 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
2371 | 1 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
2372 | | //combining the appropriate sign change |
2373 | 1 | edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
2374 | | //row1-row0 |
2375 | 1 | edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); |
2376 | 1 | edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1) |
2377 | 1 | signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top) |
2378 | | //row1 -row2 |
2379 | 1 | cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b); |
2380 | 1 | cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b); |
2381 | | //creating mask 00 for +ve and -ve values and FF for zero. |
2382 | 1 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
2383 | 1 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
2384 | | //combining the appropriate sign change |
2385 | 1 | signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2) |
2386 | 1 | edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1) |
2387 | | //adding top and down substraction |
2388 | 1 | edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty |
2389 | | //for the next iteration signup0_16x8b = -signdwn1_16x8b |
2390 | 1 | signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(2-1) for next |
2391 | 1 | src_top_16x8b = src_temp1_16x8b; |
2392 | | |
2393 | | //adding constant 2 |
2394 | 1 | edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); |
2395 | | |
2396 | | //shuffle to get sao index |
2397 | 1 | edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); |
2398 | | |
2399 | | //adding chroma offset to access U and V |
2400 | 1 | edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); |
2401 | | //shuffle to get sao offset |
2402 | 1 | edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); |
2403 | | //the next top already in src_top_16x8b |
2404 | | //cnvert to 16 bit then add and then saturated pack |
2405 | 1 | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); |
2406 | 1 | src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); |
2407 | 1 | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); |
2408 | 1 | src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); |
2409 | 1 | edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); |
2410 | 1 | src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); |
2411 | 1 | src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b); |
2412 | 1 | src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b); |
2413 | | |
2414 | 1 | cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); |
2415 | | |
2416 | | //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. |
2417 | 1 | _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); |
2418 | | // row = 1 |
2419 | 1 | _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); |
2420 | 1 | src_temp0_16x8b = src_bottom_16x8b; |
2421 | 1 | pu1_src_cpy += (src_strd << 1); |
2422 | | |
2423 | 1 | } |
2424 | 2 | ht_rem = ht & 0x1; |
2425 | 2 | if(ht_rem) |
2426 | 1 | { |
2427 | | |
2428 | | //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. |
2429 | 1 | src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd)); |
2430 | | |
2431 | | //row 0 -row1 |
2432 | | //separating +ve and and -ve values. |
2433 | 1 | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b); |
2434 | 1 | cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b); |
2435 | | //creating mask 00 for +ve and -ve values and FF for zero. |
2436 | 1 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
2437 | 1 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
2438 | | //combining the appropriate sign change |
2439 | 1 | edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
2440 | | //adding top and down substraction |
2441 | 1 | edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); |
2442 | | //adding constant 2 |
2443 | 1 | edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); |
2444 | 1 | src_top_16x8b = src_temp0_16x8b; |
2445 | | |
2446 | 1 | edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); |
2447 | 1 | edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8); |
2448 | | //shuffle to get sao index |
2449 | 1 | edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); |
2450 | | //adding chroma offset to access U and V |
2451 | 1 | edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); |
2452 | | //shuffle to get sao offset |
2453 | 1 | edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); |
2454 | | |
2455 | | //cnvert to 16 bit then add and then saturated pack |
2456 | 1 | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); |
2457 | 1 | src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); |
2458 | 1 | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); |
2459 | 1 | src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); |
2460 | 1 | src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b); |
2461 | | //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. |
2462 | 1 | _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); |
2463 | 1 | pu1_src_cpy += (src_strd); |
2464 | | |
2465 | 1 | } |
2466 | 2 | if(0 == pu1_avail[3]) |
2467 | 0 | { |
2468 | 0 | src_top_16x8b = src_bottom_16x8b; |
2469 | 0 | } |
2470 | 2 | _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b); |
2471 | 2 | pu1_src += 8; |
2472 | 2 | } |
2473 | 72.2k | } |
2474 | 72.2k | } |
2475 | | |
2476 | | /* 135 degree filtering */ |
2477 | | void ihevc_sao_edge_offset_class2_ssse3(UWORD8 *pu1_src, |
2478 | | WORD32 src_strd, |
2479 | | UWORD8 *pu1_src_left, |
2480 | | UWORD8 *pu1_src_top, |
2481 | | UWORD8 *pu1_src_top_left, |
2482 | | UWORD8 *pu1_src_top_right, |
2483 | | UWORD8 *pu1_src_bot_left, |
2484 | | UWORD8 *pu1_avail, |
2485 | | WORD8 *pi1_sao_offset, |
2486 | | WORD32 wd, |
2487 | | WORD32 ht) |
2488 | 56.8k | { |
2489 | 56.8k | WORD32 row, col; |
2490 | 56.8k | UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2; |
2491 | 56.8k | UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2; |
2492 | 56.8k | UWORD8 *pu1_firstleft; |
2493 | 56.8k | UWORD8 *pu1_src_cpy, *pu1_src_org; |
2494 | 56.8k | UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy; |
2495 | 56.8k | UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8]; |
2496 | 56.8k | UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8]; |
2497 | 56.8k | WORD32 wd_rem; |
2498 | 56.8k | UWORD8 u1_pos_0_0_tmp, u1_pos_wd_ht_tmp; |
2499 | 56.8k | WORD32 ht_tmp, ht_0; |
2500 | | |
2501 | 56.8k | WORD32 bit_depth; |
2502 | 56.8k | UWORD8 u1_avail0, u1_avail1; |
2503 | | |
2504 | 56.8k | __m128i src_top_16x8b, src_bottom_16x8b; |
2505 | 56.8k | __m128i src_temp0_16x8b, src_temp1_16x8b; |
2506 | 56.8k | __m128i signup0_16x8b, signdwn1_16x8b; |
2507 | 56.8k | __m128i cmp_gt0_16x8b, cmp_lt0_16x8b; |
2508 | 56.8k | __m128i edge0_16x8b, edge1_16x8b; |
2509 | 56.8k | __m128i au1_mask8x16b; |
2510 | 56.8k | __m128i edge_idx_8x16b, sao_offset_8x16b; |
2511 | 56.8k | __m128i const2_16x8b, const0_16x8b; |
2512 | 56.8k | __m128i left_store_16x8b; |
2513 | 56.8k | UNUSED(pu1_src_top_right); |
2514 | 56.8k | UNUSED(pu1_src_bot_left); |
2515 | | |
2516 | 56.8k | ht_0 = ht; ht_tmp = ht; |
2517 | 56.8k | au1_mask8x16b = _mm_set1_epi8(0xff); |
2518 | | |
2519 | | //setting availability mask to ff size MAX_CTB_SIZE |
2520 | 284k | for(col = 0; col < MAX_CTB_SIZE; col += 16) |
2521 | 227k | _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b); |
2522 | 1.85M | for(row = 0; row < ht; row++) |
2523 | 1.79M | { |
2524 | 1.79M | au1_src_left_tmp[row] = pu1_src_left[row]; |
2525 | 1.79M | } |
2526 | 56.8k | bit_depth = BIT_DEPTH_LUMA; |
2527 | 56.8k | pu1_src_org = pu1_src; |
2528 | 56.8k | pu1_src_top_cpy = pu1_src_top; |
2529 | 56.8k | pu1_src_left_cpy2 = au1_src_left_tmp; |
2530 | 56.8k | pu1_src_left_cpy = au1_src_left_tmp; |
2531 | 56.8k | pu1_src_left_str2 = au1_src_left_tmp1; |
2532 | 56.8k | pu1_src_left_str = au1_src_left_tmp1; |
2533 | 56.8k | edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx); |
2534 | 56.8k | sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset); |
2535 | | |
2536 | | |
2537 | | /* If top-left is available, process separately */ |
2538 | 56.8k | if(0 != pu1_avail[4]) |
2539 | 53.5k | { |
2540 | 53.5k | WORD8 edge_idx; |
2541 | | |
2542 | 53.5k | edge_idx = 2 + SIGN(pu1_src[0] - pu1_src_top_left[0]) + |
2543 | 53.5k | SIGN(pu1_src[0] - pu1_src[1 + src_strd]); |
2544 | | |
2545 | 53.5k | edge_idx = gi1_table_edge_idx[edge_idx]; |
2546 | | |
2547 | 53.5k | if(0 != edge_idx) |
2548 | 15.0k | { |
2549 | 15.0k | u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1); |
2550 | 15.0k | } |
2551 | 38.5k | else |
2552 | 38.5k | { |
2553 | 38.5k | u1_pos_0_0_tmp = pu1_src[0]; |
2554 | 38.5k | } |
2555 | 53.5k | } |
2556 | 3.30k | else |
2557 | 3.30k | { |
2558 | 3.30k | u1_pos_0_0_tmp = pu1_src[0]; |
2559 | 3.30k | } |
2560 | | |
2561 | | /* If bottom-right is available, process separately */ |
2562 | 56.8k | if(0 != pu1_avail[7]) |
2563 | 54.6k | { |
2564 | 54.6k | WORD8 edge_idx; |
2565 | | |
2566 | 54.6k | edge_idx = 2 + SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 1 - src_strd]) + |
2567 | 54.6k | SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]); |
2568 | | |
2569 | 54.6k | edge_idx = gi1_table_edge_idx[edge_idx]; |
2570 | | |
2571 | 54.6k | if(0 != edge_idx) |
2572 | 16.2k | { |
2573 | 16.2k | u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1); |
2574 | 16.2k | } |
2575 | 38.3k | else |
2576 | 38.3k | { |
2577 | 38.3k | u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd]; |
2578 | 38.3k | } |
2579 | 54.6k | } |
2580 | 2.21k | else |
2581 | 2.21k | { |
2582 | 2.21k | u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd]; |
2583 | 2.21k | } |
2584 | 56.8k | pu1_firstleft = pu1_src_top_left; |
2585 | | |
2586 | | /* Update height and source pointers based on the availability flags */ |
2587 | 56.8k | if(0 == pu1_avail[2]) |
2588 | 2.64k | { |
2589 | 2.64k | pu1_firstleft = pu1_src_left_cpy2; |
2590 | 2.64k | pu1_src_left_cpy2++; |
2591 | 2.64k | pu1_src_left_str2++; |
2592 | 2.64k | pu1_src_top_cpy = pu1_src; |
2593 | 2.64k | pu1_src += src_strd; |
2594 | 2.64k | ht--; |
2595 | 2.64k | } |
2596 | 56.8k | if(0 == pu1_avail[3]) |
2597 | 1.20k | { |
2598 | 1.20k | ht--; |
2599 | 1.20k | ht_0--; |
2600 | 1.20k | } |
2601 | | //storing top left in a mmx register |
2602 | 56.8k | left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_firstleft); |
2603 | 56.8k | const2_16x8b = _mm_set1_epi8(2); |
2604 | 56.8k | const0_16x8b = _mm_setzero_si128(); |
2605 | 56.8k | left_store_16x8b = _mm_slli_si128(left_store_16x8b, 15); |
2606 | | //update top -left |
2607 | 56.8k | *pu1_src_top_left = pu1_src_top[wd - 1]; |
2608 | | //availability mask creation |
2609 | 56.8k | u1_avail0 = pu1_avail[0]; |
2610 | 56.8k | u1_avail1 = pu1_avail[1]; |
2611 | 56.8k | au1_mask[0] = u1_avail0; |
2612 | 56.8k | au1_mask[wd - 1] = u1_avail1; |
2613 | 56.8k | { |
2614 | 56.8k | WORD32 ht_rem; |
2615 | | |
2616 | | |
2617 | 56.8k | pu1_src_left_cpy = pu1_src_left_cpy2; |
2618 | 56.8k | pu1_src_left_str = pu1_src_left_str2; |
2619 | 56.8k | au1_mask_cpy = au1_mask; |
2620 | 143k | for(col = wd; col >= 16; col -= 16) |
2621 | 87.1k | { |
2622 | 87.1k | pu1_src_cpy = pu1_src; |
2623 | 87.1k | src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col)); |
2624 | | //row = 0 |
2625 | 87.1k | src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy)); |
2626 | 87.1k | src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 15); |
2627 | | //loading the mask |
2628 | 87.1k | au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy); |
2629 | | //separating +ve and and -ve values. |
2630 | 87.1k | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b); |
2631 | 87.1k | cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b); |
2632 | | //creating mask 00 for +ve and -ve values and FF for zero. |
2633 | 87.1k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
2634 | 87.1k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
2635 | | //combining the appropriate sign change |
2636 | 87.1k | signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
2637 | | |
2638 | | |
2639 | 1.46M | for(row = ht; row >= 2; row -= 2) |
2640 | 1.37M | { |
2641 | 1.37M | left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); |
2642 | | //row = 1 |
2643 | 1.37M | src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); |
2644 | | // row = 1 right |
2645 | 1.37M | src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1)); |
2646 | | //to insert left in row 0 |
2647 | 1.37M | signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 15); |
2648 | | //row 0 -row1 |
2649 | | //separating +ve and and -ve values. |
2650 | 1.37M | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b); |
2651 | 1.37M | cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b); |
2652 | | |
2653 | | //creating mask 00 for +ve and -ve values and FF for zero. |
2654 | 1.37M | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
2655 | 1.37M | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
2656 | | //manipulation for row 1 - row 0 |
2657 | 1.37M | signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15); |
2658 | | //combining the appropriate sign change |
2659 | 1.37M | edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1) |
2660 | | //row1-row0 |
2661 | | //separating +ve and and -ve values. |
2662 | 1.37M | cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); |
2663 | 1.37M | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); |
2664 | | //creating mask 00 for +ve and -ve values and FF for zero. |
2665 | 1.37M | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
2666 | 1.37M | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
2667 | | // row = 2 right |
2668 | 1.37M | src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd + 1)); |
2669 | 1.37M | edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0) |
2670 | | |
2671 | | |
2672 | | //row1 -bottom |
2673 | 1.37M | cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b); |
2674 | 1.37M | cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b); |
2675 | | //creating mask 00 for +ve and -ve values and FF for zero. |
2676 | 1.37M | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
2677 | 1.37M | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
2678 | | //combining the appropriate sign change |
2679 | 1.37M | signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
2680 | | // row = 2 |
2681 | 1.37M | src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); |
2682 | | |
2683 | | //combining sign-left and sign_right |
2684 | 1.37M | edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); |
2685 | | |
2686 | | //storing the row 1 left for next row. |
2687 | 1.37M | signup0_16x8b = _mm_slli_si128(left_store_16x8b, 14); |
2688 | | |
2689 | | //combining sign-left and sign_right |
2690 | 1.37M | edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b); |
2691 | | //manipulation for bottom - row 1 |
2692 | 1.37M | signup0_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signup0_16x8b, 15); |
2693 | | //eliminating old left for row 0 and row 1 |
2694 | 1.37M | left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2); |
2695 | | //bottom - row1 |
2696 | 1.37M | cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signup0_16x8b); |
2697 | 1.37M | cmp_lt0_16x8b = _mm_subs_epu8(signup0_16x8b, src_bottom_16x8b); |
2698 | | //creating mask 00 for +ve and -ve values and FF for zero. |
2699 | 1.37M | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
2700 | 1.37M | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
2701 | | //for the next iteration bottom -row1 |
2702 | 1.37M | signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
2703 | | //row1 getting it right for left of next block |
2704 | 1.37M | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15); |
2705 | | //adding constant 2 |
2706 | 1.37M | edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); |
2707 | 1.37M | edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); |
2708 | | //shuffle to get sao index |
2709 | 1.37M | edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); |
2710 | 1.37M | edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); |
2711 | | //using availability mask |
2712 | 1.37M | edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); |
2713 | 1.37M | edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b); |
2714 | | //shuffle to get sao offset |
2715 | 1.37M | edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); |
2716 | 1.37M | edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); |
2717 | | //row0 getting it right for left of next block |
2718 | 1.37M | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15); |
2719 | | //copying the next top |
2720 | 1.37M | src_top_16x8b = src_temp1_16x8b; |
2721 | | //cnvert to 16 bit then add and then saturated pack |
2722 | 1.37M | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); |
2723 | 1.37M | cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); |
2724 | 1.37M | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); |
2725 | 1.37M | src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); |
2726 | 1.37M | cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); |
2727 | 1.37M | cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); |
2728 | 1.37M | src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); |
2729 | 1.37M | src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); |
2730 | | |
2731 | 1.37M | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); |
2732 | 1.37M | cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); |
2733 | 1.37M | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b); |
2734 | 1.37M | src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b); |
2735 | 1.37M | cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); |
2736 | 1.37M | cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b); |
2737 | 1.37M | src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b); |
2738 | 1.37M | src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b); |
2739 | | |
2740 | | //store left boundary |
2741 | 1.37M | _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); |
2742 | | //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. |
2743 | 1.37M | _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); |
2744 | | // row = 1 |
2745 | 1.37M | _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b); |
2746 | | |
2747 | 1.37M | src_temp0_16x8b = src_bottom_16x8b; |
2748 | 1.37M | pu1_src_cpy += (src_strd << 1); |
2749 | 1.37M | pu1_src_left_cpy += 2; |
2750 | 1.37M | pu1_src_left_str += 2; |
2751 | 1.37M | } |
2752 | 87.1k | ht_rem = ht & 0x1; |
2753 | | |
2754 | 87.1k | if(ht_rem) |
2755 | 5.71k | { |
2756 | 5.71k | left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); |
2757 | 5.71k | src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1)); |
2758 | | //current row -next row |
2759 | | //separating +ve and and -ve values. |
2760 | 5.71k | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b); |
2761 | 5.71k | cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b); |
2762 | | //creating mask 00 for +ve and -ve values and FF for zero. |
2763 | 5.71k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
2764 | 5.71k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
2765 | | //combining the appropriate sign change |
2766 | 5.71k | edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
2767 | | //adding top and botton and constant 2 |
2768 | 5.71k | edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); |
2769 | 5.71k | edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); |
2770 | | //eliminating old left for row 0 and row 1 |
2771 | 5.71k | left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1); |
2772 | | |
2773 | 5.71k | edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); |
2774 | | //using availability mask |
2775 | 5.71k | edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); |
2776 | | |
2777 | 5.71k | edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); |
2778 | | |
2779 | | //row0 getting it right for left of next block |
2780 | 5.71k | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15); |
2781 | | //copying the next top |
2782 | 5.71k | src_top_16x8b = src_temp0_16x8b; |
2783 | | //cnvert to 16 bit then add and then saturated pack |
2784 | 5.71k | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); |
2785 | 5.71k | cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); |
2786 | 5.71k | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); |
2787 | 5.71k | src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); |
2788 | 5.71k | cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); |
2789 | 5.71k | cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); |
2790 | 5.71k | src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); |
2791 | 5.71k | src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); |
2792 | | //store left boundary |
2793 | 5.71k | _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); |
2794 | | |
2795 | 5.71k | _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); |
2796 | 5.71k | pu1_src_cpy += (src_strd); |
2797 | 5.71k | pu1_src_left_cpy += 1; |
2798 | 5.71k | pu1_src_left_str += 1; |
2799 | 5.71k | } |
2800 | 87.1k | if(0 == pu1_avail[3]) |
2801 | 1.79k | { |
2802 | 1.79k | src_top_16x8b = src_bottom_16x8b; |
2803 | 1.79k | pu1_src_left_str[0] = pu1_src_cpy[15]; |
2804 | 1.79k | } |
2805 | 87.1k | if(0 == pu1_avail[2]) |
2806 | 3.92k | { |
2807 | 3.92k | pu1_src_left_str[-ht_0] = pu1_src[15 - src_strd]; |
2808 | 3.92k | } |
2809 | | |
2810 | | //for the top left of next part of the block |
2811 | 87.1k | left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col)); |
2812 | | //updating top flag |
2813 | 87.1k | _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b); |
2814 | 87.1k | pu1_src += 16; |
2815 | 87.1k | au1_mask_cpy += 16; |
2816 | | |
2817 | | |
2818 | 87.1k | pu1_left_tmp = pu1_src_left_cpy2; |
2819 | 87.1k | pu1_src_left_cpy2 = pu1_src_left_str2; |
2820 | 87.1k | pu1_src_left_str2 = pu1_left_tmp; |
2821 | | |
2822 | 87.1k | pu1_src_left_cpy = pu1_src_left_cpy2; |
2823 | 87.1k | pu1_src_left_str = pu1_src_left_str2; |
2824 | 87.1k | } |
2825 | | |
2826 | 56.8k | wd_rem = wd & 0xF; |
2827 | 56.8k | if(wd_rem) |
2828 | 55.8k | { |
2829 | 55.8k | pu1_src_left_cpy = pu1_src_left_cpy2; |
2830 | 55.8k | pu1_src_left_str = pu1_src_left_str2; |
2831 | 55.8k | pu1_src_cpy = pu1_src; |
2832 | 55.8k | src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col)); |
2833 | | //row = 0 |
2834 | 55.8k | src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy)); |
2835 | 55.8k | src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 15); |
2836 | 55.8k | au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //???? |
2837 | | //separating +ve and and -ve values. |
2838 | 55.8k | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b); |
2839 | 55.8k | cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b); |
2840 | | //creating mask 00 for +ve and -ve values and FF for zero. |
2841 | 55.8k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
2842 | 55.8k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
2843 | | //preparing au1_mask |
2844 | 55.8k | au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b); |
2845 | | //combining the appropriate sign change |
2846 | 55.8k | signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
2847 | 55.8k | signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); |
2848 | | |
2849 | 493k | for(row = ht; row >= 4; row -= 4) |
2850 | 437k | { |
2851 | 437k | left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); |
2852 | | //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. |
2853 | 437k | src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); |
2854 | | // row = 2 |
2855 | 437k | src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); |
2856 | | //right row1 |
2857 | 437k | signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1); |
2858 | | //row 0 -row1 |
2859 | | //separating +ve and and -ve values. |
2860 | 437k | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); |
2861 | 437k | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); |
2862 | | //manipulation for row 1 -row 0 |
2863 | 437k | signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 15); |
2864 | | //creating mask 00 for +ve and -ve values and FF for zero. |
2865 | 437k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
2866 | 437k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
2867 | | //row 0 left |
2868 | 437k | signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15); |
2869 | | //combining the appropriate sign change |
2870 | 437k | edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
2871 | | //row 1 -row0 |
2872 | | //separating +ve and and -ve values. |
2873 | 437k | cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); |
2874 | 437k | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); |
2875 | | |
2876 | | //creating mask 00 for +ve and -ve values and FF for zero. |
2877 | 437k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
2878 | 437k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
2879 | | //row1-row0 |
2880 | 437k | edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
2881 | | |
2882 | 437k | edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1) |
2883 | | |
2884 | 437k | signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top) |
2885 | | //right row2 |
2886 | 437k | signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1); |
2887 | | //packing row 0 n row 1 |
2888 | 437k | src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b); |
2889 | | //row1 -row2 |
2890 | 437k | cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); |
2891 | 437k | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); |
2892 | | //creating mask 00 for +ve and -ve values and FF for zero. |
2893 | 437k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
2894 | 437k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
2895 | | //combining the appropriate sign change |
2896 | 437k | signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2) |
2897 | 437k | edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1) |
2898 | | //manipulation for row 2 -row 1 |
2899 | 437k | signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14); |
2900 | | //row 1 left |
2901 | 437k | signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15); |
2902 | | //row = 3 |
2903 | 437k | src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd)); |
2904 | | |
2905 | | // row = 4 |
2906 | 437k | src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd)); |
2907 | | |
2908 | 437k | edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty |
2909 | | |
2910 | | //separating +ve and and -ve values.(2,1) |
2911 | 437k | cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); |
2912 | 437k | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); |
2913 | | //manipulation for row 3 -row 2 |
2914 | 437k | signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 13); |
2915 | | //creating mask 00 for +ve and -ve values and FF for zero. |
2916 | 437k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
2917 | 437k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
2918 | | //row 2 left |
2919 | 437k | signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15); |
2920 | | //combining the appropriate sign change |
2921 | 437k | signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) |
2922 | | |
2923 | | //separating +ve and and -ve values.(3,2) |
2924 | 437k | cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b); |
2925 | 437k | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b); |
2926 | 437k | signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1) |
2927 | | //creating mask 00 for +ve and -ve values and FF for zero. |
2928 | 437k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
2929 | 437k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
2930 | | //right row3 |
2931 | 437k | signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 1); |
2932 | | //combining the appropriate sign change |
2933 | 437k | edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2) |
2934 | | |
2935 | 437k | signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1) |
2936 | | |
2937 | | //separating +ve and and -ve values.(2,3) |
2938 | 437k | cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); |
2939 | 437k | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); |
2940 | | //right row 4 |
2941 | 437k | signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1); |
2942 | | //creating mask 00 for +ve and -ve values and FF for zero. |
2943 | 437k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
2944 | 437k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
2945 | | //combining the appropriate sign change |
2946 | 437k | edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3) |
2947 | | |
2948 | | //separating +ve and and -ve values.(3,bottom) |
2949 | 437k | cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b); |
2950 | 437k | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b); |
2951 | | |
2952 | | //creating mask 00 for +ve and -ve values and FF for zero. |
2953 | 437k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
2954 | 437k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
2955 | 437k | edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3) |
2956 | | //combining the appropriate sign change |
2957 | 437k | signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom) |
2958 | 437k | edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3) |
2959 | | |
2960 | | //manipulation for bottom -row 3 |
2961 | 437k | signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12); |
2962 | | //eliminating old left for row 0,1,2,3 |
2963 | 437k | left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4); |
2964 | | //packing row 2 n row 3 |
2965 | 437k | src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b); |
2966 | | //row 3 left |
2967 | 437k | signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 15); |
2968 | | //loading row 3 right into left |
2969 | 437k | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 15); |
2970 | | //adding bottom and top values of row 2 and row 3 |
2971 | 437k | edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2) |
2972 | | //separating +ve and and -ve values.(botttom,3) |
2973 | 437k | cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); |
2974 | 437k | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); |
2975 | | //to store right of row 2 |
2976 | 437k | signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8); |
2977 | | //creating mask 00 for +ve and -ve values and FF for zero. |
2978 | 437k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
2979 | 437k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
2980 | 437k | signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration |
2981 | | |
2982 | | //storing right of row 2into left |
2983 | 437k | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15); |
2984 | | //to store right of row 0 |
2985 | 437k | signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); |
2986 | | //storing right of row 1 into left |
2987 | 437k | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15); |
2988 | | |
2989 | | //adding constant 2 |
2990 | 437k | edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); |
2991 | 437k | edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); |
2992 | | //shuffle to get sao index |
2993 | 437k | edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); |
2994 | 437k | edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); |
2995 | | //using availability mask |
2996 | 437k | edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); |
2997 | 437k | edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b); |
2998 | | //shuffle to get sao offset |
2999 | 437k | edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); |
3000 | 437k | edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); |
3001 | | |
3002 | | //storing right of row 0 into left |
3003 | 437k | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15); |
3004 | | //cnvert to 16 bit then add and then saturated pack |
3005 | 437k | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); |
3006 | 437k | cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); |
3007 | 437k | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); |
3008 | 437k | src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); |
3009 | 437k | cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); |
3010 | 437k | cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); |
3011 | 437k | src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); |
3012 | 437k | src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); |
3013 | | |
3014 | 437k | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); |
3015 | 437k | cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b); |
3016 | 437k | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b); |
3017 | 437k | src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b); |
3018 | 437k | cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); |
3019 | 437k | cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b); |
3020 | 437k | src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b); |
3021 | 437k | src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b); |
3022 | | |
3023 | 437k | cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); |
3024 | 437k | cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8); |
3025 | | |
3026 | 437k | _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); |
3027 | | //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. |
3028 | 437k | _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); |
3029 | | // row = 1 |
3030 | 437k | _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); |
3031 | | //row = 2 |
3032 | 437k | _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b); |
3033 | | // row = 3 |
3034 | 437k | _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b); |
3035 | | |
3036 | 437k | src_temp0_16x8b = src_temp1_16x8b; |
3037 | 437k | signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); |
3038 | 437k | pu1_src_cpy += (src_strd << 2); |
3039 | 437k | pu1_src_left_cpy += 4; |
3040 | 437k | pu1_src_left_str += 4; |
3041 | 437k | } |
3042 | 55.8k | ht_rem = ht & 0x2; |
3043 | 55.8k | if(ht_rem) |
3044 | 3.79k | { |
3045 | 3.79k | left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); |
3046 | | //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. |
3047 | 3.79k | src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); |
3048 | | // row = 2 |
3049 | 3.79k | src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); |
3050 | | |
3051 | | //row 0 -row 1 |
3052 | 3.79k | signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1); |
3053 | | //separating +ve and and -ve values. |
3054 | 3.79k | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); |
3055 | 3.79k | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); |
3056 | | //manipulation for row 1 -row 0 |
3057 | 3.79k | signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 15); |
3058 | | //creating mask 00 for +ve and -ve values and FF for zero. |
3059 | 3.79k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
3060 | 3.79k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
3061 | | //manipulation for row 1 - row 0 |
3062 | 3.79k | signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15); |
3063 | | //combining the appropriate sign change |
3064 | 3.79k | edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
3065 | | |
3066 | | //row1-row0 |
3067 | | //separating +ve and and -ve values. |
3068 | 3.79k | cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); |
3069 | 3.79k | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); |
3070 | | |
3071 | | //creating mask 00 for +ve and -ve values and FF for zero. |
3072 | 3.79k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
3073 | 3.79k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
3074 | | //combining the appropriate sign chang |
3075 | 3.79k | edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
3076 | | //row 1 -bottom |
3077 | 3.79k | signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1); |
3078 | | |
3079 | 3.79k | edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1) |
3080 | 3.79k | signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top) |
3081 | | //row1 -bottom |
3082 | 3.79k | cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); |
3083 | 3.79k | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); |
3084 | | |
3085 | | //creating mask 00 for +ve and -ve values and FF for zero. |
3086 | 3.79k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
3087 | 3.79k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
3088 | | //combining the appropriate sign change |
3089 | 3.79k | signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2) |
3090 | 3.79k | edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1) |
3091 | | //manipulation for bottom -row1 |
3092 | 3.79k | signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14); |
3093 | | //manipulation for bottom- row 1 |
3094 | 3.79k | signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15); |
3095 | | //adding top and down substraction |
3096 | 3.79k | edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty |
3097 | | //bottom - row 1 |
3098 | 3.79k | cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); |
3099 | 3.79k | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); |
3100 | | |
3101 | | //eliminating old left for row 0,1 |
3102 | 3.79k | left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2); |
3103 | 3.79k | signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8); |
3104 | | //creating mask 00 for +ve and -ve values and FF for zero. |
3105 | 3.79k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
3106 | 3.79k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
3107 | | //for the next iteration signup0_16x8b |
3108 | 3.79k | signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next |
3109 | | |
3110 | | //storing right of row 1 into left |
3111 | 3.79k | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15); |
3112 | | //for storing right of row 1 |
3113 | 3.79k | signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); |
3114 | | |
3115 | 3.79k | src_top_16x8b = src_temp1_16x8b; |
3116 | | //storing right of row 0 into left |
3117 | 3.79k | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15); |
3118 | | |
3119 | | //adding constant 2 |
3120 | 3.79k | edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); |
3121 | | |
3122 | | //shuffle to get sao index |
3123 | 3.79k | edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); |
3124 | | //using availability mask |
3125 | 3.79k | edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); |
3126 | | //shuffle to get sao offset |
3127 | 3.79k | edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); |
3128 | | |
3129 | | //the next top already in src_top_16x8b |
3130 | | //cnvert to 16 bit then add and then saturated pack |
3131 | 3.79k | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); |
3132 | 3.79k | src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); |
3133 | 3.79k | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); |
3134 | 3.79k | src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); |
3135 | 3.79k | src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); |
3136 | 3.79k | cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); |
3137 | 3.79k | src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b); |
3138 | 3.79k | src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b); |
3139 | | |
3140 | 3.79k | cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); |
3141 | | |
3142 | 3.79k | _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); |
3143 | | //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. |
3144 | 3.79k | _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); |
3145 | | // row = 1 |
3146 | 3.79k | _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); |
3147 | 3.79k | src_temp0_16x8b = src_bottom_16x8b; |
3148 | 3.79k | pu1_src_cpy += (src_strd << 1); |
3149 | 3.79k | pu1_src_left_cpy += 2; |
3150 | 3.79k | pu1_src_left_str += 2; |
3151 | 3.79k | } |
3152 | 55.8k | ht_rem = ht & 0x1; |
3153 | 55.8k | if(ht_rem) |
3154 | 3.79k | { |
3155 | 3.79k | left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); |
3156 | | //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. |
3157 | 3.79k | src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); |
3158 | | //left store manipulation 1 |
3159 | 3.79k | left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1); |
3160 | | //row 0 -row1 |
3161 | 3.79k | signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1); |
3162 | | //separating +ve and and -ve values. |
3163 | 3.79k | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); |
3164 | 3.79k | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); |
3165 | | //creating mask 00 for +ve and -ve values and FF for zero. |
3166 | 3.79k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
3167 | 3.79k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
3168 | | //combining the appropriate sign change |
3169 | 3.79k | edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
3170 | | //adding top and down substraction |
3171 | 3.79k | edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); |
3172 | | //for row 0 right to put into left store |
3173 | 3.79k | signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); |
3174 | | //adding constant 2 |
3175 | 3.79k | edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); |
3176 | 3.79k | edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); |
3177 | 3.79k | edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8); |
3178 | | //filling the left boundary value |
3179 | 3.79k | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15); |
3180 | | |
3181 | | //shuffle to get sao index |
3182 | 3.79k | edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); |
3183 | | //using availability mask |
3184 | 3.79k | edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); |
3185 | | //shuffle to get sao offset |
3186 | 3.79k | edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); |
3187 | 3.79k | src_top_16x8b = src_temp0_16x8b; |
3188 | | //cnvert to 16 bit then add and then saturated pack |
3189 | 3.79k | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); |
3190 | 3.79k | src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); |
3191 | 3.79k | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); |
3192 | 3.79k | src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); |
3193 | 3.79k | src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b); |
3194 | | |
3195 | 3.79k | _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); |
3196 | | //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. |
3197 | 3.79k | _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); |
3198 | 3.79k | pu1_src_cpy += (src_strd); |
3199 | 3.79k | pu1_src_left_cpy += 1; |
3200 | 3.79k | pu1_src_left_str += 1; |
3201 | 3.79k | } |
3202 | 55.8k | if(0 == pu1_avail[3]) |
3203 | 1.19k | { |
3204 | 1.19k | src_top_16x8b = src_bottom_16x8b; |
3205 | 1.19k | pu1_src_left_str[0] = pu1_src_cpy[7]; |
3206 | 1.19k | } |
3207 | | |
3208 | 55.8k | if(0 == pu1_avail[2]) |
3209 | 2.60k | { |
3210 | 2.60k | pu1_src_left_str[-ht_0] = pu1_src[7 - src_strd]; |
3211 | 2.60k | } |
3212 | | |
3213 | 55.8k | _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b); |
3214 | 55.8k | pu1_src += 8; |
3215 | 55.8k | au1_mask_cpy += 16; |
3216 | | |
3217 | 55.8k | pu1_left_tmp = pu1_src_left_cpy2; |
3218 | 55.8k | pu1_src_left_cpy2 = pu1_src_left_str2; |
3219 | 55.8k | pu1_src_left_str2 = pu1_left_tmp; |
3220 | | |
3221 | 55.8k | pu1_src_left_cpy = pu1_src_left_cpy2; |
3222 | 55.8k | pu1_src_left_str = pu1_src_left_str2; |
3223 | 55.8k | } |
3224 | 56.8k | pu1_src_org[0] = u1_pos_0_0_tmp; |
3225 | 56.8k | pu1_src_org[wd - 1 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp; |
3226 | 56.8k | pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 1) : pu1_src_left_cpy; |
3227 | 1.85M | for(row = 0; row < ht_tmp; row++) |
3228 | 1.79M | { |
3229 | 1.79M | pu1_src_left[row] = pu1_src_left_cpy[row]; |
3230 | 1.79M | } |
3231 | 56.8k | } |
3232 | | |
3233 | 56.8k | } |
3234 | | |
3235 | | /* 135 degree filtering */ |
3236 | | void ihevc_sao_edge_offset_class2_chroma_ssse3(UWORD8 *pu1_src, |
3237 | | WORD32 src_strd, |
3238 | | UWORD8 *pu1_src_left, |
3239 | | UWORD8 *pu1_src_top, |
3240 | | UWORD8 *pu1_src_top_left, |
3241 | | UWORD8 *pu1_src_top_right, |
3242 | | UWORD8 *pu1_src_bot_left, |
3243 | | UWORD8 *pu1_avail, |
3244 | | WORD8 *pi1_sao_offset_u, |
3245 | | WORD8 *pi1_sao_offset_v, |
3246 | | WORD32 wd, |
3247 | | WORD32 ht) |
3248 | 63.1k | { |
3249 | 63.1k | WORD32 row, col; |
3250 | 63.1k | UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2; |
3251 | 63.1k | UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2; |
3252 | 63.1k | UWORD8 *pu1_firstleft; |
3253 | 63.1k | UWORD8 *pu1_src_cpy, *pu1_src_org; |
3254 | 63.1k | UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy; |
3255 | 63.1k | UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)]; |
3256 | 63.1k | UWORD8 au1_src_left_tmp1[2 * (MAX_CTB_SIZE + 8)]; |
3257 | 63.1k | WORD32 wd_rem; |
3258 | 63.1k | UWORD8 u1_pos_0_0_tmp_u, u1_pos_0_0_tmp_v, u1_pos_wd_ht_tmp_u, u1_pos_wd_ht_tmp_v; |
3259 | 63.1k | WORD32 ht_tmp; |
3260 | 63.1k | WORD32 ht_0; |
3261 | | |
3262 | 63.1k | WORD32 bit_depth; |
3263 | 63.1k | UWORD8 u1_avail0, u1_avail1; |
3264 | | |
3265 | 63.1k | __m128i src_temp0_16x8b, src_temp1_16x8b; |
3266 | 63.1k | __m128i signup0_16x8b, signdwn1_16x8b; |
3267 | 63.1k | __m128i cmp_gt0_16x8b, cmp_lt0_16x8b; |
3268 | 63.1k | __m128i edge0_16x8b, edge1_16x8b; |
3269 | 63.1k | __m128i src_top_16x8b, src_bottom_16x8b; |
3270 | 63.1k | __m128i au1_mask8x16b; |
3271 | 63.1k | __m128i edge_idx_8x16b, sao_offset_8x16b; |
3272 | 63.1k | __m128i const2_16x8b, const0_16x8b; |
3273 | 63.1k | __m128i left_store_16x8b; |
3274 | 63.1k | __m128i chroma_offset_8x16b; |
3275 | | |
3276 | 63.1k | UNUSED(pu1_src_top_right); |
3277 | 63.1k | UNUSED(pu1_src_bot_left); |
3278 | | |
3279 | 63.1k | ht_0 = ht; ht_tmp = ht; |
3280 | 63.1k | au1_mask8x16b = _mm_set1_epi8(0xff); |
3281 | | /* Updating left and top-left */ |
3282 | 2.06M | for(row = 0; row < 2 * ht; row++) |
3283 | 2.00M | { |
3284 | 2.00M | au1_src_left_tmp[row] = pu1_src_left[row]; |
3285 | 2.00M | } |
3286 | | //setting availability mask to ff size MAX_CTB_SIZE |
3287 | 315k | for(col = 0; col < MAX_CTB_SIZE; col += 16) |
3288 | 252k | _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b); |
3289 | 63.1k | bit_depth = BIT_DEPTH_LUMA; |
3290 | 63.1k | pu1_src_org = pu1_src; |
3291 | 63.1k | pu1_src_top_cpy = pu1_src_top; |
3292 | 63.1k | pu1_src_left_cpy2 = au1_src_left_tmp; |
3293 | 63.1k | pu1_src_left_cpy = au1_src_left_tmp; |
3294 | 63.1k | pu1_src_left_str2 = au1_src_left_tmp1; |
3295 | 63.1k | pu1_src_left_str = au1_src_left_tmp1; |
3296 | 63.1k | edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx); |
3297 | 63.1k | sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u); |
3298 | 63.1k | const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v); |
3299 | 63.1k | chroma_offset_8x16b = _mm_set1_epi16(0x0800); |
3300 | | |
3301 | | /* If top-left is available, process separately */ |
3302 | 63.1k | if(0 != pu1_avail[4]) |
3303 | 59.5k | { |
3304 | 59.5k | WORD32 edge_idx; |
3305 | | |
3306 | | /* U */ |
3307 | 59.5k | edge_idx = 2 + SIGN(pu1_src[0] - pu1_src_top_left[0]) + |
3308 | 59.5k | SIGN(pu1_src[0] - pu1_src[2 + src_strd]); |
3309 | | |
3310 | 59.5k | edge_idx = gi1_table_edge_idx[edge_idx]; |
3311 | | |
3312 | 59.5k | if(0 != edge_idx) |
3313 | 14.5k | { |
3314 | 14.5k | u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1); |
3315 | 14.5k | } |
3316 | 45.0k | else |
3317 | 45.0k | { |
3318 | 45.0k | u1_pos_0_0_tmp_u = pu1_src[0]; |
3319 | 45.0k | } |
3320 | | |
3321 | | /* V */ |
3322 | 59.5k | edge_idx = 2 + SIGN(pu1_src[1] - pu1_src_top_left[1]) + |
3323 | 59.5k | SIGN(pu1_src[1] - pu1_src[1 + 2 + src_strd]); |
3324 | | |
3325 | 59.5k | edge_idx = gi1_table_edge_idx[edge_idx]; |
3326 | | |
3327 | 59.5k | if(0 != edge_idx) |
3328 | 13.1k | { |
3329 | 13.1k | u1_pos_0_0_tmp_v = CLIP3(pu1_src[1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1); |
3330 | 13.1k | } |
3331 | 46.3k | else |
3332 | 46.3k | { |
3333 | 46.3k | u1_pos_0_0_tmp_v = pu1_src[1]; |
3334 | 46.3k | } |
3335 | 59.5k | } |
3336 | 3.64k | else |
3337 | 3.64k | { |
3338 | 3.64k | u1_pos_0_0_tmp_u = pu1_src[0]; |
3339 | 3.64k | u1_pos_0_0_tmp_v = pu1_src[1]; |
3340 | 3.64k | } |
3341 | | |
3342 | | /* If bottom-right is available, process separately */ |
3343 | 63.1k | if(0 != pu1_avail[7]) |
3344 | 60.6k | { |
3345 | 60.6k | WORD32 edge_idx; |
3346 | | |
3347 | | /* U */ |
3348 | 60.6k | edge_idx = 2 + SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]) + |
3349 | 60.6k | SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]); |
3350 | | |
3351 | 60.6k | edge_idx = gi1_table_edge_idx[edge_idx]; |
3352 | | |
3353 | 60.6k | if(0 != edge_idx) |
3354 | 12.9k | { |
3355 | 12.9k | u1_pos_wd_ht_tmp_u = CLIP3(pu1_src[wd - 2 + (ht - 1) * src_strd] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1); |
3356 | 12.9k | } |
3357 | 47.7k | else |
3358 | 47.7k | { |
3359 | 47.7k | u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd]; |
3360 | 47.7k | } |
3361 | | |
3362 | | /* V */ |
3363 | 60.6k | edge_idx = 2 + SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]) + |
3364 | 60.6k | SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]); |
3365 | | |
3366 | 60.6k | edge_idx = gi1_table_edge_idx[edge_idx]; |
3367 | | |
3368 | 60.6k | if(0 != edge_idx) |
3369 | 15.6k | { |
3370 | 15.6k | u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1); |
3371 | 15.6k | } |
3372 | 44.9k | else |
3373 | 44.9k | { |
3374 | 44.9k | u1_pos_wd_ht_tmp_v = pu1_src[wd - 1 + (ht - 1) * src_strd]; |
3375 | 44.9k | } |
3376 | 60.6k | } |
3377 | 2.50k | else |
3378 | 2.50k | { |
3379 | 2.50k | u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd]; |
3380 | 2.50k | u1_pos_wd_ht_tmp_v = pu1_src[wd - 1 + (ht - 1) * src_strd]; |
3381 | 2.50k | } |
3382 | 63.1k | pu1_firstleft = pu1_src_top_left; |
3383 | | |
3384 | | /* Update height and source pointers based on the availability flags */ |
3385 | 63.1k | if(0 == pu1_avail[2]) |
3386 | 1.69k | { |
3387 | 1.69k | pu1_firstleft = pu1_src_left_cpy2; |
3388 | 1.69k | pu1_src_left_cpy2 += 2; |
3389 | 1.69k | pu1_src_left_str2 += 2; |
3390 | 1.69k | pu1_src_top_cpy = pu1_src; |
3391 | 1.69k | pu1_src += src_strd; |
3392 | 1.69k | ht--; |
3393 | 1.69k | } |
3394 | 63.1k | if(0 == pu1_avail[3]) |
3395 | 1.52k | { |
3396 | 1.52k | ht--; |
3397 | 1.52k | ht_0--; |
3398 | 1.52k | } |
3399 | | //storing top left in a mmx register |
3400 | 63.1k | left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_firstleft); |
3401 | 63.1k | sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b); |
3402 | 63.1k | const2_16x8b = _mm_set1_epi8(2); |
3403 | 63.1k | const0_16x8b = _mm_setzero_si128(); |
3404 | 63.1k | left_store_16x8b = _mm_slli_si128(left_store_16x8b, 14); |
3405 | | |
3406 | | //availability mask creation |
3407 | 63.1k | u1_avail0 = pu1_avail[0]; |
3408 | 63.1k | u1_avail1 = pu1_avail[1]; |
3409 | 63.1k | au1_mask[0] = u1_avail0; |
3410 | 63.1k | au1_mask[1] = u1_avail0; |
3411 | 63.1k | au1_mask[wd - 1] = u1_avail1; |
3412 | 63.1k | au1_mask[wd - 2] = u1_avail1; |
3413 | | |
3414 | | /* top-left arrays */ |
3415 | 63.1k | pu1_src_top_left[0] = pu1_src_top[wd - 2]; |
3416 | 63.1k | pu1_src_top_left[1] = pu1_src_top[wd - 1]; |
3417 | 63.1k | { |
3418 | 63.1k | WORD32 ht_rem; |
3419 | 63.1k | au1_mask_cpy = au1_mask; |
3420 | | |
3421 | 63.1k | pu1_src_left_cpy = pu1_src_left_cpy2; |
3422 | 63.1k | pu1_src_left_str = pu1_src_left_str2; |
3423 | 191k | for(col = wd; col >= 16; col -= 16) |
3424 | 128k | { |
3425 | 128k | pu1_src_cpy = pu1_src; |
3426 | 128k | src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col)); |
3427 | | //row = 0 |
3428 | 128k | src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy)); |
3429 | 128k | src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 14); |
3430 | | //loading the mask |
3431 | 128k | au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy); |
3432 | | //separating +ve and and -ve values. |
3433 | 128k | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b); |
3434 | 128k | cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b); |
3435 | | //creating mask 00 for +ve and -ve values and FF for zero. |
3436 | 128k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
3437 | 128k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
3438 | | //combining the appropriate sign change |
3439 | 128k | signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
3440 | | |
3441 | | |
3442 | 1.13M | for(row = ht; row >= 2; row -= 2) |
3443 | 1.00M | { |
3444 | 1.00M | left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); |
3445 | | //row = 1 |
3446 | 1.00M | src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); |
3447 | | // row = 1 right |
3448 | 1.00M | src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2)); |
3449 | | //to insert left in row 0 |
3450 | 1.00M | signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14); |
3451 | | //row 0 -row1 |
3452 | | //separating +ve and and -ve values. |
3453 | 1.00M | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b); |
3454 | 1.00M | cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b); |
3455 | | |
3456 | | //creating mask 00 for +ve and -ve values and FF for zero. |
3457 | 1.00M | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
3458 | 1.00M | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
3459 | | //manipulation for row 1 - row 0 |
3460 | 1.00M | signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14); |
3461 | | //combining the appropriate sign change |
3462 | 1.00M | edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1) |
3463 | | //row1-row0 |
3464 | | //separating +ve and and -ve values. |
3465 | 1.00M | cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); |
3466 | 1.00M | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); |
3467 | | //creating mask 00 for +ve and -ve values and FF for zero. |
3468 | 1.00M | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
3469 | 1.00M | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
3470 | | // row = 2 right |
3471 | 1.00M | src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd + 2)); |
3472 | 1.00M | edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0) |
3473 | | |
3474 | | |
3475 | | //row1 -bottom |
3476 | 1.00M | cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b); |
3477 | 1.00M | cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b); |
3478 | | //creating mask 00 for +ve and -ve values and FF for zero. |
3479 | 1.00M | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
3480 | 1.00M | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
3481 | | //combining the appropriate sign change |
3482 | 1.00M | signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
3483 | | // row = 2 |
3484 | 1.00M | src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); |
3485 | | |
3486 | | //combining sign-left and sign_right |
3487 | 1.00M | edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); |
3488 | | |
3489 | | //storing the row 1 left for next row. |
3490 | 1.00M | signup0_16x8b = _mm_slli_si128(left_store_16x8b, 12); |
3491 | | |
3492 | | //combining sign-left and sign_right |
3493 | 1.00M | edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b); |
3494 | | //manipulation for bottom - row 1 |
3495 | 1.00M | signup0_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signup0_16x8b, 14); |
3496 | | //eliminating old left for row 0 and row 1 |
3497 | 1.00M | left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4); |
3498 | | //bottom - row1 |
3499 | 1.00M | cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signup0_16x8b); |
3500 | 1.00M | cmp_lt0_16x8b = _mm_subs_epu8(signup0_16x8b, src_bottom_16x8b); |
3501 | | //creating mask 00 for +ve and -ve values and FF for zero. |
3502 | 1.00M | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
3503 | 1.00M | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
3504 | | //for the next iteration bottom -row1 |
3505 | 1.00M | signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
3506 | | //row1 getting it right for left of next iteration |
3507 | 1.00M | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14); |
3508 | | //copying the next top |
3509 | 1.00M | src_top_16x8b = src_temp1_16x8b; |
3510 | | //row0 getting its right for left of next iteration. |
3511 | 1.00M | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14); |
3512 | | |
3513 | | |
3514 | | //adding constant 2 |
3515 | 1.00M | edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); |
3516 | 1.00M | edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); |
3517 | | //shuffle to get sao index |
3518 | 1.00M | edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); |
3519 | 1.00M | edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); |
3520 | | //using availability mask |
3521 | 1.00M | edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); |
3522 | 1.00M | edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b); |
3523 | | //adding chroma offset to access U and V |
3524 | 1.00M | edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); |
3525 | 1.00M | edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b); |
3526 | | |
3527 | | |
3528 | | //shuffle to get sao offset |
3529 | 1.00M | edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); |
3530 | 1.00M | edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); |
3531 | | //cnvert to 16 bit then add and then saturated pack |
3532 | 1.00M | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); |
3533 | 1.00M | cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); |
3534 | 1.00M | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); |
3535 | 1.00M | src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); |
3536 | 1.00M | edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); |
3537 | 1.00M | cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); |
3538 | 1.00M | src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b); |
3539 | 1.00M | src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); |
3540 | | |
3541 | 1.00M | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); |
3542 | 1.00M | cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); |
3543 | 1.00M | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b); |
3544 | 1.00M | src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b); |
3545 | 1.00M | edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b); |
3546 | 1.00M | cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); |
3547 | 1.00M | src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b); |
3548 | 1.00M | src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b); |
3549 | | |
3550 | | //store left boundary |
3551 | 1.00M | _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); |
3552 | | //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. |
3553 | 1.00M | _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); |
3554 | | // row = 1 |
3555 | 1.00M | _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b); |
3556 | | |
3557 | 1.00M | src_temp0_16x8b = src_bottom_16x8b; |
3558 | 1.00M | pu1_src_cpy += (src_strd << 1); |
3559 | 1.00M | pu1_src_left_cpy += 4; |
3560 | 1.00M | pu1_src_left_str += 4; |
3561 | 1.00M | } |
3562 | 128k | ht_rem = ht & 0x1; |
3563 | | |
3564 | 128k | if(ht_rem) |
3565 | 6.47k | { |
3566 | 6.47k | left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); |
3567 | 6.47k | src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2)); |
3568 | | //current row -next row |
3569 | | //separating +ve and and -ve values. |
3570 | 6.47k | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b); |
3571 | 6.47k | cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b); |
3572 | | //creating mask 00 for +ve and -ve values and FF for zero. |
3573 | 6.47k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
3574 | 6.47k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
3575 | | //combining the appropriate sign change |
3576 | 6.47k | edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
3577 | | //adding top and botton and constant 2 |
3578 | 6.47k | edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); |
3579 | 6.47k | edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); |
3580 | | |
3581 | | //eliminating old left for row 0 and row 1 |
3582 | 6.47k | left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2); |
3583 | | //copying the next top |
3584 | 6.47k | src_top_16x8b = src_temp0_16x8b; |
3585 | | //row0 getting it right for left of next block |
3586 | 6.47k | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14); |
3587 | | |
3588 | 6.47k | edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); |
3589 | | //using availability mask |
3590 | 6.47k | edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); |
3591 | | //adding chroma offset to access U and V |
3592 | 6.47k | edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); |
3593 | | |
3594 | 6.47k | edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); |
3595 | | |
3596 | | //cnvert to 16 bit then add and then saturated pack |
3597 | 6.47k | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); |
3598 | 6.47k | cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); |
3599 | 6.47k | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); |
3600 | 6.47k | src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); |
3601 | 6.47k | edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); |
3602 | 6.47k | cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); |
3603 | 6.47k | src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b); |
3604 | 6.47k | src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); |
3605 | | |
3606 | 6.47k | _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); |
3607 | | |
3608 | 6.47k | _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); |
3609 | 6.47k | pu1_src_cpy += (src_strd); |
3610 | 6.47k | pu1_src_left_cpy += 2; |
3611 | 6.47k | pu1_src_left_str += 2; |
3612 | 6.47k | } |
3613 | 128k | if(0 == pu1_avail[3]) |
3614 | 3.04k | { |
3615 | 3.04k | src_top_16x8b = src_bottom_16x8b; |
3616 | 3.04k | pu1_src_left_str[1] = pu1_src_cpy[15]; |
3617 | 3.04k | pu1_src_left_str[0] = pu1_src_cpy[14]; |
3618 | 3.04k | } |
3619 | 128k | if(0 == pu1_avail[2]) |
3620 | 3.43k | { |
3621 | 3.43k | pu1_src_left_str[-2 * ht_0] = pu1_src[14 - src_strd]; |
3622 | 3.43k | pu1_src_left_str[-2 * ht_0 + 1] = pu1_src[15 - src_strd]; |
3623 | 3.43k | } |
3624 | | |
3625 | | //for the top left of next part of the block |
3626 | 128k | left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col)); |
3627 | | //updating top flag |
3628 | 128k | _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b); |
3629 | 128k | pu1_src += 16; |
3630 | 128k | au1_mask_cpy += 16; |
3631 | | |
3632 | 128k | pu1_left_tmp = pu1_src_left_cpy2; |
3633 | 128k | pu1_src_left_cpy2 = pu1_src_left_str2; |
3634 | 128k | pu1_src_left_str2 = pu1_left_tmp; |
3635 | | |
3636 | 128k | pu1_src_left_cpy = pu1_src_left_cpy2; |
3637 | 128k | pu1_src_left_str = pu1_src_left_str2; |
3638 | 128k | } |
3639 | 63.1k | wd_rem = wd & 0xF; |
3640 | 63.1k | if(wd_rem) |
3641 | 6 | { |
3642 | 6 | pu1_src_left_cpy = pu1_src_left_cpy2; |
3643 | 6 | pu1_src_left_str = pu1_src_left_str2; |
3644 | 6 | pu1_src_cpy = pu1_src; |
3645 | 6 | src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col)); |
3646 | | //row = 0 |
3647 | 6 | src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy)); |
3648 | 6 | src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 14); |
3649 | 6 | au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //???? |
3650 | | //separating +ve and and -ve values. |
3651 | 6 | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b); |
3652 | 6 | cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b); |
3653 | | //creating mask 00 for +ve and -ve values and FF for zero. |
3654 | 6 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
3655 | 6 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
3656 | | //preparing au1_mask |
3657 | 6 | au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b); |
3658 | | //combining the appropriate sign change |
3659 | 6 | signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
3660 | 6 | signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); |
3661 | | |
3662 | 10 | for(row = ht; row >= 4; row -= 4) |
3663 | 4 | { |
3664 | 4 | left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); |
3665 | | //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. |
3666 | 4 | src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); |
3667 | | // row = 2 |
3668 | 4 | src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); |
3669 | | //right row1 |
3670 | 4 | signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); |
3671 | | //row 0 -row1 |
3672 | | //separating +ve and and -ve values. |
3673 | 4 | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); |
3674 | 4 | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); |
3675 | | //manipulation for row 1 -row 0 |
3676 | 4 | signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14); |
3677 | | //creating mask 00 for +ve and -ve values and FF for zero. |
3678 | 4 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
3679 | 4 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
3680 | | //row 0 left |
3681 | 4 | signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14); |
3682 | | //combining the appropriate sign change |
3683 | 4 | edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
3684 | | //row 1 -row0 |
3685 | | //separating +ve and and -ve values. |
3686 | 4 | cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); |
3687 | 4 | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); |
3688 | | |
3689 | | //creating mask 00 for +ve and -ve values and FF for zero. |
3690 | 4 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
3691 | 4 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
3692 | | //row1-row0 |
3693 | 4 | edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
3694 | | |
3695 | 4 | edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1) |
3696 | | |
3697 | 4 | signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top) |
3698 | | //right row2 |
3699 | 4 | signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2); |
3700 | | //packing row 0 n row 1 |
3701 | 4 | src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b); |
3702 | | //row1 -row2 |
3703 | 4 | cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); |
3704 | 4 | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); |
3705 | | //creating mask 00 for +ve and -ve values and FF for zero. |
3706 | 4 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
3707 | 4 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
3708 | | //combining the appropriate sign change |
3709 | 4 | signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2) |
3710 | 4 | edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1) |
3711 | | //manipulation for row 2 -row 1 |
3712 | 4 | signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12); |
3713 | | //row 1 left |
3714 | 4 | signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14); |
3715 | | //row = 3 |
3716 | 4 | src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd)); |
3717 | | |
3718 | | // row = 4 |
3719 | 4 | src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd)); |
3720 | | |
3721 | 4 | edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty |
3722 | | |
3723 | | //separating +ve and and -ve values.(2,1) |
3724 | 4 | cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); |
3725 | 4 | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); |
3726 | | //manipulation for row 3 -row 2 |
3727 | 4 | signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 10); |
3728 | | //creating mask 00 for +ve and -ve values and FF for zero. |
3729 | 4 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
3730 | 4 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
3731 | | //row 2 left |
3732 | 4 | signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14); |
3733 | | //combining the appropriate sign change |
3734 | 4 | signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) |
3735 | | |
3736 | | //separating +ve and and -ve values.(3,2) |
3737 | 4 | cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b); |
3738 | 4 | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b); |
3739 | 4 | signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1) |
3740 | | //creating mask 00 for +ve and -ve values and FF for zero. |
3741 | 4 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
3742 | 4 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
3743 | | //right row3 |
3744 | 4 | signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 2); |
3745 | | //combining the appropriate sign change |
3746 | 4 | edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2) |
3747 | | |
3748 | 4 | signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1) |
3749 | | |
3750 | | //separating +ve and and -ve values.(2,3) |
3751 | 4 | cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); |
3752 | 4 | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); |
3753 | | //right row 4 |
3754 | 4 | signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); |
3755 | | //creating mask 00 for +ve and -ve values and FF for zero. |
3756 | 4 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
3757 | 4 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
3758 | | //combining the appropriate sign change |
3759 | 4 | edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3) |
3760 | | |
3761 | | //separating +ve and and -ve values.(3,bottom) |
3762 | 4 | cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b); |
3763 | 4 | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b); |
3764 | | |
3765 | | //creating mask 00 for +ve and -ve values and FF for zero. |
3766 | 4 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
3767 | 4 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
3768 | 4 | edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3) |
3769 | | //combining the appropriate sign change |
3770 | 4 | signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom) |
3771 | 4 | edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3) |
3772 | | |
3773 | | //manipulation for bottom -row 3 |
3774 | 4 | signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 8); |
3775 | | //eliminating old left for row 0,1,2,3 |
3776 | 4 | left_store_16x8b = _mm_srli_si128(left_store_16x8b, 8); |
3777 | | //packing row 2 n row 3 |
3778 | 4 | src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b); |
3779 | | //row 3 left |
3780 | 4 | signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 14); |
3781 | | |
3782 | | //adding bottom and top values of row 2 and row 3 |
3783 | 4 | edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2) |
3784 | | //separating +ve and and -ve values.(botttom,3) |
3785 | 4 | cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); |
3786 | 4 | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); |
3787 | | |
3788 | | //creating mask 00 for +ve and -ve values and FF for zero. |
3789 | 4 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
3790 | 4 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
3791 | 4 | signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration |
3792 | | |
3793 | | //to store right of row 2 |
3794 | 4 | signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8); |
3795 | | //loading row 3 right into left |
3796 | 4 | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 14); |
3797 | | //storing right of row 2into left |
3798 | 4 | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); |
3799 | | //to store right of row 0 |
3800 | 4 | signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); |
3801 | | //storing right of row 1 into left |
3802 | 4 | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14); |
3803 | | //storing right of row 0 into left |
3804 | 4 | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); |
3805 | | |
3806 | | //adding constant 2 |
3807 | 4 | edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); |
3808 | 4 | edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); |
3809 | | //shuffle to get sao index |
3810 | 4 | edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); |
3811 | 4 | edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); |
3812 | | //using availability mask |
3813 | 4 | edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); |
3814 | 4 | edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b); |
3815 | | |
3816 | | //adding chroma offset to access U and V |
3817 | 4 | edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); |
3818 | 4 | edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b); |
3819 | | |
3820 | | //shuffle to get sao offset |
3821 | 4 | edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); |
3822 | 4 | edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); |
3823 | | //cnvert to 16 bit then add and then saturated pack |
3824 | 4 | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); |
3825 | 4 | cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); |
3826 | 4 | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); |
3827 | 4 | src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); |
3828 | 4 | edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); |
3829 | 4 | cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); |
3830 | 4 | src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b); |
3831 | 4 | src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); |
3832 | | |
3833 | 4 | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); |
3834 | 4 | cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b); |
3835 | 4 | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b); |
3836 | 4 | src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b); |
3837 | 4 | edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b); |
3838 | 4 | cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); |
3839 | 4 | src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b); |
3840 | 4 | src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b); |
3841 | | |
3842 | 4 | cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); |
3843 | 4 | cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8); |
3844 | | |
3845 | | |
3846 | 4 | _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); |
3847 | | //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. |
3848 | 4 | _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); |
3849 | | // row = 1 |
3850 | 4 | _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); |
3851 | | //row = 2 |
3852 | 4 | _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b); |
3853 | | // row = 3 |
3854 | 4 | _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b); |
3855 | | |
3856 | 4 | src_temp0_16x8b = src_temp1_16x8b; |
3857 | 4 | signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); |
3858 | 4 | pu1_src_cpy += (src_strd << 2); |
3859 | 4 | pu1_src_left_cpy += 8; |
3860 | 4 | pu1_src_left_str += 8; |
3861 | 4 | } |
3862 | 6 | ht_rem = ht & 0x2; |
3863 | 6 | if(ht_rem) |
3864 | 4 | { |
3865 | 4 | left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); |
3866 | | //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. |
3867 | 4 | src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); |
3868 | | // row = 2 |
3869 | 4 | src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); |
3870 | | |
3871 | | //row 0 -row 1 |
3872 | 4 | signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); |
3873 | | //separating +ve and and -ve values. |
3874 | 4 | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); |
3875 | 4 | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); |
3876 | | //manipulation for row 1 -row 0 |
3877 | 4 | signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14); |
3878 | | //creating mask 00 for +ve and -ve values and FF for zero. |
3879 | 4 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
3880 | 4 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
3881 | | //manipulation for row 1 - row 0 |
3882 | 4 | signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14); |
3883 | | //combining the appropriate sign change |
3884 | 4 | edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
3885 | | |
3886 | | //row1-row0 |
3887 | | //separating +ve and and -ve values. |
3888 | 4 | cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); |
3889 | 4 | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); |
3890 | | |
3891 | | //creating mask 00 for +ve and -ve values and FF for zero. |
3892 | 4 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
3893 | 4 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
3894 | | //combining the appropriate sign chang |
3895 | 4 | edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
3896 | | //row 1 -bottom |
3897 | 4 | signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2); |
3898 | | |
3899 | 4 | edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1) |
3900 | 4 | signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top) |
3901 | | //row1 -bottom |
3902 | 4 | cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); |
3903 | 4 | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); |
3904 | | |
3905 | | //creating mask 00 for +ve and -ve values and FF for zero. |
3906 | 4 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
3907 | 4 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
3908 | | //combining the appropriate sign change |
3909 | 4 | signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2) |
3910 | 4 | edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1) |
3911 | | //manipulation for bottom -row1 |
3912 | 4 | signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12); |
3913 | | //eliminating old left for row 0,1 |
3914 | 4 | left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4); |
3915 | | //manipulation for bottom- row 1 |
3916 | 4 | signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14); |
3917 | | //adding top and down substraction |
3918 | 4 | edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty |
3919 | | //bottom - row 1 |
3920 | 4 | cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); |
3921 | 4 | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); |
3922 | | |
3923 | | //shifting row 1 |
3924 | 4 | signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8); |
3925 | | //creating mask 00 for +ve and -ve values and FF for zero. |
3926 | 4 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
3927 | 4 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
3928 | | //for the next iteration signup0_16x8b |
3929 | 4 | signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next |
3930 | | //storing right of row 1 into left |
3931 | 4 | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); //for storing right of row 0 |
3932 | 4 | signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); |
3933 | | //the next top in src_top_16x8b |
3934 | 4 | src_top_16x8b = src_temp1_16x8b; |
3935 | | //storing right of row 0 into left |
3936 | 4 | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); |
3937 | | |
3938 | | |
3939 | | //adding constant 2 |
3940 | 4 | edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); |
3941 | | |
3942 | | //shuffle to get sao index |
3943 | 4 | edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); |
3944 | | //using availability mask |
3945 | 4 | edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); |
3946 | | |
3947 | | //adding chroma offset to access U and V |
3948 | 4 | edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); |
3949 | | |
3950 | | //shuffle to get sao offset |
3951 | 4 | edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); |
3952 | | //the next top already in src_top_16x8b |
3953 | | //cnvert to 16 bit then add and then saturated pack |
3954 | 4 | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); |
3955 | 4 | src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); |
3956 | 4 | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); |
3957 | 4 | src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); |
3958 | 4 | edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); |
3959 | 4 | src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); |
3960 | 4 | src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b); |
3961 | 4 | src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b); |
3962 | | |
3963 | 4 | cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); |
3964 | | |
3965 | 4 | _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); |
3966 | | //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. |
3967 | 4 | _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); |
3968 | | // row = 1 |
3969 | 4 | _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); |
3970 | 4 | src_temp0_16x8b = src_bottom_16x8b; |
3971 | 4 | pu1_src_cpy += (src_strd << 1); |
3972 | 4 | pu1_src_left_cpy += 4; |
3973 | 4 | pu1_src_left_str += 4; |
3974 | 4 | } |
3975 | 6 | ht_rem = ht & 0x1; |
3976 | 6 | if(ht_rem) |
3977 | 0 | { |
3978 | 0 | left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); |
3979 | | //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. |
3980 | 0 | src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); |
3981 | | |
3982 | | //row 0 -row1 |
3983 | 0 | signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2); |
3984 | | //separating +ve and and -ve values. |
3985 | 0 | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); |
3986 | 0 | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); |
3987 | | //creating mask 00 for +ve and -ve values and FF for zero. |
3988 | 0 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
3989 | 0 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
3990 | | //combining the appropriate sign change |
3991 | 0 | edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
3992 | | //adding top and down substraction |
3993 | 0 | edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); |
3994 | | |
3995 | | //for row 0 right to put into left store |
3996 | 0 | signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); |
3997 | | //left store manipulation 1 |
3998 | 0 | left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2); |
3999 | 0 | src_top_16x8b = src_temp0_16x8b; |
4000 | | //filling the left boundary value |
4001 | 0 | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); |
4002 | | |
4003 | | //adding constant 2 |
4004 | 0 | edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); |
4005 | 0 | edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); |
4006 | 0 | edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8); |
4007 | | |
4008 | | |
4009 | | //shuffle to get sao index |
4010 | 0 | edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); |
4011 | | //using availability mask |
4012 | 0 | edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); |
4013 | | //adding chroma offset to access U and V |
4014 | 0 | edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); |
4015 | | |
4016 | | //shuffle to get sao offset |
4017 | 0 | edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); |
4018 | | |
4019 | | //cnvert to 16 bit then add and then saturated pack |
4020 | 0 | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); |
4021 | 0 | src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); |
4022 | 0 | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); |
4023 | 0 | src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); |
4024 | 0 | src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b); |
4025 | |
|
4026 | 0 | _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); |
4027 | | //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. |
4028 | 0 | _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); |
4029 | 0 | pu1_src_cpy += (src_strd); |
4030 | 0 | pu1_src_left_cpy += 2; |
4031 | 0 | pu1_src_left_str += 2; |
4032 | 0 | } |
4033 | 6 | if(0 == pu1_avail[3]) |
4034 | 4 | { |
4035 | 4 | src_top_16x8b = src_bottom_16x8b; |
4036 | 4 | pu1_src_left_str[1] = pu1_src_cpy[7]; |
4037 | 4 | pu1_src_left_str[0] = pu1_src_cpy[6]; |
4038 | 4 | } |
4039 | | |
4040 | 6 | if(0 == pu1_avail[2]) |
4041 | 4 | { |
4042 | 4 | pu1_src_left_str[-2 * ht_0] = pu1_src[6 - src_strd]; |
4043 | 4 | pu1_src_left_str[-2 * ht_0 + 1] = pu1_src[7 - src_strd]; |
4044 | 4 | } |
4045 | | |
4046 | 6 | _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b); |
4047 | 6 | pu1_src += 8; |
4048 | | |
4049 | 6 | pu1_left_tmp = pu1_src_left_cpy2; |
4050 | 6 | pu1_src_left_cpy2 = pu1_src_left_str2; |
4051 | 6 | pu1_src_left_str2 = pu1_left_tmp; |
4052 | | |
4053 | 6 | pu1_src_left_cpy = pu1_src_left_cpy2; |
4054 | 6 | pu1_src_left_str = pu1_src_left_str2; |
4055 | 6 | } |
4056 | 63.1k | pu1_src_org[0] = u1_pos_0_0_tmp_u; |
4057 | 63.1k | pu1_src_org[1] = u1_pos_0_0_tmp_v; |
4058 | 63.1k | pu1_src_org[wd - 2 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp_u; |
4059 | 63.1k | pu1_src_org[wd - 1 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp_v; |
4060 | 63.1k | pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 2) : pu1_src_left_cpy; |
4061 | 2.06M | for(row = 0; row < 2 * ht_tmp; row++) |
4062 | 2.00M | { |
4063 | 2.00M | pu1_src_left[row] = pu1_src_left_cpy[row]; |
4064 | 2.00M | } |
4065 | 63.1k | } |
4066 | | |
4067 | 63.1k | } |
4068 | | |
4069 | | void ihevc_sao_edge_offset_class3_ssse3(UWORD8 *pu1_src, |
4070 | | WORD32 src_strd, |
4071 | | UWORD8 *pu1_src_left, |
4072 | | UWORD8 *pu1_src_top, |
4073 | | UWORD8 *pu1_src_top_left, |
4074 | | UWORD8 *pu1_src_top_right, |
4075 | | UWORD8 *pu1_src_bot_left, |
4076 | | UWORD8 *pu1_avail, |
4077 | | WORD8 *pi1_sao_offset, |
4078 | | WORD32 wd, |
4079 | | WORD32 ht) |
4080 | 56.9k | { |
4081 | 56.9k | WORD32 row, col; |
4082 | 56.9k | UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2; |
4083 | 56.9k | UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2; |
4084 | 56.9k | UWORD8 *pu1_src_cpy, *pu1_src_org; |
4085 | 56.9k | UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8]; |
4086 | 56.9k | UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8]; |
4087 | 56.9k | UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy; |
4088 | 56.9k | WORD32 wd_rem; |
4089 | 56.9k | UWORD8 u1_pos_wd_0_tmp, u1_pos_0_ht_tmp; |
4090 | 56.9k | WORD32 ht_tmp; |
4091 | 56.9k | WORD32 bit_depth; |
4092 | 56.9k | UWORD8 u1_avail0, u1_avail1; |
4093 | | |
4094 | 56.9k | __m128i src_top_16x8b, src_bottom_16x8b; |
4095 | 56.9k | __m128i src_temp0_16x8b, src_temp1_16x8b; |
4096 | 56.9k | __m128i signup0_16x8b, signdwn1_16x8b; |
4097 | 56.9k | __m128i cmp_gt0_16x8b, cmp_lt0_16x8b; |
4098 | 56.9k | __m128i edge0_16x8b, edge1_16x8b; |
4099 | 56.9k | __m128i au1_mask8x16b; |
4100 | 56.9k | __m128i edge_idx_8x16b, sao_offset_8x16b; |
4101 | 56.9k | __m128i const2_16x8b, const0_16x8b; |
4102 | 56.9k | __m128i left_store_16x8b; |
4103 | | |
4104 | 56.9k | ht_tmp = ht; |
4105 | 56.9k | au1_mask8x16b = _mm_set1_epi8(0xff); |
4106 | | |
4107 | 56.9k | au1_src_left_tmp[0] = pu1_src[(wd - 1)]; |
4108 | | //manipulation for bottom left |
4109 | 1.79M | for(row = 1; row < ht; row++) |
4110 | 1.74M | { |
4111 | 1.74M | au1_src_left_tmp[row] = pu1_src_left[row]; |
4112 | 1.74M | } |
4113 | 56.9k | au1_src_left_tmp[ht] = pu1_src_bot_left[0]; |
4114 | | |
4115 | 56.9k | *pu1_src_top_left = pu1_src_top[wd - 1]; |
4116 | | //setting availability mask to ff size MAX_CTB_SIZE |
4117 | 284k | for(col = 0; col < MAX_CTB_SIZE; col += 16) |
4118 | 227k | _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b); |
4119 | 56.9k | bit_depth = BIT_DEPTH_LUMA; |
4120 | 56.9k | pu1_src_org = pu1_src; |
4121 | 56.9k | pu1_src_top_cpy = pu1_src_top; |
4122 | 56.9k | pu1_src_left_cpy2 = au1_src_left_tmp; |
4123 | 56.9k | pu1_src_left_cpy = au1_src_left_tmp; |
4124 | 56.9k | pu1_src_left_str2 = au1_src_left_tmp1; |
4125 | 56.9k | pu1_src_left_str = au1_src_left_tmp1; |
4126 | 56.9k | edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx); |
4127 | 56.9k | sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset); |
4128 | | |
4129 | | /* If top-right is available, process separately */ |
4130 | 56.9k | if(0 != pu1_avail[5]) |
4131 | 54.5k | { |
4132 | 54.5k | WORD32 edge_idx; |
4133 | | |
4134 | 54.5k | edge_idx = 2 + SIGN(pu1_src[wd - 1] - pu1_src_top_right[0]) + |
4135 | 54.5k | SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd]); |
4136 | | |
4137 | 54.5k | edge_idx = gi1_table_edge_idx[edge_idx]; |
4138 | | |
4139 | 54.5k | if(0 != edge_idx) |
4140 | 15.9k | { |
4141 | 15.9k | u1_pos_wd_0_tmp = CLIP3(pu1_src[wd - 1] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1); |
4142 | 15.9k | } |
4143 | 38.6k | else |
4144 | 38.6k | { |
4145 | 38.6k | u1_pos_wd_0_tmp = pu1_src[wd - 1]; |
4146 | 38.6k | } |
4147 | 54.5k | } |
4148 | 2.40k | else |
4149 | 2.40k | { |
4150 | 2.40k | u1_pos_wd_0_tmp = pu1_src[wd - 1]; |
4151 | 2.40k | } |
4152 | | |
4153 | | /* If bottom-left is available, process separately */ |
4154 | 56.9k | if(0 != pu1_avail[6]) |
4155 | 52.7k | { |
4156 | 52.7k | WORD32 edge_idx; |
4157 | | |
4158 | 52.7k | edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd]) + |
4159 | 52.7k | SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]); |
4160 | | |
4161 | 52.7k | edge_idx = gi1_table_edge_idx[edge_idx]; |
4162 | | |
4163 | 52.7k | if(0 != edge_idx) |
4164 | 18.7k | { |
4165 | 18.7k | u1_pos_0_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1); |
4166 | 18.7k | } |
4167 | 33.9k | else |
4168 | 33.9k | { |
4169 | 33.9k | u1_pos_0_ht_tmp = pu1_src[(ht - 1) * src_strd]; |
4170 | 33.9k | } |
4171 | 52.7k | } |
4172 | 4.22k | else |
4173 | 4.22k | { |
4174 | 4.22k | u1_pos_0_ht_tmp = pu1_src[(ht - 1) * src_strd]; |
4175 | 4.22k | } |
4176 | | |
4177 | | |
4178 | | |
4179 | | /* Update height and source pointers based on the availability flags */ |
4180 | 56.9k | if(0 == pu1_avail[2]) |
4181 | 1.34k | { |
4182 | 1.34k | pu1_src_left_cpy2++; |
4183 | 1.34k | pu1_src_left_str2++; |
4184 | 1.34k | pu1_src_top_cpy = pu1_src; |
4185 | 1.34k | pu1_src += src_strd; |
4186 | 1.34k | ht--; |
4187 | 1.34k | } |
4188 | 56.9k | if(0 == pu1_avail[3]) |
4189 | 1.99k | { |
4190 | 1.99k | ht--; |
4191 | 1.99k | } |
4192 | | |
4193 | | |
4194 | 56.9k | const2_16x8b = _mm_set1_epi8(2); |
4195 | 56.9k | const0_16x8b = _mm_setzero_si128(); |
4196 | | |
4197 | | |
4198 | | //availability mask creation |
4199 | 56.9k | u1_avail0 = pu1_avail[0]; |
4200 | 56.9k | u1_avail1 = pu1_avail[1]; |
4201 | 56.9k | au1_mask[0] = u1_avail0; |
4202 | 56.9k | au1_mask[wd - 1] = u1_avail1; |
4203 | 56.9k | { |
4204 | 56.9k | WORD32 ht_rem; |
4205 | | |
4206 | 56.9k | pu1_src_left_cpy = pu1_src_left_cpy2; |
4207 | 56.9k | pu1_src_left_str = pu1_src_left_str2; |
4208 | 56.9k | au1_mask_cpy = au1_mask; |
4209 | 144k | for(col = wd; col >= 16; col -= 16) |
4210 | 87.8k | { |
4211 | 87.8k | pu1_src_cpy = pu1_src; |
4212 | 87.8k | src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col + 1)); |
4213 | | //row = 0 |
4214 | 87.8k | src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy)); |
4215 | | |
4216 | | //loading the mask |
4217 | 87.8k | au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy); |
4218 | | //separating +ve and and -ve values. |
4219 | 87.8k | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b); |
4220 | 87.8k | cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b); |
4221 | | //creating mask 00 for +ve and -ve values and FF for zero. |
4222 | 87.8k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
4223 | 87.8k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
4224 | | //combining the appropriate sign change |
4225 | 87.8k | signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
4226 | | |
4227 | 1.46M | for(row = ht; row >= 2; row -= 2) |
4228 | 1.37M | { |
4229 | 1.37M | left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy)); |
4230 | | //row = 1 |
4231 | 1.37M | src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); |
4232 | | //to insert left in row 1 |
4233 | 1.37M | signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14); |
4234 | | // row = 0 right |
4235 | 1.37M | src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 1)); |
4236 | | |
4237 | | //manipulation for row 1 - row 0 |
4238 | 1.37M | signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15); |
4239 | | //row 0 -row1 |
4240 | | //separating +ve and and -ve values. |
4241 | 1.37M | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); |
4242 | 1.37M | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); |
4243 | | |
4244 | | //creating mask 00 for +ve and -ve values and FF for zero. |
4245 | 1.37M | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
4246 | 1.37M | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
4247 | | |
4248 | | //combining the appropriate sign change |
4249 | 1.37M | edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1) |
4250 | | //combining sign-left and sign_right |
4251 | 1.37M | edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); |
4252 | | |
4253 | | //row1-row0 |
4254 | | //separating +ve and and -ve values. |
4255 | 1.37M | cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b); |
4256 | 1.37M | cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b); |
4257 | | //creating mask 00 for +ve and -ve values and FF for zero. |
4258 | 1.37M | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
4259 | 1.37M | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
4260 | | |
4261 | | // row = 2 |
4262 | 1.37M | src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); |
4263 | | // row = 1 right |
4264 | 1.37M | signdwn1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1)); |
4265 | 1.37M | edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0) |
4266 | | |
4267 | | //bottom - row1 |
4268 | 1.37M | cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); |
4269 | 1.37M | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); |
4270 | | //creating mask 00 for +ve and -ve values and FF for zero. |
4271 | 1.37M | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
4272 | 1.37M | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
4273 | | //for the next iteration bottom -row1 |
4274 | 1.37M | signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
4275 | | |
4276 | | //to insert left in row 1 |
4277 | 1.37M | signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 13); |
4278 | | //manipulation for row 1 - bottom |
4279 | 1.37M | signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15); |
4280 | | |
4281 | | //row1 -bottom |
4282 | 1.37M | cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); |
4283 | 1.37M | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); |
4284 | | //creating mask 00 for +ve and -ve values and FF for zero. |
4285 | 1.37M | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
4286 | 1.37M | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
4287 | | //combining the appropriate sign change |
4288 | 1.37M | signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
4289 | | |
4290 | | //combining sign-left and sign_right |
4291 | 1.37M | edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b); |
4292 | | |
4293 | | //eliminating old left for row 0 and row 1 |
4294 | 1.37M | left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2); |
4295 | | |
4296 | | //row1 getting it right for left of next block |
4297 | 1.37M | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15); |
4298 | | //adding constant 2 |
4299 | 1.37M | edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); |
4300 | 1.37M | edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); |
4301 | | //shuffle to get sao index |
4302 | 1.37M | edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); |
4303 | 1.37M | edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); |
4304 | | //using availability mask |
4305 | 1.37M | edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); |
4306 | 1.37M | edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b); |
4307 | | //shuffle to get sao offset |
4308 | 1.37M | edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); |
4309 | 1.37M | edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); |
4310 | | //row0 getting it right for left of next block |
4311 | 1.37M | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15); |
4312 | | //copying the next top |
4313 | 1.37M | src_top_16x8b = src_temp1_16x8b; |
4314 | | //cnvert to 16 bit then add and then saturated pack |
4315 | 1.37M | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); |
4316 | 1.37M | cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); |
4317 | 1.37M | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); |
4318 | 1.37M | src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); |
4319 | 1.37M | cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); |
4320 | 1.37M | cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); |
4321 | 1.37M | src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); |
4322 | 1.37M | src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); |
4323 | | |
4324 | 1.37M | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); |
4325 | 1.37M | cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); |
4326 | 1.37M | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b); |
4327 | 1.37M | src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b); |
4328 | 1.37M | cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); |
4329 | 1.37M | cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b); |
4330 | 1.37M | src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b); |
4331 | 1.37M | src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b); |
4332 | | //store left boundary |
4333 | 1.37M | _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); |
4334 | | //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. |
4335 | 1.37M | _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); |
4336 | | // row = 1 |
4337 | 1.37M | _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b); |
4338 | | |
4339 | 1.37M | src_temp0_16x8b = src_bottom_16x8b; |
4340 | 1.37M | pu1_src_cpy += (src_strd << 1); |
4341 | 1.37M | pu1_src_left_cpy += 2; |
4342 | 1.37M | pu1_src_left_str += 2; |
4343 | 1.37M | } |
4344 | 87.8k | ht_rem = ht & 0x1; |
4345 | | |
4346 | 87.8k | if(ht_rem) |
4347 | 5.12k | { |
4348 | 5.12k | left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); |
4349 | 5.12k | src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); |
4350 | | //to insert left in row 1 |
4351 | 5.12k | signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14); |
4352 | | //manipulation for row 1 - row 0 |
4353 | 5.12k | signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15); |
4354 | | |
4355 | | //current row -next row |
4356 | | //separating +ve and and -ve values. |
4357 | 5.12k | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); |
4358 | 5.12k | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); |
4359 | | //creating mask 00 for +ve and -ve values and FF for zero. |
4360 | 5.12k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
4361 | 5.12k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
4362 | | //combining the appropriate sign change |
4363 | 5.12k | edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
4364 | | //adding top and bottom and constant 2 |
4365 | 5.12k | edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); |
4366 | 5.12k | edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); |
4367 | | //eliminating old left for row 0 and row 1 |
4368 | 5.12k | left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1); |
4369 | | |
4370 | 5.12k | edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); |
4371 | | //using availability mask |
4372 | 5.12k | edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); |
4373 | | |
4374 | 5.12k | edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); |
4375 | | |
4376 | | //row0 getting it right for left of next block |
4377 | 5.12k | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15); |
4378 | | //copying the next top |
4379 | 5.12k | src_top_16x8b = src_temp0_16x8b; |
4380 | | //cnvert to 16 bit then add and then saturated pack |
4381 | 5.12k | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); |
4382 | 5.12k | cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); |
4383 | 5.12k | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); |
4384 | 5.12k | src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); |
4385 | 5.12k | cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); |
4386 | 5.12k | cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); |
4387 | 5.12k | src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); |
4388 | 5.12k | src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); |
4389 | | //store left boundary |
4390 | 5.12k | _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); |
4391 | | |
4392 | 5.12k | _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); |
4393 | 5.12k | pu1_src_cpy += (src_strd); |
4394 | 5.12k | src_temp0_16x8b = src_bottom_16x8b; |
4395 | 5.12k | pu1_src_left_cpy++; |
4396 | 5.12k | pu1_src_left_str++; |
4397 | 5.12k | } |
4398 | 87.8k | { //for bottom right |
4399 | 87.8k | left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); |
4400 | 87.8k | left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1); |
4401 | 87.8k | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15); |
4402 | 87.8k | _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); |
4403 | 87.8k | } |
4404 | 87.8k | if(0 == pu1_avail[3]) |
4405 | 3.03k | { |
4406 | 3.03k | src_top_16x8b = src_bottom_16x8b; |
4407 | 3.03k | } |
4408 | | //for the top left of next part of the block |
4409 | 87.8k | left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col)); |
4410 | | //updating top flag |
4411 | 87.8k | _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b); |
4412 | 87.8k | pu1_src += 16; |
4413 | 87.8k | au1_mask_cpy += 16; |
4414 | | |
4415 | 87.8k | pu1_left_tmp = pu1_src_left_cpy2; |
4416 | 87.8k | pu1_src_left_cpy2 = pu1_src_left_str2; |
4417 | 87.8k | pu1_src_left_str2 = pu1_left_tmp; |
4418 | | |
4419 | 87.8k | pu1_src_left_cpy = pu1_src_left_cpy2; |
4420 | 87.8k | pu1_src_left_str = pu1_src_left_str2; |
4421 | 87.8k | } |
4422 | | |
4423 | 56.9k | wd_rem = wd & 0xF; |
4424 | 56.9k | if(wd_rem) |
4425 | 55.8k | { |
4426 | 55.8k | pu1_src_cpy = pu1_src; |
4427 | 55.8k | pu1_src_left_cpy = pu1_src_left_cpy2; |
4428 | 55.8k | pu1_src_left_str = pu1_src_left_str2; |
4429 | 55.8k | src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col + 1)); |
4430 | | //row = 0 |
4431 | 55.8k | src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy)); |
4432 | 55.8k | au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //???? |
4433 | | //separating +ve and and -ve values. |
4434 | 55.8k | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b); |
4435 | 55.8k | cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b); |
4436 | | //creating mask 00 for +ve and -ve values and FF for zero. |
4437 | 55.8k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
4438 | 55.8k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
4439 | | //preparing au1_mask |
4440 | 55.8k | au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b); |
4441 | | //combining the appropriate sign change |
4442 | 55.8k | signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
4443 | 55.8k | signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); |
4444 | | |
4445 | 492k | for(row = ht; row >= 4; row -= 4) |
4446 | 436k | { |
4447 | 436k | left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); |
4448 | | //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. |
4449 | 436k | src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); |
4450 | | // row = 2 |
4451 | 436k | src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); |
4452 | | //manipulation for row 0 -row 1 |
4453 | 436k | signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14); |
4454 | | //row 1 left |
4455 | 436k | signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15); |
4456 | | //row 0 -row1 |
4457 | | //separating +ve and and -ve values. |
4458 | 436k | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); |
4459 | 436k | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); |
4460 | | |
4461 | | //creating mask 00 for +ve and -ve values and FF for zero. |
4462 | 436k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
4463 | 436k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
4464 | | //manipulatiing for row 1 -row 0 |
4465 | 436k | signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 1); |
4466 | | //combining the appropriate sign change |
4467 | 436k | edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
4468 | | //row 1 -row0 |
4469 | | //separating +ve and and -ve values. |
4470 | 436k | cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); |
4471 | 436k | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); |
4472 | | |
4473 | | //creating mask 00 for +ve and -ve values and FF for zero. |
4474 | 436k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
4475 | 436k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
4476 | | //row1-row0 |
4477 | 436k | edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
4478 | | |
4479 | 436k | edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1) |
4480 | | |
4481 | 436k | signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top) |
4482 | | //manipulation for row 1 -row 2 |
4483 | 436k | signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 13); |
4484 | | //row 2 left |
4485 | 436k | signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15); |
4486 | | //packing row 0 n row 1 |
4487 | 436k | src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b); |
4488 | | //row1 -row2 |
4489 | 436k | cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); |
4490 | 436k | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); |
4491 | | //creating mask 00 for +ve and -ve values and FF for zero. |
4492 | 436k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
4493 | 436k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
4494 | | //combining the appropriate sign change |
4495 | 436k | signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2) |
4496 | 436k | edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1) |
4497 | | |
4498 | | //row 1 right |
4499 | 436k | signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1); |
4500 | | //row = 3 |
4501 | 436k | src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd)); |
4502 | | |
4503 | | // row = 4 |
4504 | 436k | src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd)); |
4505 | | |
4506 | 436k | edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty |
4507 | | |
4508 | | //separating +ve and and -ve values.(2,1) |
4509 | 436k | cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); |
4510 | 436k | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); |
4511 | | |
4512 | | //creating mask 00 for +ve and -ve values and FF for zero. |
4513 | 436k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
4514 | 436k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
4515 | | //row 2 right |
4516 | 436k | signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1); |
4517 | | //combining the appropriate sign change |
4518 | 436k | signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) |
4519 | | |
4520 | | //separating +ve and and -ve values.(3,2) |
4521 | 436k | cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b); |
4522 | 436k | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b); |
4523 | 436k | signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1) |
4524 | | //creating mask 00 for +ve and -ve values and FF for zero. |
4525 | 436k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
4526 | 436k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
4527 | | //manipulation for row 2 -row 3 |
4528 | 436k | signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12); |
4529 | | //row 3 left |
4530 | 436k | signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 15); |
4531 | | //combining the appropriate sign change |
4532 | 436k | edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2) |
4533 | | |
4534 | 436k | signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1) |
4535 | | |
4536 | | //separating +ve and and -ve values.(2,3) |
4537 | 436k | cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); |
4538 | 436k | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); |
4539 | | |
4540 | | //manipulation for row 3 -bottom |
4541 | 436k | signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 11); |
4542 | | //bottom left |
4543 | 436k | signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15); |
4544 | | |
4545 | | //creating mask 00 for +ve and -ve values and FF for zero. |
4546 | 436k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
4547 | 436k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
4548 | | //combining the appropriate sign change |
4549 | 436k | edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3) |
4550 | | |
4551 | | //separating +ve and and -ve values.(3,bottom) |
4552 | 436k | cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b); |
4553 | 436k | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b); |
4554 | | |
4555 | | //creating mask 00 for +ve and -ve values and FF for zero. |
4556 | 436k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
4557 | 436k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
4558 | 436k | edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3) |
4559 | | //combining the appropriate sign change |
4560 | 436k | signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom) |
4561 | 436k | edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3) |
4562 | | |
4563 | | |
4564 | | //eliminating old left for row 0,1,2,3 |
4565 | 436k | left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4); |
4566 | | //packing row 2 n row 3 |
4567 | 436k | src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b); |
4568 | | //row 3 right |
4569 | 436k | signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 1); |
4570 | | //loading row 3 right into left |
4571 | 436k | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 15); |
4572 | | //adding bottom and top values of row 2 and row 3 |
4573 | 436k | edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2) |
4574 | | //separating +ve and and -ve values.(botttom,3) |
4575 | 436k | cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); |
4576 | 436k | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); |
4577 | | //to store right of row 2 |
4578 | 436k | signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8); |
4579 | | //creating mask 00 for +ve and -ve values and FF for zero. |
4580 | 436k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
4581 | 436k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
4582 | 436k | signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration |
4583 | | |
4584 | | //storing right of row 2into left |
4585 | 436k | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15); |
4586 | | //to store right of row 0 |
4587 | 436k | signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); |
4588 | | //storing right of row 1 into left |
4589 | 436k | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15); |
4590 | | |
4591 | | //adding constant 2 |
4592 | 436k | edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); |
4593 | 436k | edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); |
4594 | | //shuffle to get sao index |
4595 | 436k | edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); |
4596 | 436k | edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); |
4597 | | //using availability mask |
4598 | 436k | edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); |
4599 | 436k | edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b); |
4600 | | //shuffle to get sao offset |
4601 | 436k | edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); |
4602 | 436k | edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); |
4603 | | |
4604 | | //storing right of row 0 into left |
4605 | 436k | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15); |
4606 | | //cnvert to 16 bit then add and then saturated pack |
4607 | 436k | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); |
4608 | 436k | cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); |
4609 | 436k | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); |
4610 | 436k | src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); |
4611 | 436k | cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); |
4612 | 436k | cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); |
4613 | 436k | src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); |
4614 | 436k | src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); |
4615 | | |
4616 | 436k | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); |
4617 | 436k | cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b); |
4618 | 436k | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b); |
4619 | 436k | src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b); |
4620 | 436k | cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); |
4621 | 436k | cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b); |
4622 | 436k | src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b); |
4623 | 436k | src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b); |
4624 | | |
4625 | 436k | cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); |
4626 | 436k | cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8); |
4627 | | |
4628 | 436k | _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); |
4629 | | //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. |
4630 | 436k | _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); |
4631 | | // row = 1 |
4632 | 436k | _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); |
4633 | | //row = 2 |
4634 | 436k | _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b); |
4635 | | // row = 3 |
4636 | 436k | _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b); |
4637 | | |
4638 | 436k | src_temp0_16x8b = src_temp1_16x8b; |
4639 | 436k | signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); |
4640 | 436k | pu1_src_cpy += (src_strd << 2); |
4641 | 436k | pu1_src_left_cpy += 4; |
4642 | 436k | pu1_src_left_str += 4; |
4643 | 436k | } |
4644 | 55.8k | ht_rem = ht & 0x2; |
4645 | 55.8k | if(ht_rem) |
4646 | 3.24k | { |
4647 | 3.24k | left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); |
4648 | | //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. |
4649 | 3.24k | src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); |
4650 | | // row = 2 |
4651 | 3.24k | src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); |
4652 | | |
4653 | | //manipulation for row 0 -row 1 |
4654 | 3.24k | signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14); |
4655 | | //bottom left |
4656 | 3.24k | signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15); |
4657 | | //separating +ve and and -ve values. |
4658 | 3.24k | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); |
4659 | 3.24k | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); |
4660 | | |
4661 | | //creating mask 00 for +ve and -ve values and FF for zero. |
4662 | 3.24k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
4663 | 3.24k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
4664 | | //manipulation for row 1 - row 0 |
4665 | 3.24k | signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 1); |
4666 | | //combining the appropriate sign change |
4667 | 3.24k | edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
4668 | | |
4669 | | //row1-row0 |
4670 | | //separating +ve and and -ve values. |
4671 | 3.24k | cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); |
4672 | 3.24k | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); |
4673 | | |
4674 | | //creating mask 00 for +ve and -ve values and FF for zero. |
4675 | 3.24k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
4676 | 3.24k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
4677 | | //combining the appropriate sign chang |
4678 | 3.24k | edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
4679 | | |
4680 | | //manipulation for row 1 -bottom |
4681 | 3.24k | signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 13); |
4682 | | //bottom left |
4683 | 3.24k | signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15); |
4684 | | |
4685 | 3.24k | edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1) |
4686 | 3.24k | signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top) |
4687 | | //row1 -bottom |
4688 | 3.24k | cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); |
4689 | 3.24k | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); |
4690 | | |
4691 | | //creating mask 00 for +ve and -ve values and FF for zero. |
4692 | 3.24k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
4693 | 3.24k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
4694 | | //combining the appropriate sign change |
4695 | 3.24k | signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2) |
4696 | 3.24k | edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1) |
4697 | | //manipulation for bottom- row 1 (row 1 right) |
4698 | 3.24k | signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1); |
4699 | | //adding top and down substraction |
4700 | 3.24k | edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty |
4701 | | //bottom - row 1 |
4702 | 3.24k | cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); |
4703 | 3.24k | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); |
4704 | | |
4705 | | //eliminating old left for row 0,1 |
4706 | 3.24k | left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2); |
4707 | 3.24k | signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8); |
4708 | | //creating mask 00 for +ve and -ve values and FF for zero. |
4709 | 3.24k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
4710 | 3.24k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
4711 | | //for the next iteration signup0_16x8b |
4712 | 3.24k | signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next |
4713 | | |
4714 | | //storing right of row 1 into left |
4715 | 3.24k | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15); |
4716 | | //for storing right of row 1 |
4717 | 3.24k | signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); |
4718 | | |
4719 | 3.24k | src_top_16x8b = src_temp1_16x8b; |
4720 | | //storing right of row 0 into left |
4721 | 3.24k | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15); |
4722 | | |
4723 | | //adding constant 2 |
4724 | 3.24k | edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); |
4725 | | |
4726 | | //shuffle to get sao index |
4727 | 3.24k | edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); |
4728 | | //using availability mask |
4729 | 3.24k | edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); |
4730 | | //shuffle to get sao offset |
4731 | 3.24k | edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); |
4732 | | |
4733 | | //the next top already in src_top_16x8b |
4734 | | //cnvert to 16 bit then add and then saturated pack |
4735 | 3.24k | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); |
4736 | 3.24k | src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); |
4737 | 3.24k | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); |
4738 | 3.24k | src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); |
4739 | 3.24k | src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); |
4740 | 3.24k | cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); |
4741 | 3.24k | src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b); |
4742 | 3.24k | src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b); |
4743 | | |
4744 | 3.24k | cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); |
4745 | | |
4746 | 3.24k | _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); |
4747 | | //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. |
4748 | 3.24k | _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); |
4749 | | // row = 1 |
4750 | 3.24k | _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); |
4751 | 3.24k | src_temp0_16x8b = src_bottom_16x8b; |
4752 | 3.24k | pu1_src_cpy += (src_strd << 1); |
4753 | 3.24k | pu1_src_left_cpy += 2; |
4754 | 3.24k | pu1_src_left_str += 2; |
4755 | 3.24k | } |
4756 | 55.8k | ht_rem = ht & 0x1; |
4757 | 55.8k | if(ht_rem) |
4758 | 3.24k | { |
4759 | 3.24k | left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); |
4760 | | //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. |
4761 | 3.24k | src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); |
4762 | | |
4763 | | |
4764 | | //manipulation for row 0 -bottom |
4765 | 3.24k | signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14); |
4766 | | //bottom left |
4767 | 3.24k | signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15); |
4768 | | //separating +ve and and -ve values. |
4769 | 3.24k | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); |
4770 | 3.24k | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); |
4771 | | //creating mask 00 for +ve and -ve values and FF for zero. |
4772 | 3.24k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
4773 | 3.24k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
4774 | | //combining the appropriate sign change |
4775 | 3.24k | edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
4776 | | //adding top and down substraction |
4777 | 3.24k | edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); |
4778 | | //for row 0 right to put into left store |
4779 | 3.24k | signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); |
4780 | | //adding constant 2 |
4781 | 3.24k | edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); |
4782 | 3.24k | edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); |
4783 | 3.24k | edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8); |
4784 | | //left store manipulation 1 |
4785 | 3.24k | left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1); |
4786 | | //filling the left boundary value |
4787 | 3.24k | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15); |
4788 | | |
4789 | | //shuffle to get sao index |
4790 | 3.24k | edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); |
4791 | | //using availability mask |
4792 | 3.24k | edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); |
4793 | | //shuffle to get sao offset |
4794 | 3.24k | edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); |
4795 | 3.24k | src_top_16x8b = src_temp0_16x8b; |
4796 | | //cnvert to 16 bit then add and then saturated pack |
4797 | 3.24k | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); |
4798 | 3.24k | src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); |
4799 | 3.24k | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); |
4800 | 3.24k | src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); |
4801 | 3.24k | src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b); |
4802 | | |
4803 | 3.24k | _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); |
4804 | | //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. |
4805 | 3.24k | _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); |
4806 | 3.24k | pu1_src_cpy += (src_strd); |
4807 | 3.24k | src_temp0_16x8b = src_bottom_16x8b; |
4808 | 3.24k | pu1_src_left_cpy++; |
4809 | 3.24k | pu1_src_left_str++; |
4810 | 3.24k | } |
4811 | 55.8k | { //for bottom right |
4812 | 55.8k | left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); |
4813 | 55.8k | left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1); |
4814 | 55.8k | src_temp0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); |
4815 | 55.8k | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15); |
4816 | 55.8k | _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); |
4817 | 55.8k | } |
4818 | 55.8k | if(0 == pu1_avail[3]) |
4819 | 1.96k | { |
4820 | 1.96k | src_top_16x8b = src_bottom_16x8b; |
4821 | 1.96k | } |
4822 | 55.8k | _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b); |
4823 | 55.8k | pu1_src += 8; |
4824 | | |
4825 | 55.8k | pu1_left_tmp = pu1_src_left_cpy2; |
4826 | 55.8k | pu1_src_left_cpy2 = pu1_src_left_str2; |
4827 | 55.8k | pu1_src_left_str2 = pu1_left_tmp; |
4828 | | |
4829 | 55.8k | pu1_src_left_cpy = pu1_src_left_cpy2; |
4830 | 55.8k | pu1_src_left_str = pu1_src_left_str2; |
4831 | | |
4832 | 55.8k | } |
4833 | 56.9k | pu1_src_org[wd - 1] = u1_pos_wd_0_tmp; |
4834 | 56.9k | pu1_src_org[(ht_tmp - 1) * src_strd] = u1_pos_0_ht_tmp; |
4835 | 56.9k | pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 1) : pu1_src_left_cpy; |
4836 | 56.9k | pu1_src_left[0] = au1_src_left_tmp[0]; |
4837 | 1.79M | for(row = 1; row < ht_tmp; row++) |
4838 | 1.74M | { |
4839 | 1.74M | pu1_src_left[row] = pu1_src_left_cpy[row]; |
4840 | 1.74M | } |
4841 | 56.9k | } |
4842 | | |
4843 | 56.9k | } |
4844 | | |
4845 | | void ihevc_sao_edge_offset_class3_chroma_ssse3(UWORD8 *pu1_src, |
4846 | | WORD32 src_strd, |
4847 | | UWORD8 *pu1_src_left, |
4848 | | UWORD8 *pu1_src_top, |
4849 | | UWORD8 *pu1_src_top_left, |
4850 | | UWORD8 *pu1_src_top_right, |
4851 | | UWORD8 *pu1_src_bot_left, |
4852 | | UWORD8 *pu1_avail, |
4853 | | WORD8 *pi1_sao_offset_u, |
4854 | | WORD8 *pi1_sao_offset_v, |
4855 | | WORD32 wd, |
4856 | | WORD32 ht) |
4857 | 62.9k | { |
4858 | 62.9k | WORD32 row, col; |
4859 | 62.9k | UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2; |
4860 | 62.9k | UWORD8 *pu1_src_cpy, *pu1_src_org; |
4861 | 62.9k | UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)]; |
4862 | 62.9k | UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy; |
4863 | 62.9k | WORD32 wd_rem; |
4864 | 62.9k | UWORD8 u1_pos_wd_0_tmp_u, u1_pos_wd_0_tmp_v, u1_pos_0_ht_tmp_u, u1_pos_0_ht_tmp_v; |
4865 | 62.9k | WORD32 ht_tmp; |
4866 | 62.9k | WORD32 bit_depth; |
4867 | 62.9k | UWORD8 u1_avail0, u1_avail1; |
4868 | | |
4869 | 62.9k | __m128i src_top_16x8b, src_bottom_16x8b; |
4870 | 62.9k | __m128i src_temp0_16x8b, src_temp1_16x8b; |
4871 | 62.9k | __m128i signup0_16x8b, signdwn1_16x8b; |
4872 | 62.9k | __m128i cmp_gt0_16x8b, cmp_lt0_16x8b; |
4873 | 62.9k | __m128i edge0_16x8b, edge1_16x8b; |
4874 | 62.9k | __m128i au1_mask8x16b; |
4875 | 62.9k | __m128i edge_idx_8x16b, sao_offset_8x16b; |
4876 | 62.9k | __m128i left_store_16x8b; |
4877 | 62.9k | __m128i const0_16x8b, const2_16x8b; |
4878 | 62.9k | __m128i chroma_offset_8x16b; |
4879 | | |
4880 | 62.9k | ht_tmp = ht; |
4881 | 62.9k | au1_mask8x16b = _mm_set1_epi8(0xff); |
4882 | | |
4883 | | |
4884 | 62.9k | au1_src_left_tmp[0] = pu1_src[(wd - 2)]; |
4885 | 62.9k | au1_src_left_tmp[1] = pu1_src[(wd - 1)]; |
4886 | | //manipulation for bottom left |
4887 | 1.92M | for(row = 2; row < 2 * ht; row++) |
4888 | 1.86M | { |
4889 | 1.86M | au1_src_left_tmp[row] = pu1_src_left[row]; |
4890 | 1.86M | } |
4891 | 62.9k | au1_src_left_tmp[2 * ht] = pu1_src_bot_left[0]; |
4892 | 62.9k | au1_src_left_tmp[2 * ht + 1] = pu1_src_bot_left[1]; |
4893 | | |
4894 | 62.9k | pu1_src_top_left[0] = pu1_src_top[wd - 2]; |
4895 | 62.9k | pu1_src_top_left[1] = pu1_src_top[wd - 1]; |
4896 | | //setting availability mask to ff size MAX_CTB_SIZE |
4897 | 314k | for(col = 0; col < MAX_CTB_SIZE; col += 16) |
4898 | 251k | _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b); |
4899 | 62.9k | bit_depth = BIT_DEPTH_LUMA; |
4900 | 62.9k | pu1_src_org = pu1_src; |
4901 | 62.9k | pu1_src_top_cpy = pu1_src_top; |
4902 | 62.9k | pu1_src_left_cpy2 = au1_src_left_tmp; |
4903 | 62.9k | pu1_src_left_cpy = au1_src_left_tmp; |
4904 | 62.9k | edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx); |
4905 | 62.9k | sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u); |
4906 | 62.9k | const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v); |
4907 | 62.9k | chroma_offset_8x16b = _mm_set1_epi16(0x0800); |
4908 | | /* If top-right is available, process separately */ |
4909 | 62.9k | if(0 != pu1_avail[5]) |
4910 | 59.3k | { |
4911 | 59.3k | WORD32 edge_idx; |
4912 | | |
4913 | | /* U */ |
4914 | 59.3k | edge_idx = 2 + SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) + |
4915 | 59.3k | SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]); |
4916 | | |
4917 | 59.3k | edge_idx = gi1_table_edge_idx[edge_idx]; |
4918 | | |
4919 | 59.3k | if(0 != edge_idx) |
4920 | 20.0k | { |
4921 | 20.0k | u1_pos_wd_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1); |
4922 | 20.0k | } |
4923 | 39.3k | else |
4924 | 39.3k | { |
4925 | 39.3k | u1_pos_wd_0_tmp_u = pu1_src[wd - 2]; |
4926 | 39.3k | } |
4927 | | |
4928 | | /* V */ |
4929 | 59.3k | edge_idx = 2 + SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) + |
4930 | 59.3k | SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]); |
4931 | | |
4932 | 59.3k | edge_idx = gi1_table_edge_idx[edge_idx]; |
4933 | | |
4934 | 59.3k | if(0 != edge_idx) |
4935 | 16.0k | { |
4936 | 16.0k | u1_pos_wd_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1); |
4937 | 16.0k | } |
4938 | 43.3k | else |
4939 | 43.3k | { |
4940 | 43.3k | u1_pos_wd_0_tmp_v = pu1_src[wd - 1]; |
4941 | 43.3k | } |
4942 | 59.3k | } |
4943 | 3.57k | else |
4944 | 3.57k | { |
4945 | 3.57k | u1_pos_wd_0_tmp_u = pu1_src[wd - 2]; |
4946 | 3.57k | u1_pos_wd_0_tmp_v = pu1_src[wd - 1]; |
4947 | 3.57k | } |
4948 | | |
4949 | | /* If bottom-left is available, process separately */ |
4950 | 62.9k | if(0 != pu1_avail[6]) |
4951 | 60.4k | { |
4952 | 60.4k | WORD32 edge_idx; |
4953 | | |
4954 | | /* U */ |
4955 | 60.4k | edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 2 - src_strd]) + |
4956 | 60.4k | SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]); |
4957 | | |
4958 | 60.4k | edge_idx = gi1_table_edge_idx[edge_idx]; |
4959 | | |
4960 | 60.4k | if(0 != edge_idx) |
4961 | 14.3k | { |
4962 | 14.3k | u1_pos_0_ht_tmp_u = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1); |
4963 | 14.3k | } |
4964 | 46.0k | else |
4965 | 46.0k | { |
4966 | 46.0k | u1_pos_0_ht_tmp_u = pu1_src[(ht - 1) * src_strd]; |
4967 | 46.0k | } |
4968 | | |
4969 | | /* V */ |
4970 | 60.4k | edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd]) + |
4971 | 60.4k | SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1]); |
4972 | | |
4973 | 60.4k | edge_idx = gi1_table_edge_idx[edge_idx]; |
4974 | | |
4975 | 60.4k | if(0 != edge_idx) |
4976 | 16.1k | { |
4977 | 16.1k | u1_pos_0_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd + 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1); |
4978 | 16.1k | } |
4979 | 44.2k | else |
4980 | 44.2k | { |
4981 | 44.2k | u1_pos_0_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1]; |
4982 | 44.2k | } |
4983 | 60.4k | } |
4984 | 2.56k | else |
4985 | 2.56k | { |
4986 | 2.56k | u1_pos_0_ht_tmp_u = pu1_src[(ht - 1) * src_strd]; |
4987 | 2.56k | u1_pos_0_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1]; |
4988 | 2.56k | } |
4989 | | |
4990 | | |
4991 | | |
4992 | | /* Update height and source pointers based on the availability flags */ |
4993 | 62.9k | if(0 == pu1_avail[2]) |
4994 | 2.02k | { |
4995 | 2.02k | pu1_src_left_cpy2 += 2; |
4996 | 2.02k | pu1_src_top_cpy = pu1_src; |
4997 | 2.02k | pu1_src += src_strd; |
4998 | 2.02k | ht--; |
4999 | 2.02k | } |
5000 | 62.9k | if(0 == pu1_avail[3]) |
5001 | 1.74k | { |
5002 | 1.74k | ht--; |
5003 | 1.74k | } |
5004 | | |
5005 | 62.9k | sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b); |
5006 | 62.9k | const2_16x8b = _mm_set1_epi8(2); |
5007 | 62.9k | const0_16x8b = _mm_setzero_si128(); |
5008 | | |
5009 | | |
5010 | | //availability mask creation |
5011 | 62.9k | u1_avail0 = pu1_avail[0]; |
5012 | 62.9k | u1_avail1 = pu1_avail[1]; |
5013 | 62.9k | au1_mask[0] = u1_avail0; |
5014 | 62.9k | au1_mask[1] = u1_avail0; |
5015 | 62.9k | au1_mask[wd - 1] = u1_avail1; |
5016 | 62.9k | au1_mask[wd - 2] = u1_avail1; |
5017 | 62.9k | { |
5018 | 62.9k | WORD32 ht_rem; |
5019 | 62.9k | au1_mask_cpy = au1_mask; |
5020 | 191k | for(col = wd; col >= 16; col -= 16) |
5021 | 128k | { |
5022 | 128k | pu1_src_cpy = pu1_src; |
5023 | 128k | src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col + 2)); |
5024 | | //row = 0 |
5025 | 128k | src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy)); |
5026 | | |
5027 | | //loading the mask |
5028 | 128k | au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy); |
5029 | | //separating +ve and and -ve values. |
5030 | 128k | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b); |
5031 | 128k | cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b); |
5032 | | //creating mask 00 for +ve and -ve values and FF for zero. |
5033 | 128k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
5034 | 128k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
5035 | | //combining the appropriate sign change |
5036 | 128k | signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
5037 | 128k | pu1_src_left_cpy = pu1_src_left_cpy2; |
5038 | | |
5039 | 1.14M | for(row = ht; row >= 2; row -= 2) |
5040 | 1.01M | { |
5041 | 1.01M | left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy)); |
5042 | | //row = 1 |
5043 | 1.01M | src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); |
5044 | | //to insert left in row 1 |
5045 | 1.01M | signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12); |
5046 | | // row = 0 right |
5047 | 1.01M | src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2)); |
5048 | | |
5049 | | //manipulation for row 1 - row 0 |
5050 | 1.01M | signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14); |
5051 | | //row 0 -row1 |
5052 | | //separating +ve and and -ve values. |
5053 | 1.01M | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); |
5054 | 1.01M | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); |
5055 | | |
5056 | | //creating mask 00 for +ve and -ve values and FF for zero. |
5057 | 1.01M | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
5058 | 1.01M | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
5059 | | |
5060 | | //combining the appropriate sign change |
5061 | 1.01M | edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1) |
5062 | | //combining sign-left and sign_right |
5063 | 1.01M | edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); |
5064 | | |
5065 | | //row1-row0 |
5066 | | //separating +ve and and -ve values. |
5067 | 1.01M | cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b); |
5068 | 1.01M | cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b); |
5069 | | //creating mask 00 for +ve and -ve values and FF for zero. |
5070 | 1.01M | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
5071 | 1.01M | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
5072 | | |
5073 | | // row = 2 |
5074 | 1.01M | src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); |
5075 | | // row = 1 right |
5076 | 1.01M | signdwn1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2)); |
5077 | 1.01M | edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0) |
5078 | | |
5079 | | //bottom - row1 |
5080 | 1.01M | cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); |
5081 | 1.01M | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); |
5082 | | //creating mask 00 for +ve and -ve values and FF for zero. |
5083 | 1.01M | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
5084 | 1.01M | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
5085 | | //for the next iteration bottom -row1 |
5086 | 1.01M | signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
5087 | | |
5088 | | //to insert left in row 1 |
5089 | 1.01M | signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 10); |
5090 | | //manipulation for row 1 - bottom |
5091 | 1.01M | signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14); |
5092 | | |
5093 | | //row1 -bottom |
5094 | 1.01M | cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); |
5095 | 1.01M | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); |
5096 | | //creating mask 00 for +ve and -ve values and FF for zero. |
5097 | 1.01M | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
5098 | 1.01M | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
5099 | | //combining the appropriate sign change |
5100 | 1.01M | signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
5101 | | |
5102 | | //combining sign-left and sign_right |
5103 | 1.01M | edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b); |
5104 | | |
5105 | | //eliminating old left for row 0 and row 1 |
5106 | 1.01M | left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4); |
5107 | | //row1 getting it right for left of next block |
5108 | 1.01M | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14); |
5109 | | //row0 getting it right for left of next block |
5110 | 1.01M | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14); |
5111 | | //copying the next top |
5112 | 1.01M | src_top_16x8b = src_temp1_16x8b; |
5113 | | |
5114 | | |
5115 | | //adding constant 2 |
5116 | 1.01M | edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); |
5117 | 1.01M | edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); |
5118 | | //shuffle to get sao index |
5119 | 1.01M | edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); |
5120 | 1.01M | edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); |
5121 | | //using availability mask |
5122 | 1.01M | edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); |
5123 | 1.01M | edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b); |
5124 | | |
5125 | | //adding chroma offset to access U and V |
5126 | 1.01M | edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); |
5127 | 1.01M | edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b); |
5128 | | |
5129 | | //shuffle to get sao offset |
5130 | 1.01M | edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); |
5131 | 1.01M | edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); |
5132 | | //cnvert to 16 bit then add and then saturated pack |
5133 | 1.01M | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); |
5134 | 1.01M | cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); |
5135 | 1.01M | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); |
5136 | 1.01M | src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); |
5137 | 1.01M | edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); |
5138 | 1.01M | cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); |
5139 | 1.01M | src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b); |
5140 | 1.01M | src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); |
5141 | | |
5142 | 1.01M | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); |
5143 | 1.01M | cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); |
5144 | 1.01M | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b); |
5145 | 1.01M | src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b); |
5146 | 1.01M | edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b); |
5147 | 1.01M | cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); |
5148 | 1.01M | src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b); |
5149 | 1.01M | src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b); |
5150 | | //store left boundary |
5151 | 1.01M | _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b); |
5152 | | //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. |
5153 | 1.01M | _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); |
5154 | | // row = 1 |
5155 | 1.01M | _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b); |
5156 | | |
5157 | 1.01M | src_temp0_16x8b = src_bottom_16x8b; |
5158 | 1.01M | pu1_src_cpy += (src_strd << 1); |
5159 | 1.01M | pu1_src_left_cpy += 4; |
5160 | 1.01M | } |
5161 | 128k | ht_rem = ht & 0x1; |
5162 | | |
5163 | 128k | if(ht_rem) |
5164 | 7.49k | { |
5165 | 7.49k | left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); |
5166 | 7.49k | src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); |
5167 | | //to insert left in row 1 |
5168 | 7.49k | signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12); |
5169 | | //manipulation for row 1 - row 0 |
5170 | 7.49k | signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14); |
5171 | | |
5172 | | //current row -next row |
5173 | | //separating +ve and and -ve values. |
5174 | 7.49k | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); |
5175 | 7.49k | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); |
5176 | | //creating mask 00 for +ve and -ve values and FF for zero. |
5177 | 7.49k | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
5178 | 7.49k | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
5179 | | //combining the appropriate sign change |
5180 | 7.49k | edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
5181 | | //adding top and bottom and constant 2 |
5182 | 7.49k | edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); |
5183 | 7.49k | edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); |
5184 | | //eliminating old left for row 0 and row 1 |
5185 | 7.49k | left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2); |
5186 | | //row0 getting it right for left of next block |
5187 | 7.49k | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14); |
5188 | | //copying the next top |
5189 | 7.49k | src_top_16x8b = src_temp0_16x8b; |
5190 | | |
5191 | 7.49k | edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); |
5192 | | //using availability mask |
5193 | 7.49k | edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); |
5194 | | |
5195 | | //adding chroma offset to access U and V |
5196 | 7.49k | edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); |
5197 | | |
5198 | | |
5199 | 7.49k | edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); |
5200 | | |
5201 | | //cnvert to 16 bit then add and then saturated pack |
5202 | 7.49k | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); |
5203 | 7.49k | cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); |
5204 | 7.49k | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); |
5205 | 7.49k | src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); |
5206 | 7.49k | edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); |
5207 | 7.49k | cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); |
5208 | 7.49k | src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b); |
5209 | 7.49k | src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); |
5210 | | |
5211 | | //store left boundary |
5212 | 7.49k | _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b); |
5213 | | |
5214 | 7.49k | _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); |
5215 | 7.49k | pu1_src_cpy += (src_strd); |
5216 | 7.49k | src_temp0_16x8b = src_bottom_16x8b; |
5217 | 7.49k | pu1_src_left_cpy += 2; |
5218 | 7.49k | } |
5219 | 128k | { //for bottom right |
5220 | 128k | left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); |
5221 | 128k | left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2); |
5222 | 128k | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14); |
5223 | 128k | _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b); |
5224 | 128k | } |
5225 | 128k | if(0 == pu1_avail[3]) |
5226 | 3.49k | { |
5227 | 3.49k | src_top_16x8b = src_bottom_16x8b; |
5228 | 3.49k | } |
5229 | | //for the top left of next part of the block |
5230 | 128k | left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col)); |
5231 | | //updating top flag |
5232 | 128k | _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b); |
5233 | 128k | pu1_src += 16; |
5234 | 128k | au1_mask_cpy += 16; |
5235 | 128k | } |
5236 | 62.9k | pu1_src_left_cpy = pu1_src_left_cpy2; |
5237 | 62.9k | wd_rem = wd & 0xF; |
5238 | 62.9k | if(wd_rem) |
5239 | 16 | { |
5240 | 16 | pu1_src_cpy = pu1_src; |
5241 | 16 | src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col + 2)); |
5242 | | //row = 0 |
5243 | 16 | src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy)); |
5244 | 16 | au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //???? |
5245 | | //separating +ve and and -ve values. |
5246 | 16 | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b); |
5247 | 16 | cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b); |
5248 | | //creating mask 00 for +ve and -ve values and FF for zero. |
5249 | 16 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
5250 | 16 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
5251 | | //preparing au1_mask |
5252 | 16 | au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b); |
5253 | | //combining the appropriate sign change |
5254 | 16 | signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
5255 | 16 | signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); |
5256 | 16 | pu1_src_left_cpy = pu1_src_left_cpy2; |
5257 | 18 | for(row = ht; row >= 4; row -= 4) |
5258 | 2 | { |
5259 | 2 | left_store_16x8b = _mm_loadu_si128((__m128i *)pu1_src_left_cpy); |
5260 | | //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. |
5261 | 2 | src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); |
5262 | | // row = 2 |
5263 | 2 | src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); |
5264 | | //manipulation for row 0 -row 1 |
5265 | 2 | signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12); |
5266 | | //row 1 left |
5267 | 2 | signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14); |
5268 | | //row 0 -row1 |
5269 | | //separating +ve and and -ve values. |
5270 | 2 | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); |
5271 | 2 | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); |
5272 | | |
5273 | | //creating mask 00 for +ve and -ve values and FF for zero. |
5274 | 2 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
5275 | 2 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
5276 | | //manipulatiing for row 1 -row 0 |
5277 | 2 | signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 2); |
5278 | | //combining the appropriate sign change |
5279 | 2 | edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
5280 | | //row 1 -row0 |
5281 | | //separating +ve and and -ve values. |
5282 | 2 | cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); |
5283 | 2 | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); |
5284 | | |
5285 | | //creating mask 00 for +ve and -ve values and FF for zero. |
5286 | 2 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
5287 | 2 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
5288 | | //row1-row0 |
5289 | 2 | edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
5290 | | |
5291 | 2 | edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1) |
5292 | | |
5293 | 2 | signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top) |
5294 | | //manipulation for row 1 -row 2 |
5295 | 2 | signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 10); |
5296 | | //row 2 left |
5297 | 2 | signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14); |
5298 | | //packing row 0 n row 1 |
5299 | 2 | src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b); |
5300 | | //row1 -row2 |
5301 | 2 | cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); |
5302 | 2 | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); |
5303 | | //creating mask 00 for +ve and -ve values and FF for zero. |
5304 | 2 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
5305 | 2 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
5306 | | //combining the appropriate sign change |
5307 | 2 | signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2) |
5308 | 2 | edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1) |
5309 | | |
5310 | | //row 1 right |
5311 | 2 | signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); |
5312 | | //row = 3 |
5313 | 2 | src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd)); |
5314 | | |
5315 | | // row = 4 |
5316 | 2 | src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd)); |
5317 | | |
5318 | 2 | edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty |
5319 | | |
5320 | | //separating +ve and and -ve values.(2,1) |
5321 | 2 | cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); |
5322 | 2 | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); |
5323 | | |
5324 | | //creating mask 00 for +ve and -ve values and FF for zero. |
5325 | 2 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
5326 | 2 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
5327 | | //row 2 right |
5328 | 2 | signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2); |
5329 | | //combining the appropriate sign change |
5330 | 2 | signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) |
5331 | | |
5332 | | //separating +ve and and -ve values.(3,2) |
5333 | 2 | cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b); |
5334 | 2 | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b); |
5335 | 2 | signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1) |
5336 | | //creating mask 00 for +ve and -ve values and FF for zero. |
5337 | 2 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
5338 | 2 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
5339 | | //manipulation for row 2 -row 3 |
5340 | 2 | signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 8); |
5341 | | //row 3 left |
5342 | 2 | signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 14); |
5343 | | //combining the appropriate sign change |
5344 | 2 | edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2) |
5345 | | |
5346 | 2 | signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1) |
5347 | | |
5348 | | //separating +ve and and -ve values.(2,3) |
5349 | 2 | cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); |
5350 | 2 | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); |
5351 | | |
5352 | | //manipulation for row 3 -bottom |
5353 | 2 | signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 6); |
5354 | | //bottom left |
5355 | 2 | signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14); |
5356 | | |
5357 | | //creating mask 00 for +ve and -ve values and FF for zero. |
5358 | 2 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
5359 | 2 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
5360 | | //combining the appropriate sign change |
5361 | 2 | edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3) |
5362 | | |
5363 | | //separating +ve and and -ve values.(3,bottom) |
5364 | 2 | cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b); |
5365 | 2 | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b); |
5366 | | |
5367 | | //creating mask 00 for +ve and -ve values and FF for zero. |
5368 | 2 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
5369 | 2 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
5370 | 2 | edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3) |
5371 | | //combining the appropriate sign change |
5372 | 2 | signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom) |
5373 | 2 | edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3) |
5374 | | |
5375 | | |
5376 | | //eliminating old left for row 0,1,2,3 |
5377 | 2 | left_store_16x8b = _mm_srli_si128(left_store_16x8b, 8); |
5378 | | //packing row 2 n row 3 |
5379 | 2 | src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b); |
5380 | | //row 3 right |
5381 | 2 | signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 2); |
5382 | | //loading row 3 right into left |
5383 | 2 | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 14); |
5384 | | //adding bottom and top values of row 2 and row 3 |
5385 | 2 | edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2) |
5386 | | //separating +ve and and -ve values.(botttom,3) |
5387 | 2 | cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); |
5388 | 2 | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); |
5389 | | //to store right of row 2 |
5390 | 2 | signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8); |
5391 | | //creating mask 00 for +ve and -ve values and FF for zero. |
5392 | 2 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
5393 | 2 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
5394 | 2 | signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration |
5395 | | |
5396 | | //storing right of row 2into left |
5397 | 2 | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); |
5398 | | //to store right of row 0 |
5399 | 2 | signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); |
5400 | | //storing right of row 1 into left |
5401 | 2 | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14); |
5402 | | //storing right of row 0 into left |
5403 | 2 | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); |
5404 | | |
5405 | | |
5406 | | //adding constant 2 |
5407 | 2 | edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); |
5408 | 2 | edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); |
5409 | | //shuffle to get sao index |
5410 | 2 | edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); |
5411 | 2 | edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); |
5412 | | //using availability mask |
5413 | 2 | edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); |
5414 | 2 | edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b); |
5415 | | //adding chroma offset to access U and V |
5416 | 2 | edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); |
5417 | 2 | edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b); |
5418 | | //shuffle to get sao offset |
5419 | 2 | edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); |
5420 | 2 | edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); |
5421 | | |
5422 | | //cnvert to 16 bit then add and then saturated pack |
5423 | 2 | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); |
5424 | 2 | cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); |
5425 | 2 | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); |
5426 | 2 | src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); |
5427 | 2 | edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); |
5428 | 2 | cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); |
5429 | 2 | src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b); |
5430 | 2 | src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); |
5431 | | |
5432 | 2 | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); |
5433 | 2 | cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b); |
5434 | 2 | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b); |
5435 | 2 | src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b); |
5436 | 2 | edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b); |
5437 | 2 | cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); |
5438 | 2 | src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b); |
5439 | 2 | src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b); |
5440 | | |
5441 | 2 | cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); |
5442 | 2 | cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8); |
5443 | 2 | _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b); |
5444 | | //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. |
5445 | 2 | _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); |
5446 | | // row = 1 |
5447 | 2 | _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); |
5448 | | //row = 2 |
5449 | 2 | _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b); |
5450 | | // row = 3 |
5451 | 2 | _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b); |
5452 | | |
5453 | 2 | src_temp0_16x8b = src_temp1_16x8b; |
5454 | 2 | signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); |
5455 | 2 | pu1_src_cpy += (src_strd << 2); |
5456 | 2 | pu1_src_left_cpy += 8; |
5457 | 2 | } |
5458 | 16 | ht_rem = ht & 0x2; |
5459 | 16 | if(ht_rem) |
5460 | 16 | { |
5461 | 16 | left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); |
5462 | | //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. |
5463 | 16 | src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); |
5464 | | // row = 2 |
5465 | 16 | src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); |
5466 | | |
5467 | | //manipulation for row 0 -row 1 |
5468 | 16 | signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12); |
5469 | | //bottom left |
5470 | 16 | signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14); |
5471 | | //separating +ve and and -ve values. |
5472 | 16 | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); |
5473 | 16 | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); |
5474 | | |
5475 | | //creating mask 00 for +ve and -ve values and FF for zero. |
5476 | 16 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
5477 | 16 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
5478 | | //manipulation for row 1 - row 0 |
5479 | 16 | signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 2); |
5480 | | //combining the appropriate sign change |
5481 | 16 | edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
5482 | | |
5483 | | //row1-row0 |
5484 | | //separating +ve and and -ve values. |
5485 | 16 | cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); |
5486 | 16 | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); |
5487 | | |
5488 | | //creating mask 00 for +ve and -ve values and FF for zero. |
5489 | 16 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
5490 | 16 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
5491 | | //combining the appropriate sign chang |
5492 | 16 | edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
5493 | | |
5494 | | //manipulation for row 1 -bottom |
5495 | 16 | signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 10); |
5496 | | //bottom left |
5497 | 16 | signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14); |
5498 | | |
5499 | 16 | edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1) |
5500 | 16 | signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top) |
5501 | | //row1 -bottom |
5502 | 16 | cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); |
5503 | 16 | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); |
5504 | | |
5505 | | //creating mask 00 for +ve and -ve values and FF for zero. |
5506 | 16 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
5507 | 16 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
5508 | | //combining the appropriate sign change |
5509 | 16 | signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2) |
5510 | 16 | edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1) |
5511 | | |
5512 | | //manipulation for bottom- row 1 (row 1 right) |
5513 | 16 | signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); |
5514 | | //adding top and down substraction |
5515 | 16 | edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty |
5516 | | //bottom - row 1 |
5517 | 16 | cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); |
5518 | 16 | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); |
5519 | | |
5520 | | //eliminating old left for row 0,1 |
5521 | 16 | left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4); |
5522 | 16 | signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8); |
5523 | | //creating mask 00 for +ve and -ve values and FF for zero. |
5524 | 16 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
5525 | 16 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
5526 | | //for the next iteration signup0_16x8b |
5527 | 16 | signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next |
5528 | | |
5529 | | //storing right of row 1 into left |
5530 | 16 | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); |
5531 | | //for storing right of row 1 |
5532 | 16 | signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); |
5533 | | |
5534 | 16 | src_top_16x8b = src_temp1_16x8b; |
5535 | | //storing right of row 0 into left |
5536 | 16 | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); |
5537 | | |
5538 | | //adding constant 2 |
5539 | 16 | edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); |
5540 | | |
5541 | | //shuffle to get sao index |
5542 | 16 | edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); |
5543 | | //using availability mask |
5544 | 16 | edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); |
5545 | | //adding chroma offset to access U and V |
5546 | 16 | edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); |
5547 | | //shuffle to get sao offset |
5548 | 16 | edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); |
5549 | | //the next top already in src_top_16x8b |
5550 | | //cnvert to 16 bit then add and then saturated pack |
5551 | 16 | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); |
5552 | 16 | src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); |
5553 | 16 | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); |
5554 | 16 | src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); |
5555 | 16 | edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); |
5556 | 16 | src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); |
5557 | 16 | src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b); |
5558 | 16 | src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b); |
5559 | | |
5560 | 16 | cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); |
5561 | | |
5562 | 16 | _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b); |
5563 | | //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. |
5564 | 16 | _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); |
5565 | | // row = 1 |
5566 | 16 | _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); |
5567 | 16 | src_temp0_16x8b = src_bottom_16x8b; |
5568 | 16 | pu1_src_cpy += (src_strd << 1); |
5569 | 16 | pu1_src_left_cpy += 4; |
5570 | 16 | } |
5571 | 16 | ht_rem = ht & 0x1; |
5572 | 16 | if(ht_rem) |
5573 | 4 | { |
5574 | 4 | left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); |
5575 | | //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. |
5576 | 4 | src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); |
5577 | | |
5578 | | |
5579 | | //manipulation for row 0 -bottom |
5580 | 4 | signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12); |
5581 | | //bottom left |
5582 | 4 | signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14); |
5583 | | //separating +ve and and -ve values. |
5584 | 4 | cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); |
5585 | 4 | cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); |
5586 | | //creating mask 00 for +ve and -ve values and FF for zero. |
5587 | 4 | cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); |
5588 | 4 | cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); |
5589 | | //combining the appropriate sign change |
5590 | 4 | edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); |
5591 | | //adding top and down substraction |
5592 | 4 | edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); |
5593 | | //for row 0 right to put into left store |
5594 | 4 | signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); |
5595 | | //adding constant 2 |
5596 | 4 | edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); |
5597 | 4 | edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); |
5598 | 4 | edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8); |
5599 | | //left store manipulation 1 |
5600 | 4 | left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2); |
5601 | | //filling the left boundary value |
5602 | 4 | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); |
5603 | 4 | src_top_16x8b = src_temp0_16x8b; |
5604 | | |
5605 | | //shuffle to get sao index |
5606 | 4 | edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); |
5607 | | //using availability mask |
5608 | 4 | edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); |
5609 | | //adding chroma offset to access U and V |
5610 | 4 | edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); |
5611 | | //shuffle to get sao offset |
5612 | 4 | edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); |
5613 | | |
5614 | | //cnvert to 16 bit then add and then saturated pack |
5615 | 4 | signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); |
5616 | 4 | src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); |
5617 | 4 | cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); |
5618 | 4 | src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); |
5619 | 4 | src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b); |
5620 | | |
5621 | 4 | _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b); |
5622 | | //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. |
5623 | 4 | _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); |
5624 | 4 | pu1_src_cpy += (src_strd); |
5625 | 4 | src_temp0_16x8b = src_bottom_16x8b; |
5626 | 4 | pu1_src_left_cpy += 2; |
5627 | 4 | } |
5628 | 16 | { //for bottom right |
5629 | 16 | left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); |
5630 | 16 | left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2); |
5631 | 16 | src_temp0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); |
5632 | 16 | left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14); |
5633 | 16 | _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b); |
5634 | 16 | } |
5635 | 16 | if(0 == pu1_avail[3]) |
5636 | 14 | { |
5637 | 14 | src_top_16x8b = src_bottom_16x8b; |
5638 | 14 | } |
5639 | | |
5640 | 16 | _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b); |
5641 | 16 | pu1_src += 8; |
5642 | 16 | } |
5643 | 62.9k | pu1_src_org[wd - 2] = u1_pos_wd_0_tmp_u; |
5644 | 62.9k | pu1_src_org[wd - 1] = u1_pos_wd_0_tmp_v; |
5645 | 62.9k | pu1_src_org[(ht_tmp - 1) * src_strd] = u1_pos_0_ht_tmp_u; |
5646 | 62.9k | pu1_src_org[(ht_tmp - 1) * src_strd + 1] = u1_pos_0_ht_tmp_v; |
5647 | 2.05M | for(row = 0; row < 2 * ht_tmp; row++) |
5648 | 1.99M | { |
5649 | 1.99M | pu1_src_left[row] = au1_src_left_tmp[row]; |
5650 | 1.99M | } |
5651 | 62.9k | } |
5652 | | |
5653 | 62.9k | } |