/src/libhevc/common/x86/ihevc_deblk_ssse3_intr.c
Line | Count | Source (jump to first uncovered line) |
1 | | /****************************************************************************** |
2 | | * |
3 | | * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore |
4 | | * |
5 | | * Licensed under the Apache License, Version 2.0 (the "License"); |
6 | | * you may not use this file except in compliance with the License. |
7 | | * You may obtain a copy of the License at: |
8 | | * |
9 | | * http://www.apache.org/licenses/LICENSE-2.0 |
10 | | * |
11 | | * Unless required by applicable law or agreed to in writing, software |
12 | | * distributed under the License is distributed on an "AS IS" BASIS, |
13 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | * See the License for the specific language governing permissions and |
15 | | * limitations under the License. |
16 | | * |
17 | | ******************************************************************************/ |
18 | | /** |
19 | | ******************************************************************************* |
20 | | * @file |
21 | | * ihevc_deblck_atom_intr.c |
22 | | * |
23 | | * @brief |
24 | | * Contains function definitions for deblocking filters |
25 | | * |
26 | | * @author |
27 | | * Rishab |
28 | | * |
29 | | * @par List of Functions: |
30 | | * - ihevc_deblk_luma_vert_ssse3() |
31 | | * - ihevc_deblk_luma_horz_ssse3() |
32 | | * - ihevc_deblk_chroma_vert_ssse3() |
33 | | * - ihevc_deblk_chroma_horz_ssse3() |
34 | | * |
35 | | * @remarks |
36 | | * None |
37 | | * |
38 | | ******************************************************************************* |
39 | | */ |
40 | | #include <stdlib.h> |
41 | | #include <stdio.h> |
42 | | #include <assert.h> |
43 | | #include "ihevc_typedefs.h" |
44 | | #include "ihevc_platform_macros.h" |
45 | | #include "ihevc_macros.h" |
46 | | #include "ihevc_deblk.h" |
47 | | #include "ihevc_deblk_tables.h" |
48 | | #include "ihevc_debug.h" |
49 | | |
50 | | #include "ihevc_tables_x86_intr.h" |
51 | | |
52 | | #include <immintrin.h> |
53 | | /** |
54 | | ******************************************************************************* |
55 | | * |
56 | | * @brief |
57 | | * Decision process and filtering for the luma block vertical edge. |
58 | | * |
59 | | * @par Description: |
60 | | * The decision process for the luma block vertical edge is carried out and |
61 | | * an appropriate filter is applied. The boundary filter strength, bs should |
62 | | * be greater than 0. The pcm flags and the transquant bypass flags should |
63 | | * be taken care of by the calling function. |
64 | | * |
65 | | * @param[in] pu1_src |
66 | | * Pointer to the src sample q(0,0) |
67 | | * |
68 | | * @param[in] src_strd |
69 | | * Source stride |
70 | | * |
71 | | * @param[in] bs |
72 | | * Boundary filter strength of q(0,0) |
73 | | * |
74 | | * @param[in] quant_param_p |
75 | | * quantization parameter of p block |
76 | | * |
77 | | * @param[in] quant_param_q |
78 | | * quantization parameter of p block |
79 | | * |
80 | | * @param[in] beta_offset_div2 |
81 | | * |
82 | | * |
83 | | * @param[in] tc_offset_div2 |
84 | | * |
85 | | * |
86 | | * @param[in] filter_flag_p |
87 | | * flag whether to filter the p block |
88 | | * |
89 | | * @param[in] filter_flag_q |
90 | | * flag whether to filter the q block |
91 | | * |
92 | | * @returns |
93 | | * |
94 | | * @remarks |
95 | | * None |
96 | | * |
97 | | ******************************************************************************* |
98 | | */ |
99 | | |
100 | | void ihevc_deblk_luma_vert_ssse3(UWORD8 *pu1_src, |
101 | | WORD32 src_strd, |
102 | | WORD32 bs, |
103 | | WORD32 quant_param_p, |
104 | | WORD32 quant_param_q, |
105 | | WORD32 beta_offset_div2, |
106 | | WORD32 tc_offset_div2, |
107 | | WORD32 filter_flag_p, |
108 | | WORD32 filter_flag_q) |
109 | 13.5M | { |
110 | 13.5M | WORD32 qp_luma, beta_indx, tc_indx; |
111 | 13.5M | WORD32 beta, tc; |
112 | 13.5M | WORD32 d, dp, dq, d_sam0, d_sam3; |
113 | | |
114 | 13.5M | WORD32 d3, d0, de_0, de_1, de_2, de_3; |
115 | 13.5M | WORD32 de, dep, deq; |
116 | 13.5M | __m128i src_row0_8x16b, src_row1_8x16b, src_row2_8x16b, src_row3_8x16b; |
117 | | |
118 | | |
119 | 13.5M | { |
120 | 13.5M | __m128i src_tmp_8x16b, coef_8x16b, mask_d_result_4x32b, mask_de_result_8x16b; |
121 | 13.5M | __m128i mask_16x8b, temp_coef0_8x16b, temp_coef1_8x16b; |
122 | | |
123 | | |
124 | | |
125 | 13.5M | ASSERT((bs > 0) && (bs <= 3)); |
126 | 13.6M | ASSERT(filter_flag_p || filter_flag_q); |
127 | | |
128 | 13.6M | qp_luma = (quant_param_p + quant_param_q + 1) >> 1; |
129 | 13.6M | beta_indx = CLIP3(qp_luma + (beta_offset_div2 << 1), 0, 51); |
130 | | |
131 | | /* BS based on implementation can take value 3 if it is intra/inter egde */ |
132 | | /* based on BS, tc index is calcuated by adding 2 * ( bs - 1) to QP and tc_offset */ |
133 | | /* for BS = 1 adding factor is (0*2), BS = 2 or 3 adding factor is (1*2) */ |
134 | | /* the above desired functionallity is achieved by doing (2*(bs>>1)) */ |
135 | | |
136 | 13.6M | tc_indx = CLIP3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53); |
137 | | |
138 | 13.6M | beta = gai4_ihevc_beta_table[beta_indx]; |
139 | 13.6M | tc = gai4_ihevc_tc_table[tc_indx]; |
140 | 13.6M | if(0 == tc) |
141 | 647k | { |
142 | 647k | return; |
143 | 647k | } |
144 | 13.0M | src_row0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 4)); |
145 | 13.0M | src_row3_8x16b = _mm_loadl_epi64((__m128i *)((pu1_src - 4) + 3 * src_strd)); |
146 | | |
147 | 13.0M | coef_8x16b = _mm_load_si128((__m128i *)(coef_d)); |
148 | 13.0M | mask_16x8b = _mm_load_si128((__m128i *)(shuffle_d)); |
149 | | |
150 | 13.0M | src_tmp_8x16b = _mm_unpacklo_epi64(src_row0_8x16b, src_row3_8x16b); |
151 | 13.0M | mask_de_result_8x16b = _mm_shuffle_epi8(src_tmp_8x16b, mask_16x8b); |
152 | | |
153 | 13.0M | mask_d_result_4x32b = _mm_maddubs_epi16(src_tmp_8x16b, coef_8x16b); |
154 | | |
155 | | |
156 | | //to get all 1's of 8 bit in (1) |
157 | 13.0M | temp_coef0_8x16b = _mm_cmpeq_epi16(src_tmp_8x16b, src_tmp_8x16b); |
158 | 13.0M | temp_coef1_8x16b = _mm_srli_epi16(temp_coef0_8x16b, 15); |
159 | | //accumulating values foe dp3 dq3 , dp0 dq0 values |
160 | 13.0M | mask_d_result_4x32b = _mm_madd_epi16(mask_d_result_4x32b, temp_coef1_8x16b); |
161 | | |
162 | 13.0M | temp_coef1_8x16b = _mm_packus_epi16(temp_coef1_8x16b, temp_coef1_8x16b); |
163 | | // to get all 1,-1 sets of 16 bits in (0) |
164 | 13.0M | temp_coef0_8x16b = _mm_unpacklo_epi8(temp_coef0_8x16b, temp_coef1_8x16b); |
165 | | //q33-q30,p33-p30,q03-q00,p03-p00,0,q30-p30,0,q00-p00 |
166 | 13.0M | mask_de_result_8x16b = _mm_maddubs_epi16(mask_de_result_8x16b, temp_coef0_8x16b); |
167 | | //to get 16 bit 1's |
168 | 13.0M | temp_coef0_8x16b = _mm_srli_epi16(temp_coef1_8x16b, 8); |
169 | | |
170 | | |
171 | | // dq3 dp3 dq0 dp0 |
172 | 13.0M | mask_d_result_4x32b = _mm_abs_epi32(mask_d_result_4x32b); |
173 | 13.0M | mask_16x8b = _mm_shuffle_epi32(mask_d_result_4x32b, 0xec); |
174 | 13.0M | mask_d_result_4x32b = _mm_shuffle_epi32(mask_d_result_4x32b, 0x49); |
175 | | // dq dp d3 d0 |
176 | 13.0M | mask_d_result_4x32b = _mm_add_epi32(mask_d_result_4x32b, mask_16x8b); |
177 | | //|q33-q30|,|p33-p30|,|q03-q00|,|p03-p00|,0,|q30-p30|,0,|q00-p00| |
178 | 13.0M | mask_de_result_8x16b = _mm_abs_epi16(mask_de_result_8x16b); |
179 | | //|q33-q30|+|p33-p30|,|q03-q00|+|p03-p00|,0+|q30-p30|,0+|q00-p00| |
180 | 13.0M | mask_de_result_8x16b = _mm_madd_epi16(mask_de_result_8x16b, temp_coef0_8x16b); |
181 | | |
182 | | ///store back in a single variable |
183 | 13.0M | temp_coef0_8x16b = _mm_srli_si128(mask_d_result_4x32b, 4); |
184 | 13.0M | temp_coef1_8x16b = _mm_srli_si128(mask_d_result_4x32b, 8); |
185 | 13.0M | mask_16x8b = _mm_srli_si128(mask_d_result_4x32b, 12); |
186 | | |
187 | 13.0M | d0 = _mm_cvtsi128_si32(mask_d_result_4x32b); |
188 | 13.0M | d3 = _mm_cvtsi128_si32(temp_coef0_8x16b); |
189 | 13.0M | dp = _mm_cvtsi128_si32(temp_coef1_8x16b); |
190 | 13.0M | dq = _mm_cvtsi128_si32(mask_16x8b); |
191 | | //getting d |
192 | 13.0M | d = d0 + d3; |
193 | | |
194 | | ///store back in a single variable |
195 | 13.0M | temp_coef0_8x16b = _mm_srli_si128(mask_de_result_8x16b, 4); |
196 | 13.0M | temp_coef1_8x16b = _mm_srli_si128(mask_de_result_8x16b, 8); |
197 | 13.0M | mask_16x8b = _mm_srli_si128(mask_de_result_8x16b, 12); |
198 | | |
199 | 13.0M | de_0 = _mm_cvtsi128_si32(mask_de_result_8x16b); |
200 | 13.0M | de_1 = _mm_cvtsi128_si32(temp_coef0_8x16b); |
201 | 13.0M | de_2 = _mm_cvtsi128_si32(temp_coef1_8x16b); |
202 | 13.0M | de_3 = _mm_cvtsi128_si32(mask_16x8b); |
203 | | |
204 | 13.0M | de = 0; |
205 | 13.0M | dep = 0; |
206 | 13.0M | deq = 0; |
207 | 13.0M | if(d < beta) |
208 | 12.5M | { |
209 | 12.5M | d_sam0 = 0; |
210 | 12.5M | if((2 * d0 < (beta >> 2)) |
211 | 12.5M | && (de_2 < (beta >> 3)) |
212 | 12.5M | && (de_0 < ((5 * tc + 1) >> 1))) |
213 | 8.44M | { |
214 | 8.44M | d_sam0 = 1; |
215 | 8.44M | } |
216 | | |
217 | 12.5M | d_sam3 = 0; |
218 | 12.5M | if((2 * d3 < (beta >> 2)) |
219 | 12.5M | && (de_3 < (beta >> 3)) |
220 | 12.5M | && de_1 < ((5 * tc + 1) >> 1)) |
221 | 8.55M | { |
222 | 8.55M | d_sam3 = 1; |
223 | 8.55M | } |
224 | | |
225 | 12.5M | de = (d_sam0 & d_sam3) + 1; |
226 | 12.5M | dep = (dp < (beta + (beta >> 1)) >> 3) ? 1 : 0; |
227 | 12.5M | deq = (dq < (beta + (beta >> 1)) >> 3) ? 1 : 0; |
228 | 12.5M | if(tc <= 1) |
229 | 3.50M | { |
230 | 3.50M | dep = 0; |
231 | 3.50M | deq = 0; |
232 | 3.50M | } |
233 | 12.5M | } |
234 | | |
235 | 13.0M | } |
236 | | |
237 | 13.0M | if(de != 0) |
238 | 12.9M | { |
239 | | |
240 | | |
241 | 12.9M | src_row1_8x16b = _mm_loadl_epi64((__m128i *)((pu1_src - 4) + src_strd)); |
242 | 12.9M | src_row2_8x16b = _mm_loadl_epi64((__m128i *)((pu1_src - 4) + 2 * src_strd)); |
243 | | |
244 | 12.9M | if(de == 2) |
245 | 7.29M | { |
246 | 7.29M | __m128i temp_pq_str0_16x8b; |
247 | 7.29M | __m128i temp_pq1_str0_16x8b, temp_pq1_str1_16x8b; |
248 | 7.29M | __m128i temp_pq2_str0_16x8b; |
249 | 7.29M | __m128i temp_pq_str1_16x8b; |
250 | 7.29M | __m128i temp_str0_16x8b, temp_str1_16x8b, temp_str2_16x8b, temp_str3_16x8b; |
251 | 7.29M | __m128i temp_max0_16x8b, temp_max1_16x8b, temp_min0_16x8b, temp_min1_16x8b; |
252 | 7.29M | __m128i const2_8x16b, const2tc_8x16b; |
253 | 7.29M | LWORD64 mask, tc2; |
254 | 7.29M | tc = tc << 1; |
255 | 7.29M | mask = (((LWORD64)filter_flag_q) << 63) | (((LWORD64)filter_flag_p) << 31); |
256 | 7.29M | tc2 = ((LWORD64)tc); |
257 | | |
258 | 7.29M | const2_8x16b = _mm_cmpeq_epi16(src_row0_8x16b, src_row0_8x16b); |
259 | | //q'0-q'1-2 ,p'0-p'1-2 |
260 | 7.29M | src_row0_8x16b = _mm_unpacklo_epi64(src_row0_8x16b, src_row2_8x16b); |
261 | 7.29M | src_row1_8x16b = _mm_unpacklo_epi64(src_row1_8x16b, src_row3_8x16b); |
262 | | |
263 | 7.29M | const2_8x16b = _mm_srli_epi16(const2_8x16b, 15); |
264 | 7.29M | temp_pq_str0_16x8b = _mm_srli_epi64(src_row0_8x16b, 16); |
265 | 7.29M | temp_pq_str1_16x8b = _mm_srli_epi64(src_row1_8x16b, 16); |
266 | | //arranged x x x x x x x x q31 q30 q1 q10 p30 p31 p10 p11 , x x x x x x x x q21 q20 q01 q00 p20 p21 p00 p01 |
267 | 7.29M | temp_str0_16x8b = _mm_unpacklo_epi16(temp_pq_str0_16x8b, temp_pq_str1_16x8b); |
268 | 7.29M | temp_str1_16x8b = _mm_unpackhi_epi16(temp_pq_str0_16x8b, temp_pq_str1_16x8b); |
269 | | |
270 | 7.29M | const2_8x16b = _mm_packus_epi16(const2_8x16b, const2_8x16b); |
271 | | //arranged q31 q30 q21 q20 q1 q10 q01 q00 p30 p31 p20 p21 p10 p11 p00 p01 |
272 | 7.29M | temp_pq_str0_16x8b = _mm_unpacklo_epi32(temp_str0_16x8b, temp_str1_16x8b); |
273 | | |
274 | 7.29M | temp_pq_str0_16x8b = _mm_maddubs_epi16(temp_pq_str0_16x8b, const2_8x16b); |
275 | | |
276 | | //q'1-2, p'1-2 |
277 | 7.29M | temp_pq1_str0_16x8b = _mm_srli_epi64(src_row0_8x16b, 8); |
278 | 7.29M | temp_pq1_str1_16x8b = _mm_srli_epi64(src_row1_8x16b, 8); |
279 | | |
280 | 7.29M | temp_str2_16x8b = _mm_unpacklo_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b); |
281 | 7.29M | temp_str3_16x8b = _mm_unpackhi_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b); |
282 | | |
283 | 7.29M | temp_str2_16x8b = _mm_shuffle_epi32(temp_str2_16x8b, 0x58); |
284 | 7.29M | temp_str3_16x8b = _mm_shuffle_epi32(temp_str3_16x8b, 0x58); |
285 | | // q30 p30 q20 p20 q10 p10 q01 q00 p30 q20 p20 q10 p10 q01 q00 p00 |
286 | 7.29M | temp_pq1_str0_16x8b = _mm_unpackhi_epi32(temp_str2_16x8b, temp_str3_16x8b); |
287 | | // q32 q31 q22 q21 q12 q11 q02 q01 p32 p31 p22 p21 p12 p11 p02 p01 |
288 | 7.29M | temp_pq1_str1_16x8b = _mm_unpacklo_epi32(temp_str2_16x8b, temp_str3_16x8b); |
289 | | |
290 | 7.29M | temp_pq1_str0_16x8b = _mm_maddubs_epi16(temp_pq1_str0_16x8b, const2_8x16b); |
291 | 7.29M | temp_pq1_str1_16x8b = _mm_maddubs_epi16(temp_pq1_str1_16x8b, const2_8x16b); |
292 | | |
293 | | //clipping mask design |
294 | 7.29M | temp_str1_16x8b = _mm_setzero_si128(); |
295 | 7.29M | temp_str0_16x8b = _mm_loadl_epi64((__m128i *)(&mask)); |
296 | 7.29M | const2tc_8x16b = _mm_loadl_epi64((__m128i *)(&tc2)); |
297 | 7.29M | temp_str0_16x8b = _mm_shuffle_epi32(temp_str0_16x8b, 0x44); |
298 | 7.29M | const2tc_8x16b = _mm_shuffle_epi8(const2tc_8x16b, temp_str1_16x8b); |
299 | | |
300 | | //clipping mask design |
301 | 7.29M | temp_str0_16x8b = _mm_srai_epi32(temp_str0_16x8b, 31); |
302 | 7.29M | const2tc_8x16b = _mm_and_si128(const2tc_8x16b, temp_str0_16x8b); |
303 | | //calculating Clipping MAX for all pixel values. |
304 | 7.29M | temp_max0_16x8b = _mm_adds_epu8(src_row0_8x16b, const2tc_8x16b); |
305 | 7.29M | temp_max1_16x8b = _mm_adds_epu8(src_row1_8x16b, const2tc_8x16b); |
306 | | |
307 | | |
308 | | //q'2-q'0-2,p'2-p'0-2 |
309 | 7.29M | temp_pq2_str0_16x8b = _mm_unpacklo_epi16(src_row0_8x16b, src_row2_8x16b); |
310 | 7.29M | temp_str3_16x8b = _mm_unpacklo_epi16(src_row1_8x16b, src_row3_8x16b); |
311 | | |
312 | 7.29M | temp_pq2_str0_16x8b = _mm_shuffle_epi32(temp_pq2_str0_16x8b, 0x5c); |
313 | 7.29M | temp_str3_16x8b = _mm_shuffle_epi32(temp_str3_16x8b, 0x5c); |
314 | | |
315 | 7.29M | const2_8x16b = _mm_slli_epi16(const2_8x16b, 1); |
316 | | //arranged q33 q32 q23 q22 q13 q12 q03 q02 p33 p32 p23 p22 p13 p12 p03 p02 |
317 | 7.29M | temp_str3_16x8b = _mm_unpacklo_epi16(temp_pq2_str0_16x8b, temp_str3_16x8b); |
318 | | |
319 | 7.29M | temp_pq2_str0_16x8b = _mm_maddubs_epi16(temp_str3_16x8b, const2_8x16b); |
320 | | |
321 | | //calculating Clipping MIN for all pixel values. |
322 | 7.29M | temp_min0_16x8b = _mm_subs_epu8(src_row0_8x16b, const2tc_8x16b); |
323 | 7.29M | temp_min1_16x8b = _mm_subs_epu8(src_row1_8x16b, const2tc_8x16b); |
324 | | //q'0-q'1-2 ,p'0-p'1-2 |
325 | 7.29M | temp_pq_str1_16x8b = _mm_shuffle_epi32(temp_pq_str0_16x8b, 0x4e); |
326 | 7.29M | temp_pq_str0_16x8b = _mm_add_epi16(temp_pq_str0_16x8b, temp_pq_str1_16x8b); |
327 | | //q'1-2 p'1-2 |
328 | 7.29M | temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b); |
329 | | //to get 2 in 16 bit |
330 | 7.29M | const2_8x16b = _mm_srli_epi16(const2_8x16b, 8); |
331 | | //to get q33 q23 q13 q03, p33 p23 p13 p03 |
332 | 7.29M | temp_pq1_str1_16x8b = _mm_slli_epi16(temp_str3_16x8b, 8); |
333 | 7.29M | temp_pq_str1_16x8b = _mm_srli_epi16(temp_str3_16x8b, 8); |
334 | 7.29M | temp_pq1_str1_16x8b = _mm_srli_epi16(temp_pq1_str1_16x8b, 8); |
335 | | |
336 | | //q'1, p'1 (adding 2) |
337 | 7.29M | temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, const2_8x16b); |
338 | | //q'0-q'1,p'0-p'1 |
339 | 7.29M | temp_pq_str0_16x8b = _mm_add_epi16(temp_pq_str0_16x8b, const2_8x16b); |
340 | | //q'2-q'1,p'2-p'1 |
341 | 7.29M | temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq2_str0_16x8b, const2_8x16b); |
342 | | //q'0 = (q'0-q'1)+q'1 ,p'0 = (p'0-p'1)+p'1; |
343 | 7.29M | temp_pq_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq_str0_16x8b); |
344 | | //q'2 = (q'2-q'1)+q'1 ,p'2 = (p'2-p'1)+p'1; |
345 | 7.29M | temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq2_str0_16x8b); |
346 | | |
347 | | //normalisation of all modified pixels |
348 | 7.29M | temp_pq_str0_16x8b = _mm_srai_epi16(temp_pq_str0_16x8b, 3); |
349 | 7.29M | temp_pq1_str0_16x8b = _mm_srai_epi16(temp_pq1_str0_16x8b, 2); |
350 | 7.29M | temp_pq2_str0_16x8b = _mm_srai_epi16(temp_pq2_str0_16x8b, 3); |
351 | | |
352 | | //getting p0 p1 together and p2 p3 together |
353 | 7.29M | temp_str0_16x8b = _mm_unpacklo_epi16(temp_pq1_str0_16x8b, temp_pq_str0_16x8b); |
354 | 7.29M | temp_str2_16x8b = _mm_unpacklo_epi16(temp_pq1_str1_16x8b, temp_pq2_str0_16x8b); |
355 | | //getting q1 q0 together and q3 q2 together |
356 | 7.29M | temp_pq_str0_16x8b = _mm_unpackhi_epi16(temp_pq_str0_16x8b, temp_pq1_str0_16x8b); |
357 | 7.29M | temp_pq2_str0_16x8b = _mm_unpackhi_epi16(temp_pq2_str0_16x8b, temp_pq_str1_16x8b); |
358 | | //getting p's of row0 row1 together and of row2 row3 together |
359 | 7.29M | temp_pq_str1_16x8b = _mm_unpacklo_epi32(temp_str2_16x8b, temp_str0_16x8b); |
360 | 7.29M | temp_str2_16x8b = _mm_unpackhi_epi32(temp_str2_16x8b, temp_str0_16x8b); |
361 | | //getting q's of row0 row1 together and of row2 row3 together |
362 | 7.29M | temp_str0_16x8b = _mm_unpacklo_epi32(temp_pq_str0_16x8b, temp_pq2_str0_16x8b); |
363 | 7.29M | temp_pq_str0_16x8b = _mm_unpackhi_epi32(temp_pq_str0_16x8b, temp_pq2_str0_16x8b); |
364 | | //getting values for respective rows in 16 bit |
365 | 7.29M | src_row0_8x16b = _mm_unpacklo_epi64(temp_pq_str1_16x8b, temp_str0_16x8b); |
366 | 7.29M | src_row1_8x16b = _mm_unpackhi_epi64(temp_pq_str1_16x8b, temp_str0_16x8b); |
367 | 7.29M | src_row2_8x16b = _mm_unpacklo_epi64(temp_str2_16x8b, temp_pq_str0_16x8b); |
368 | 7.29M | src_row3_8x16b = _mm_unpackhi_epi64(temp_str2_16x8b, temp_pq_str0_16x8b); |
369 | | //packing values to 8 bit |
370 | 7.29M | src_row0_8x16b = _mm_packus_epi16(src_row0_8x16b, src_row2_8x16b); |
371 | 7.29M | src_row1_8x16b = _mm_packus_epi16(src_row1_8x16b, src_row3_8x16b); |
372 | | //Clipping MAX |
373 | 7.29M | src_row0_8x16b = _mm_min_epu8(src_row0_8x16b, temp_max0_16x8b); |
374 | 7.29M | src_row1_8x16b = _mm_min_epu8(src_row1_8x16b, temp_max1_16x8b); |
375 | | //Clipping MIN |
376 | 7.29M | src_row0_8x16b = _mm_max_epu8(src_row0_8x16b, temp_min0_16x8b); |
377 | 7.29M | src_row1_8x16b = _mm_max_epu8(src_row1_8x16b, temp_min1_16x8b); |
378 | | //separating row 2 and row 3 |
379 | 7.29M | src_row2_8x16b = _mm_srli_si128(src_row0_8x16b, 8); |
380 | 7.29M | src_row3_8x16b = _mm_srli_si128(src_row1_8x16b, 8); |
381 | | |
382 | 7.29M | } |
383 | | |
384 | 5.62M | else |
385 | 5.62M | { |
386 | | |
387 | 5.62M | __m128i tmp_delta0_8x16b, tmp_delta1_8x16b, tmp_delta2_8x16b, tmp_delta3_8x16b; |
388 | 5.62M | __m128i tmp0_const_8x16b, tmp1_const_8x16b, tmp2_const_8x16b, tmp3_const_8x16b; |
389 | 5.62M | __m128i coefdelta_0_8x16b, mask_pq_8x16b; |
390 | 5.62M | __m128i const2_8x16b, consttc_8x16b; |
391 | | |
392 | 5.62M | LWORD64 mask1; |
393 | 5.62M | mask1 = (((LWORD64)(filter_flag_q & deq)) << 63) | (((LWORD64)filter_flag_q) << 47) | (((LWORD64)filter_flag_p) << 31) | (((LWORD64)(filter_flag_p & dep)) << 15); |
394 | | |
395 | 5.62M | consttc_8x16b = _mm_set1_epi32(tc); |
396 | | |
397 | | |
398 | 5.62M | src_row0_8x16b = _mm_unpacklo_epi64(src_row0_8x16b, src_row1_8x16b); |
399 | 5.62M | src_row2_8x16b = _mm_unpacklo_epi64(src_row2_8x16b, src_row3_8x16b); |
400 | | |
401 | 5.62M | tmp_delta2_8x16b = _mm_srli_epi64(src_row0_8x16b, 16); |
402 | 5.62M | tmp_delta3_8x16b = _mm_srli_epi64(src_row2_8x16b, 16); |
403 | | |
404 | 5.62M | tmp_delta2_8x16b = _mm_shuffle_epi32(tmp_delta2_8x16b, 0x08); |
405 | 5.62M | tmp_delta3_8x16b = _mm_shuffle_epi32(tmp_delta3_8x16b, 0x08); |
406 | | //arranged q31 q30 p30 p31 q21 q20 p20 p21 q1 q10 p10 p11 q01 q00 p00 p01 |
407 | 5.62M | tmp_delta2_8x16b = _mm_unpacklo_epi64(tmp_delta2_8x16b, tmp_delta3_8x16b); |
408 | | |
409 | 5.62M | coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_de1); |
410 | | // (-3q1+9q0),(-9p0+3p1) |
411 | 5.62M | tmp_delta3_8x16b = _mm_maddubs_epi16(tmp_delta2_8x16b, coefdelta_0_8x16b); |
412 | | //converting to 16 bit |
413 | 5.62M | consttc_8x16b = _mm_packs_epi32(consttc_8x16b, consttc_8x16b); |
414 | | //getting -tc store |
415 | 5.62M | tmp1_const_8x16b = _mm_cmpeq_epi32(consttc_8x16b, consttc_8x16b); |
416 | | //calc 10 *tc = 2*tc +8*tc ; 2*tc |
417 | 5.62M | tmp2_const_8x16b = _mm_slli_epi16(consttc_8x16b, 1); |
418 | | //calc 10 *tc = 2*tc +8*tc ; 8*tc |
419 | 5.62M | tmp0_const_8x16b = _mm_slli_epi16(consttc_8x16b, 3); |
420 | | //getting -tc store |
421 | 5.62M | tmp3_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp1_const_8x16b); |
422 | | //calc 10 *tc |
423 | 5.62M | tmp2_const_8x16b = _mm_add_epi16(tmp2_const_8x16b, tmp0_const_8x16b); |
424 | | //const 1 |
425 | 5.62M | const2_8x16b = _mm_srli_epi16(tmp1_const_8x16b, 15); |
426 | 5.62M | tmp_delta0_8x16b = _mm_madd_epi16(tmp_delta3_8x16b, const2_8x16b); |
427 | 5.62M | const2_8x16b = _mm_srli_epi32(tmp1_const_8x16b, 31); |
428 | | //getting the mask values |
429 | 5.62M | mask_pq_8x16b = _mm_loadl_epi64((__m128i *)(&mask1)); |
430 | | //loaded coef for delta1 calculation |
431 | 5.62M | coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_dep1); |
432 | | //(-2q1+q0),(p0-2p1) |
433 | 5.62M | tmp_delta3_8x16b = _mm_maddubs_epi16(tmp_delta2_8x16b, coefdelta_0_8x16b); |
434 | | //const 8 |
435 | 5.62M | const2_8x16b = _mm_slli_epi32(const2_8x16b, 3); |
436 | | //rearranging the mask values |
437 | 5.62M | mask_pq_8x16b = _mm_unpacklo_epi64(mask_pq_8x16b, mask_pq_8x16b); |
438 | | //normalisation of the filter |
439 | 5.62M | tmp_delta0_8x16b = _mm_add_epi32(tmp_delta0_8x16b, const2_8x16b); |
440 | 5.62M | tmp_delta0_8x16b = _mm_srai_epi32(tmp_delta0_8x16b, 4); |
441 | | |
442 | | //getting deltaq0 |
443 | 5.62M | tmp_delta2_8x16b = _mm_sign_epi32(tmp_delta0_8x16b, tmp1_const_8x16b); |
444 | | //packing d3q d2q d1q d0q d3p d2p d1p d0p |
445 | 5.62M | tmp_delta0_8x16b = _mm_packs_epi32(tmp_delta0_8x16b, tmp_delta2_8x16b); |
446 | | //absolute delta |
447 | 5.62M | tmp_delta2_8x16b = _mm_abs_epi16(tmp_delta0_8x16b); |
448 | | //Clipping of delta0 |
449 | 5.62M | tmp_delta0_8x16b = _mm_min_epi16(tmp_delta0_8x16b, consttc_8x16b); |
450 | | //mask for |delta| < 10*tc |
451 | 5.62M | tmp0_const_8x16b = _mm_cmpgt_epi16(tmp2_const_8x16b, tmp_delta2_8x16b); |
452 | | //Clipping of delta0 |
453 | 5.62M | tmp_delta0_8x16b = _mm_max_epi16(tmp_delta0_8x16b, tmp3_const_8x16b); |
454 | | |
455 | | |
456 | | //delta 1 calc starts |
457 | | |
458 | | //getting q32 q22 q12 q02 p32 p12 p22 p02 |
459 | 5.62M | tmp2_const_8x16b = _mm_loadl_epi64((__m128i *)(shuffle0)); |
460 | 5.62M | tmp_delta2_8x16b = _mm_shuffle_epi8(src_row0_8x16b, tmp2_const_8x16b); |
461 | 5.62M | tmp_delta1_8x16b = _mm_shuffle_epi8(src_row2_8x16b, tmp2_const_8x16b); |
462 | 5.62M | tmp_delta1_8x16b = _mm_unpacklo_epi32(tmp_delta2_8x16b, tmp_delta1_8x16b); |
463 | | //constant 1 |
464 | 5.62M | const2_8x16b = _mm_srli_epi16(tmp1_const_8x16b, 15); |
465 | | //tc>>1 16 bit |
466 | 5.62M | consttc_8x16b = _mm_srai_epi16(consttc_8x16b, 1); |
467 | | |
468 | | //getting -tc>>1 store 16 bit |
469 | 5.62M | tmp1_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp1_const_8x16b); |
470 | | //2*delta0 |
471 | 5.62M | tmp2_const_8x16b = _mm_add_epi16(tmp_delta0_8x16b, tmp_delta0_8x16b); |
472 | | |
473 | | //getting all respective q's and p's together |
474 | 5.62M | tmp3_const_8x16b = _mm_load_si128((__m128i *)(shuffle1)); |
475 | 5.62M | tmp_delta3_8x16b = _mm_shuffle_epi8(tmp_delta3_8x16b, tmp3_const_8x16b); |
476 | | //final adds for deltap1 and deltaq1 |
477 | 5.62M | tmp_delta3_8x16b = _mm_add_epi16(tmp_delta3_8x16b, const2_8x16b); |
478 | 5.62M | tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, tmp2_const_8x16b); |
479 | 5.62M | tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, tmp_delta3_8x16b); |
480 | 5.62M | tmp2_const_8x16b = _mm_setzero_si128(); |
481 | 5.62M | tmp_delta1_8x16b = _mm_srai_epi16(tmp_delta1_8x16b, 2); |
482 | | |
483 | | // clipping delta1 |
484 | 5.62M | tmp_delta1_8x16b = _mm_min_epi16(tmp_delta1_8x16b, consttc_8x16b); |
485 | | // clipping delta1 |
486 | 5.62M | tmp_delta1_8x16b = _mm_max_epi16(tmp_delta1_8x16b, tmp1_const_8x16b); |
487 | | |
488 | | //getting the mask ready |
489 | 5.62M | mask_pq_8x16b = _mm_srai_epi16(mask_pq_8x16b, 15); |
490 | | //masking of the delta values |delta|<10*tc |
491 | 5.62M | tmp_delta1_8x16b = _mm_and_si128(tmp_delta1_8x16b, tmp0_const_8x16b); |
492 | 5.62M | tmp_delta0_8x16b = _mm_and_si128(tmp_delta0_8x16b, tmp0_const_8x16b); |
493 | | //packing dq1 dq0 dp0 dp1 |
494 | 5.62M | tmp1_const_8x16b = _mm_unpacklo_epi16(tmp_delta1_8x16b, tmp_delta0_8x16b); |
495 | 5.62M | tmp_delta0_8x16b = _mm_unpackhi_epi16(tmp_delta0_8x16b, tmp_delta1_8x16b); |
496 | 5.62M | tmp_delta1_8x16b = _mm_unpackhi_epi32(tmp1_const_8x16b, tmp_delta0_8x16b); |
497 | 5.62M | tmp_delta0_8x16b = _mm_unpacklo_epi32(tmp1_const_8x16b, tmp_delta0_8x16b); |
498 | | |
499 | | //masking of the delta values dep, deq , filter_p ,filter_q |
500 | 5.62M | tmp_delta0_8x16b = _mm_and_si128(tmp_delta0_8x16b, mask_pq_8x16b); |
501 | 5.62M | tmp_delta1_8x16b = _mm_and_si128(tmp_delta1_8x16b, mask_pq_8x16b); |
502 | | //converting 8bit to 16 bit |
503 | 5.62M | src_row0_8x16b = _mm_unpacklo_epi8(src_row0_8x16b, tmp2_const_8x16b); |
504 | 5.62M | src_row1_8x16b = _mm_unpacklo_epi8(src_row1_8x16b, tmp2_const_8x16b); |
505 | 5.62M | src_row2_8x16b = _mm_unpacklo_epi8(src_row2_8x16b, tmp2_const_8x16b); |
506 | 5.62M | src_row3_8x16b = _mm_unpacklo_epi8(src_row3_8x16b, tmp2_const_8x16b); |
507 | | //shuffle values loaded |
508 | 5.62M | tmp0_const_8x16b = _mm_load_si128((__m128i *)shuffle2); |
509 | 5.62M | tmp1_const_8x16b = _mm_load_si128((__m128i *)shuffle3); |
510 | | //arranging each row delta in different registers |
511 | 5.62M | tmp_delta3_8x16b = _mm_shuffle_epi8(tmp_delta1_8x16b, tmp1_const_8x16b); |
512 | 5.62M | tmp_delta2_8x16b = _mm_shuffle_epi8(tmp_delta1_8x16b, tmp0_const_8x16b); |
513 | 5.62M | tmp_delta1_8x16b = _mm_shuffle_epi8(tmp_delta0_8x16b, tmp1_const_8x16b); |
514 | 5.62M | tmp_delta0_8x16b = _mm_shuffle_epi8(tmp_delta0_8x16b, tmp0_const_8x16b); |
515 | | |
516 | | //adding the respective delta |
517 | 5.62M | src_row3_8x16b = _mm_add_epi16(tmp_delta3_8x16b, src_row3_8x16b); |
518 | 5.62M | src_row2_8x16b = _mm_add_epi16(tmp_delta2_8x16b, src_row2_8x16b); |
519 | 5.62M | src_row1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, src_row1_8x16b); |
520 | 5.62M | src_row0_8x16b = _mm_add_epi16(tmp_delta0_8x16b, src_row0_8x16b); |
521 | | //saturating to 8 bit |
522 | 5.62M | src_row2_8x16b = _mm_packus_epi16(src_row2_8x16b, src_row3_8x16b); |
523 | 5.62M | src_row0_8x16b = _mm_packus_epi16(src_row0_8x16b, src_row1_8x16b); |
524 | | //separating different rows |
525 | 5.62M | src_row1_8x16b = _mm_srli_si128(src_row0_8x16b, 8); |
526 | 5.62M | src_row3_8x16b = _mm_srli_si128(src_row2_8x16b, 8); |
527 | 5.62M | } |
528 | | |
529 | 12.9M | _mm_storel_epi64((__m128i *)(pu1_src - 4), src_row0_8x16b); |
530 | 12.9M | _mm_storel_epi64((__m128i *)((pu1_src - 4) + src_strd), src_row1_8x16b); |
531 | 12.9M | _mm_storel_epi64((__m128i *)((pu1_src - 4) + 2 * src_strd), src_row2_8x16b); |
532 | 12.9M | _mm_storel_epi64((__m128i *)((pu1_src - 4) + 3 * src_strd), src_row3_8x16b); |
533 | 12.9M | } |
534 | 13.0M | } |
535 | | |
536 | | void ihevc_deblk_luma_horz_ssse3(UWORD8 *pu1_src, |
537 | | WORD32 src_strd, |
538 | | WORD32 bs, |
539 | | WORD32 quant_param_p, |
540 | | WORD32 quant_param_q, |
541 | | WORD32 beta_offset_div2, |
542 | | WORD32 tc_offset_div2, |
543 | | WORD32 filter_flag_p, |
544 | | WORD32 filter_flag_q) |
545 | 13.7M | { |
546 | 13.7M | WORD32 qp_luma, beta_indx, tc_indx; |
547 | 13.7M | WORD32 beta, tc; |
548 | | |
549 | 13.7M | WORD32 d0, d3, dp, dq, d; |
550 | 13.7M | WORD32 de_0, de_1, de_2, de_3; |
551 | 13.7M | WORD32 d_sam0, d_sam3; |
552 | 13.7M | WORD32 de, dep, deq; |
553 | | |
554 | 13.7M | __m128i src_q0_8x16b, src_q1_8x16b, src_p0_8x16b, src_p1_8x16b, src_q2_8x16b; |
555 | 13.7M | __m128i tmp_pq_str1_8x16b, src_p2_8x16b, tmp_pq_str0_8x16b; |
556 | | |
557 | | |
558 | | |
559 | | |
560 | 13.7M | { |
561 | 13.7M | __m128i src_tmp_p_0_8x16b, src_tmp_p_1_8x16b, src_tmp_q_0_8x16b, src_tmp_q_1_8x16b; |
562 | 13.7M | __m128i coef_8x16b, mask_d_result_4x32b, mask_de_result_8x16b; |
563 | 13.7M | __m128i mask_16x8b, temp_coef0_8x16b, temp_coef1_8x16b; |
564 | | |
565 | 13.7M | ASSERT((bs > 0)); |
566 | 13.8M | ASSERT(filter_flag_p || filter_flag_q); |
567 | | |
568 | 13.8M | qp_luma = (quant_param_p + quant_param_q + 1) >> 1; |
569 | 13.8M | beta_indx = CLIP3(qp_luma + (beta_offset_div2 << 1), 0, 51); |
570 | | |
571 | | /* BS based on implementation can take value 3 if it is intra/inter egde */ |
572 | | /* based on BS, tc index is calcuated by adding 2 * ( bs - 1) to QP and tc_offset */ |
573 | | /* for BS = 1 adding factor is (0*2), BS = 2 or 3 adding factor is (1*2) */ |
574 | | /* the above desired functionallity is achieved by doing (2*(bs>>1)) */ |
575 | | |
576 | 13.8M | tc_indx = CLIP3(qp_luma + 2 * (bs >> 1) + (tc_offset_div2 << 1), 0, 53); |
577 | | |
578 | 13.8M | beta = gai4_ihevc_beta_table[beta_indx]; |
579 | 13.8M | tc = gai4_ihevc_tc_table[tc_indx]; |
580 | 13.8M | if(0 == tc) |
581 | 585k | { |
582 | 585k | return; |
583 | 585k | } |
584 | 13.3M | src_q0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src)); |
585 | 13.3M | src_q1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); |
586 | 13.3M | src_p0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - src_strd)); |
587 | 13.3M | src_p1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 2 * src_strd)); |
588 | 13.3M | src_q2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); |
589 | 13.3M | tmp_pq_str1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); |
590 | 13.3M | src_p2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 3 * src_strd)); |
591 | 13.3M | tmp_pq_str0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 4 * src_strd)); |
592 | | |
593 | | |
594 | 13.3M | src_tmp_p_0_8x16b = _mm_unpacklo_epi8(src_p1_8x16b, src_p0_8x16b); |
595 | 13.3M | src_tmp_p_1_8x16b = _mm_unpacklo_epi8(tmp_pq_str0_8x16b, src_p2_8x16b); |
596 | | |
597 | 13.3M | src_tmp_q_0_8x16b = _mm_unpacklo_epi8(src_q0_8x16b, src_q1_8x16b); |
598 | 13.3M | src_tmp_q_1_8x16b = _mm_unpacklo_epi8(src_q2_8x16b, tmp_pq_str1_8x16b); |
599 | | |
600 | 13.3M | src_tmp_p_0_8x16b = _mm_unpacklo_epi16(src_tmp_p_1_8x16b, src_tmp_p_0_8x16b); |
601 | 13.3M | src_tmp_q_0_8x16b = _mm_unpacklo_epi16(src_tmp_q_0_8x16b, src_tmp_q_1_8x16b); |
602 | | |
603 | 13.3M | src_tmp_p_0_8x16b = _mm_shuffle_epi32(src_tmp_p_0_8x16b, 0x6c); |
604 | 13.3M | src_tmp_q_0_8x16b = _mm_shuffle_epi32(src_tmp_q_0_8x16b, 0x6c); |
605 | | |
606 | 13.3M | coef_8x16b = _mm_load_si128((__m128i *)(coef_d)); |
607 | 13.3M | mask_16x8b = _mm_load_si128((__m128i *)(shuffle_d)); |
608 | | |
609 | 13.3M | src_tmp_p_0_8x16b = _mm_unpacklo_epi32(src_tmp_p_0_8x16b, src_tmp_q_0_8x16b); |
610 | | //WORD32 shuffle_d[4]={0x80800403,0x80800c0b,0x03000704,0x0b080f0c}; |
611 | 13.3M | mask_de_result_8x16b = _mm_shuffle_epi8(src_tmp_p_0_8x16b, mask_16x8b); |
612 | | |
613 | 13.3M | mask_d_result_4x32b = _mm_maddubs_epi16(src_tmp_p_0_8x16b, coef_8x16b); |
614 | | |
615 | | |
616 | | //to get all 1's of 8 bit in (1) |
617 | 13.3M | temp_coef0_8x16b = _mm_cmpeq_epi16(src_tmp_p_0_8x16b, src_tmp_p_0_8x16b); |
618 | 13.3M | temp_coef1_8x16b = _mm_srli_epi16(temp_coef0_8x16b, 15); |
619 | | //accumulating values foe dp3 dq3 , dp0 dq0 values |
620 | 13.3M | mask_d_result_4x32b = _mm_madd_epi16(mask_d_result_4x32b, temp_coef1_8x16b); |
621 | | |
622 | 13.3M | temp_coef1_8x16b = _mm_packus_epi16(temp_coef1_8x16b, temp_coef1_8x16b); |
623 | | // to get all 1,-1 sets of 16 bits in (0) |
624 | 13.3M | temp_coef0_8x16b = _mm_unpacklo_epi8(temp_coef0_8x16b, temp_coef1_8x16b); |
625 | | //q33-q30,p33-p30,q03-q00,p03-p00,0,q30-p30,0,q00-p00 |
626 | 13.3M | mask_de_result_8x16b = _mm_maddubs_epi16(mask_de_result_8x16b, temp_coef0_8x16b); |
627 | | //to get 16 bit 1's |
628 | 13.3M | temp_coef0_8x16b = _mm_srli_epi16(temp_coef1_8x16b, 8); |
629 | | |
630 | | |
631 | | // dq3 dp3 dq0 dp0 |
632 | 13.3M | mask_d_result_4x32b = _mm_abs_epi32(mask_d_result_4x32b); |
633 | 13.3M | mask_16x8b = _mm_shuffle_epi32(mask_d_result_4x32b, 0xec); |
634 | 13.3M | mask_d_result_4x32b = _mm_shuffle_epi32(mask_d_result_4x32b, 0x49); |
635 | | // dq dp d3 d0 |
636 | 13.3M | mask_d_result_4x32b = _mm_add_epi32(mask_d_result_4x32b, mask_16x8b); |
637 | | //|q33-q30|,|p33-p30|,|q03-q00|,|p03-p00|,0,|q30-p30|,0,|q00-p00| |
638 | 13.3M | mask_de_result_8x16b = _mm_abs_epi16(mask_de_result_8x16b); |
639 | | //|q33-q30|+|p33-p30|,|q03-q00|+|p03-p00|,0+|q30-p30|,0+|q00-p00| |
640 | 13.3M | mask_de_result_8x16b = _mm_madd_epi16(mask_de_result_8x16b, temp_coef0_8x16b); |
641 | | |
642 | | ///store back in a single variable |
643 | 13.3M | temp_coef0_8x16b = _mm_srli_si128(mask_d_result_4x32b, 4); |
644 | 13.3M | temp_coef1_8x16b = _mm_srli_si128(mask_d_result_4x32b, 8); |
645 | 13.3M | mask_16x8b = _mm_srli_si128(mask_d_result_4x32b, 12); |
646 | | |
647 | 13.3M | d0 = _mm_cvtsi128_si32(mask_d_result_4x32b); |
648 | 13.3M | d3 = _mm_cvtsi128_si32(temp_coef0_8x16b); |
649 | 13.3M | dp = _mm_cvtsi128_si32(temp_coef1_8x16b); |
650 | 13.3M | dq = _mm_cvtsi128_si32(mask_16x8b); |
651 | | //getting d |
652 | 13.3M | d = d0 + d3; |
653 | | |
654 | | ///store back in a single variable |
655 | 13.3M | temp_coef0_8x16b = _mm_srli_si128(mask_de_result_8x16b, 4); |
656 | 13.3M | temp_coef1_8x16b = _mm_srli_si128(mask_de_result_8x16b, 8); |
657 | 13.3M | mask_16x8b = _mm_srli_si128(mask_de_result_8x16b, 12); |
658 | | |
659 | 13.3M | de_0 = _mm_cvtsi128_si32(mask_de_result_8x16b); |
660 | 13.3M | de_1 = _mm_cvtsi128_si32(temp_coef0_8x16b); |
661 | 13.3M | de_2 = _mm_cvtsi128_si32(temp_coef1_8x16b); |
662 | 13.3M | de_3 = _mm_cvtsi128_si32(mask_16x8b); |
663 | | |
664 | 13.3M | de = 0; |
665 | 13.3M | dep = 0; |
666 | 13.3M | deq = 0; |
667 | 13.3M | if(d < beta) |
668 | 12.8M | { |
669 | 12.8M | d_sam0 = 0; |
670 | 12.8M | if((2 * d0 < (beta >> 2)) |
671 | 12.8M | && (de_2 < (beta >> 3)) |
672 | 12.8M | && (de_0 < ((5 * tc + 1) >> 1))) |
673 | 8.51M | { |
674 | 8.51M | d_sam0 = 1; |
675 | 8.51M | } |
676 | | |
677 | 12.8M | d_sam3 = 0; |
678 | 12.8M | if((2 * d3 < (beta >> 2)) |
679 | 12.8M | && (de_3 < (beta >> 3)) |
680 | 12.8M | && de_1 < ((5 * tc + 1) >> 1)) |
681 | 8.56M | { |
682 | 8.56M | d_sam3 = 1; |
683 | 8.56M | } |
684 | | |
685 | 12.8M | de = (d_sam0 & d_sam3) + 1; |
686 | 12.8M | dep = (dp < (beta + (beta >> 1)) >> 3) ? 1 : 0; |
687 | 12.8M | deq = (dq < (beta + (beta >> 1)) >> 3) ? 1 : 0; |
688 | 12.8M | if(tc <= 1) |
689 | 3.46M | { |
690 | 3.46M | dep = 0; |
691 | 3.46M | deq = 0; |
692 | 3.46M | } |
693 | 12.8M | } |
694 | | |
695 | 13.3M | } |
696 | | |
697 | 13.3M | if(de != 0) |
698 | 12.9M | { |
699 | | |
700 | 12.9M | if(2 == de) |
701 | 8.14M | { |
702 | | |
703 | 8.14M | __m128i temp_pq0_str0_16x8b; |
704 | 8.14M | __m128i temp_pq1_str0_16x8b, temp_pq1_str1_16x8b; |
705 | 8.14M | __m128i temp_pq2_str0_16x8b; |
706 | 8.14M | __m128i temp_str0_16x8b, temp_str1_16x8b; |
707 | 8.14M | __m128i const2_8x16b, const2tc_8x16b; |
708 | | |
709 | 8.14M | LWORD64 mask, tc2; |
710 | 8.14M | tc = tc << 1; |
711 | 8.14M | mask = (((LWORD64)filter_flag_q) << 63) | (((LWORD64)filter_flag_p) << 31); |
712 | 8.14M | tc2 = ((LWORD64)tc); |
713 | | |
714 | 8.14M | const2_8x16b = _mm_cmpeq_epi16(src_p1_8x16b, src_p1_8x16b); |
715 | | //q'0-q'1-2 ,p'0-p'1-2 |
716 | 8.14M | temp_pq0_str0_16x8b = _mm_unpacklo_epi8(src_p1_8x16b, src_p0_8x16b); |
717 | 8.14M | temp_str0_16x8b = _mm_unpacklo_epi8(src_q0_8x16b, src_q1_8x16b); |
718 | 8.14M | const2_8x16b = _mm_srli_epi16(const2_8x16b, 15); |
719 | | //arranged q31 q30 q21 q20 q1 q10 q01 q00 p30 p31 p20 p21 p10 p11 p00 p01 |
720 | 8.14M | temp_pq0_str0_16x8b = _mm_unpacklo_epi64(temp_pq0_str0_16x8b, temp_str0_16x8b); |
721 | | |
722 | 8.14M | const2_8x16b = _mm_packus_epi16(const2_8x16b, const2_8x16b); |
723 | 8.14M | temp_pq0_str0_16x8b = _mm_maddubs_epi16(temp_pq0_str0_16x8b, const2_8x16b); |
724 | | |
725 | | //q'1-2, p'1-2 |
726 | 8.14M | temp_pq1_str0_16x8b = _mm_unpacklo_epi8(src_p0_8x16b, src_q0_8x16b); |
727 | 8.14M | temp_pq1_str1_16x8b = _mm_unpacklo_epi8(src_q1_8x16b, src_q2_8x16b); |
728 | 8.14M | temp_str1_16x8b = _mm_unpacklo_epi8(src_p1_8x16b, src_p2_8x16b); |
729 | | // q30 p30 q20 p20 q10 p10 q01 q00 p30 q20 p20 q10 p10 q01 q00 p00 |
730 | 8.14M | temp_pq1_str0_16x8b = _mm_unpacklo_epi64(temp_pq1_str0_16x8b, temp_pq1_str0_16x8b); |
731 | | // q32 q31 q22 q21 q12 q11 q02 q01 p32 p31 p22 p21 p12 p11 p02 p01 |
732 | 8.14M | temp_pq1_str1_16x8b = _mm_unpacklo_epi64(temp_str1_16x8b, temp_pq1_str1_16x8b); |
733 | | |
734 | 8.14M | temp_pq1_str0_16x8b = _mm_maddubs_epi16(temp_pq1_str0_16x8b, const2_8x16b); |
735 | 8.14M | temp_pq1_str1_16x8b = _mm_maddubs_epi16(temp_pq1_str1_16x8b, const2_8x16b); |
736 | | |
737 | | //clipping mask design |
738 | 8.14M | temp_str1_16x8b = _mm_setzero_si128(); |
739 | 8.14M | temp_str0_16x8b = _mm_loadl_epi64((__m128i *)(&mask)); |
740 | 8.14M | const2tc_8x16b = _mm_loadl_epi64((__m128i *)(&tc2)); |
741 | 8.14M | temp_str0_16x8b = _mm_shuffle_epi32(temp_str0_16x8b, 0x44); |
742 | 8.14M | const2tc_8x16b = _mm_shuffle_epi8(const2tc_8x16b, temp_str1_16x8b); |
743 | | |
744 | | //clipping mask design |
745 | 8.14M | temp_str0_16x8b = _mm_srai_epi32(temp_str0_16x8b, 31); |
746 | 8.14M | const2tc_8x16b = _mm_and_si128(const2tc_8x16b, temp_str0_16x8b); |
747 | | //calculating Clipping MAX for all pixel values. |
748 | 8.14M | src_p0_8x16b = _mm_unpacklo_epi32(src_p0_8x16b, src_q0_8x16b); |
749 | 8.14M | src_q0_8x16b = _mm_unpacklo_epi32(src_p1_8x16b, src_q1_8x16b); |
750 | | //for clipping calc |
751 | 8.14M | src_p1_8x16b = _mm_unpacklo_epi64(src_p0_8x16b, src_q0_8x16b); |
752 | | //saving the unmodified data of q1 p1 q0 p0 |
753 | 8.14M | src_q1_8x16b = _mm_unpackhi_epi64(src_p0_8x16b, src_q0_8x16b); |
754 | | //CLIpping MAX and MIN for q1 p1 q0 p0 |
755 | 8.14M | src_p0_8x16b = _mm_adds_epu8(src_p1_8x16b, const2tc_8x16b); |
756 | 8.14M | src_p1_8x16b = _mm_subs_epu8(src_p1_8x16b, const2tc_8x16b); |
757 | | |
758 | | |
759 | | //q'2-q'0-2,p'2-p'0-2 |
760 | 8.14M | tmp_pq_str0_8x16b = _mm_unpacklo_epi8(src_p2_8x16b, tmp_pq_str0_8x16b); |
761 | 8.14M | temp_pq2_str0_16x8b = _mm_unpacklo_epi8(src_q2_8x16b, tmp_pq_str1_8x16b); |
762 | 8.14M | const2_8x16b = _mm_slli_epi16(const2_8x16b, 1); |
763 | | //arranged q33 q32 q23 q22 q13 q12 q03 q02 p32 p33 p22 p23 p12 p13 p02 p03 |
764 | 8.14M | temp_pq2_str0_16x8b = _mm_unpacklo_epi64(tmp_pq_str0_8x16b, temp_pq2_str0_16x8b); |
765 | 8.14M | src_p2_8x16b = _mm_unpacklo_epi32(src_p2_8x16b, src_q2_8x16b); |
766 | 8.14M | temp_pq2_str0_16x8b = _mm_maddubs_epi16(temp_pq2_str0_16x8b, const2_8x16b); |
767 | | |
768 | | //calculating Clipping MAX and MIN for p2 and q2 . |
769 | 8.14M | tmp_pq_str0_8x16b = _mm_adds_epu8(src_p2_8x16b, const2tc_8x16b); |
770 | 8.14M | tmp_pq_str1_8x16b = _mm_subs_epu8(src_p2_8x16b, const2tc_8x16b); |
771 | | //q'0-q'1-2 ,p'0-p'1-2 |
772 | 8.14M | temp_str0_16x8b = _mm_shuffle_epi32(temp_pq0_str0_16x8b, 0x4e); |
773 | 8.14M | temp_pq0_str0_16x8b = _mm_add_epi16(temp_pq0_str0_16x8b, temp_str0_16x8b); |
774 | | //q'1-2 p'1-2 |
775 | 8.14M | temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b); |
776 | | //to get 2 in 16 bit |
777 | 8.14M | const2_8x16b = _mm_srli_epi16(const2_8x16b, 8); |
778 | | |
779 | | |
780 | | //q'1, p'1 (adding 2) |
781 | 8.14M | temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, const2_8x16b); |
782 | | //q'0-q'1,p'0-p'1 |
783 | 8.14M | temp_pq0_str0_16x8b = _mm_add_epi16(temp_pq0_str0_16x8b, const2_8x16b); |
784 | | //q'2-q'1,p'2-p'1 |
785 | 8.14M | temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq2_str0_16x8b, const2_8x16b); |
786 | | //q'0 = (q'0-q'1)+q'1 ,p'0 = (p'0-p'1)+p'1; |
787 | 8.14M | temp_pq0_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq0_str0_16x8b); |
788 | | //q'2 = (q'2-q'1)+q'1 ,p'2 = (p'2-p'1)+p'1; |
789 | 8.14M | temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq2_str0_16x8b); |
790 | | |
791 | | //normalisation of all modified pixels |
792 | 8.14M | temp_pq0_str0_16x8b = _mm_srai_epi16(temp_pq0_str0_16x8b, 3); |
793 | 8.14M | temp_pq1_str0_16x8b = _mm_srai_epi16(temp_pq1_str0_16x8b, 2); |
794 | 8.14M | temp_pq2_str0_16x8b = _mm_srai_epi16(temp_pq2_str0_16x8b, 3); |
795 | | //q'1 p'1 q'0 p'0 |
796 | 8.14M | temp_pq0_str0_16x8b = _mm_packus_epi16(temp_pq0_str0_16x8b, temp_pq1_str0_16x8b); |
797 | 8.14M | temp_pq2_str0_16x8b = _mm_packus_epi16(temp_pq2_str0_16x8b, temp_pq2_str0_16x8b); |
798 | | //pack with the unmodified data of q2 and p2 |
799 | 8.14M | src_p2_8x16b = _mm_unpackhi_epi64(temp_pq2_str0_16x8b, src_p2_8x16b); |
800 | | //Clipping MAX and MIN for q'1 p'1 q'0 p'0 and q'2 p'2 |
801 | 8.14M | temp_pq0_str0_16x8b = _mm_min_epu8(temp_pq0_str0_16x8b, src_p0_8x16b); |
802 | 8.14M | src_p2_8x16b = _mm_min_epu8(src_p2_8x16b, tmp_pq_str0_8x16b); |
803 | 8.14M | temp_pq0_str0_16x8b = _mm_max_epu8(temp_pq0_str0_16x8b, src_p1_8x16b); |
804 | 8.14M | src_p2_8x16b = _mm_max_epu8(src_p2_8x16b, tmp_pq_str1_8x16b); |
805 | | //Reshuffling q'1 p'1 q'0 p'0 along with unmodified data |
806 | 8.14M | src_p0_8x16b = _mm_unpacklo_epi32(temp_pq0_str0_16x8b, src_q1_8x16b); |
807 | 8.14M | src_p1_8x16b = _mm_unpackhi_epi32(temp_pq0_str0_16x8b, src_q1_8x16b); |
808 | 8.14M | src_p2_8x16b = _mm_shuffle_epi32(src_p2_8x16b, 0xd8); |
809 | 8.14M | src_q0_8x16b = _mm_srli_si128(src_p0_8x16b, 8); |
810 | 8.14M | src_q1_8x16b = _mm_srli_si128(src_p1_8x16b, 8); |
811 | 8.14M | src_q2_8x16b = _mm_srli_si128(src_p2_8x16b, 8); |
812 | | |
813 | 8.14M | _mm_storel_epi64((__m128i *)(pu1_src - 3 * src_strd), src_p2_8x16b); |
814 | 8.14M | _mm_storel_epi64((__m128i *)(pu1_src - 2 * src_strd), src_p1_8x16b); |
815 | 8.14M | _mm_storel_epi64((__m128i *)(pu1_src - src_strd), src_p0_8x16b); |
816 | 8.14M | _mm_storel_epi64((__m128i *)(pu1_src), src_q0_8x16b); |
817 | 8.14M | _mm_storel_epi64((__m128i *)(pu1_src + src_strd), src_q1_8x16b); |
818 | 8.14M | _mm_storel_epi64((__m128i *)(pu1_src + 2 * src_strd), src_q2_8x16b); |
819 | | |
820 | | |
821 | 8.14M | } |
822 | | |
823 | 4.76M | else |
824 | 4.76M | { |
825 | | |
826 | 4.76M | __m128i tmp_delta0_8x16b, tmp_delta1_8x16b; |
827 | 4.76M | __m128i tmp0_const_8x16b, tmp1_const_8x16b, tmp2_const_8x16b; |
828 | 4.76M | __m128i coefdelta_0_8x16b; |
829 | 4.76M | __m128i const2_8x16b, consttc_8x16b; |
830 | | |
831 | 4.76M | LWORD64 maskp0, maskp1, maskq0, maskq1; |
832 | 4.76M | maskp0 = (LWORD64)filter_flag_p; |
833 | 4.76M | maskq0 = (LWORD64)filter_flag_q; |
834 | 4.76M | maskp1 = (LWORD64)dep; |
835 | 4.76M | maskq1 = (LWORD64)deq; |
836 | 4.76M | consttc_8x16b = _mm_set1_epi32(tc); |
837 | | |
838 | 4.76M | tmp_delta0_8x16b = _mm_unpacklo_epi8(src_p1_8x16b, src_p0_8x16b); |
839 | 4.76M | tmp_delta1_8x16b = _mm_unpacklo_epi8(src_q0_8x16b, src_q1_8x16b); |
840 | | //arranged q31 q30 p30 p31 q21 q20 p20 p21 q1 q10 p10 p11 q01 q00 p00 p01 |
841 | 4.76M | tmp_delta1_8x16b = _mm_unpacklo_epi16(tmp_delta0_8x16b, tmp_delta1_8x16b); |
842 | | |
843 | 4.76M | coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_de1); |
844 | | // (-3q1+9q0),(-9p0+3p1) |
845 | 4.76M | tmp_delta0_8x16b = _mm_maddubs_epi16(tmp_delta1_8x16b, coefdelta_0_8x16b); |
846 | | |
847 | | //getting -tc store |
848 | 4.76M | tmp2_const_8x16b = _mm_cmpeq_epi32(consttc_8x16b, consttc_8x16b); |
849 | | |
850 | | //getting tc in 16 bit |
851 | 4.76M | consttc_8x16b = _mm_packs_epi32(consttc_8x16b, consttc_8x16b); |
852 | | //calc 10 *tc = 2*tc +8*tc ; 2*tc |
853 | 4.76M | tmp_pq_str0_8x16b = _mm_slli_epi16(consttc_8x16b, 1); |
854 | | //calc 10 *tc = 2*tc +8*tc ; 8*tc |
855 | 4.76M | tmp_pq_str1_8x16b = _mm_slli_epi16(consttc_8x16b, 3); |
856 | | |
857 | | //const 1 |
858 | 4.76M | const2_8x16b = _mm_srli_epi16(tmp2_const_8x16b, 15); |
859 | | //calc 10 *tc |
860 | 4.76M | tmp_pq_str0_8x16b = _mm_add_epi16(tmp_pq_str0_8x16b, tmp_pq_str1_8x16b); |
861 | | //delta0 without normalisation and clipping |
862 | 4.76M | tmp_delta0_8x16b = _mm_madd_epi16(tmp_delta0_8x16b, const2_8x16b); |
863 | | |
864 | 4.76M | const2_8x16b = _mm_srli_epi32(tmp2_const_8x16b, 31); |
865 | | |
866 | | //loaded coef for delta1 calculation |
867 | 4.76M | coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_dep1); |
868 | | //(-2q1+q0),(p0-2p1) |
869 | 4.76M | tmp_delta1_8x16b = _mm_maddubs_epi16(tmp_delta1_8x16b, coefdelta_0_8x16b); |
870 | | //const 8 |
871 | 4.76M | const2_8x16b = _mm_slli_epi32(const2_8x16b, 3); |
872 | | |
873 | | //normalisation of the filter |
874 | 4.76M | tmp_delta0_8x16b = _mm_add_epi32(tmp_delta0_8x16b, const2_8x16b); |
875 | 4.76M | tmp_delta0_8x16b = _mm_srai_epi32(tmp_delta0_8x16b, 4); |
876 | | |
877 | | //getting deltaq0 |
878 | 4.76M | tmp_pq_str1_8x16b = _mm_sign_epi32(tmp_delta0_8x16b, tmp2_const_8x16b); |
879 | | //getting -tc |
880 | 4.76M | tmp1_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp2_const_8x16b); |
881 | | //packing d03q d02q d01q d0q d03p d02p d01p d00p |
882 | 4.76M | tmp_delta0_8x16b = _mm_packs_epi32(tmp_delta0_8x16b, tmp_pq_str1_8x16b); |
883 | | //absolute delta |
884 | 4.76M | tmp_pq_str1_8x16b = _mm_abs_epi16(tmp_delta0_8x16b); |
885 | | |
886 | | //Clipping of delta0 |
887 | 4.76M | tmp_delta0_8x16b = _mm_min_epi16(tmp_delta0_8x16b, consttc_8x16b); |
888 | | //tc>>1 16 bit |
889 | 4.76M | consttc_8x16b = _mm_srai_epi16(consttc_8x16b, 1); |
890 | | //Clipping of delta0 |
891 | 4.76M | tmp_delta0_8x16b = _mm_max_epi16(tmp_delta0_8x16b, tmp1_const_8x16b); |
892 | | |
893 | | //(-tc)>>1 16 bit |
894 | 4.76M | tmp1_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp2_const_8x16b); |
895 | | //mask for |delta| < 10*tc |
896 | 4.76M | tmp_pq_str0_8x16b = _mm_cmpgt_epi16(tmp_pq_str0_8x16b, tmp_pq_str1_8x16b); |
897 | | //delta 1 calc starts |
898 | | |
899 | | //getting q32 q22 q12 q02 p32 p12 p22 p02 |
900 | 4.76M | tmp0_const_8x16b = _mm_setzero_si128(); |
901 | 4.76M | src_q2_8x16b = _mm_unpacklo_epi8(src_q2_8x16b, tmp0_const_8x16b); |
902 | 4.76M | src_p2_8x16b = _mm_unpacklo_epi8(src_p2_8x16b, tmp0_const_8x16b); |
903 | 4.76M | src_p2_8x16b = _mm_unpacklo_epi64(src_p2_8x16b, src_q2_8x16b); |
904 | | //constant 1 |
905 | 4.76M | const2_8x16b = _mm_srli_epi16(tmp2_const_8x16b, 15); |
906 | | //2*delta0 |
907 | 4.76M | tmp2_const_8x16b = _mm_add_epi16(tmp_delta0_8x16b, tmp_delta0_8x16b); |
908 | | //getting all respective q's and p's together |
909 | 4.76M | coefdelta_0_8x16b = _mm_load_si128((__m128i *)(shuffle1)); |
910 | 4.76M | tmp_delta1_8x16b = _mm_shuffle_epi8(tmp_delta1_8x16b, coefdelta_0_8x16b); |
911 | | //final adds for deltap1 and deltaq1 |
912 | 4.76M | tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, const2_8x16b); |
913 | 4.76M | src_p2_8x16b = _mm_add_epi16(src_p2_8x16b, tmp2_const_8x16b); |
914 | 4.76M | tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, src_p2_8x16b); |
915 | 4.76M | tmp_delta1_8x16b = _mm_srai_epi16(tmp_delta1_8x16b, 2); |
916 | | |
917 | | //mask0= (((LWORD64)filter_flag_q)<<63)| (((LWORD64)filter_flag_p)<<31); |
918 | 4.76M | tmp_pq_str1_8x16b = _mm_loadl_epi64((__m128i *)(&(maskq0))); |
919 | 4.76M | src_p2_8x16b = _mm_loadl_epi64((__m128i *)(&(maskp0))); |
920 | | |
921 | | // src_p2_8x16b = _mm_set_epi32(filter_flag_q,filter_flag_p,filter_flag_q,filter_flag_p); |
922 | | //mask1= (((LWORD64)(filter_flag_q&deq))<<63)|(((LWORD64)(filter_flag_p & dep))<<31); |
923 | 4.76M | src_q2_8x16b = _mm_loadl_epi64((__m128i *)(&(maskq1))); |
924 | 4.76M | coefdelta_0_8x16b = _mm_loadl_epi64((__m128i *)(&(maskp1))); |
925 | | |
926 | 4.76M | src_p2_8x16b = _mm_unpacklo_epi32(src_p2_8x16b, tmp_pq_str1_8x16b); |
927 | 4.76M | src_q2_8x16b = _mm_unpacklo_epi32(coefdelta_0_8x16b, src_q2_8x16b); |
928 | | //src_q2_8x16b = _mm_set_epi32(deq,dep,deq,dep); |
929 | 4.76M | src_q2_8x16b = _mm_and_si128(src_q2_8x16b, src_p2_8x16b); |
930 | | |
931 | | //rearranging the mask values |
932 | 4.76M | src_q2_8x16b = _mm_shuffle_epi32(src_q2_8x16b, 0x50); |
933 | 4.76M | src_p2_8x16b = _mm_shuffle_epi32(src_p2_8x16b, 0x50); |
934 | | |
935 | 4.76M | src_q2_8x16b = _mm_slli_epi32(src_q2_8x16b, 31); |
936 | 4.76M | src_p2_8x16b = _mm_slli_epi32(src_p2_8x16b, 31); |
937 | 4.76M | src_q2_8x16b = _mm_srai_epi32(src_q2_8x16b, 31); |
938 | 4.76M | src_p2_8x16b = _mm_srai_epi32(src_p2_8x16b, 31); |
939 | | |
940 | | //combining mask delta1 |
941 | 4.76M | tmp_pq_str1_8x16b = _mm_and_si128(tmp_pq_str0_8x16b, src_q2_8x16b); |
942 | | // clipping delta1 |
943 | 4.76M | tmp_delta1_8x16b = _mm_min_epi16(tmp_delta1_8x16b, consttc_8x16b); |
944 | | //combining mask delat0 |
945 | 4.76M | tmp_pq_str0_8x16b = _mm_and_si128(tmp_pq_str0_8x16b, src_p2_8x16b); |
946 | | // clipping delta1 |
947 | 4.76M | tmp_delta1_8x16b = _mm_max_epi16(tmp_delta1_8x16b, tmp1_const_8x16b); |
948 | | |
949 | | |
950 | | //masking of the delta values |delta|<10*tc |
951 | 4.76M | tmp_delta1_8x16b = _mm_and_si128(tmp_delta1_8x16b, tmp_pq_str1_8x16b); |
952 | 4.76M | tmp_delta0_8x16b = _mm_and_si128(tmp_delta0_8x16b, tmp_pq_str0_8x16b); |
953 | | //separating p and q delta 0 and addinq p0 and q0 |
954 | 4.76M | tmp_pq_str0_8x16b = _mm_unpacklo_epi64(tmp_delta0_8x16b, tmp0_const_8x16b); |
955 | 4.76M | tmp_pq_str1_8x16b = _mm_unpackhi_epi64(tmp_delta0_8x16b, tmp0_const_8x16b); |
956 | 4.76M | src_p0_8x16b = _mm_unpacklo_epi8(src_p0_8x16b, tmp0_const_8x16b); |
957 | 4.76M | src_q0_8x16b = _mm_unpacklo_epi8(src_q0_8x16b, tmp0_const_8x16b); |
958 | 4.76M | src_p0_8x16b = _mm_add_epi16(src_p0_8x16b, tmp_pq_str0_8x16b); |
959 | 4.76M | src_q0_8x16b = _mm_add_epi16(src_q0_8x16b, tmp_pq_str1_8x16b); |
960 | | //separating p and q delta 0 and addinq p0 and q0 |
961 | 4.76M | tmp_pq_str0_8x16b = _mm_unpacklo_epi64(tmp_delta1_8x16b, tmp0_const_8x16b); |
962 | 4.76M | tmp_pq_str1_8x16b = _mm_unpackhi_epi64(tmp_delta1_8x16b, tmp0_const_8x16b); |
963 | 4.76M | src_p1_8x16b = _mm_unpacklo_epi8(src_p1_8x16b, tmp0_const_8x16b); |
964 | 4.76M | src_q1_8x16b = _mm_unpacklo_epi8(src_q1_8x16b, tmp0_const_8x16b); |
965 | 4.76M | src_p1_8x16b = _mm_add_epi16(src_p1_8x16b, tmp_pq_str0_8x16b); |
966 | 4.76M | src_q1_8x16b = _mm_add_epi16(src_q1_8x16b, tmp_pq_str1_8x16b); |
967 | | //packing p1 q1 and p0 q0 to 8 bit |
968 | 4.76M | src_p1_8x16b = _mm_packus_epi16(src_p1_8x16b, src_q1_8x16b); |
969 | 4.76M | src_p0_8x16b = _mm_packus_epi16(src_p0_8x16b, src_q0_8x16b); |
970 | | |
971 | 4.76M | src_q1_8x16b = _mm_srli_si128(src_p1_8x16b, 8); |
972 | 4.76M | src_q0_8x16b = _mm_srli_si128(src_p0_8x16b, 8); |
973 | | |
974 | 4.76M | _mm_storel_epi64((__m128i *)(pu1_src - 2 * src_strd), src_p1_8x16b); |
975 | 4.76M | _mm_storel_epi64((__m128i *)(pu1_src - src_strd), src_p0_8x16b); |
976 | 4.76M | _mm_storel_epi64((__m128i *)(pu1_src), src_q0_8x16b); |
977 | 4.76M | _mm_storel_epi64((__m128i *)(pu1_src + src_strd), src_q1_8x16b); |
978 | | |
979 | | |
980 | 4.76M | } |
981 | | |
982 | | |
983 | | |
984 | 12.9M | } |
985 | | |
986 | 13.3M | } |
987 | | |
988 | | void ihevc_deblk_chroma_vert_ssse3(UWORD8 *pu1_src, |
989 | | WORD32 src_strd, |
990 | | WORD32 quant_param_p, |
991 | | WORD32 quant_param_q, |
992 | | WORD32 qp_offset_u, |
993 | | WORD32 qp_offset_v, |
994 | | WORD32 tc_offset_div2, |
995 | | WORD32 filter_flag_p, |
996 | | WORD32 filter_flag_q) |
997 | 5.72M | { |
998 | 5.72M | WORD32 qp_indx_u, qp_chroma_u; |
999 | 5.72M | WORD32 qp_indx_v, qp_chroma_v; |
1000 | 5.72M | WORD32 tc_indx_u, tc_u; |
1001 | 5.72M | WORD32 tc_indx_v, tc_v; |
1002 | | |
1003 | 5.72M | __m128i src_row_0_16x8b, tmp_pxl_0_16x8b, src_row_2_16x8b, tmp_pxl_1_16x8b; |
1004 | 5.72M | ASSERT(filter_flag_p || filter_flag_q); |
1005 | | |
1006 | | /* chroma processing is done only if BS is 2 */ |
1007 | | /* this function is assumed to be called only if BS is 2 */ |
1008 | 5.72M | qp_indx_u = qp_offset_u + ((quant_param_p + quant_param_q + 1) >> 1); |
1009 | 5.72M | qp_chroma_u = qp_indx_u < 0 ? qp_indx_u : (qp_indx_u > 57 ? qp_indx_u - 6 : gai4_ihevc_qp_table[qp_indx_u]); |
1010 | | |
1011 | 5.72M | qp_indx_v = qp_offset_v + ((quant_param_p + quant_param_q + 1) >> 1); |
1012 | 5.72M | qp_chroma_v = qp_indx_v < 0 ? qp_indx_v : (qp_indx_v > 57 ? qp_indx_v - 6 : gai4_ihevc_qp_table[qp_indx_v]); |
1013 | | |
1014 | 5.72M | tc_indx_u = CLIP3(qp_chroma_u + 2 + (tc_offset_div2 << 1), 0, 53); |
1015 | 5.72M | tc_u = gai4_ihevc_tc_table[tc_indx_u]; |
1016 | | |
1017 | 5.72M | tc_indx_v = CLIP3(qp_chroma_v + 2 + (tc_offset_div2 << 1), 0, 53); |
1018 | 5.72M | tc_v = gai4_ihevc_tc_table[tc_indx_v]; |
1019 | | |
1020 | 5.72M | if(0 == tc_u && 0 == tc_v) |
1021 | 292k | { |
1022 | 292k | return; |
1023 | 292k | } |
1024 | 5.43M | src_row_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src - 4)); |
1025 | 5.43M | tmp_pxl_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd - 4)); |
1026 | 5.43M | src_row_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd - 4)); |
1027 | 5.43M | tmp_pxl_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd - 4)); |
1028 | | |
1029 | 5.43M | { |
1030 | 5.43M | LWORD64 mask_tc, mask_flag, mask; |
1031 | 5.43M | __m128i delta_vu0_16x8b, delta_vu1_16x8b; |
1032 | 5.43M | __m128i mask_tc_16x8, mask_16x8b, mask_flag_p_16x8b, mask_flag_q_16x8b; |
1033 | 5.43M | __m128i min_0_16x8b; |
1034 | 5.43M | __m128i const_16x8b; |
1035 | 5.43M | mask_flag = (((LWORD64)filter_flag_p) << 31) | (((LWORD64)filter_flag_q) << 63); |
1036 | 5.43M | mask_tc = (((LWORD64)tc_v) << 16) | ((LWORD64)tc_u); |
1037 | 5.43M | mask = 0xffff00000000ffffLL; |
1038 | | |
1039 | 5.43M | src_row_0_16x8b = _mm_unpacklo_epi64(src_row_0_16x8b, tmp_pxl_0_16x8b); |
1040 | 5.43M | src_row_2_16x8b = _mm_unpacklo_epi64(src_row_2_16x8b, tmp_pxl_1_16x8b); |
1041 | | |
1042 | 5.43M | mask_16x8b = _mm_load_si128((__m128i *)(shuffle_uv)); |
1043 | | // qv11 qu11 qv10 qu10 qv01 qu01 qv00 qu00 pv10 pu10 pv11 pu11 pv00 pu00 pv01 pu01 |
1044 | | // qv31 qu31 qv30 qu30 qv21 qu21 qv20 qu20 pv30 pu30 pv31 pu31 pv20 pu20 pv21 pu21 |
1045 | 5.43M | delta_vu0_16x8b = _mm_shuffle_epi8(src_row_0_16x8b, mask_16x8b); |
1046 | 5.43M | delta_vu1_16x8b = _mm_shuffle_epi8(src_row_2_16x8b, mask_16x8b); |
1047 | | |
1048 | 5.43M | tmp_pxl_0_16x8b = _mm_unpacklo_epi64(delta_vu0_16x8b, delta_vu1_16x8b); |
1049 | 5.43M | tmp_pxl_1_16x8b = _mm_unpackhi_epi64(delta_vu0_16x8b, delta_vu1_16x8b); |
1050 | | // pv30 pv31 pu30 pu31 pv20 pv21 pu20 pu21 pv10 pv11 pu10 pu11 pv00 pv01 pu00 pu01 |
1051 | | // qv31 qv30 qu31 qu30 qv21 qv20 qu21 qu20 qv11 qv10 qu11 qu10 qv01 qv00 qu01 qu00 |
1052 | 5.43M | delta_vu0_16x8b = _mm_load_si128((__m128i *)delta0); |
1053 | 5.43M | delta_vu1_16x8b = _mm_load_si128((__m128i *)delta1); |
1054 | | |
1055 | 5.43M | delta_vu0_16x8b = _mm_maddubs_epi16(tmp_pxl_0_16x8b, delta_vu0_16x8b); |
1056 | 5.43M | delta_vu1_16x8b = _mm_maddubs_epi16(tmp_pxl_1_16x8b, delta_vu1_16x8b); |
1057 | | |
1058 | | //generating offset 4 |
1059 | 5.43M | const_16x8b = _mm_cmpeq_epi16(tmp_pxl_0_16x8b, tmp_pxl_0_16x8b); |
1060 | | // filter flag mask and tc mask |
1061 | 5.43M | mask_tc_16x8 = _mm_loadl_epi64((__m128i *)(&mask_tc)); |
1062 | 5.43M | mask_flag_q_16x8b = _mm_loadl_epi64((__m128i *)(&mask_flag)); |
1063 | | |
1064 | 5.43M | mask_tc_16x8 = _mm_shuffle_epi32(mask_tc_16x8, 0x00); |
1065 | 5.43M | mask_flag_q_16x8b = _mm_srai_epi32(mask_flag_q_16x8b, 31); |
1066 | | //-tc |
1067 | 5.43M | min_0_16x8b = _mm_sign_epi16(mask_tc_16x8, const_16x8b); |
1068 | | //converting const 1 |
1069 | 5.43M | const_16x8b = _mm_srli_epi16(const_16x8b, 15); |
1070 | | |
1071 | | //filterp and filterq flag |
1072 | 5.43M | mask_flag_p_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x00); |
1073 | 5.43M | mask_flag_q_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x55); |
1074 | | |
1075 | | //modified delta with a filter (1 -4 4 -1) available in 16 bit |
1076 | 5.43M | delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, delta_vu1_16x8b); |
1077 | | //converting const 4 |
1078 | 5.43M | const_16x8b = _mm_slli_epi16(const_16x8b, 2); |
1079 | | |
1080 | 5.43M | mask_16x8b = _mm_loadl_epi64((__m128i *)(&mask)); |
1081 | | //offset addition |
1082 | 5.43M | delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, const_16x8b); |
1083 | | //eliminating q1 |
1084 | 5.43M | tmp_pxl_1_16x8b = _mm_slli_epi16(tmp_pxl_1_16x8b, 8); |
1085 | | |
1086 | 5.43M | const_16x8b = _mm_setzero_si128(); |
1087 | | //filter after normalisation |
1088 | 5.43M | delta_vu0_16x8b = _mm_srai_epi16(delta_vu0_16x8b, 3); |
1089 | 5.43M | mask_16x8b = _mm_shuffle_epi32(mask_16x8b, 0x44); |
1090 | | |
1091 | | //clipping MAX |
1092 | 5.43M | delta_vu0_16x8b = _mm_min_epi16(delta_vu0_16x8b, mask_tc_16x8); |
1093 | | //getting p0 and eliminating p1 |
1094 | 5.43M | tmp_pxl_0_16x8b = _mm_srli_epi16(tmp_pxl_0_16x8b, 8); |
1095 | | //clipping MIN |
1096 | 5.43M | delta_vu0_16x8b = _mm_max_epi16(delta_vu0_16x8b, min_0_16x8b); |
1097 | | //getting q0 |
1098 | 5.43M | tmp_pxl_1_16x8b = _mm_srli_epi16(tmp_pxl_1_16x8b, 8); |
1099 | | //masking filter flag |
1100 | 5.43M | delta_vu1_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_q_16x8b); |
1101 | 5.43M | delta_vu0_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_p_16x8b); |
1102 | | |
1103 | | // q-delta ,p+delta |
1104 | 5.43M | tmp_pxl_1_16x8b = _mm_sub_epi16(tmp_pxl_1_16x8b, delta_vu1_16x8b); |
1105 | 5.43M | tmp_pxl_0_16x8b = _mm_add_epi16(tmp_pxl_0_16x8b, delta_vu0_16x8b); |
1106 | | //merging q0 and p0 of respective rows |
1107 | 5.43M | delta_vu1_16x8b = _mm_unpackhi_epi32(tmp_pxl_0_16x8b, tmp_pxl_1_16x8b); |
1108 | 5.43M | delta_vu0_16x8b = _mm_unpacklo_epi32(tmp_pxl_0_16x8b, tmp_pxl_1_16x8b); |
1109 | | // row 0 and row 1 packed , row2 and row3 packed |
1110 | 5.43M | delta_vu0_16x8b = _mm_packus_epi16(delta_vu0_16x8b, const_16x8b); |
1111 | 5.43M | delta_vu1_16x8b = _mm_packus_epi16(delta_vu1_16x8b, const_16x8b); |
1112 | | //removing older pixel values |
1113 | 5.43M | src_row_0_16x8b = _mm_and_si128(src_row_0_16x8b, mask_16x8b); |
1114 | 5.43M | src_row_2_16x8b = _mm_and_si128(src_row_2_16x8b, mask_16x8b); |
1115 | | //arranging modified pixels |
1116 | 5.43M | delta_vu0_16x8b = _mm_shuffle_epi32(delta_vu0_16x8b, 0xd8); |
1117 | 5.43M | delta_vu1_16x8b = _mm_shuffle_epi32(delta_vu1_16x8b, 0xd8); |
1118 | 5.43M | delta_vu0_16x8b = _mm_slli_epi64(delta_vu0_16x8b, 16); |
1119 | 5.43M | delta_vu1_16x8b = _mm_slli_epi64(delta_vu1_16x8b, 16); |
1120 | | //plugging the modified values |
1121 | 5.43M | src_row_0_16x8b = _mm_or_si128(src_row_0_16x8b, delta_vu0_16x8b); |
1122 | 5.43M | src_row_2_16x8b = _mm_or_si128(src_row_2_16x8b, delta_vu1_16x8b); |
1123 | | |
1124 | | |
1125 | | //geting values for row1 and row 3 |
1126 | 5.43M | tmp_pxl_0_16x8b = _mm_srli_si128(src_row_0_16x8b, 8); |
1127 | 5.43M | tmp_pxl_1_16x8b = _mm_srli_si128(src_row_2_16x8b, 8); |
1128 | | |
1129 | 5.43M | _mm_storel_epi64((__m128i *)(pu1_src - 4), src_row_0_16x8b); |
1130 | 5.43M | _mm_storel_epi64((__m128i *)((pu1_src - 4) + src_strd), tmp_pxl_0_16x8b); |
1131 | 5.43M | _mm_storel_epi64((__m128i *)((pu1_src - 4) + 2 * src_strd), src_row_2_16x8b); |
1132 | 5.43M | _mm_storel_epi64((__m128i *)((pu1_src - 4) + 3 * src_strd), tmp_pxl_1_16x8b); |
1133 | 5.43M | } |
1134 | | |
1135 | | |
1136 | | |
1137 | 5.43M | } |
1138 | | |
1139 | | void ihevc_deblk_chroma_horz_ssse3(UWORD8 *pu1_src, |
1140 | | WORD32 src_strd, |
1141 | | WORD32 quant_param_p, |
1142 | | WORD32 quant_param_q, |
1143 | | WORD32 qp_offset_u, |
1144 | | WORD32 qp_offset_v, |
1145 | | WORD32 tc_offset_div2, |
1146 | | WORD32 filter_flag_p, |
1147 | | WORD32 filter_flag_q) |
1148 | 5.60M | { |
1149 | 5.60M | WORD32 qp_indx_u, qp_chroma_u; |
1150 | 5.60M | WORD32 qp_indx_v, qp_chroma_v; |
1151 | 5.60M | WORD32 tc_indx_u, tc_u; |
1152 | 5.60M | WORD32 tc_indx_v, tc_v; |
1153 | | |
1154 | | |
1155 | 5.60M | __m128i tmp_p0_16x8b, src_p0_16x8b, src_q0_16x8b, tmp_q0_16x8b; |
1156 | | |
1157 | 5.60M | ASSERT(filter_flag_p || filter_flag_q); |
1158 | | |
1159 | | /* chroma processing is done only if BS is 2 */ |
1160 | | /* this function is assumed to be called only if BS is 2 */ |
1161 | 5.60M | qp_indx_u = qp_offset_u + ((quant_param_p + quant_param_q + 1) >> 1); |
1162 | 5.60M | qp_chroma_u = qp_indx_u < 0 ? qp_indx_u : (qp_indx_u > 57 ? qp_indx_u - 6 : gai4_ihevc_qp_table[qp_indx_u]); |
1163 | | |
1164 | 5.60M | qp_indx_v = qp_offset_v + ((quant_param_p + quant_param_q + 1) >> 1); |
1165 | 5.60M | qp_chroma_v = qp_indx_v < 0 ? qp_indx_v : (qp_indx_v > 57 ? qp_indx_v - 6 : gai4_ihevc_qp_table[qp_indx_v]); |
1166 | | |
1167 | 5.60M | tc_indx_u = CLIP3(qp_chroma_u + 2 + (tc_offset_div2 << 1), 0, 53); |
1168 | 5.60M | tc_u = gai4_ihevc_tc_table[tc_indx_u]; |
1169 | | |
1170 | 5.60M | tc_indx_v = CLIP3(qp_chroma_v + 2 + (tc_offset_div2 << 1), 0, 53); |
1171 | 5.60M | tc_v = gai4_ihevc_tc_table[tc_indx_v]; |
1172 | | |
1173 | 5.60M | if(0 == tc_u && 0 == tc_v) |
1174 | 256k | { |
1175 | 256k | return; |
1176 | 256k | } |
1177 | 5.35M | tmp_p0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src - 2 * src_strd)); |
1178 | 5.35M | src_p0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src - src_strd)); |
1179 | 5.35M | src_q0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); |
1180 | 5.35M | tmp_q0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); |
1181 | | |
1182 | 5.35M | { |
1183 | 5.35M | LWORD64 mask_tc, mask_flag; |
1184 | 5.35M | __m128i delta_vu0_16x8b, delta_vu1_16x8b; |
1185 | 5.35M | __m128i mask_tc_16x8, mask_16x8b, mask_flag_p_16x8b, mask_flag_q_16x8b; |
1186 | 5.35M | __m128i min_0_16x8b; |
1187 | 5.35M | __m128i const_16x8b; |
1188 | 5.35M | mask_flag = (((LWORD64)filter_flag_p) << 31) | (((LWORD64)filter_flag_q) << 63); |
1189 | 5.35M | mask_tc = (((LWORD64)tc_v) << 16) | ((LWORD64)tc_u); |
1190 | | |
1191 | 5.35M | tmp_p0_16x8b = _mm_unpacklo_epi8(tmp_p0_16x8b, src_p0_16x8b); |
1192 | 5.35M | tmp_q0_16x8b = _mm_unpacklo_epi8(src_q0_16x8b, tmp_q0_16x8b); |
1193 | | |
1194 | | // pv30 pv31 pu30 pu31 pv20 pv21 pu20 pu21 pv10 pv11 pu10 pu11 pv00 pv01 pu00 pu01 |
1195 | | // qv31 qv30 qu31 qu30 qv21 qv20 qu21 qu20 qv11 qv10 qu11 qu10 qv01 qv00 qu01 qu00 |
1196 | 5.35M | delta_vu0_16x8b = _mm_load_si128((__m128i *)delta0); |
1197 | 5.35M | delta_vu1_16x8b = _mm_load_si128((__m128i *)delta1); |
1198 | | |
1199 | 5.35M | delta_vu0_16x8b = _mm_maddubs_epi16(tmp_p0_16x8b, delta_vu0_16x8b); |
1200 | 5.35M | delta_vu1_16x8b = _mm_maddubs_epi16(tmp_q0_16x8b, delta_vu1_16x8b); |
1201 | | |
1202 | | |
1203 | | // filter flag mask and tc mask |
1204 | 5.35M | mask_tc_16x8 = _mm_loadl_epi64((__m128i *)(&mask_tc)); |
1205 | 5.35M | mask_flag_q_16x8b = _mm_loadl_epi64((__m128i *)(&mask_flag)); |
1206 | | |
1207 | | //generating offset 4 |
1208 | 5.35M | const_16x8b = _mm_cmpeq_epi16(tmp_p0_16x8b, tmp_p0_16x8b); |
1209 | | // filter flag mask and tc mask |
1210 | 5.35M | mask_tc_16x8 = _mm_shuffle_epi32(mask_tc_16x8, 0x00); |
1211 | 5.35M | mask_flag_q_16x8b = _mm_srai_epi32(mask_flag_q_16x8b, 31); |
1212 | | //-tc |
1213 | 5.35M | min_0_16x8b = _mm_sign_epi16(mask_tc_16x8, const_16x8b); |
1214 | | //converting const 1 |
1215 | 5.35M | const_16x8b = _mm_srli_epi16(const_16x8b, 15); |
1216 | | |
1217 | | //filterp |
1218 | 5.35M | mask_flag_p_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x00); |
1219 | | |
1220 | | |
1221 | | //converting const 4 |
1222 | 5.35M | const_16x8b = _mm_slli_epi16(const_16x8b, 2); |
1223 | | //modified delta with a filter (1 -4 4 -1) available in 16 bit |
1224 | 5.35M | delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, delta_vu1_16x8b); |
1225 | | |
1226 | | //filterq flag |
1227 | 5.35M | mask_flag_q_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x55); |
1228 | | //offset addition |
1229 | 5.35M | delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, const_16x8b); |
1230 | 5.35M | mask_16x8b = _mm_setzero_si128(); |
1231 | | //filter after normalisation |
1232 | 5.35M | delta_vu0_16x8b = _mm_srai_epi16(delta_vu0_16x8b, 3); |
1233 | | |
1234 | | //converting p0 to 16bit |
1235 | 5.35M | src_p0_16x8b = _mm_unpacklo_epi8(src_p0_16x8b, mask_16x8b); |
1236 | | //clipping MAX |
1237 | 5.35M | delta_vu0_16x8b = _mm_min_epi16(delta_vu0_16x8b, mask_tc_16x8); |
1238 | | //converting q0 to 16bit |
1239 | 5.35M | src_q0_16x8b = _mm_unpacklo_epi8(src_q0_16x8b, mask_16x8b); |
1240 | | //clipping MIN |
1241 | 5.35M | delta_vu0_16x8b = _mm_max_epi16(delta_vu0_16x8b, min_0_16x8b); |
1242 | | |
1243 | | //masking filter flag |
1244 | 5.35M | delta_vu1_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_q_16x8b); |
1245 | 5.35M | delta_vu0_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_p_16x8b); |
1246 | | |
1247 | | // q-delta ,p+delta |
1248 | 5.35M | src_q0_16x8b = _mm_sub_epi16(src_q0_16x8b, delta_vu1_16x8b); |
1249 | 5.35M | src_p0_16x8b = _mm_add_epi16(src_p0_16x8b, delta_vu0_16x8b); |
1250 | | |
1251 | | // p0 and q0 packed |
1252 | 5.35M | src_q0_16x8b = _mm_packus_epi16(src_q0_16x8b, mask_16x8b); |
1253 | 5.35M | src_p0_16x8b = _mm_packus_epi16(src_p0_16x8b, mask_16x8b); |
1254 | | |
1255 | | |
1256 | | |
1257 | 5.35M | _mm_storel_epi64((__m128i *)(pu1_src - src_strd), src_p0_16x8b); |
1258 | 5.35M | _mm_storel_epi64((__m128i *)(pu1_src), src_q0_16x8b); |
1259 | | |
1260 | 5.35M | } |
1261 | | |
1262 | | |
1263 | 5.35M | } |