/src/libavc/common/x86/ih264_deblk_chroma_avx2.c
Line | Count | Source |
1 | | /****************************************************************************** |
2 | | * |
3 | | * Copyright (C) 2015 The Android Open Source Project |
4 | | * |
5 | | * Licensed under the Apache License, Version 2.0 (the "License"); |
6 | | * you may not use this file except in compliance with the License. |
7 | | * You may obtain a copy of the License at: |
8 | | * |
9 | | * http://www.apache.org/licenses/LICENSE-2.0 |
10 | | * |
11 | | * Unless required by applicable law or agreed to in writing, software |
12 | | * distributed under the License is distributed on an "AS IS" BASIS, |
13 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | * See the License for the specific language governing permissions and |
15 | | * limitations under the License. |
16 | | * |
17 | | ***************************************************************************** |
18 | | * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
19 | | */ |
20 | | /*****************************************************************************/ |
21 | | |
22 | | /*****************************************************************************/ |
23 | | /* File Includes */ |
24 | | /*****************************************************************************/ |
25 | | |
26 | | /* System include files */ |
27 | | #include <stdio.h> |
28 | | |
29 | | #ifdef __ANDROID__ |
30 | | #include "log/log.h" |
31 | | #include <cutils/log.h> |
32 | | #endif |
33 | | |
34 | | /* User include files */ |
35 | | #include "ih264_typedefs.h" |
36 | | #include "ih264_platform_macros.h" |
37 | | #include "ih264_deblk_edge_filters.h" |
38 | | #include "ih264_macros.h" |
39 | | |
40 | | #include <stdint.h> |
41 | | #include <string.h> |
42 | | #include <immintrin.h> |
43 | | |
44 | | |
45 | | |
46 | | /*****************************************************************************/ |
47 | | /* */ |
48 | | /* Function Name : ih264_deblk_chroma_vert_bslt4_avx2() */ |
49 | | /* */ |
50 | | /* Description : This function performs filtering of a chroma block */ |
51 | | /* vertical edge when the boundary strength is less than 4 */ |
52 | | /* in high profile. */ |
53 | | /* */ |
54 | | /* Inputs : pu1_src - pointer to the src sample q0 of U */ |
55 | | /* src_strd - source stride */ |
56 | | /* alpha_cb - alpha value for the boundary in U */ |
57 | | /* beta_cb - beta value for the boundary in U */ |
58 | | /* alpha_cr - alpha value for the boundary in V */ |
59 | | /* beta_cr - beta value for the boundary in V */ |
60 | | /* u4_bs - packed Boundary strength array */ |
61 | | /* pu1_cliptab_cb - tc0_table for U */ |
62 | | /* pu1_cliptab_cr - tc0_table for V */ |
63 | | /* */ |
64 | | /* Globals : None */ |
65 | | /* */ |
66 | | /* Processing : This operation is described in Sec. 8.7.2.3 under the */ |
67 | | /* title "Filtering process for edges for bS less than 4" */ |
68 | | /* in ITU T Rec H.264 with alpha and beta values different */ |
69 | | /* in U and V. */ |
70 | | /* */ |
71 | | /* Outputs : None */ |
72 | | /* */ |
73 | | /* Returns : None */ |
74 | | /* */ |
75 | | /* Issues : None */ |
76 | | /* */ |
77 | | /* Revision History: */ |
78 | | /* */ |
79 | | /* DD MM YYYY Author(s) Changes (Describe the changes made) */ |
80 | | /* 12 02 2015 Naveen Kumar P Initial version */ |
81 | | /* 15 09 2020 Priyanka Bose AVX2 Intel Intrinsics Support */ |
82 | | /*****************************************************************************/ |
83 | | |
84 | | void ih264_deblk_chroma_vert_bslt4_avx2(UWORD8 *pu1_src, |
85 | | WORD32 src_strd, |
86 | | WORD32 alpha_cb, |
87 | | WORD32 beta_cb, |
88 | | WORD32 alpha_cr, |
89 | | WORD32 beta_cr, |
90 | | UWORD32 u4_bs, |
91 | | const UWORD8 *pu1_cliptab_cb, |
92 | | const UWORD8 *pu1_cliptab_cr) |
93 | 77.0k | { |
94 | 77.0k | UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/ |
95 | 77.0k | UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3; |
96 | 77.0k | WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb; |
97 | 77.0k | WORD32 beta_cbcr = (beta_cr << 16) + beta_cb; |
98 | 77.0k | __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh; |
99 | 77.0k | __m256i lineab, linecd, lineef, linegh, lineae, linebf, linecg, linedh; |
100 | 77.0k | __m256i temp1, temp2, temp3, temp4; |
101 | 77.0k | __m256i t1,t3, t2,t4,pq0_uv_32x8,pq1_uv_32x8,tmp1,tmp2,p0_uv_8x32,q0_uv_8x32; |
102 | | |
103 | 77.0k | __m256i pq0_uv_8x32, pq1_uv_8x32, p1_uv_8x32,pq0_uv_8x32_1,pq0_uv_8x32_2; |
104 | 77.0k | __m256i flag_bs, flag1, flag2; |
105 | 77.0k | __m256i diff, diff1, alpha_cbcr_32x8, beta_cbcr_32x8, in_macro; |
106 | 77.0k | __m256i zero = _mm256_setzero_si256(); |
107 | 77.0k | __m256i C0_uv_8x32; |
108 | 77.0k | __m256i p0_uv_8x32_1, p0_uv_8x32_2, q0_uv_8x32_1, q0_uv_8x32_2,p0_uv_32x8_1,q0_uv_32x8_1; |
109 | | |
110 | 77.0k | u1_Bs0 = (u4_bs >> 24) & 0xff; |
111 | 77.0k | u1_Bs1 = (u4_bs >> 16) & 0xff; |
112 | 77.0k | u1_Bs2 = (u4_bs >> 8) & 0xff; |
113 | 77.0k | u1_Bs3 = (u4_bs >> 0) & 0xff; |
114 | | |
115 | 77.0k | flag_bs = _mm256_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3,u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, |
116 | 77.0k | u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2,u1_Bs2, u1_Bs2, |
117 | 77.0k | u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1, |
118 | 77.0k | u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0,u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0); |
119 | 77.0k | flag_bs = _mm256_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s |
120 | 77.0k | flag_bs = _mm256_xor_si256(flag_bs, _mm256_set1_epi8(0xFF)); //Invert for required mask |
121 | | |
122 | | /* Load and transpose the pixel values */ |
123 | 77.0k | lineab = _mm256_loadu2_m128i((__m128i *)(pu1_src_uv - 4 + src_strd), (__m128i *)(pu1_src_uv - 4)); |
124 | 77.0k | linecd = _mm256_loadu2_m128i((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), (__m128i *)(pu1_src_uv - 4 + 2 * src_strd)); |
125 | 77.0k | lineef = _mm256_loadu2_m128i((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), (__m128i *)(pu1_src_uv - 4 + 4 * src_strd)); |
126 | 77.0k | linegh = _mm256_loadu2_m128i((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), (__m128i *)(pu1_src_uv - 4 + 6 * src_strd)); |
127 | | |
128 | 77.0k | temp1 = _mm256_unpacklo_epi64(lineab, zero); //a0 -- a7 000.. b0..b7 000 |
129 | 77.0k | temp2 = _mm256_unpacklo_epi64(linecd, zero); |
130 | 77.0k | temp3 = _mm256_unpacklo_epi64(lineef, zero); //e0 -- e7 000.. f0..f7 000 |
131 | 77.0k | temp4 = _mm256_unpacklo_epi64(linegh, zero); |
132 | | |
133 | 77.0k | temp1 = _mm256_unpacklo_epi16(temp1, temp2); //a0 a1 c0 c1 -- a6 a7 c6 c7 b0 b1 d0 d1.. b6 b7 d6 d7 |
134 | 77.0k | temp2 = _mm256_unpacklo_epi16(temp3, temp4); //e0 e1 g0 g1 f0 f1 h0 h1 |
135 | | |
136 | 77.0k | t2 = _mm256_permute2f128_si256(temp1, temp2, 0x20); |
137 | 77.0k | t3 = _mm256_permute2f128_si256(temp1, temp2, 0x31); |
138 | | |
139 | 77.0k | tmp1 = _mm256_unpacklo_epi16(t2, t3); //a0 a1 b0 b1 c0 c1 d0 d1 -a2 a3 b2 b3 .... e0 e1 f0 f1 g0 g1 h0 h1 -e2 e3.. |
140 | 77.0k | tmp2 = _mm256_unpackhi_epi16(t2, t3); //a4 a5 b4 b5 -a6 a7 b6 b7 |
141 | | |
142 | | |
143 | 77.0k | temp1 = _mm256_unpacklo_epi8(tmp1,zero); // a0 0 a1 0 b0 0 b1 0 c0 0 c1 0 d0 0 d1 0 - e0 0 e1 0 .. => p1 |
144 | 77.0k | temp2 = _mm256_unpackhi_epi8(tmp1,zero); // a2 0 a3 0 => p0 |
145 | 77.0k | temp3 = _mm256_unpacklo_epi8(tmp2,zero); //a4 0 a5 0 => q0 |
146 | 77.0k | temp4 = _mm256_unpackhi_epi8(tmp2,zero); //a6 0 a7 0 => q1 |
147 | | |
148 | 77.0k | pq1_uv_32x8 = _mm256_packus_epi16(temp1,temp4); // 0213 |
149 | 77.0k | pq0_uv_32x8 = _mm256_packus_epi16(temp2,temp3); //0213 |
150 | | |
151 | 77.0k | diff = _mm256_subs_epi16(temp2, temp3); //Condn 1 (p0 -q0) - set (3), set(3) |
152 | 77.0k | diff = _mm256_abs_epi16(diff); |
153 | 77.0k | alpha_cbcr_32x8 = _mm256_set1_epi32(alpha_cbcr); |
154 | 77.0k | flag1 = _mm256_cmpgt_epi16(alpha_cbcr_32x8, diff); |
155 | | |
156 | 77.0k | diff = _mm256_subs_epi16(temp4, temp3); //Condtn 2 (q1 -q0) |
157 | 77.0k | diff = _mm256_abs_epi16(diff); |
158 | 77.0k | beta_cbcr_32x8 = _mm256_set1_epi32(beta_cbcr); |
159 | 77.0k | flag1 = _mm256_and_si256(flag1, _mm256_cmpgt_epi16(beta_cbcr_32x8, diff)); |
160 | | |
161 | | |
162 | 77.0k | diff = _mm256_subs_epi16(temp1, temp2); //Condtn 3 (p1 -p0) |
163 | 77.0k | diff = _mm256_abs_epi16(diff); |
164 | 77.0k | flag1 = _mm256_and_si256(flag1, _mm256_cmpgt_epi16(beta_cbcr_32x8, diff)); |
165 | | |
166 | 77.0k | diff = _mm256_subs_epi16(temp3, temp2); //(q0 -p0) |
167 | 77.0k | diff = _mm256_slli_epi16(diff, 2); |
168 | | |
169 | 77.0k | diff1 = _mm256_subs_epi16(temp1, temp4); //(p1 -q1) |
170 | 77.0k | diff = _mm256_add_epi16(diff, diff1); |
171 | | |
172 | 77.0k | diff = _mm256_add_epi16(diff, _mm256_set1_epi16(4)); |
173 | 77.0k | in_macro = _mm256_srai_epi16(diff, 3); |
174 | | |
175 | | |
176 | 77.0k | C0_uv_8x32 = _mm256_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1], |
177 | 77.0k | pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1], |
178 | 77.0k | pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0], |
179 | 77.0k | pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0], |
180 | 77.0k | pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3], |
181 | 77.0k | pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3], |
182 | 77.0k | pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2], |
183 | 77.0k | pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]); |
184 | | |
185 | 77.0k | C0_uv_8x32 = _mm256_add_epi16(C0_uv_8x32, _mm256_set1_epi16(1)); |
186 | | |
187 | 77.0k | in_macro = _mm256_min_epi16(C0_uv_8x32, in_macro); //CLIP3 |
188 | 77.0k | C0_uv_8x32 = _mm256_subs_epi16(zero, C0_uv_8x32); |
189 | 77.0k | in_macro = _mm256_max_epi16(C0_uv_8x32, in_macro); |
190 | | |
191 | 77.0k | p0_uv_8x32_1 = _mm256_add_epi16(temp2, in_macro); |
192 | 77.0k | q0_uv_8x32_1 = _mm256_sub_epi16(temp3, in_macro); |
193 | | |
194 | | |
195 | 77.0k | flag1 = _mm256_and_si256(flag1, flag_bs); |
196 | 77.0k | flag1 = _mm256_packs_epi16(flag1, flag1); // 0213 |
197 | | |
198 | 77.0k | pq0_uv_8x32 = _mm256_packus_epi16(p0_uv_8x32_1,q0_uv_8x32_1); //0213 |
199 | | |
200 | 77.0k | pq0_uv_8x32_1 = _mm256_and_si256(pq0_uv_32x8, |
201 | 77.0k | _mm256_xor_si256(flag1, _mm256_set1_epi8(0xFF))); |
202 | 77.0k | pq0_uv_8x32_2 = _mm256_and_si256(pq0_uv_8x32, flag1); |
203 | 77.0k | pq0_uv_32x8 = _mm256_add_epi8(pq0_uv_8x32_1, pq0_uv_8x32_2); |
204 | | |
205 | | |
206 | 77.0k | t1 = _mm256_unpacklo_epi16(pq1_uv_32x8, pq0_uv_32x8); // temp1 temp3 |
207 | 77.0k | t2 = _mm256_unpackhi_epi16(pq1_uv_32x8, pq0_uv_32x8); // temp2 temp4 |
208 | | |
209 | 77.0k | t4 = _mm256_shufflelo_epi16(t2, _MM_SHUFFLE(2, 3, 0, 1)); // pshuflw |
210 | 77.0k | t4 = _mm256_shufflehi_epi16(t4, _MM_SHUFFLE(2, 3, 0, 1)); |
211 | | |
212 | 77.0k | lineae = _mm256_unpacklo_epi32(t1, t4); // temp1 temp3 |
213 | 77.0k | linecg = _mm256_unpackhi_epi32(t1, t4); // temp2 temp4 |
214 | | |
215 | 77.0k | linea = _mm256_castsi256_si128(lineae); |
216 | 77.0k | lineb = _mm256_castsi256_si128(_mm256_srli_si256(lineae, 8)); |
217 | 77.0k | lineae = _mm256_permute2f128_si256(lineae, lineae, 0x1); |
218 | 77.0k | linee = _mm256_castsi256_si128(lineae); |
219 | 77.0k | linef = _mm256_castsi256_si128(_mm256_srli_si256(lineae, 8)); |
220 | | |
221 | | |
222 | 77.0k | linec = _mm256_castsi256_si128(linecg); |
223 | 77.0k | lined = _mm256_castsi256_si128(_mm256_srli_si256(linecg, 8)); |
224 | 77.0k | linecg = _mm256_permute2f128_si256(linecg, linecg, 0x1); |
225 | 77.0k | lineg = _mm256_castsi256_si128(linecg); |
226 | 77.0k | lineh = _mm256_castsi256_si128(_mm256_srli_si256(linecg, 8)); |
227 | | |
228 | 77.0k | _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea); |
229 | 77.0k | _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb); |
230 | 77.0k | _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec); |
231 | 77.0k | _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined); |
232 | 77.0k | _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee); |
233 | 77.0k | _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef); |
234 | 77.0k | _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg); |
235 | 77.0k | _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh); |
236 | | |
237 | 77.0k | } |
238 | | |
239 | | /*****************************************************************************/ |
240 | | /* */ |
241 | | /* Function Name : ih264_deblk_chroma_horz_bslt4_avx2() */ |
242 | | /* */ |
243 | | /* Description : This function performs filtering of a chroma block */ |
244 | | /* horizontal edge when the boundary strength is less than */ |
245 | | /* 4 in high profile. */ |
246 | | /* */ |
247 | | /* Inputs : pu1_src - pointer to the src sample q0 of U */ |
248 | | /* src_strd - source stride */ |
249 | | /* alpha_cb - alpha value for the boundary in U */ |
250 | | /* beta_cb - beta value for the boundary in U */ |
251 | | /* alpha_cr - alpha value for the boundary in V */ |
252 | | /* beta_cr - beta value for the boundary in V */ |
253 | | /* u4_bs - packed Boundary strength array */ |
254 | | /* pu1_cliptab_cb - tc0_table for U */ |
255 | | /* pu1_cliptab_cr - tc0_table for V */ |
256 | | /* */ |
257 | | /* Globals : None */ |
258 | | /* */ |
259 | | /* Processing : This operation is described in Sec. 8.7.2.3 under the */ |
260 | | /* title "Filtering process for edges for bS less than 4" */ |
261 | | /* in ITU T Rec H.264 with alpha and beta values different */ |
262 | | /* in U and V. */ |
263 | | /* */ |
264 | | /* Outputs : None */ |
265 | | /* */ |
266 | | /* Returns : None */ |
267 | | /* */ |
268 | | /* Issues : None */ |
269 | | /* */ |
270 | | /* Revision History: */ |
271 | | /* */ |
272 | | /* DD MM YYYY Author(s) Changes (Describe the changes made) */ |
273 | | /* 12 02 2015 Naveen Kumar P Initial version */ |
274 | | /* 12 10 2020 Priyanka Bose AVX2 Intel Intrinsics Support */ |
275 | | /*****************************************************************************/ |
276 | | void ih264_deblk_chroma_horz_bslt4_avx2 (UWORD8 *pu1_src, |
277 | | WORD32 src_strd, |
278 | | WORD32 alpha_cb, |
279 | | WORD32 beta_cb, |
280 | | WORD32 alpha_cr, |
281 | | WORD32 beta_cr, |
282 | | UWORD32 u4_bs, |
283 | | const UWORD8 *pu1_cliptab_cb, |
284 | | const UWORD8 *pu1_cliptab_cr) |
285 | 86.1k | { |
286 | 86.1k | UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/ |
287 | 86.1k | WORD16 i16_posP1, i16_posP0, i16_posQ1; |
288 | 86.1k | UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3; |
289 | | |
290 | 86.1k | UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */ |
291 | 86.1k | WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb; |
292 | 86.1k | WORD32 beta_cbcr = (beta_cr << 16) + beta_cb; |
293 | 86.1k | __m256i p0q0_uv_32x8,p1q1_uv_32x8; |
294 | 86.1k | __m256i temp1,temp2,temp3,temp4; |
295 | 86.1k | __m256i flag_bs, flag1, flag2; |
296 | 86.1k | __m256i diff, diff1, alpha_cbcr_32x8, beta_cbcr_32x8, in_macro; |
297 | 86.1k | __m256i zero = _mm256_setzero_si256(); |
298 | 86.1k | __m256i C0_uv_8x32; |
299 | 86.1k | __m256i p0q0_uv_8x32_1, p0q0_uv_8x32_2,res1,res2,p0_uv_8x32_1,q0_uv_8x32_1; |
300 | | |
301 | 86.1k | pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1); |
302 | | |
303 | 86.1k | i16_posQ1 = src_strd; |
304 | 86.1k | i16_posP0 = src_strd; |
305 | 86.1k | i16_posP1 = 0; |
306 | | |
307 | 86.1k | u1_Bs0 = (u4_bs >> 24) & 0xff; |
308 | 86.1k | u1_Bs1 = (u4_bs >> 16) & 0xff; |
309 | 86.1k | u1_Bs2 = (u4_bs >> 8) & 0xff; |
310 | 86.1k | u1_Bs3 = (u4_bs >> 0) & 0xff; |
311 | | |
312 | 86.1k | flag_bs = _mm256_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, |
313 | 86.1k | u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2, |
314 | 86.1k | u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, |
315 | 86.1k | u1_Bs2, u1_Bs2,u1_Bs2, u1_Bs2, |
316 | 86.1k | u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1, |
317 | 86.1k | u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0, |
318 | 86.1k | u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1, |
319 | 86.1k | u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0); |
320 | 86.1k | flag_bs = _mm256_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s |
321 | 86.1k | flag_bs = _mm256_xor_si256(flag_bs, _mm256_set1_epi8(0xFF)); //Invert for required mask |
322 | | |
323 | 86.1k | p0q0_uv_32x8 = _mm256_loadu2_m128i((__m128i *)(pu1_src_uv), (__m128i *)(pu1_HorzPixelUV + i16_posP0)); |
324 | 86.1k | p1q1_uv_32x8 = _mm256_loadu2_m128i((__m128i *)(pu1_src_uv + i16_posQ1), (__m128i *)(pu1_HorzPixelUV + i16_posP1)); |
325 | | |
326 | 86.1k | res1 = _mm256_permute4x64_epi64(p0q0_uv_32x8,0xD8); |
327 | 86.1k | res2 = _mm256_permute4x64_epi64(p1q1_uv_32x8,0xD8); |
328 | | |
329 | 86.1k | temp3 = _mm256_unpacklo_epi8(res1, zero); //p0 l 0 h 0 |
330 | 86.1k | temp4 = _mm256_unpackhi_epi8(res1, zero); //q0 |
331 | 86.1k | temp1 = _mm256_unpacklo_epi8(res2, zero); //p1 |
332 | 86.1k | temp2 = _mm256_unpackhi_epi8(res2, zero); //q1 |
333 | | |
334 | 86.1k | diff = _mm256_subs_epi16(temp3, temp4); //Condn 1 //p0 l h - q0 l h |
335 | 86.1k | diff = _mm256_abs_epi16(diff); |
336 | 86.1k | alpha_cbcr_32x8 = _mm256_set1_epi32(alpha_cbcr); |
337 | 86.1k | flag1 = _mm256_cmpgt_epi16(alpha_cbcr_32x8, diff); |
338 | | |
339 | 86.1k | diff = _mm256_subs_epi16(temp2, temp4); //Condtn 2 |
340 | 86.1k | diff = _mm256_abs_epi16(diff); |
341 | 86.1k | beta_cbcr_32x8 = _mm256_set1_epi32(beta_cbcr); |
342 | 86.1k | flag1 = _mm256_and_si256(flag1, _mm256_cmpgt_epi16(beta_cbcr_32x8, diff)); |
343 | | |
344 | 86.1k | diff = _mm256_subs_epi16(temp1, temp3); //Condtn 3 |
345 | 86.1k | diff = _mm256_abs_epi16(diff); |
346 | 86.1k | flag1 = _mm256_and_si256(flag1, _mm256_cmpgt_epi16(beta_cbcr_32x8, diff)); |
347 | | |
348 | 86.1k | diff = _mm256_subs_epi16(temp4, temp3); |
349 | 86.1k | diff = _mm256_slli_epi16(diff, 2); |
350 | 86.1k | diff1 = _mm256_subs_epi16(temp1, temp2); |
351 | 86.1k | diff = _mm256_add_epi16(diff, diff1); |
352 | 86.1k | diff = _mm256_add_epi16(diff, _mm256_set1_epi16(4)); |
353 | 86.1k | in_macro = _mm256_srai_epi16(diff, 3); |
354 | | |
355 | 86.1k | C0_uv_8x32 = _mm256_set_epi16( |
356 | 86.1k | pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3], |
357 | 86.1k | pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3], |
358 | 86.1k | pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2], |
359 | 86.1k | pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2], |
360 | 86.1k | pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1], |
361 | 86.1k | pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1], |
362 | 86.1k | pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0], |
363 | 86.1k | pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]); |
364 | | |
365 | 86.1k | C0_uv_8x32 = _mm256_add_epi16(C0_uv_8x32, _mm256_set1_epi16(1)); |
366 | | |
367 | 86.1k | in_macro = _mm256_min_epi16(C0_uv_8x32, in_macro); //CLIP3 |
368 | 86.1k | C0_uv_8x32 = _mm256_subs_epi16(zero, C0_uv_8x32); |
369 | 86.1k | in_macro = _mm256_max_epi16(C0_uv_8x32, in_macro); |
370 | | |
371 | 86.1k | p0_uv_8x32_1 = _mm256_add_epi16(temp3, in_macro); |
372 | 86.1k | q0_uv_8x32_1 = _mm256_sub_epi16(temp4, in_macro); |
373 | | |
374 | 86.1k | p0q0_uv_8x32_2 = _mm256_packus_epi16(p0_uv_8x32_1,q0_uv_8x32_1); |
375 | 86.1k | flag1 = _mm256_packs_epi16(flag1, flag1); |
376 | 86.1k | flag1 = _mm256_and_si256(flag1, flag_bs); //Final flag (BS condition + other 3 conditions) |
377 | | |
378 | 86.1k | p0q0_uv_8x32_1 = _mm256_and_si256(res1, |
379 | 86.1k | _mm256_xor_si256(flag1, _mm256_set1_epi8(0xFF))); |
380 | 86.1k | p0q0_uv_8x32_2 = _mm256_and_si256(p0q0_uv_8x32_2, flag1); |
381 | 86.1k | p0q0_uv_8x32_1 = _mm256_add_epi8(p0q0_uv_8x32_1, p0q0_uv_8x32_2); |
382 | 86.1k | p0q0_uv_8x32_1 = _mm256_permute4x64_epi64(p0q0_uv_8x32_1,0xD8); |
383 | | |
384 | 86.1k | _mm256_storeu2_m128i((__m128i *)(pu1_src_uv),(__m128i *)(pu1_HorzPixelUV + i16_posP0), p0q0_uv_8x32_1); |
385 | | |
386 | 86.1k | } |