/src/libavc/common/x86/ih264_deblk_luma_avx2.c
Line | Count | Source |
1 | | /****************************************************************************** |
2 | | * |
3 | | * Copyright (C) 2015 The Android Open Source Project |
4 | | * |
5 | | * Licensed under the Apache License, Version 2.0 (the "License"); |
6 | | * you may not use this file except in compliance with the License. |
7 | | * You may obtain a copy of the License at: |
8 | | * |
9 | | * http://www.apache.org/licenses/LICENSE-2.0 |
10 | | * |
11 | | * Unless required by applicable law or agreed to in writing, software |
12 | | * distributed under the License is distributed on an "AS IS" BASIS, |
13 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | * See the License for the specific language governing permissions and |
15 | | * limitations under the License. |
16 | | * |
17 | | ***************************************************************************** |
18 | | * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
19 | | */ |
20 | | /*****************************************************************************/ |
21 | | /* */ |
22 | | /* File Name : ih264_deblk_luma_avx2.c */ |
23 | | /* */ |
24 | | /* Description : Contains function definitions for deblocking */ |
25 | | /* */ |
26 | | /* List of Functions : ih264_deblk_luma_horz_bslt4_avx2() */ |
27 | | /* ih264_deblk_luma_vert_bslt4_avx2() */ |
28 | | /* */ |
29 | | /* Issues / Problems : None */ |
30 | | /* */ |
31 | | /* Revision History : */ |
32 | | /* */ |
33 | | /* DD MM YYYY Author(s) Changes (Describe the changes made) */ |
34 | | /* 12 02 2015 Naveen Kumar P Added luma deblocking ssse3 */ |
35 | | /* intrinsics */ |
36 | | /* 15 09 2020 Priyanka Bose AVX2 Intel Intrinsics Support */ |
37 | | /*****************************************************************************/ |
38 | | |
39 | | /*****************************************************************************/ |
40 | | /* File Includes */ |
41 | | /*****************************************************************************/ |
42 | | |
43 | | /* System include files */ |
44 | | #include <stdio.h> |
45 | | #ifdef __ANDROID__ |
46 | | #include "log/log.h" |
47 | | #include <cutils/log.h> |
48 | | #endif |
49 | | |
50 | | /* User include files */ |
51 | | #include "ih264_typedefs.h" |
52 | | #include "ih264_platform_macros.h" |
53 | | #include "ih264_deblk_edge_filters.h" |
54 | | #include "ih264_macros.h" |
55 | | |
56 | | |
57 | | /*****************************************************************************/ |
58 | | /* */ |
59 | | /* Function Name : ih264_deblk_luma_horz_bslt4_avx2() */ |
60 | | /* */ |
61 | | /* Description : This function performs filtering of a luma block */ |
62 | | /* horizontal edge when boundary strength is less than 4. */ |
63 | | /* */ |
64 | | /* Inputs : pu1_src - pointer to the src sample q0 */ |
65 | | /* src_strd - source stride */ |
66 | | /* alpha - alpha value for the boundary */ |
67 | | /* beta - beta value for the boundary */ |
68 | | /* u4_bs - packed Boundary strength array */ |
69 | | /* pu1_cliptab - tc0_table */ |
70 | | /* */ |
71 | | /* Globals : None */ |
72 | | /* */ |
73 | | /* Processing : This operation is described in Sec. 8.7.2.3 under the */ |
74 | | /* title "Filtering process for edges for bS less than 4" */ |
75 | | /* in ITU T Rec H.264. */ |
76 | | /* */ |
77 | | /* Outputs : None */ |
78 | | /* */ |
79 | | /* Returns : None */ |
80 | | /* */ |
81 | | /* Issues : None */ |
82 | | /* */ |
83 | | /* Revision History: */ |
84 | | /* */ |
85 | | /* DD MM YYYY Author(s) Changes (Describe the changes made) */ |
86 | | /* 12 02 2015 Naveen Kumar P Initial version */ |
87 | | /* 15 09 2020 Priyanka Bose AVX2 Intel Intrinsics Support */ |
88 | | /*****************************************************************************/ |
89 | | void ih264_deblk_luma_horz_bslt4_avx2(UWORD8 *pu1_src, |
90 | | WORD32 src_strd, |
91 | | WORD32 alpha, |
92 | | WORD32 beta, |
93 | | UWORD32 u4_bs, |
94 | | const UWORD8 *pu1_cliptab) |
95 | 35.1k | { |
96 | | |
97 | 35.1k | WORD16 i16_posP2, i16_posP1, i16_posP0, i16_posQ1, i16_posQ2; |
98 | 35.1k | UWORD8 *pu1_HorzPixel; |
99 | 35.1k | __m256i zero = _mm256_setzero_si256(); |
100 | 35.1k | __m128i zero_128 = _mm_setzero_si128(); |
101 | 35.1k | __m128i Alpha_8x16,bs_flag_16x8b, C0_16x8, C0_8x16, C0_hi_8x16; |
102 | 35.1k | __m256i Beta_8x32,in_macro_32x8,in_macro_1,in_macro_2,flag1_32x8,flag2_32x8; |
103 | 35.1k | __m256i C_8x32,C0_8x32_res,temp1,temp2,temp3,temp4,res1,res2,q0p1_32x8,p0q1_32x8; |
104 | 35.1k | __m128i p0_16x8,q0_16x8,temp1_128,temp2_128,flag1_16x8_128; |
105 | 35.1k | __m256i const_val4_8x32,p0q0_32x8,p1q1_32x8,p2q2_32x8,q0p0_32x8; |
106 | 35.1k | UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3; |
107 | 35.1k | UWORD8 clip0, clip1, clip2, clip3; |
108 | | |
109 | 35.1k | pu1_HorzPixel = pu1_src - (src_strd << 2); |
110 | | |
111 | 35.1k | i16_posQ1 = src_strd; |
112 | 35.1k | i16_posQ2 = X2(src_strd); |
113 | 35.1k | i16_posP0 = X3(src_strd); |
114 | 35.1k | i16_posP1 = X2(src_strd); |
115 | 35.1k | i16_posP2 = src_strd; |
116 | | |
117 | 35.1k | p0q0_32x8 = _mm256_loadu2_m128i((__m128i *)(pu1_src), (__m128i *)(pu1_HorzPixel + i16_posP0)); //lower -p0 higher-q0 |
118 | 35.1k | p1q1_32x8 = _mm256_loadu2_m128i((__m128i *)(pu1_src + i16_posQ1), (__m128i *)(pu1_HorzPixel + i16_posP1)); //l= p1, h=q1 |
119 | 35.1k | p2q2_32x8 = _mm256_loadu2_m128i((__m128i *)(pu1_src + i16_posQ2), (__m128i *)(pu1_HorzPixel + i16_posP2)); |
120 | | |
121 | 35.1k | u1_Bs0 = (u4_bs >> 24) & 0xff; |
122 | 35.1k | u1_Bs1 = (u4_bs >> 16) & 0xff; |
123 | 35.1k | u1_Bs2 = (u4_bs >> 8) & 0xff; |
124 | 35.1k | u1_Bs3 = (u4_bs >> 0) & 0xff; |
125 | 35.1k | clip0 = pu1_cliptab[u1_Bs0]; |
126 | 35.1k | clip1 = pu1_cliptab[u1_Bs1]; |
127 | 35.1k | clip2 = pu1_cliptab[u1_Bs2]; |
128 | 35.1k | clip3 = pu1_cliptab[u1_Bs3]; |
129 | | |
130 | 35.1k | Alpha_8x16 = _mm_set1_epi16(alpha); |
131 | 35.1k | Beta_8x32 = _mm256_set1_epi16(beta); |
132 | | |
133 | 35.1k | bs_flag_16x8b = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, |
134 | 35.1k | u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2, |
135 | 35.1k | u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1, |
136 | 35.1k | u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0); |
137 | | |
138 | 35.1k | C0_16x8 = _mm_set_epi8(clip3, clip3, clip3, clip3, clip2, clip2, clip2, |
139 | 35.1k | clip2, clip1, clip1, clip1, clip1, clip0, clip0, |
140 | 35.1k | clip0, clip0); |
141 | | |
142 | 35.1k | bs_flag_16x8b = _mm_cmpeq_epi8(bs_flag_16x8b, zero_128); |
143 | 35.1k | bs_flag_16x8b = _mm_xor_si128(bs_flag_16x8b, _mm_set1_epi8(0xFF)); //Invert for required mask |
144 | 35.1k | C0_8x16 = _mm_unpacklo_epi8(C0_16x8, zero_128); |
145 | 35.1k | C0_hi_8x16 = _mm_unpackhi_epi8(C0_16x8, zero_128); |
146 | 35.1k | C0_8x32_res = _mm256_set_m128i(C0_hi_8x16,C0_8x16); |
147 | | |
148 | | //Cond1 (ABS(p0 - q0) < alpha) |
149 | 35.1k | p0_16x8 = _mm256_castsi256_si128(p0q0_32x8); |
150 | 35.1k | q0p0_32x8 = _mm256_permute2x128_si256(p0q0_32x8, p0q0_32x8, 0x1); |
151 | 35.1k | q0_16x8 = _mm256_castsi256_si128(p0q0_32x8); |
152 | 35.1k | temp1_128 = _mm_subs_epu8(q0_16x8, p0_16x8); |
153 | 35.1k | temp2_128 = _mm_subs_epu8(p0_16x8, q0_16x8); |
154 | 35.1k | temp1_128 = _mm_add_epi8(temp1_128, temp2_128); |
155 | | |
156 | 35.1k | temp2_128 = _mm_unpacklo_epi8(temp1_128, zero_128); |
157 | 35.1k | temp1_128 = _mm_unpackhi_epi8(temp1_128, zero_128); |
158 | | |
159 | 35.1k | temp2_128 = _mm_cmpgt_epi16(Alpha_8x16, temp2_128); |
160 | 35.1k | temp1_128 = _mm_cmpgt_epi16(Alpha_8x16, temp1_128); |
161 | 35.1k | flag1_16x8_128 = _mm_packs_epi16(temp2_128, temp1_128); |
162 | 35.1k | flag1_16x8_128 = _mm_and_si128(flag1_16x8_128, bs_flag_16x8b); |
163 | | |
164 | 35.1k | flag1_32x8 = _mm256_set_m128i(flag1_16x8_128,flag1_16x8_128); |
165 | | |
166 | | //Cond2 (ABS(q1 - q0) < beta) & Cond3 (ABS(p1 - p0) < beta) |
167 | 35.1k | temp1 = _mm256_subs_epu8(p0q0_32x8, p1q1_32x8); |
168 | 35.1k | temp2 = _mm256_subs_epu8(p1q1_32x8, p0q0_32x8); |
169 | 35.1k | temp1 = _mm256_add_epi8(temp1, temp2); |
170 | | |
171 | 35.1k | temp2 = _mm256_unpacklo_epi8(temp1, zero); |
172 | 35.1k | temp1 = _mm256_unpackhi_epi8(temp1, zero); |
173 | | |
174 | 35.1k | temp2 = _mm256_cmpgt_epi16(Beta_8x32, temp2); |
175 | 35.1k | temp1 = _mm256_cmpgt_epi16(Beta_8x32, temp1); |
176 | | |
177 | 35.1k | flag2_32x8 = _mm256_packs_epi16(temp2, temp1); |
178 | | |
179 | | //!((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta)) |
180 | 35.1k | flag1_32x8 = _mm256_and_si256(flag1_32x8, flag2_32x8); |
181 | | |
182 | | //(ABS(p2 - p0) < beta) & (ABS(q2 - q0) < beta) |
183 | 35.1k | temp1 = _mm256_subs_epu8(p0q0_32x8, p2q2_32x8); |
184 | 35.1k | temp2 = _mm256_subs_epu8(p2q2_32x8, p0q0_32x8); |
185 | 35.1k | temp1 = _mm256_add_epi8(temp1, temp2); |
186 | | |
187 | 35.1k | temp2 = _mm256_unpacklo_epi8(temp1, zero); |
188 | 35.1k | temp1 = _mm256_unpackhi_epi8(temp1, zero); |
189 | 35.1k | temp2 = _mm256_cmpgt_epi16(Beta_8x32, temp2); |
190 | 35.1k | temp1 = _mm256_cmpgt_epi16(Beta_8x32, temp1); |
191 | | |
192 | 35.1k | flag2_32x8 = _mm256_packs_epi16(temp2, temp1); |
193 | 35.1k | flag2_32x8 = _mm256_and_si256(flag1_32x8, flag2_32x8); |
194 | | |
195 | 35.1k | temp2 = _mm256_subs_epi16(zero, temp2); |
196 | 35.1k | temp1 = _mm256_subs_epi16(zero, temp1); |
197 | | |
198 | 35.1k | temp3 = _mm256_permute2x128_si256(temp2,temp1,0x20); // low adding |
199 | 35.1k | temp4 = _mm256_permute2x128_si256(temp2,temp1,0x31); //high adding |
200 | 35.1k | temp2 = _mm256_add_epi16(temp3,temp4); |
201 | 35.1k | C_8x32 = _mm256_add_epi16(C0_8x32_res, temp2); // |
202 | 35.1k | const_val4_8x32 = _mm256_set1_epi16(4); |
203 | | |
204 | 35.1k | res1 = _mm256_permute4x64_epi64(q0p0_32x8, 0xD8); |
205 | 35.1k | res2 = _mm256_permute4x64_epi64(p1q1_32x8, 0xD8); |
206 | | |
207 | 35.1k | temp3 = _mm256_subs_epi16(_mm256_unpacklo_epi8(res1, zero), |
208 | 35.1k | _mm256_unpackhi_epi8(res1, zero)); |
209 | 35.1k | temp4 = _mm256_subs_epi16(_mm256_unpacklo_epi8(res2, zero), |
210 | 35.1k | _mm256_unpackhi_epi8(res2, zero)); |
211 | | |
212 | 35.1k | temp1 = _mm256_slli_epi16(temp3, 2); |
213 | 35.1k | temp1 = _mm256_add_epi16(temp1, temp4); |
214 | 35.1k | temp1 = _mm256_add_epi16(temp1, const_val4_8x32); |
215 | 35.1k | in_macro_32x8 = _mm256_srai_epi16(temp1, 3); |
216 | | |
217 | 35.1k | in_macro_32x8 = _mm256_min_epi16(C_8x32, in_macro_32x8); //CLIP3 |
218 | 35.1k | C_8x32 = _mm256_subs_epi16(zero, C_8x32); |
219 | 35.1k | in_macro_32x8 = _mm256_max_epi16(C_8x32, in_macro_32x8); //CLIP3 |
220 | | |
221 | 35.1k | temp3 = _mm256_unpacklo_epi8(res1, zero); //q0 |
222 | 35.1k | temp4 = _mm256_unpackhi_epi8(res1, zero); //p0 |
223 | | |
224 | 35.1k | temp1 = _mm256_add_epi16(temp4, in_macro_32x8); |
225 | 35.1k | temp2 = _mm256_sub_epi16(temp3, in_macro_32x8); |
226 | | |
227 | 35.1k | temp1 = _mm256_packus_epi16(temp2, temp1); // Suffle needed |
228 | | |
229 | 35.1k | temp1 = _mm256_and_si256(temp1, flag1_32x8); //q0 p0 |
230 | | |
231 | 35.1k | temp2 = _mm256_and_si256(res1, |
232 | 35.1k | _mm256_xor_si256(flag1_32x8, _mm256_set1_epi16(0xFFFF))); |
233 | | |
234 | 35.1k | temp1 = _mm256_add_epi8(temp1, temp2); |
235 | 35.1k | temp1 = _mm256_permute4x64_epi64(temp1, 0xD8); |
236 | 35.1k | _mm256_storeu2_m128i((__m128i *)(pu1_HorzPixel + i16_posP0),(__m128i *)(pu1_src),temp1); |
237 | | |
238 | | //if(Ap < Beta) if(Aq < Beta) |
239 | 35.1k | temp1 = _mm256_avg_epu16(_mm256_unpacklo_epi8(res1, zero), |
240 | 35.1k | _mm256_unpackhi_epi8(res1, zero)); |
241 | | |
242 | 35.1k | temp2 = _mm256_slli_epi16(_mm256_unpacklo_epi8(p1q1_32x8, zero), 1); |
243 | 35.1k | temp3 = _mm256_subs_epi16(_mm256_unpacklo_epi8(p2q2_32x8, zero), temp2); |
244 | | |
245 | 35.1k | temp2 = _mm256_slli_epi16(_mm256_unpackhi_epi8(p1q1_32x8, zero), 1); |
246 | 35.1k | temp2 = _mm256_subs_epi16(_mm256_unpackhi_epi8(p2q2_32x8, zero), temp2); |
247 | | |
248 | 35.1k | temp4 = _mm256_permute2x128_si256(temp3, temp2, 0x20); //p0 q0 |
249 | 35.1k | temp3 = _mm256_permute2x128_si256(temp3, temp2, 0x31); |
250 | 35.1k | temp4 = _mm256_add_epi16(temp1, temp4); //p |
251 | 35.1k | in_macro_1 = _mm256_srai_epi16(temp4, 1); |
252 | 35.1k | temp3 = _mm256_add_epi16(temp1, temp3); //q |
253 | 35.1k | in_macro_2 = _mm256_srai_epi16(temp3, 1); |
254 | | |
255 | 35.1k | in_macro_1 = _mm256_min_epi16(C0_8x32_res, in_macro_1); //CLIP3 |
256 | 35.1k | C0_8x32_res = _mm256_subs_epi16(zero, C0_8x32_res); |
257 | 35.1k | in_macro_1 = _mm256_max_epi16(C0_8x32_res, in_macro_1); //CLIP3 |
258 | | |
259 | 35.1k | in_macro_2 = _mm256_max_epi16(C0_8x32_res, in_macro_2); //CLIP3 |
260 | 35.1k | C0_8x32_res = _mm256_subs_epi16(zero, C0_8x32_res); |
261 | 35.1k | in_macro_2 = _mm256_min_epi16(C0_8x32_res, in_macro_2); //CLIP3 |
262 | | |
263 | 35.1k | temp1 = _mm256_unpacklo_epi8(res2, zero); |
264 | 35.1k | temp2 = _mm256_unpackhi_epi8(res2, zero); |
265 | | |
266 | 35.1k | temp1 = _mm256_add_epi16(temp1, in_macro_1); |
267 | 35.1k | temp2 = _mm256_add_epi16(temp2, in_macro_2); |
268 | 35.1k | temp1 = _mm256_packus_epi16(temp1, temp2); // pl ph ql qh |
269 | 35.1k | temp1 = _mm256_and_si256(temp1, flag2_32x8); |
270 | 35.1k | temp2 = _mm256_and_si256(res2,_mm256_xor_si256(flag2_32x8, _mm256_set1_epi16(0xFFFF))); |
271 | 35.1k | temp1 = _mm256_add_epi8(temp1, temp2); |
272 | 35.1k | temp1 = _mm256_permute4x64_epi64(temp1, 0xD8); |
273 | 35.1k | _mm256_storeu2_m128i((__m128i *)(pu1_src + i16_posQ1),(__m128i *)(pu1_HorzPixel + i16_posP1),temp1); |
274 | | |
275 | 35.1k | } |