/src/libavc/common/x86/ih264_deblk_chroma_ssse3.c
Line | Count | Source |
1 | | /****************************************************************************** |
2 | | * |
3 | | * Copyright (C) 2015 The Android Open Source Project |
4 | | * |
5 | | * Licensed under the Apache License, Version 2.0 (the "License"); |
6 | | * you may not use this file except in compliance with the License. |
7 | | * You may obtain a copy of the License at: |
8 | | * |
9 | | * http://www.apache.org/licenses/LICENSE-2.0 |
10 | | * |
11 | | * Unless required by applicable law or agreed to in writing, software |
12 | | * distributed under the License is distributed on an "AS IS" BASIS, |
13 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | * See the License for the specific language governing permissions and |
15 | | * limitations under the License. |
16 | | * |
17 | | ***************************************************************************** |
18 | | * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
19 | | */ |
20 | | /*****************************************************************************/ |
21 | | /* */ |
22 | | /* File Name : ih264_deblk_chroma_ssse3.c */ |
23 | | /* */ |
24 | | /* Description : Contains function definitions for deblocking */ |
25 | | /* */ |
26 | | /* List of Functions : ih264_deblk_chroma_vert_bs4_ssse3() */ |
27 | | /* ih264_deblk_chroma_horz_bs4_ssse3() */ |
28 | | /* ih264_deblk_chroma_vert_bslt4_ssse3() */ |
29 | | /* ih264_deblk_chroma_horz_bslt4_ssse3() */ |
30 | | /* ih264_deblk_chroma_vert_bs4_mbaff_ssse3() */ |
31 | | /* ih264_deblk_chroma_vert_bslt4_mbaff_ssse3() */ |
32 | | /* */ |
33 | | /* Issues / Problems : None */ |
34 | | /* */ |
35 | | /* Revision History : */ |
36 | | /* */ |
37 | | /* DD MM YYYY Author(s) Changes (Describe the changes made) */ |
38 | | /* 12 02 2015 Naveen Kumar P Added chrom deblocking ssse3 */ |
39 | | /* intrinsics */ |
40 | | /* */ |
41 | | /*****************************************************************************/ |
42 | | |
43 | | /*****************************************************************************/ |
44 | | /* File Includes */ |
45 | | /*****************************************************************************/ |
46 | | |
47 | | /* System include files */ |
48 | | #include <stdio.h> |
49 | | |
50 | | /* User include files */ |
51 | | #include "ih264_typedefs.h" |
52 | | #include "ih264_platform_macros.h" |
53 | | #include "ih264_deblk_edge_filters.h" |
54 | | #include "ih264_macros.h" |
55 | | |
56 | | /*****************************************************************************/ |
57 | | /* Function Definitions */ |
58 | | /*****************************************************************************/ |
59 | | |
60 | | /*****************************************************************************/ |
61 | | /* */ |
62 | | /* Function Name : ih264_deblk_chroma_vert_bs4_ssse3() */ |
63 | | /* */ |
64 | | /* Description : This function performs filtering of a chroma block */ |
65 | | /* vertical edge when the boundary strength is set to 4 in */ |
66 | | /* high profile. */ |
67 | | /* */ |
68 | | /* Inputs : pu1_src - pointer to the src sample q0 of U */ |
69 | | /* src_strd - source stride */ |
70 | | /* alpha_cb - alpha value for the boundary in U */ |
71 | | /* beta_cb - beta value for the boundary in U */ |
72 | | /* alpha_cr - alpha value for the boundary in V */ |
73 | | /* beta_cr - beta value for the boundary in V */ |
74 | | /* */ |
75 | | /* Globals : None */ |
76 | | /* */ |
77 | | /* Processing : This operation is described in Sec. 8.7.2.4 under the */ |
78 | | /* title "Filtering process for edges for bS equal to 4" in */ |
79 | | /* ITU T Rec H.264 with alpha and beta values different in */ |
80 | | /* U and V. */ |
81 | | /* */ |
82 | | /* Outputs : None */ |
83 | | /* */ |
84 | | /* Returns : None */ |
85 | | /* */ |
86 | | /* Issues : None */ |
87 | | /* */ |
88 | | /* Revision History: */ |
89 | | /* */ |
90 | | /* DD MM YYYY Author(s) Changes (Describe the changes made) */ |
91 | | /* 12 02 2015 Naveen Kumar P Initial version */ |
92 | | /* */ |
93 | | /*****************************************************************************/ |
94 | | void ih264_deblk_chroma_vert_bs4_ssse3(UWORD8 *pu1_src, |
95 | | WORD32 src_strd, |
96 | | WORD32 alpha_cb, |
97 | | WORD32 beta_cb, |
98 | | WORD32 alpha_cr, |
99 | | WORD32 beta_cr) |
100 | 8.45M | { |
101 | 8.45M | UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/ |
102 | 8.45M | WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb; |
103 | 8.45M | WORD32 beta_cbcr = (beta_cr << 16) + beta_cb; |
104 | 8.45M | __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh; |
105 | 8.45M | __m128i temp1, temp2, temp3, temp4; |
106 | | |
107 | 8.45M | __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8; |
108 | 8.45M | __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16; |
109 | 8.45M | __m128i flag1, flag2; |
110 | 8.45M | __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8; |
111 | 8.45M | __m128i zero = _mm_setzero_si128(); |
112 | 8.45M | __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2; |
113 | | |
114 | | /* Load and transpose the pixel values */ |
115 | 8.45M | linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4)); |
116 | 8.45M | lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd)); |
117 | 8.45M | linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd)); |
118 | 8.45M | lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd)); |
119 | 8.45M | linee = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd)); |
120 | 8.45M | linef = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd)); |
121 | 8.45M | lineg = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd)); |
122 | 8.45M | lineh = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd)); |
123 | | |
124 | 8.45M | temp1 = _mm_unpacklo_epi16(linea, lineb); |
125 | 8.45M | temp2 = _mm_unpacklo_epi16(linec, lined); |
126 | 8.45M | temp3 = _mm_unpacklo_epi16(linee, linef); |
127 | 8.45M | temp4 = _mm_unpacklo_epi16(lineg, lineh); |
128 | | |
129 | 8.45M | p1_uv_8x16 = _mm_unpacklo_epi32(temp1, temp2); |
130 | 8.45M | p0_uv_8x16 = _mm_unpacklo_epi32(temp3, temp4); |
131 | 8.45M | q0_uv_8x16 = _mm_unpackhi_epi32(temp1, temp2); |
132 | 8.45M | q1_uv_8x16 = _mm_unpackhi_epi32(temp3, temp4); |
133 | | |
134 | 8.45M | p1_uv_16x8 = _mm_unpacklo_epi64(p1_uv_8x16, p0_uv_8x16); |
135 | 8.45M | p0_uv_16x8 = _mm_unpackhi_epi64(p1_uv_8x16, p0_uv_8x16); |
136 | 8.45M | q0_uv_16x8 = _mm_unpacklo_epi64(q0_uv_8x16, q1_uv_8x16); |
137 | 8.45M | q1_uv_16x8 = _mm_unpackhi_epi64(q0_uv_8x16, q1_uv_8x16); |
138 | | /* End of transpose */ |
139 | | |
140 | 8.45M | q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero); |
141 | 8.45M | q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero); |
142 | 8.45M | p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero); |
143 | 8.45M | p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero); |
144 | | |
145 | 8.45M | diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 |
146 | 8.45M | diff = _mm_abs_epi16(diff); |
147 | 8.45M | alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); |
148 | 8.45M | flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); |
149 | | |
150 | 8.45M | diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 |
151 | 8.45M | diff = _mm_abs_epi16(diff); |
152 | 8.45M | beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); |
153 | 8.45M | flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
154 | | |
155 | 8.45M | diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 |
156 | 8.45M | diff = _mm_abs_epi16(diff); |
157 | 8.45M | flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
158 | | |
159 | 8.45M | temp1 = _mm_slli_epi16(p1_uv_8x16, 1); |
160 | 8.45M | temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16); |
161 | 8.45M | temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); |
162 | 8.45M | temp1 = _mm_add_epi16(temp1, temp2); |
163 | 8.45M | p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2); |
164 | | |
165 | 8.45M | temp1 = _mm_slli_epi16(q1_uv_8x16, 1); |
166 | 8.45M | temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16); |
167 | 8.45M | temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); |
168 | 8.45M | temp1 = _mm_add_epi16(temp1, temp2); |
169 | 8.45M | q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2); |
170 | | |
171 | 8.45M | q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero); |
172 | 8.45M | q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero); |
173 | 8.45M | p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero); |
174 | 8.45M | p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero); |
175 | | |
176 | 8.45M | diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 |
177 | 8.45M | diff = _mm_abs_epi16(diff); |
178 | 8.45M | alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); |
179 | 8.45M | flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); |
180 | | |
181 | 8.45M | diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 |
182 | 8.45M | diff = _mm_abs_epi16(diff); |
183 | 8.45M | beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); |
184 | 8.45M | flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
185 | | |
186 | 8.45M | diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 |
187 | 8.45M | diff = _mm_abs_epi16(diff); |
188 | 8.45M | flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
189 | | |
190 | 8.45M | temp1 = _mm_slli_epi16(p1_uv_8x16, 1); |
191 | 8.45M | temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16); |
192 | 8.45M | temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); |
193 | 8.45M | temp1 = _mm_add_epi16(temp1, temp2); |
194 | 8.45M | p0_uv_8x16_2 = _mm_srai_epi16(temp1, 2); |
195 | | |
196 | 8.45M | temp1 = _mm_slli_epi16(q1_uv_8x16, 1); |
197 | 8.45M | temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16); |
198 | 8.45M | temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); |
199 | 8.45M | temp1 = _mm_add_epi16(temp1, temp2); |
200 | 8.45M | q0_uv_8x16_2 = _mm_srai_epi16(temp1, 2); |
201 | | |
202 | 8.45M | p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2); |
203 | 8.45M | q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2); |
204 | | |
205 | 8.45M | flag1 = _mm_packs_epi16(flag1, flag2); |
206 | | |
207 | 8.45M | p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8, |
208 | 8.45M | _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); |
209 | 8.45M | p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1); |
210 | 8.45M | p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2); |
211 | | |
212 | 8.45M | q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8, |
213 | 8.45M | _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); |
214 | 8.45M | q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1); |
215 | 8.45M | q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2); |
216 | | |
217 | | /* Inverse-transpose and store back */ |
218 | 8.45M | temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8); |
219 | 8.45M | temp2 = _mm_unpackhi_epi16(p1_uv_16x8, p0_uv_16x8); |
220 | 8.45M | temp3 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8); |
221 | 8.45M | temp4 = _mm_unpackhi_epi16(q0_uv_16x8, q1_uv_16x8); |
222 | | |
223 | 8.45M | linea = _mm_unpacklo_epi32(temp1, temp3); |
224 | 8.45M | lineb = _mm_srli_si128(linea, 8); |
225 | 8.45M | linec = _mm_unpackhi_epi32(temp1, temp3); |
226 | 8.45M | lined = _mm_srli_si128(linec, 8); |
227 | 8.45M | linee = _mm_unpacklo_epi32(temp2, temp4); |
228 | 8.45M | linef = _mm_srli_si128(linee, 8); |
229 | 8.45M | lineg = _mm_unpackhi_epi32(temp2, temp4); |
230 | 8.45M | lineh = _mm_srli_si128(lineg, 8); |
231 | | |
232 | 8.45M | _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea); |
233 | 8.45M | _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb); |
234 | 8.45M | _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec); |
235 | 8.45M | _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined); |
236 | 8.45M | _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee); |
237 | 8.45M | _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef); |
238 | 8.45M | _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg); |
239 | 8.45M | _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh); |
240 | | |
241 | 8.45M | } |
242 | | |
243 | | /*****************************************************************************/ |
244 | | /* */ |
245 | | /* Function Name : ih264_deblk_chroma_horz_bs4_ssse3() */ |
246 | | /* */ |
247 | | /* Description : This function performs filtering of a chroma block */ |
248 | | /* horizontal edge when the boundary strength is set to 4 */ |
249 | | /* in high profile. */ |
250 | | /* */ |
251 | | /* Inputs : pu1_src - pointer to the src sample q0 of U */ |
252 | | /* src_strd - source stride */ |
253 | | /* alpha_cb - alpha value for the boundary in U */ |
254 | | /* beta_cb - beta value for the boundary in U */ |
255 | | /* alpha_cr - alpha value for the boundary in V */ |
256 | | /* beta_cr - beta value for the boundary in V */ |
257 | | /* */ |
258 | | /* Globals : None */ |
259 | | /* */ |
260 | | /* Processing : This operation is described in Sec. 8.7.2.4 under the */ |
261 | | /* title "Filtering process for edges for bS equal to 4" in */ |
262 | | /* ITU T Rec H.264 with alpha and beta values different in */ |
263 | | /* U and V. */ |
264 | | /* */ |
265 | | /* Outputs : None */ |
266 | | /* */ |
267 | | /* Returns : None */ |
268 | | /* */ |
269 | | /* Issues : None */ |
270 | | /* */ |
271 | | /* Revision History: */ |
272 | | /* */ |
273 | | /* DD MM YYYY Author(s) Changes (Describe the changes made) */ |
274 | | /* 12 02 2015 Naveen Kumar P Initial version */ |
275 | | /* */ |
276 | | /*****************************************************************************/ |
277 | | void ih264_deblk_chroma_horz_bs4_ssse3(UWORD8 *pu1_src, |
278 | | WORD32 src_strd, |
279 | | WORD32 alpha_cb, |
280 | | WORD32 beta_cb, |
281 | | WORD32 alpha_cr, |
282 | | WORD32 beta_cr) |
283 | 8.37M | { |
284 | 8.37M | UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/ |
285 | 8.37M | WORD16 i16_posP1, i16_posP0, i16_posQ1; |
286 | | |
287 | 8.37M | UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */ |
288 | 8.37M | WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb; |
289 | 8.37M | WORD32 beta_cbcr = (beta_cr << 16) + beta_cb; |
290 | 8.37M | __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8; |
291 | 8.37M | __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16; |
292 | 8.37M | __m128i flag1, flag2; |
293 | 8.37M | __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8; |
294 | 8.37M | __m128i zero = _mm_setzero_si128(); |
295 | 8.37M | __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2; |
296 | 8.37M | __m128i temp1, temp2; |
297 | | |
298 | 8.37M | pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1); |
299 | | |
300 | 8.37M | i16_posQ1 = src_strd; |
301 | 8.37M | i16_posP0 = src_strd; |
302 | 8.37M | i16_posP1 = 0; |
303 | | |
304 | 8.37M | q0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv)); |
305 | 8.37M | q1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv + i16_posQ1)); |
306 | 8.37M | p1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP1)); |
307 | 8.37M | p0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0)); |
308 | | |
309 | 8.37M | q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero); |
310 | 8.37M | q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero); |
311 | 8.37M | p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero); |
312 | 8.37M | p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero); |
313 | | |
314 | 8.37M | diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 |
315 | 8.37M | diff = _mm_abs_epi16(diff); |
316 | 8.37M | alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); |
317 | 8.37M | flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); |
318 | | |
319 | 8.37M | diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 |
320 | 8.37M | diff = _mm_abs_epi16(diff); |
321 | 8.37M | beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); |
322 | 8.37M | flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
323 | | |
324 | 8.37M | diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 |
325 | 8.37M | diff = _mm_abs_epi16(diff); |
326 | 8.37M | flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
327 | | |
328 | 8.37M | temp1 = _mm_slli_epi16(p1_uv_8x16, 1); |
329 | 8.37M | temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16); |
330 | 8.37M | temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); |
331 | 8.37M | temp1 = _mm_add_epi16(temp1, temp2); |
332 | 8.37M | p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2); |
333 | | |
334 | 8.37M | temp1 = _mm_slli_epi16(q1_uv_8x16, 1); |
335 | 8.37M | temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16); |
336 | 8.37M | temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); |
337 | 8.37M | temp1 = _mm_add_epi16(temp1, temp2); |
338 | 8.37M | q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2); |
339 | | |
340 | 8.37M | q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero); |
341 | 8.37M | q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero); |
342 | 8.37M | p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero); |
343 | 8.37M | p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero); |
344 | | |
345 | 8.37M | diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 |
346 | 8.37M | diff = _mm_abs_epi16(diff); |
347 | 8.37M | alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); |
348 | 8.37M | flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); |
349 | | |
350 | 8.37M | diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 |
351 | 8.37M | diff = _mm_abs_epi16(diff); |
352 | 8.37M | beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); |
353 | 8.37M | flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
354 | | |
355 | 8.37M | diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 |
356 | 8.37M | diff = _mm_abs_epi16(diff); |
357 | 8.37M | flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
358 | | |
359 | 8.37M | temp1 = _mm_slli_epi16(p1_uv_8x16, 1); |
360 | 8.37M | temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16); |
361 | 8.37M | temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); |
362 | 8.37M | temp1 = _mm_add_epi16(temp1, temp2); |
363 | 8.37M | p0_uv_8x16_2 = _mm_srai_epi16(temp1, 2); |
364 | | |
365 | 8.37M | temp1 = _mm_slli_epi16(q1_uv_8x16, 1); |
366 | 8.37M | temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16); |
367 | 8.37M | temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); |
368 | 8.37M | temp1 = _mm_add_epi16(temp1, temp2); |
369 | 8.37M | q0_uv_8x16_2 = _mm_srai_epi16(temp1, 2); |
370 | | |
371 | 8.37M | p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2); |
372 | 8.37M | q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2); |
373 | | |
374 | 8.37M | flag1 = _mm_packs_epi16(flag1, flag2); |
375 | | |
376 | 8.37M | p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8, |
377 | 8.37M | _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); |
378 | 8.37M | p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1); |
379 | 8.37M | p0_uv_8x16_1 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2); |
380 | 8.37M | _mm_storeu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0), p0_uv_8x16_1); |
381 | | |
382 | 8.37M | q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8, |
383 | 8.37M | _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); |
384 | 8.37M | q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1); |
385 | 8.37M | q0_uv_8x16_1 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2); |
386 | 8.37M | _mm_storeu_si128((__m128i *)(pu1_src_uv), q0_uv_8x16_1); |
387 | | |
388 | 8.37M | } |
389 | | |
390 | | /*****************************************************************************/ |
391 | | /* */ |
392 | | /* Function Name : ih264_deblk_chroma_vert_bslt4_ssse3() */ |
393 | | /* */ |
394 | | /* Description : This function performs filtering of a chroma block */ |
395 | | /* vertical edge when the boundary strength is less than 4 */ |
396 | | /* in high profile. */ |
397 | | /* */ |
398 | | /* Inputs : pu1_src - pointer to the src sample q0 of U */ |
399 | | /* src_strd - source stride */ |
400 | | /* alpha_cb - alpha value for the boundary in U */ |
401 | | /* beta_cb - beta value for the boundary in U */ |
402 | | /* alpha_cr - alpha value for the boundary in V */ |
403 | | /* beta_cr - beta value for the boundary in V */ |
404 | | /* u4_bs - packed Boundary strength array */ |
405 | | /* pu1_cliptab_cb - tc0_table for U */ |
406 | | /* pu1_cliptab_cr - tc0_table for V */ |
407 | | /* */ |
408 | | /* Globals : None */ |
409 | | /* */ |
410 | | /* Processing : This operation is described in Sec. 8.7.2.3 under the */ |
411 | | /* title "Filtering process for edges for bS less than 4" */ |
412 | | /* in ITU T Rec H.264 with alpha and beta values different */ |
413 | | /* in U and V. */ |
414 | | /* */ |
415 | | /* Outputs : None */ |
416 | | /* */ |
417 | | /* Returns : None */ |
418 | | /* */ |
419 | | /* Issues : None */ |
420 | | /* */ |
421 | | /* Revision History: */ |
422 | | /* */ |
423 | | /* DD MM YYYY Author(s) Changes (Describe the changes made) */ |
424 | | /* 12 02 2015 Naveen Kumar P Initial version */ |
425 | | /* */ |
426 | | /*****************************************************************************/ |
427 | | void ih264_deblk_chroma_vert_bslt4_ssse3(UWORD8 *pu1_src, |
428 | | WORD32 src_strd, |
429 | | WORD32 alpha_cb, |
430 | | WORD32 beta_cb, |
431 | | WORD32 alpha_cr, |
432 | | WORD32 beta_cr, |
433 | | UWORD32 u4_bs, |
434 | | const UWORD8 *pu1_cliptab_cb, |
435 | | const UWORD8 *pu1_cliptab_cr) |
436 | 10.6M | { |
437 | 10.6M | UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/ |
438 | 10.6M | UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3; |
439 | 10.6M | WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb; |
440 | 10.6M | WORD32 beta_cbcr = (beta_cr << 16) + beta_cb; |
441 | 10.6M | __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh; |
442 | 10.6M | __m128i temp1, temp2, temp3, temp4; |
443 | | |
444 | 10.6M | __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8; |
445 | 10.6M | __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16; |
446 | 10.6M | __m128i flag_bs, flag1, flag2; |
447 | 10.6M | __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro; |
448 | 10.6M | __m128i zero = _mm_setzero_si128(); |
449 | 10.6M | __m128i C0_uv_8x16; |
450 | 10.6M | __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2; |
451 | | |
452 | 10.6M | u1_Bs0 = (u4_bs >> 24) & 0xff; |
453 | 10.6M | u1_Bs1 = (u4_bs >> 16) & 0xff; |
454 | 10.6M | u1_Bs2 = (u4_bs >> 8) & 0xff; |
455 | 10.6M | u1_Bs3 = (u4_bs >> 0) & 0xff; |
456 | | |
457 | 10.6M | flag_bs = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2, |
458 | 10.6M | u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1, |
459 | 10.6M | u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0); |
460 | 10.6M | flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s |
461 | 10.6M | flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask |
462 | | |
463 | | /* Load and transpose the pixel values */ |
464 | 10.6M | linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4)); |
465 | 10.6M | lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd)); |
466 | 10.6M | linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd)); |
467 | 10.6M | lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd)); |
468 | 10.6M | linee = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd)); |
469 | 10.6M | linef = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd)); |
470 | 10.6M | lineg = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd)); |
471 | 10.6M | lineh = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd)); |
472 | | |
473 | 10.6M | temp1 = _mm_unpacklo_epi16(linea, lineb); |
474 | 10.6M | temp2 = _mm_unpacklo_epi16(linec, lined); |
475 | 10.6M | temp3 = _mm_unpacklo_epi16(linee, linef); |
476 | 10.6M | temp4 = _mm_unpacklo_epi16(lineg, lineh); |
477 | | |
478 | 10.6M | p1_uv_8x16 = _mm_unpacklo_epi32(temp1, temp2); |
479 | 10.6M | p0_uv_8x16 = _mm_unpacklo_epi32(temp3, temp4); |
480 | 10.6M | q0_uv_8x16 = _mm_unpackhi_epi32(temp1, temp2); |
481 | 10.6M | q1_uv_8x16 = _mm_unpackhi_epi32(temp3, temp4); |
482 | | |
483 | 10.6M | p1_uv_16x8 = _mm_unpacklo_epi64(p1_uv_8x16, p0_uv_8x16); |
484 | 10.6M | p0_uv_16x8 = _mm_unpackhi_epi64(p1_uv_8x16, p0_uv_8x16); |
485 | 10.6M | q0_uv_16x8 = _mm_unpacklo_epi64(q0_uv_8x16, q1_uv_8x16); |
486 | 10.6M | q1_uv_16x8 = _mm_unpackhi_epi64(q0_uv_8x16, q1_uv_8x16); |
487 | | /* End of transpose */ |
488 | | |
489 | 10.6M | q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero); |
490 | 10.6M | q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero); |
491 | 10.6M | p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero); |
492 | 10.6M | p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero); |
493 | | |
494 | 10.6M | diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 |
495 | 10.6M | diff = _mm_abs_epi16(diff); |
496 | 10.6M | alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); |
497 | 10.6M | flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); |
498 | | |
499 | 10.6M | diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 |
500 | 10.6M | diff = _mm_abs_epi16(diff); |
501 | 10.6M | beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); |
502 | 10.6M | flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
503 | | |
504 | 10.6M | diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 |
505 | 10.6M | diff = _mm_abs_epi16(diff); |
506 | 10.6M | flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
507 | | |
508 | 10.6M | diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16); |
509 | 10.6M | diff = _mm_slli_epi16(diff, 2); |
510 | 10.6M | diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16); |
511 | 10.6M | diff = _mm_add_epi16(diff, diff1); |
512 | 10.6M | diff = _mm_add_epi16(diff, _mm_set1_epi16(4)); |
513 | 10.6M | in_macro = _mm_srai_epi16(diff, 3); |
514 | | |
515 | 10.6M | C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1], |
516 | 10.6M | pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1], |
517 | 10.6M | pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0], |
518 | 10.6M | pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]); |
519 | | |
520 | 10.6M | C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1)); |
521 | | |
522 | 10.6M | in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3 |
523 | 10.6M | C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16); |
524 | 10.6M | in_macro = _mm_max_epi16(C0_uv_8x16, in_macro); |
525 | | |
526 | 10.6M | p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro); |
527 | 10.6M | q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro); |
528 | | |
529 | 10.6M | q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero); |
530 | 10.6M | q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero); |
531 | 10.6M | p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero); |
532 | 10.6M | p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero); |
533 | | |
534 | 10.6M | diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 |
535 | 10.6M | diff = _mm_abs_epi16(diff); |
536 | 10.6M | alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); |
537 | 10.6M | flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); |
538 | | |
539 | 10.6M | diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 |
540 | 10.6M | diff = _mm_abs_epi16(diff); |
541 | 10.6M | beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); |
542 | 10.6M | flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
543 | | |
544 | 10.6M | diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 |
545 | 10.6M | diff = _mm_abs_epi16(diff); |
546 | 10.6M | flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
547 | | |
548 | 10.6M | diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16); |
549 | 10.6M | diff = _mm_slli_epi16(diff, 2); |
550 | 10.6M | diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16); |
551 | 10.6M | diff = _mm_add_epi16(diff, diff1); |
552 | 10.6M | diff = _mm_add_epi16(diff, _mm_set1_epi16(4)); |
553 | 10.6M | in_macro = _mm_srai_epi16(diff, 3); |
554 | | |
555 | 10.6M | C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3], |
556 | 10.6M | pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3], |
557 | 10.6M | pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2], |
558 | 10.6M | pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]); |
559 | | |
560 | 10.6M | C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1)); |
561 | | |
562 | 10.6M | in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3 |
563 | 10.6M | C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16); |
564 | 10.6M | in_macro = _mm_max_epi16(C0_uv_8x16, in_macro); |
565 | | |
566 | 10.6M | p0_uv_8x16_2 = _mm_add_epi16(p0_uv_8x16, in_macro); |
567 | 10.6M | q0_uv_8x16_2 = _mm_sub_epi16(q0_uv_8x16, in_macro); |
568 | | |
569 | 10.6M | p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2); |
570 | 10.6M | q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2); |
571 | | |
572 | 10.6M | flag1 = _mm_packs_epi16(flag1, flag2); |
573 | 10.6M | flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions) |
574 | | |
575 | 10.6M | p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8, |
576 | 10.6M | _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); |
577 | 10.6M | p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1); |
578 | 10.6M | p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2); |
579 | | |
580 | 10.6M | q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8, |
581 | 10.6M | _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); |
582 | 10.6M | q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1); |
583 | 10.6M | q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2); |
584 | | |
585 | | /* Inverse-transpose and store back */ |
586 | 10.6M | temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8); |
587 | 10.6M | temp2 = _mm_unpackhi_epi16(p1_uv_16x8, p0_uv_16x8); |
588 | 10.6M | temp3 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8); |
589 | 10.6M | temp4 = _mm_unpackhi_epi16(q0_uv_16x8, q1_uv_16x8); |
590 | | |
591 | 10.6M | linea = _mm_unpacklo_epi32(temp1, temp3); |
592 | 10.6M | lineb = _mm_srli_si128(linea, 8); |
593 | 10.6M | linec = _mm_unpackhi_epi32(temp1, temp3); |
594 | 10.6M | lined = _mm_srli_si128(linec, 8); |
595 | 10.6M | linee = _mm_unpacklo_epi32(temp2, temp4); |
596 | 10.6M | linef = _mm_srli_si128(linee, 8); |
597 | 10.6M | lineg = _mm_unpackhi_epi32(temp2, temp4); |
598 | 10.6M | lineh = _mm_srli_si128(lineg, 8); |
599 | | |
600 | 10.6M | _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea); |
601 | 10.6M | _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb); |
602 | 10.6M | _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec); |
603 | 10.6M | _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined); |
604 | 10.6M | _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee); |
605 | 10.6M | _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef); |
606 | 10.6M | _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg); |
607 | 10.6M | _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh); |
608 | | |
609 | 10.6M | } |
610 | | |
611 | | /*****************************************************************************/ |
612 | | /* */ |
613 | | /* Function Name : ih264_deblk_chroma_horz_bslt4_ssse3() */ |
614 | | /* */ |
615 | | /* Description : This function performs filtering of a chroma block */ |
616 | | /* horizontal edge when the boundary strength is less than */ |
617 | | /* 4 in high profile. */ |
618 | | /* */ |
619 | | /* Inputs : pu1_src - pointer to the src sample q0 of U */ |
620 | | /* src_strd - source stride */ |
621 | | /* alpha_cb - alpha value for the boundary in U */ |
622 | | /* beta_cb - beta value for the boundary in U */ |
623 | | /* alpha_cr - alpha value for the boundary in V */ |
624 | | /* beta_cr - beta value for the boundary in V */ |
625 | | /* u4_bs - packed Boundary strength array */ |
626 | | /* pu1_cliptab_cb - tc0_table for U */ |
627 | | /* pu1_cliptab_cr - tc0_table for V */ |
628 | | /* */ |
629 | | /* Globals : None */ |
630 | | /* */ |
631 | | /* Processing : This operation is described in Sec. 8.7.2.3 under the */ |
632 | | /* title "Filtering process for edges for bS less than 4" */ |
633 | | /* in ITU T Rec H.264 with alpha and beta values different */ |
634 | | /* in U and V. */ |
635 | | /* */ |
636 | | /* Outputs : None */ |
637 | | /* */ |
638 | | /* Returns : None */ |
639 | | /* */ |
640 | | /* Issues : None */ |
641 | | /* */ |
642 | | /* Revision History: */ |
643 | | /* */ |
644 | | /* DD MM YYYY Author(s) Changes (Describe the changes made) */ |
645 | | /* 12 02 2015 Naveen Kumar P Initial version */ |
646 | | /* */ |
647 | | /*****************************************************************************/ |
648 | | void ih264_deblk_chroma_horz_bslt4_ssse3(UWORD8 *pu1_src, |
649 | | WORD32 src_strd, |
650 | | WORD32 alpha_cb, |
651 | | WORD32 beta_cb, |
652 | | WORD32 alpha_cr, |
653 | | WORD32 beta_cr, |
654 | | UWORD32 u4_bs, |
655 | | const UWORD8 *pu1_cliptab_cb, |
656 | | const UWORD8 *pu1_cliptab_cr) |
657 | 10.7M | { |
658 | 10.7M | UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/ |
659 | 10.7M | WORD16 i16_posP1, i16_posP0, i16_posQ1; |
660 | 10.7M | UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3; |
661 | | |
662 | 10.7M | UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */ |
663 | 10.7M | WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb; |
664 | 10.7M | WORD32 beta_cbcr = (beta_cr << 16) + beta_cb; |
665 | 10.7M | __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8; |
666 | 10.7M | __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16; |
667 | 10.7M | __m128i flag_bs, flag1, flag2; |
668 | 10.7M | __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro; |
669 | 10.7M | __m128i zero = _mm_setzero_si128(); |
670 | 10.7M | __m128i C0_uv_8x16; |
671 | 10.7M | __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2; |
672 | | |
673 | 10.7M | pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1); |
674 | | |
675 | 10.7M | i16_posQ1 = src_strd; |
676 | 10.7M | i16_posP0 = src_strd; |
677 | 10.7M | i16_posP1 = 0; |
678 | | |
679 | 10.7M | u1_Bs0 = (u4_bs >> 24) & 0xff; |
680 | 10.7M | u1_Bs1 = (u4_bs >> 16) & 0xff; |
681 | 10.7M | u1_Bs2 = (u4_bs >> 8) & 0xff; |
682 | 10.7M | u1_Bs3 = (u4_bs >> 0) & 0xff; |
683 | | |
684 | 10.7M | flag_bs = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2, |
685 | 10.7M | u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1, |
686 | 10.7M | u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0); |
687 | 10.7M | flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s |
688 | 10.7M | flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask |
689 | | |
690 | 10.7M | q0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv)); |
691 | 10.7M | q1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv + i16_posQ1)); |
692 | 10.7M | p1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP1)); |
693 | 10.7M | p0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0)); |
694 | | |
695 | 10.7M | q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero); |
696 | 10.7M | q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero); |
697 | 10.7M | p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero); |
698 | 10.7M | p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero); |
699 | | |
700 | 10.7M | diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 |
701 | 10.7M | diff = _mm_abs_epi16(diff); |
702 | 10.7M | alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); |
703 | 10.7M | flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); |
704 | | |
705 | 10.7M | diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 |
706 | 10.7M | diff = _mm_abs_epi16(diff); |
707 | 10.7M | beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); |
708 | 10.7M | flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
709 | | |
710 | 10.7M | diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 |
711 | 10.7M | diff = _mm_abs_epi16(diff); |
712 | 10.7M | flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
713 | | |
714 | 10.7M | diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16); |
715 | 10.7M | diff = _mm_slli_epi16(diff, 2); |
716 | 10.7M | diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16); |
717 | 10.7M | diff = _mm_add_epi16(diff, diff1); |
718 | 10.7M | diff = _mm_add_epi16(diff, _mm_set1_epi16(4)); |
719 | 10.7M | in_macro = _mm_srai_epi16(diff, 3); |
720 | | |
721 | 10.7M | C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1], |
722 | 10.7M | pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1], |
723 | 10.7M | pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0], |
724 | 10.7M | pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]); |
725 | | |
726 | 10.7M | C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1)); |
727 | | |
728 | 10.7M | in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3 |
729 | 10.7M | C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16); |
730 | 10.7M | in_macro = _mm_max_epi16(C0_uv_8x16, in_macro); |
731 | | |
732 | 10.7M | p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro); |
733 | 10.7M | q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro); |
734 | | |
735 | 10.7M | q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero); |
736 | 10.7M | q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero); |
737 | 10.7M | p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero); |
738 | 10.7M | p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero); |
739 | | |
740 | 10.7M | diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 |
741 | 10.7M | diff = _mm_abs_epi16(diff); |
742 | 10.7M | alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); |
743 | 10.7M | flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); |
744 | | |
745 | 10.7M | diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 |
746 | 10.7M | diff = _mm_abs_epi16(diff); |
747 | 10.7M | beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); |
748 | 10.7M | flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
749 | | |
750 | 10.7M | diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 |
751 | 10.7M | diff = _mm_abs_epi16(diff); |
752 | 10.7M | flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
753 | | |
754 | 10.7M | diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16); |
755 | 10.7M | diff = _mm_slli_epi16(diff, 2); |
756 | 10.7M | diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16); |
757 | 10.7M | diff = _mm_add_epi16(diff, diff1); |
758 | 10.7M | diff = _mm_add_epi16(diff, _mm_set1_epi16(4)); |
759 | 10.7M | in_macro = _mm_srai_epi16(diff, 3); |
760 | | |
761 | 10.7M | C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3], |
762 | 10.7M | pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3], |
763 | 10.7M | pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2], |
764 | 10.7M | pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]); |
765 | | |
766 | 10.7M | C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1)); |
767 | | |
768 | 10.7M | in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3 |
769 | 10.7M | C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16); |
770 | 10.7M | in_macro = _mm_max_epi16(C0_uv_8x16, in_macro); |
771 | | |
772 | 10.7M | p0_uv_8x16_2 = _mm_add_epi16(p0_uv_8x16, in_macro); |
773 | 10.7M | q0_uv_8x16_2 = _mm_sub_epi16(q0_uv_8x16, in_macro); |
774 | | |
775 | 10.7M | p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2); |
776 | 10.7M | q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2); |
777 | | |
778 | 10.7M | flag1 = _mm_packs_epi16(flag1, flag2); |
779 | 10.7M | flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions) |
780 | | |
781 | 10.7M | p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8, |
782 | 10.7M | _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); |
783 | 10.7M | p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1); |
784 | 10.7M | p0_uv_8x16_1 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2); |
785 | 10.7M | _mm_storeu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0), p0_uv_8x16_1); |
786 | | |
787 | 10.7M | q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8, |
788 | 10.7M | _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); |
789 | 10.7M | q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1); |
790 | 10.7M | q0_uv_8x16_1 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2); |
791 | 10.7M | _mm_storeu_si128((__m128i *)(pu1_src_uv), q0_uv_8x16_1); |
792 | | |
793 | 10.7M | } |
794 | | |
795 | | /*****************************************************************************/ |
796 | | /* */ |
797 | | /* Function Name : ih264_deblk_chroma_vert_bs4_mbaff_ssse3() */ |
798 | | /* */ |
799 | | /* Description : This function performs filtering of a chroma block */ |
800 | | /* vertical edge when boundary strength is set to 4 in high */ |
801 | | /* profile. */ |
802 | | /* */ |
803 | | /* Inputs : pu1_src - pointer to the src sample q0 of U */ |
804 | | /* src_strd - source stride */ |
805 | | /* alpha_cb - alpha value for the boundary in U */ |
806 | | /* beta_cb - beta value for the boundary in U */ |
807 | | /* alpha_cr - alpha value for the boundary in V */ |
808 | | /* beta_cr - beta value for the boundary in V */ |
809 | | /* u4_bs - packed Boundary strength array */ |
810 | | /* pu1_cliptab_cb - tc0_table for U */ |
811 | | /* pu1_cliptab_cr - tc0_table for V */ |
812 | | /* */ |
813 | | /* Globals : None */ |
814 | | /* */ |
815 | | /* Processing : When the function is called twice, this operation is as */ |
816 | | /* described in Sec. 8.7.2.4 under the title "Filtering */ |
817 | | /* process for edges for bS equal to 4" in ITU T Rec H.264 */ |
818 | | /* with alpha and beta values different in U and V. */ |
819 | | /* */ |
820 | | /* Outputs : None */ |
821 | | /* */ |
822 | | /* Returns : None */ |
823 | | /* */ |
824 | | /* Issues : None */ |
825 | | /* */ |
826 | | /* Revision History: */ |
827 | | /* */ |
828 | | /* DD MM YYYY Author(s) Changes (Describe the changes made) */ |
829 | | /* 12 02 2015 Naveen Kumar P Initial version */ |
830 | | /* */ |
831 | | /*****************************************************************************/ |
832 | | void ih264_deblk_chroma_vert_bs4_mbaff_ssse3(UWORD8 *pu1_src, |
833 | | WORD32 src_strd, |
834 | | WORD32 alpha_cb, |
835 | | WORD32 beta_cb, |
836 | | WORD32 alpha_cr, |
837 | | WORD32 beta_cr) |
838 | 20.6k | { |
839 | 20.6k | UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/ |
840 | 20.6k | WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb; |
841 | 20.6k | WORD32 beta_cbcr = (beta_cr << 16) + beta_cb; |
842 | 20.6k | __m128i linea, lineb, linec, lined; |
843 | 20.6k | __m128i temp1, temp2; |
844 | | |
845 | 20.6k | __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8; |
846 | 20.6k | __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16; |
847 | 20.6k | __m128i flag1; |
848 | 20.6k | __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8; |
849 | 20.6k | __m128i zero = _mm_setzero_si128(); |
850 | 20.6k | __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2; |
851 | | |
852 | | /* Load and transpose the pixel values */ |
853 | 20.6k | linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4)); |
854 | 20.6k | lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd)); |
855 | 20.6k | linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd)); |
856 | 20.6k | lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd)); |
857 | | |
858 | 20.6k | temp1 = _mm_unpacklo_epi16(linea, lineb); |
859 | 20.6k | temp2 = _mm_unpacklo_epi16(linec, lined); |
860 | | |
861 | 20.6k | p1_uv_16x8 = _mm_unpacklo_epi32(temp1, temp2); |
862 | 20.6k | p0_uv_16x8 = _mm_srli_si128(p1_uv_16x8, 8); |
863 | 20.6k | q0_uv_16x8 = _mm_unpackhi_epi32(temp1, temp2); |
864 | 20.6k | q1_uv_16x8 = _mm_srli_si128(q0_uv_16x8, 8); |
865 | | /* End of transpose */ |
866 | | |
867 | 20.6k | q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero); |
868 | 20.6k | q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero); |
869 | 20.6k | p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero); |
870 | 20.6k | p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero); |
871 | | |
872 | 20.6k | diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 |
873 | 20.6k | diff = _mm_abs_epi16(diff); |
874 | 20.6k | alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); |
875 | 20.6k | flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); |
876 | | |
877 | 20.6k | diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 |
878 | 20.6k | diff = _mm_abs_epi16(diff); |
879 | 20.6k | beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); |
880 | 20.6k | flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
881 | | |
882 | 20.6k | diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 |
883 | 20.6k | diff = _mm_abs_epi16(diff); |
884 | 20.6k | flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
885 | | |
886 | 20.6k | temp1 = _mm_slli_epi16(p1_uv_8x16, 1); |
887 | 20.6k | temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16); |
888 | 20.6k | temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); |
889 | 20.6k | temp1 = _mm_add_epi16(temp1, temp2); |
890 | 20.6k | p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2); |
891 | | |
892 | 20.6k | temp1 = _mm_slli_epi16(q1_uv_8x16, 1); |
893 | 20.6k | temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16); |
894 | 20.6k | temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); |
895 | 20.6k | temp1 = _mm_add_epi16(temp1, temp2); |
896 | 20.6k | q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2); |
897 | | |
898 | 20.6k | p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_1); |
899 | 20.6k | q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_1); |
900 | | |
901 | 20.6k | flag1 = _mm_packs_epi16(flag1, flag1); |
902 | | |
903 | 20.6k | p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8, |
904 | 20.6k | _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); |
905 | 20.6k | p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1); |
906 | 20.6k | p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2); |
907 | | |
908 | 20.6k | q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8, |
909 | 20.6k | _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); |
910 | 20.6k | q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1); |
911 | 20.6k | q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2); |
912 | | |
913 | | /* Inverse-transpose and store back */ |
914 | 20.6k | temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8); |
915 | 20.6k | temp2 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8); |
916 | | |
917 | 20.6k | linea = _mm_unpacklo_epi32(temp1, temp2); |
918 | 20.6k | lineb = _mm_srli_si128(linea, 8); |
919 | 20.6k | linec = _mm_unpackhi_epi32(temp1, temp2); |
920 | 20.6k | lined = _mm_srli_si128(linec, 8); |
921 | | |
922 | 20.6k | _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea); |
923 | 20.6k | _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb); |
924 | 20.6k | _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec); |
925 | 20.6k | _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined); |
926 | | |
927 | 20.6k | } |
928 | | |
929 | | /*****************************************************************************/ |
930 | | /* */ |
931 | | /* Function Name : ih264_deblk_chroma_vert_bslt4_mbaff_ssse3() */ |
932 | | /* */ |
933 | | /* Description : This function performs filtering of a chroma block */ |
934 | | /* vertical edge when boundary strength is less than 4 in */ |
935 | | /* high profile. */ |
936 | | /* */ |
937 | | /* Inputs : pu1_src - pointer to the src sample q0 of U */ |
938 | | /* src_strd - source stride */ |
939 | | /* alpha_cb - alpha value for the boundary in U */ |
940 | | /* beta_cb - beta value for the boundary in U */ |
941 | | /* alpha_cr - alpha value for the boundary in V */ |
942 | | /* beta_cr - beta value for the boundary in V */ |
943 | | /* u4_bs - packed Boundary strength array */ |
944 | | /* pu1_cliptab_cb - tc0_table for U */ |
945 | | /* pu1_cliptab_cr - tc0_table for V */ |
946 | | /* */ |
947 | | /* Globals : None */ |
948 | | /* */ |
949 | | /* Processing : When the function is called twice, this operation is as */ |
950 | | /* described in Sec. 8.7.2.4 under the title "Filtering */ |
951 | | /* process for edges for bS less than 4" in ITU T Rec H.264 */ |
952 | | /* with alpha and beta values different in U and V. */ |
953 | | /* */ |
954 | | /* Outputs : None */ |
955 | | /* */ |
956 | | /* Returns : None */ |
957 | | /* */ |
958 | | /* Issues : None */ |
959 | | /* */ |
960 | | /* Revision History: */ |
961 | | /* */ |
962 | | /* DD MM YYYY Author(s) Changes (Describe the changes made) */ |
963 | | /* 12 02 2015 Naveen Kumar P Initial version */ |
964 | | /* */ |
965 | | /*****************************************************************************/ |
966 | | void ih264_deblk_chroma_vert_bslt4_mbaff_ssse3(UWORD8 *pu1_src, |
967 | | WORD32 src_strd, |
968 | | WORD32 alpha_cb, |
969 | | WORD32 beta_cb, |
970 | | WORD32 alpha_cr, |
971 | | WORD32 beta_cr, |
972 | | UWORD32 u4_bs, |
973 | | const UWORD8 *pu1_cliptab_cb, |
974 | | const UWORD8 *pu1_cliptab_cr) |
975 | 12.0k | { |
976 | 12.0k | UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/ |
977 | 12.0k | UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3; |
978 | 12.0k | WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb; |
979 | 12.0k | WORD32 beta_cbcr = (beta_cr << 16) + beta_cb; |
980 | 12.0k | __m128i linea, lineb, linec, lined; |
981 | 12.0k | __m128i temp1, temp2; |
982 | | |
983 | 12.0k | __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8; |
984 | 12.0k | __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16; |
985 | 12.0k | __m128i flag_bs, flag1; |
986 | 12.0k | __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro; |
987 | 12.0k | __m128i zero = _mm_setzero_si128(); |
988 | 12.0k | __m128i C0_uv_8x16; |
989 | 12.0k | __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2; |
990 | | |
991 | 12.0k | u1_Bs0 = (u4_bs >> 24) & 0xff; |
992 | 12.0k | u1_Bs1 = (u4_bs >> 16) & 0xff; |
993 | 12.0k | u1_Bs2 = (u4_bs >> 8) & 0xff; |
994 | 12.0k | u1_Bs3 = (u4_bs >> 0) & 0xff; |
995 | | |
996 | 12.0k | flag_bs = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, u1_Bs3, u1_Bs3, u1_Bs2, |
997 | 12.0k | u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs0, u1_Bs0); |
998 | 12.0k | flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s |
999 | 12.0k | flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask |
1000 | | |
1001 | | /* Load and transpose the pixel values */ |
1002 | 12.0k | linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4)); |
1003 | 12.0k | lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd)); |
1004 | 12.0k | linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd)); |
1005 | 12.0k | lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd)); |
1006 | | |
1007 | 12.0k | temp1 = _mm_unpacklo_epi16(linea, lineb); |
1008 | 12.0k | temp2 = _mm_unpacklo_epi16(linec, lined); |
1009 | | |
1010 | 12.0k | p1_uv_16x8 = _mm_unpacklo_epi32(temp1, temp2); |
1011 | 12.0k | p0_uv_16x8 = _mm_srli_si128(p1_uv_16x8, 8); |
1012 | 12.0k | q0_uv_16x8 = _mm_unpackhi_epi32(temp1, temp2); |
1013 | 12.0k | q1_uv_16x8 = _mm_srli_si128(q0_uv_16x8, 8); |
1014 | | /* End of transpose */ |
1015 | | |
1016 | 12.0k | q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero); |
1017 | 12.0k | q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero); |
1018 | 12.0k | p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero); |
1019 | 12.0k | p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero); |
1020 | | |
1021 | 12.0k | diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 |
1022 | 12.0k | diff = _mm_abs_epi16(diff); |
1023 | 12.0k | alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); |
1024 | 12.0k | flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); |
1025 | | |
1026 | 12.0k | diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 |
1027 | 12.0k | diff = _mm_abs_epi16(diff); |
1028 | 12.0k | beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); |
1029 | 12.0k | flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
1030 | | |
1031 | 12.0k | diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 |
1032 | 12.0k | diff = _mm_abs_epi16(diff); |
1033 | 12.0k | flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
1034 | | |
1035 | 12.0k | diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16); |
1036 | 12.0k | diff = _mm_slli_epi16(diff, 2); |
1037 | 12.0k | diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16); |
1038 | 12.0k | diff = _mm_add_epi16(diff, diff1); |
1039 | 12.0k | diff = _mm_add_epi16(diff, _mm_set1_epi16(4)); |
1040 | 12.0k | in_macro = _mm_srai_epi16(diff, 3); |
1041 | | |
1042 | 12.0k | C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3], |
1043 | 12.0k | pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2], |
1044 | 12.0k | pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1], |
1045 | 12.0k | pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]); |
1046 | | |
1047 | 12.0k | C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1)); |
1048 | | |
1049 | 12.0k | in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3 |
1050 | 12.0k | C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16); |
1051 | 12.0k | in_macro = _mm_max_epi16(C0_uv_8x16, in_macro); |
1052 | | |
1053 | 12.0k | p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro); |
1054 | 12.0k | q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro); |
1055 | | |
1056 | 12.0k | p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_1); |
1057 | 12.0k | q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_1); |
1058 | | |
1059 | 12.0k | flag1 = _mm_packs_epi16(flag1, flag1); |
1060 | 12.0k | flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions) |
1061 | | |
1062 | 12.0k | p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8, |
1063 | 12.0k | _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); |
1064 | 12.0k | p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1); |
1065 | 12.0k | p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2); |
1066 | | |
1067 | 12.0k | q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8, |
1068 | 12.0k | _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); |
1069 | 12.0k | q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1); |
1070 | 12.0k | q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2); |
1071 | | |
1072 | | /* Inverse-transpose and store back */ |
1073 | 12.0k | temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8); |
1074 | 12.0k | temp2 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8); |
1075 | | |
1076 | 12.0k | linea = _mm_unpacklo_epi32(temp1, temp2); |
1077 | 12.0k | lineb = _mm_srli_si128(linea, 8); |
1078 | 12.0k | linec = _mm_unpackhi_epi32(temp1, temp2); |
1079 | 12.0k | lined = _mm_srli_si128(linec, 8); |
1080 | | |
1081 | 12.0k | _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea); |
1082 | 12.0k | _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb); |
1083 | 12.0k | _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec); |
1084 | 12.0k | _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined); |
1085 | | |
1086 | 12.0k | } |
1087 | | |