/src/libavc/encoder/x86/ih264e_half_pel_ssse3.c
Line | Count | Source |
1 | | /****************************************************************************** |
2 | | * |
3 | | * Copyright (C) 2015 The Android Open Source Project |
4 | | * |
5 | | * Licensed under the Apache License, Version 2.0 (the "License"); |
6 | | * you may not use this file except in compliance with the License. |
7 | | * You may obtain a copy of the License at: |
8 | | * |
9 | | * http://www.apache.org/licenses/LICENSE-2.0 |
10 | | * |
11 | | * Unless required by applicable law or agreed to in writing, software |
12 | | * distributed under the License is distributed on an "AS IS" BASIS, |
13 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | * See the License for the specific language governing permissions and |
15 | | * limitations under the License. |
16 | | * |
17 | | ***************************************************************************** |
18 | | * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
19 | | */ |
20 | | /** |
21 | | ******************************************************************************* |
22 | | * @file |
23 | | * ih264e_half_pel_ssse3.c |
24 | | * |
25 | | * @brief |
26 | | * Contains the x86 intrinsic function definitions for 6-tap vertical filter |
27 | | * and cascaded 2D filter used in motion estimation in H264 encoder. |
28 | | * |
29 | | * @author |
30 | | * ittiam |
31 | | * |
32 | | * @par List of Functions: |
33 | | * ih264e_sixtapfilter_horz_ssse3 |
34 | | * ih264e_sixtap_filter_2dvh_vert_ssse3 |
35 | | * |
36 | | * @remarks |
37 | | * none |
38 | | * |
39 | | ******************************************************************************* |
40 | | */ |
41 | | |
42 | | /*****************************************************************************/ |
43 | | /* File Includes */ |
44 | | /*****************************************************************************/ |
45 | | |
46 | | /* System include files */ |
47 | | #include <stdio.h> |
48 | | #include <assert.h> |
49 | | #include <limits.h> |
50 | | |
51 | | /* User include files */ |
52 | | #include "ih264_typedefs.h" |
53 | | #include "ithread.h" |
54 | | #include "ih264_platform_macros.h" |
55 | | #include "ih264_defs.h" |
56 | | #include "ih264e_half_pel.h" |
57 | | #include "ih264_macros.h" |
58 | | #include "ih264_inter_pred_filters.h" |
59 | | #include "ih264_mem_fns.h" |
60 | | #include "ih264_padding.h" |
61 | | #include "ih264_intra_pred_filters.h" |
62 | | #include "ih264_deblk_edge_filters.h" |
63 | | |
64 | | |
65 | | /*****************************************************************************/ |
66 | | /* Function Definitions */ |
67 | | /*****************************************************************************/ |
68 | | /* |
69 | | ******************************************************************************* |
70 | | * |
71 | | * @brief |
72 | | * Interprediction luma filter for horizontal input(Filter run for width = 17 |
73 | | * and height =16) |
74 | | * |
75 | | * @par Description: |
76 | | * Applies a 6 tap horizontal filter .The output is clipped to 8 bits sec. |
77 | | * 8.4.2.2.1 titled "Luma sample interpolation process" |
78 | | * |
79 | | * @param[in] pu1_src |
80 | | * UWORD8 pointer to the source |
81 | | * |
82 | | * @param[out] pu1_dst |
83 | | * UWORD8 pointer to the destination |
84 | | * |
85 | | * @param[in] src_strd |
86 | | * integer source stride |
87 | | * |
88 | | * @param[in] dst_strd |
89 | | * integer destination stride |
90 | | * |
91 | | * @returns |
92 | | * none |
93 | | * |
94 | | * @remarks |
95 | | * none |
96 | | * |
97 | | ******************************************************************************* |
98 | | */ |
99 | | void ih264e_sixtapfilter_horz_ssse3(UWORD8 *pu1_src, |
100 | | UWORD8 *pu1_dst, |
101 | | WORD32 src_strd, |
102 | | WORD32 dst_strd) |
103 | 403k | { |
104 | 403k | WORD32 ht; |
105 | 403k | WORD32 tmp; |
106 | | |
107 | 403k | __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b; |
108 | 403k | __m128i src_r0_t1_16x8b, src_r1_t1_16x8b; |
109 | | |
110 | 403k | __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b; |
111 | 403k | __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b; |
112 | | |
113 | 403k | __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b; |
114 | 403k | __m128i const_val16_8x16b; |
115 | | |
116 | 403k | ht = 16; |
117 | 403k | pu1_src -= 2; // the filter input starts from x[-2] (till x[3]) |
118 | | |
119 | 403k | coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 |
120 | 403k | coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 |
121 | 403k | coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 |
122 | | //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20 |
123 | 403k | const_val16_8x16b = _mm_set1_epi16(16); |
124 | | |
125 | | //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... |
126 | | //Row0 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... |
127 | | //b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels. |
128 | | |
129 | 403k | do |
130 | 6.43M | { |
131 | 6.43M | src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 |
132 | 6.43M | src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 |
133 | | |
134 | 6.43M | src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 |
135 | 6.43M | src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0 |
136 | | |
137 | 6.43M | src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 |
138 | 6.43M | src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 |
139 | | |
140 | 6.43M | res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 |
141 | | //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1 |
142 | 6.43M | res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1 |
143 | | //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1 |
144 | | |
145 | 6.43M | src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0 |
146 | 6.43M | src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0 |
147 | | |
148 | 6.43M | src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0 |
149 | 6.43M | src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0 |
150 | | |
151 | 6.43M | src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 |
152 | 6.43M | src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10 |
153 | | |
154 | 6.43M | res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 |
155 | | //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3 |
156 | 6.43M | res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3 |
157 | | //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3 |
158 | | |
159 | 6.43M | src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0 |
160 | 6.43M | src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0 |
161 | | |
162 | 6.43M | src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0 |
163 | 6.43M | src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0 |
164 | | |
165 | 6.43M | src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12 |
166 | 6.43M | src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12 |
167 | | |
168 | 6.43M | res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 |
169 | | //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5 |
170 | 6.43M | res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5 |
171 | | //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5 |
172 | 6.43M | res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b); |
173 | 6.43M | res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b); |
174 | 6.43M | res_r0_t3_8x16b = _mm_add_epi16(res_r0_t3_8x16b, const_val16_8x16b); |
175 | 6.43M | res_r1_t3_8x16b = _mm_add_epi16(res_r1_t3_8x16b, const_val16_8x16b); |
176 | 6.43M | res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b); |
177 | 6.43M | res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b); |
178 | | |
179 | 6.43M | tmp = ((pu1_src[18] + pu1_src[19]) << 2) - pu1_src[17] - pu1_src[20]; |
180 | 6.43M | tmp = pu1_src[16] + pu1_src[21] + (tmp << 2) + tmp; |
181 | | |
182 | 6.43M | res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5); //shifting right by 5 bits. |
183 | 6.43M | res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5); |
184 | 6.43M | tmp = (tmp + 16) >> 5; |
185 | | |
186 | 6.43M | src_r0_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r1_t1_8x16b); |
187 | 6.43M | pu1_dst[16] = CLIP_U8(tmp); |
188 | | |
189 | 6.43M | _mm_storeu_si128((__m128i *)pu1_dst, src_r0_16x8b); |
190 | | |
191 | 6.43M | ht--; |
192 | 6.43M | pu1_src += src_strd; |
193 | 6.43M | pu1_dst += dst_strd; |
194 | 6.43M | } |
195 | 6.43M | while(ht > 0); |
196 | 403k | } |
197 | | |
198 | | /* |
199 | | ******************************************************************************* |
200 | | * |
201 | | * @brief |
202 | | * This function implements a two stage cascaded six tap filter. It |
203 | | * applies the six tap filter in the vertical direction on the |
204 | | * predictor values, followed by applying the same filter in the |
205 | | * horizontal direction on the output of the first stage. The six tap |
206 | | * filtering operation is described in sec 8.4.2.2.1 titled "Luma sample |
207 | | * interpolation process" (Filter run for width = 17 and height =17) |
208 | | * |
209 | | * @par Description: |
210 | | * The function interpolates the predictors first in the vertical direction |
211 | | * and then in the horizontal direction to output the (1/2,1/2). The output |
212 | | * of the first stage of the filter is stored in the buffer pointed to by |
213 | | * pi16_pred1(only in C) in 16 bit precision. |
214 | | * |
215 | | * @param[in] pu1_src |
216 | | * UWORD8 pointer to the source |
217 | | * |
218 | | * @param[out] pu1_dst1 |
219 | | * UWORD8 pointer to the destination(Vertical filtered output) |
220 | | * |
221 | | * @param[out] pu1_dst2 |
222 | | * UWORD8 pointer to the destination(out put after applying horizontal filter |
223 | | * to the intermediate vertical output) |
224 | | * |
225 | | * @param[in] src_strd |
226 | | * integer source stride |
227 | | |
228 | | * @param[in] dst_strd |
229 | | * integer destination stride of pu1_dst |
230 | | * |
231 | | * @param[in]pi16_pred1 |
232 | | * Pointer to 16bit intermediate buffer(used only in c) |
233 | | * |
234 | | * @param[in] pi16_pred1_strd |
235 | | * integer destination stride of pi16_pred1 |
236 | | * |
237 | | * @returns |
238 | | * none |
239 | | * |
240 | | * @remarks |
241 | | * none |
242 | | * |
243 | | ******************************************************************************* |
244 | | */ |
245 | | void ih264e_sixtap_filter_2dvh_vert_ssse3(UWORD8 *pu1_src, |
246 | | UWORD8 *pu1_dst1, |
247 | | UWORD8 *pu1_dst2, |
248 | | WORD32 src_strd, |
249 | | WORD32 dst_strd, |
250 | | WORD32 *pi4_pred1, |
251 | | WORD32 pred1_strd) |
252 | 403k | { |
253 | 403k | WORD32 ht; |
254 | 403k | WORD16 *pi2_pred1; |
255 | | |
256 | 403k | ht = 17; |
257 | 403k | pi2_pred1 = (WORD16 *)pi4_pred1; |
258 | 403k | pred1_strd = pred1_strd << 1; |
259 | | |
260 | | // Vertical 6-tap filter |
261 | 403k | { |
262 | 403k | __m128i src1_r0_16x8b, src1_r1_16x8b, src1_r2_16x8b; |
263 | 403k | __m128i src1_r3_16x8b, src1_r4_16x8b, src1_r5_16x8b; |
264 | 403k | __m128i src2_r0_16x8b, src2_r1_16x8b, src2_r2_16x8b; |
265 | 403k | __m128i src2_r3_16x8b, src2_r4_16x8b, src2_r5_16x8b; |
266 | | |
267 | 403k | __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b; |
268 | | |
269 | 403k | __m128i res_t1_8x16b, res_t2_8x16b, res_t3_8x16b; |
270 | 403k | __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b; |
271 | | |
272 | 403k | coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 |
273 | 403k | coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 |
274 | 403k | coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 |
275 | | //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20 |
276 | | |
277 | 403k | pu1_src -= 2; |
278 | 403k | pu1_src -= src_strd << 1; // the filter input starts from x[-2] (till x[3]) |
279 | | |
280 | | // Loading first five rows to start first row processing. |
281 | | // 22 values loaded in each row. |
282 | 403k | src1_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); |
283 | 403k | src2_r0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14)); |
284 | 403k | pu1_src += src_strd; |
285 | | |
286 | 403k | src1_r1_16x8b = _mm_loadu_si128((__m128i *)pu1_src); |
287 | 403k | src2_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14)); |
288 | 403k | pu1_src += src_strd; |
289 | | |
290 | 403k | src1_r2_16x8b = _mm_loadu_si128((__m128i *)pu1_src); |
291 | 403k | src2_r2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14)); |
292 | 403k | pu1_src += src_strd; |
293 | | |
294 | 403k | src1_r3_16x8b = _mm_loadu_si128((__m128i *)pu1_src); |
295 | 403k | src2_r3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14)); |
296 | 403k | pu1_src += src_strd; |
297 | | |
298 | 403k | src1_r4_16x8b = _mm_loadu_si128((__m128i *)pu1_src); |
299 | 403k | src2_r4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14)); |
300 | 403k | pu1_src += src_strd; |
301 | | |
302 | 403k | do |
303 | 6.76M | { |
304 | 6.76M | src1_r5_16x8b = _mm_loadu_si128((__m128i *)pu1_src); |
305 | 6.76M | src2_r5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14)); |
306 | | |
307 | 6.76M | src_r0r1_16x8b = _mm_unpacklo_epi8(src1_r0_16x8b, src1_r1_16x8b); |
308 | 6.76M | src_r2r3_16x8b = _mm_unpacklo_epi8(src1_r2_16x8b, src1_r3_16x8b); |
309 | 6.76M | src_r4r5_16x8b = _mm_unpacklo_epi8(src1_r4_16x8b, src1_r5_16x8b); |
310 | | |
311 | 6.76M | res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); |
312 | 6.76M | res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); |
313 | 6.76M | res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); |
314 | | |
315 | 6.76M | res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); |
316 | 6.76M | res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b); |
317 | | |
318 | 6.76M | _mm_storeu_si128((__m128i *)pi2_pred1, res_t1_8x16b); |
319 | | |
320 | 6.76M | src_r0r1_16x8b = _mm_unpackhi_epi8(src1_r0_16x8b, src1_r1_16x8b); |
321 | 6.76M | src_r2r3_16x8b = _mm_unpackhi_epi8(src1_r2_16x8b, src1_r3_16x8b); |
322 | 6.76M | src_r4r5_16x8b = _mm_unpackhi_epi8(src1_r4_16x8b, src1_r5_16x8b); |
323 | | |
324 | 6.76M | res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); |
325 | 6.76M | res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); |
326 | 6.76M | res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); |
327 | | |
328 | 6.76M | res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); |
329 | 6.76M | res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b); |
330 | | |
331 | 6.76M | _mm_storeu_si128((__m128i *)(pi2_pred1 + 8), res_t1_8x16b); |
332 | | |
333 | 6.76M | src_r0r1_16x8b = _mm_unpacklo_epi8(src2_r0_16x8b, src2_r1_16x8b); |
334 | 6.76M | src_r2r3_16x8b = _mm_unpacklo_epi8(src2_r2_16x8b, src2_r3_16x8b); |
335 | 6.76M | src_r4r5_16x8b = _mm_unpacklo_epi8(src2_r4_16x8b, src2_r5_16x8b); |
336 | | |
337 | 6.76M | res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); |
338 | 6.76M | res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); |
339 | 6.76M | res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); |
340 | | |
341 | 6.76M | res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); |
342 | 6.76M | res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b); |
343 | | |
344 | 6.76M | _mm_storeu_si128((__m128i *)(pi2_pred1 + 14), res_t1_8x16b); |
345 | | |
346 | 6.76M | src1_r0_16x8b = src1_r1_16x8b; |
347 | 6.76M | src1_r1_16x8b = src1_r2_16x8b; |
348 | 6.76M | src1_r2_16x8b = src1_r3_16x8b; |
349 | 6.76M | src1_r3_16x8b = src1_r4_16x8b; |
350 | 6.76M | src1_r4_16x8b = src1_r5_16x8b; |
351 | | |
352 | 6.76M | src2_r0_16x8b = src2_r1_16x8b; |
353 | 6.76M | src2_r1_16x8b = src2_r2_16x8b; |
354 | 6.76M | src2_r2_16x8b = src2_r3_16x8b; |
355 | 6.76M | src2_r3_16x8b = src2_r4_16x8b; |
356 | 6.76M | src2_r4_16x8b = src2_r5_16x8b; |
357 | | |
358 | 6.76M | ht--; |
359 | 6.76M | pu1_src += src_strd; |
360 | 6.76M | pi2_pred1 += pred1_strd; |
361 | 6.76M | } |
362 | 6.76M | while(ht > 0); |
363 | 403k | } |
364 | | |
365 | 403k | ht = 17; |
366 | 403k | pi2_pred1 = (WORD16 *)pi4_pred1; |
367 | | |
368 | | // Horizontal 6-tap filter |
369 | 403k | { |
370 | 403k | WORD32 temp; |
371 | | |
372 | 403k | __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b; |
373 | 403k | __m128i src_r4_8x16b, src_r5_8x16b; |
374 | 403k | __m128i src_r0r1_8x16b, src_r2r3_8x16b, src_r4r5_8x16b; |
375 | 403k | __m128i res_vert1_8x16b, res_vert2_8x16b, res_16x8b; |
376 | | |
377 | 403k | __m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b; |
378 | 403k | __m128i res_c0_8x16b, res_c1_8x16b; |
379 | | |
380 | 403k | __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b; |
381 | 403k | __m128i const_val512_4x32b, const_val16_8x16b; |
382 | | |
383 | 403k | coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001); //c0 c1 c0 c1 c0 c1 c0 c1 |
384 | 403k | coeff2_3_8x16b = _mm_set1_epi32(0x00140014); //c2 c3 c2 c3 c2 c3 c2 c3 |
385 | 403k | coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB); //c4 c5 c4 c5 c4 c5 c4 c5 |
386 | | //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20 |
387 | 403k | const_val512_4x32b = _mm_set1_epi32(512); |
388 | 403k | const_val16_8x16b = _mm_set1_epi16(16); |
389 | | |
390 | 403k | do |
391 | 6.82M | { |
392 | 6.82M | src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1)); |
393 | 6.82M | src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 1)); |
394 | 6.82M | src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 2)); |
395 | 6.82M | src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 3)); |
396 | 6.82M | src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 4)); |
397 | 6.82M | src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 5)); |
398 | | |
399 | 6.82M | res_vert1_8x16b = _mm_add_epi16(src_r2_8x16b, const_val16_8x16b); |
400 | 6.82M | res_vert1_8x16b = _mm_srai_epi16(res_vert1_8x16b, 5); //shifting right by 5 bits. |
401 | | |
402 | 6.82M | src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b); |
403 | 6.82M | src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b); |
404 | 6.82M | src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b); |
405 | | |
406 | 6.82M | res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); |
407 | 6.82M | res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); |
408 | 6.82M | res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); |
409 | | |
410 | 6.82M | res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); |
411 | 6.82M | res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); |
412 | 6.82M | res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); |
413 | 6.82M | res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); |
414 | | |
415 | 6.82M | src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b); |
416 | 6.82M | src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b); |
417 | 6.82M | src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b); |
418 | | |
419 | 6.82M | res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); |
420 | 6.82M | res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); |
421 | 6.82M | res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); |
422 | | |
423 | 6.82M | res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); |
424 | 6.82M | res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); |
425 | 6.82M | res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); |
426 | 6.82M | res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); |
427 | | |
428 | 6.82M | res_c0_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b); |
429 | | |
430 | 6.82M | src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8)); |
431 | 6.82M | src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 1)); |
432 | 6.82M | src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 2)); |
433 | 6.82M | src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 3)); |
434 | 6.82M | src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 4)); |
435 | 6.82M | src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 5)); |
436 | | |
437 | 6.82M | res_vert2_8x16b = _mm_add_epi16(src_r2_8x16b, const_val16_8x16b); |
438 | 6.82M | res_vert2_8x16b = _mm_srai_epi16(res_vert2_8x16b, 5); //shifting right by 5 bits. |
439 | | |
440 | 6.82M | src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b); |
441 | 6.82M | src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b); |
442 | 6.82M | src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b); |
443 | | |
444 | 6.82M | res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); |
445 | 6.82M | res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); |
446 | 6.82M | res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); |
447 | | |
448 | 6.82M | res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); |
449 | 6.82M | res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); |
450 | 6.82M | res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); |
451 | 6.82M | res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b ,10); |
452 | | |
453 | 6.82M | src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b); |
454 | 6.82M | src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b); |
455 | 6.82M | src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b); |
456 | | |
457 | 6.82M | res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); |
458 | 6.82M | res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); |
459 | 6.82M | res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); |
460 | | |
461 | 6.82M | res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); |
462 | 6.82M | res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); |
463 | 6.82M | res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); |
464 | 6.82M | res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); |
465 | | |
466 | 6.82M | res_c1_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b); |
467 | | |
468 | 6.82M | res_16x8b = _mm_packus_epi16(res_vert1_8x16b, res_vert2_8x16b); |
469 | 6.82M | _mm_storeu_si128((__m128i *)pu1_dst1, res_16x8b); |
470 | 6.82M | pu1_dst1[16] = CLIP_U8((pi2_pred1[18] + 16) >> 5); |
471 | | |
472 | 6.82M | res_16x8b = _mm_packus_epi16(res_c0_8x16b, res_c1_8x16b); |
473 | 6.82M | _mm_storeu_si128((__m128i *)pu1_dst2, res_16x8b); |
474 | 6.82M | temp = ((pi2_pred1[18] + pi2_pred1[19]) << 2) - pi2_pred1[17] - pi2_pred1[20]; |
475 | 6.82M | temp = pi2_pred1[16] + pi2_pred1[21] + (temp << 2) + temp; |
476 | 6.82M | pu1_dst2[16] = CLIP_U8((temp + 512) >> 10); |
477 | | |
478 | 6.82M | ht--; |
479 | 6.82M | pi2_pred1 += pred1_strd; |
480 | 6.82M | pu1_dst1 += dst_strd; |
481 | 6.82M | pu1_dst2 += dst_strd; |
482 | 6.82M | } |
483 | 6.82M | while(ht > 0); |
484 | 403k | } |
485 | 403k | } |