/src/libavc/common/x86/ih264_weighted_pred_sse42.c
Line | Count | Source |
1 | | /****************************************************************************** |
2 | | * |
3 | | * Copyright (C) 2015 The Android Open Source Project |
4 | | * |
5 | | * Licensed under the Apache License, Version 2.0 (the "License"); |
6 | | * you may not use this file except in compliance with the License. |
7 | | * You may obtain a copy of the License at: |
8 | | * |
9 | | * http://www.apache.org/licenses/LICENSE-2.0 |
10 | | * |
11 | | * Unless required by applicable law or agreed to in writing, software |
12 | | * distributed under the License is distributed on an "AS IS" BASIS, |
13 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | * See the License for the specific language governing permissions and |
15 | | * limitations under the License. |
16 | | * |
17 | | ***************************************************************************** |
18 | | * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
19 | | */ |
20 | | /*****************************************************************************/ |
21 | | /* */ |
22 | | /* File Name : ih264_weighted_pred_intr_sse42.c */ |
23 | | /* */ |
24 | | /* Description : Contains function definitions for weighted */ |
25 | | /* prediction functions in x86 sse4 intrinsics */ |
26 | | /* */ |
27 | | /* List of Functions : ih264_default_weighted_pred_luma_sse42() */ |
28 | | /* ih264_default_weighted_pred_chroma_sse42() */ |
29 | | /* ih264_weighted_pred_luma_sse42() */ |
30 | | /* ih264_weighted_pred_chroma_sse42() */ |
31 | | /* ih264_weighted_bipred_luma_sse42() */ |
32 | | /* ih264_weighted_bipred_chroma_sse42() */ |
33 | | /* */ |
34 | | /* Issues / Problems : None */ |
35 | | /* */ |
36 | | /* Revision History : */ |
37 | | /* */ |
38 | | /* DD MM YYYY Author(s) Changes */ |
39 | | /* 30 01 2015 Kaushik Initial version */ |
40 | | /* Senthoor */ |
41 | | /* */ |
42 | | /*****************************************************************************/ |
43 | | /*****************************************************************************/ |
44 | | /* File Includes */ |
45 | | /*****************************************************************************/ |
46 | | |
47 | | #include <immintrin.h> |
48 | | #include "ih264_typedefs.h" |
49 | | #include "ih264_macros.h" |
50 | | #include "ih264_platform_macros.h" |
51 | | #include "ih264_weighted_pred.h" |
52 | | |
53 | | /*****************************************************************************/ |
54 | | /* Function definitions . */ |
55 | | /*****************************************************************************/ |
56 | | /*****************************************************************************/ |
57 | | /* */ |
58 | | /* Function Name : ih264_default_weighted_pred_luma_sse42 */ |
59 | | /* */ |
60 | | /* Description : This function performs the default weighted prediction */ |
61 | | /* as described in sec 8.4.2.3.1 titled "Default weighted */ |
62 | | /* sample prediction process" for luma. The function gets */ |
63 | | /* two ht x wd blocks, calculates their rounded-average and */ |
64 | | /* stores it in the destination block. (ht,wd) can be */ |
65 | | /* (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16). */ |
66 | | /* */ |
67 | | /* Inputs : pu1_src1 - Pointer to source 1 */ |
68 | | /* pu1_src2 - Pointer to source 2 */ |
69 | | /* pu1_dst - Pointer to destination */ |
70 | | /* src_strd1 - stride for source 1 */ |
71 | | /* src_strd1 - stride for source 2 */ |
72 | | /* dst_strd - stride for destination */ |
73 | | /* ht - height of the block */ |
74 | | /* wd - width of the block */ |
75 | | /* */ |
76 | | /* Issues : None */ |
77 | | /* */ |
78 | | /* Revision History: */ |
79 | | /* */ |
80 | | /* DD MM YYYY Author(s) Changes */ |
81 | | /* 04 02 2015 Kaushik Initial Version */ |
82 | | /* Senthoor */ |
83 | | /* */ |
84 | | /*****************************************************************************/ |
85 | | void ih264_default_weighted_pred_luma_sse42(UWORD8 *pu1_src1, |
86 | | UWORD8 *pu1_src2, |
87 | | UWORD8 *pu1_dst, |
88 | | WORD32 src_strd1, |
89 | | WORD32 src_strd2, |
90 | | WORD32 dst_strd, |
91 | | WORD32 ht, |
92 | | WORD32 wd) |
93 | 374k | { |
94 | 374k | __m128i y0_0_16x8b, y0_1_16x8b, y0_2_16x8b, y0_3_16x8b; |
95 | 374k | __m128i y1_0_16x8b, y1_1_16x8b, y1_2_16x8b, y1_3_16x8b; |
96 | | |
97 | 374k | if(wd == 4) |
98 | 4.59k | { |
99 | 4.59k | do |
100 | 6.12k | { |
101 | 6.12k | y0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); |
102 | 6.12k | y0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1)); |
103 | 6.12k | y0_2_16x8b = _mm_loadl_epi64( |
104 | 6.12k | (__m128i *)(pu1_src1 + (src_strd1 << 1))); |
105 | 6.12k | y0_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3)); |
106 | | |
107 | 6.12k | y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2); |
108 | 6.12k | y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2)); |
109 | 6.12k | y1_2_16x8b = _mm_loadl_epi64( |
110 | 6.12k | (__m128i *)(pu1_src2 + (src_strd2 << 1))); |
111 | 6.12k | y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3)); |
112 | | |
113 | 6.12k | y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b); |
114 | 6.12k | y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b); |
115 | 6.12k | y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b); |
116 | 6.12k | y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b); |
117 | | |
118 | 6.12k | *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y0_0_16x8b); |
119 | 6.12k | *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y0_1_16x8b); |
120 | 6.12k | *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y0_2_16x8b); |
121 | 6.12k | *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y0_3_16x8b); |
122 | | |
123 | 6.12k | ht -= 4; |
124 | 6.12k | pu1_src1 += src_strd1 << 2; |
125 | 6.12k | pu1_src2 += src_strd2 << 2; |
126 | 6.12k | pu1_dst += dst_strd << 2; |
127 | 6.12k | } |
128 | 6.12k | while(ht > 0); |
129 | 4.59k | } |
130 | 370k | else if(wd == 8) |
131 | 71.8k | { |
132 | 71.8k | do |
133 | 169k | { |
134 | 169k | y0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); |
135 | 169k | y0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1)); |
136 | 169k | y0_2_16x8b = _mm_loadl_epi64( |
137 | 169k | (__m128i *)(pu1_src1 + (src_strd1 << 1))); |
138 | 169k | y0_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3)); |
139 | | |
140 | 169k | y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2); |
141 | 169k | y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2)); |
142 | 169k | y1_2_16x8b = _mm_loadl_epi64( |
143 | 169k | (__m128i *)(pu1_src2 + (src_strd2 << 1))); |
144 | 169k | y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3)); |
145 | | |
146 | 169k | y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b); |
147 | 169k | y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b); |
148 | 169k | y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b); |
149 | 169k | y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b); |
150 | | |
151 | 169k | _mm_storel_epi64((__m128i *)pu1_dst, y0_0_16x8b); |
152 | 169k | _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y0_1_16x8b); |
153 | 169k | _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y0_2_16x8b); |
154 | 169k | _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y0_3_16x8b); |
155 | | |
156 | 169k | ht -= 4; |
157 | 169k | pu1_src1 += src_strd1 << 2; |
158 | 169k | pu1_src2 += src_strd2 << 2; |
159 | 169k | pu1_dst += dst_strd << 2; |
160 | 169k | } |
161 | 169k | while(ht > 0); |
162 | 71.8k | } |
163 | 298k | else // wd == 16 |
164 | 298k | { |
165 | 298k | __m128i y0_4_16x8b, y0_5_16x8b, y0_6_16x8b, y0_7_16x8b; |
166 | 298k | __m128i y1_4_16x8b, y1_5_16x8b, y1_6_16x8b, y1_7_16x8b; |
167 | | |
168 | 298k | do |
169 | 575k | { |
170 | 575k | y0_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1); |
171 | 575k | y0_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1)); |
172 | 575k | y0_2_16x8b = _mm_loadu_si128( |
173 | 575k | (__m128i *)(pu1_src1 + (src_strd1 << 1))); |
174 | 575k | y0_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 3)); |
175 | 575k | y0_4_16x8b = _mm_loadu_si128( |
176 | 575k | (__m128i *)(pu1_src1 + (src_strd1 << 2))); |
177 | 575k | y0_5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 5)); |
178 | 575k | y0_6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 6)); |
179 | 575k | y0_7_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 7)); |
180 | | |
181 | 575k | y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2); |
182 | 575k | y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2)); |
183 | 575k | y1_2_16x8b = _mm_loadu_si128( |
184 | 575k | (__m128i *)(pu1_src2 + (src_strd2 << 1))); |
185 | 575k | y1_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 3)); |
186 | 575k | y1_4_16x8b = _mm_loadu_si128( |
187 | 575k | (__m128i *)(pu1_src2 + (src_strd2 << 2))); |
188 | 575k | y1_5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 5)); |
189 | 575k | y1_6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 6)); |
190 | 575k | y1_7_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 7)); |
191 | | |
192 | 575k | y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b); |
193 | 575k | y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b); |
194 | 575k | y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b); |
195 | 575k | y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b); |
196 | 575k | y0_4_16x8b = _mm_avg_epu8(y0_4_16x8b, y1_4_16x8b); |
197 | 575k | y0_5_16x8b = _mm_avg_epu8(y0_5_16x8b, y1_5_16x8b); |
198 | 575k | y0_6_16x8b = _mm_avg_epu8(y0_6_16x8b, y1_6_16x8b); |
199 | 575k | y0_7_16x8b = _mm_avg_epu8(y0_7_16x8b, y1_7_16x8b); |
200 | | |
201 | 575k | _mm_storeu_si128((__m128i *)pu1_dst, y0_0_16x8b); |
202 | 575k | _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y0_1_16x8b); |
203 | 575k | _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y0_2_16x8b); |
204 | 575k | _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y0_3_16x8b); |
205 | 575k | _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 2)), y0_4_16x8b); |
206 | 575k | _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 5), y0_5_16x8b); |
207 | 575k | _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 6), y0_6_16x8b); |
208 | 575k | _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 7), y0_7_16x8b); |
209 | | |
210 | 575k | ht -= 8; |
211 | 575k | pu1_src1 += src_strd1 << 3; |
212 | 575k | pu1_src2 += src_strd2 << 3; |
213 | 575k | pu1_dst += dst_strd << 3; |
214 | 575k | } |
215 | 575k | while(ht > 0); |
216 | 298k | } |
217 | 374k | } |
218 | | |
219 | | /*****************************************************************************/ |
220 | | /* */ |
221 | | /* Function Name : ih264_default_weighted_pred_chroma_sse42 */ |
222 | | /* */ |
223 | | /* Description : This function performs the default weighted prediction */ |
224 | | /* as described in sec 8.4.2.3.1 titled "Default weighted */ |
225 | | /* sample prediction process" for chroma. The function gets */ |
226 | | /* two ht x wd blocks, calculates their rounded-average and */ |
227 | | /* stores it in the destination block. (ht,wd) can be */ |
228 | | /* (2,2), (4,2) , (2,4), (4,4), (8,4), (4,8) or (8,8). */ |
229 | | /* */ |
230 | | /* Inputs : pu1_src1 - Pointer to source 1 */ |
231 | | /* pu1_src2 - Pointer to source 2 */ |
232 | | /* pu1_dst - Pointer to destination */ |
233 | | /* src_strd1 - stride for source 1 */ |
234 | | /* src_strd1 - stride for source 2 */ |
235 | | /* dst_strd - stride for destination */ |
236 | | /* ht - height of the block */ |
237 | | /* wd - width of the block */ |
238 | | /* */ |
239 | | /* Issues : None */ |
240 | | /* */ |
241 | | /* Revision History: */ |
242 | | /* */ |
243 | | /* DD MM YYYY Author(s) Changes */ |
244 | | /* 04 02 2015 Kaushik Initial Version */ |
245 | | /* Senthoor */ |
246 | | /* */ |
247 | | /*****************************************************************************/ |
248 | | void ih264_default_weighted_pred_chroma_sse42(UWORD8 *pu1_src1, |
249 | | UWORD8 *pu1_src2, |
250 | | UWORD8 *pu1_dst, |
251 | | WORD32 src_strd1, |
252 | | WORD32 src_strd2, |
253 | | WORD32 dst_strd, |
254 | | WORD32 ht, |
255 | | WORD32 wd) |
256 | 374k | { |
257 | 374k | __m128i uv0_0_16x8b, uv0_1_16x8b; |
258 | 374k | __m128i uv1_0_16x8b, uv1_1_16x8b; |
259 | | |
260 | 374k | if(wd == 2) |
261 | 4.59k | { |
262 | 4.59k | do |
263 | 6.12k | { |
264 | 6.12k | uv0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); |
265 | 6.12k | uv0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1)); |
266 | | |
267 | 6.12k | uv1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2); |
268 | 6.12k | uv1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2)); |
269 | | |
270 | 6.12k | uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b); |
271 | 6.12k | uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b); |
272 | | |
273 | 6.12k | *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(uv0_0_16x8b); |
274 | 6.12k | *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(uv0_1_16x8b); |
275 | | |
276 | 6.12k | ht -= 2; |
277 | 6.12k | pu1_src1 += src_strd1 << 1; |
278 | 6.12k | pu1_src2 += src_strd2 << 1; |
279 | 6.12k | pu1_dst += dst_strd << 1; |
280 | 6.12k | } |
281 | 6.12k | while(ht > 0); |
282 | 4.59k | } |
283 | 370k | else if(wd == 4) |
284 | 71.8k | { |
285 | 71.8k | do |
286 | 169k | { |
287 | 169k | uv0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); |
288 | 169k | uv0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1)); |
289 | | |
290 | 169k | uv1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2); |
291 | 169k | uv1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2)); |
292 | | |
293 | 169k | uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b); |
294 | 169k | uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b); |
295 | | |
296 | 169k | _mm_storel_epi64((__m128i *)pu1_dst, uv0_0_16x8b); |
297 | 169k | _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), uv0_1_16x8b); |
298 | | |
299 | 169k | ht -= 2; |
300 | 169k | pu1_src1 += src_strd1 << 1; |
301 | 169k | pu1_src2 += src_strd2 << 1; |
302 | 169k | pu1_dst += dst_strd << 1; |
303 | 169k | } |
304 | 169k | while(ht > 0); |
305 | 71.8k | } |
306 | 298k | else // wd == 8 |
307 | 298k | { |
308 | 298k | __m128i uv0_2_16x8b, uv0_3_16x8b; |
309 | 298k | __m128i uv1_2_16x8b, uv1_3_16x8b; |
310 | | |
311 | 298k | do |
312 | 575k | { |
313 | 575k | uv0_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1); |
314 | 575k | uv0_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1)); |
315 | 575k | uv0_2_16x8b = _mm_loadu_si128( |
316 | 575k | (__m128i *)(pu1_src1 + (src_strd1 << 1))); |
317 | 575k | uv0_3_16x8b = _mm_loadu_si128( |
318 | 575k | (__m128i *)(pu1_src1 + src_strd1 * 3)); |
319 | | |
320 | 575k | uv1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2); |
321 | 575k | uv1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2)); |
322 | 575k | uv1_2_16x8b = _mm_loadu_si128( |
323 | 575k | (__m128i *)(pu1_src2 + (src_strd2 << 1))); |
324 | 575k | uv1_3_16x8b = _mm_loadu_si128( |
325 | 575k | (__m128i *)(pu1_src2 + src_strd2 * 3)); |
326 | | |
327 | 575k | uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b); |
328 | 575k | uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b); |
329 | 575k | uv0_2_16x8b = _mm_avg_epu8(uv0_2_16x8b, uv1_2_16x8b); |
330 | 575k | uv0_3_16x8b = _mm_avg_epu8(uv0_3_16x8b, uv1_3_16x8b); |
331 | | |
332 | 575k | _mm_storeu_si128((__m128i *)pu1_dst, uv0_0_16x8b); |
333 | 575k | _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), uv0_1_16x8b); |
334 | 575k | _mm_storeu_si128( |
335 | 575k | (__m128i *)(pu1_dst + (dst_strd << 1)), uv0_2_16x8b); |
336 | 575k | _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), uv0_3_16x8b); |
337 | | |
338 | 575k | ht -= 4; |
339 | 575k | pu1_src1 += src_strd1 << 2; |
340 | 575k | pu1_src2 += src_strd2 << 2; |
341 | 575k | pu1_dst += dst_strd << 2; |
342 | 575k | } |
343 | 575k | while(ht > 0); |
344 | 298k | } |
345 | 374k | } |
346 | | |
347 | | /*****************************************************************************/ |
348 | | /* */ |
349 | | /* Function Name : ih264_weighted_pred_luma_sse42 */ |
350 | | /* */ |
351 | | /* Description : This function performs the weighted prediction as */ |
352 | | /* described in sec 8.4.2.3.2 titled "Weighted sample */ |
353 | | /* prediction process" for luma. The function gets one */ |
354 | | /* ht x wd block, weights it, rounds it off, offsets it, */ |
355 | | /* saturates it to unsigned 8-bit and stores it in the */ |
356 | | /* destination block. (ht,wd) can be (4,4), (8,4), (4,8), */ |
357 | | /* (8,8), (16,8), (8,16) or (16,16). */ |
358 | | /* */ |
359 | | /* Inputs : pu1_src - Pointer to source */ |
360 | | /* pu1_dst - Pointer to destination */ |
361 | | /* src_strd - stride for source */ |
362 | | /* dst_strd - stride for destination */ |
363 | | /* log_wd - number of bits to be rounded off */ |
364 | | /* wt - weight value */ |
365 | | /* ofst - offset value */ |
366 | | /* ht - height of the block */ |
367 | | /* wd - width of the block */ |
368 | | /* */ |
369 | | /* Issues : None */ |
370 | | /* */ |
371 | | /* Revision History: */ |
372 | | /* */ |
373 | | /* DD MM YYYY Author(s) Changes */ |
374 | | /* 04 02 2015 Kaushik Initial Version */ |
375 | | /* Senthoor */ |
376 | | /* */ |
377 | | /*****************************************************************************/ |
378 | | void ih264_weighted_pred_luma_sse42(UWORD8 *pu1_src, |
379 | | UWORD8 *pu1_dst, |
380 | | WORD32 src_strd, |
381 | | WORD32 dst_strd, |
382 | | WORD32 log_wd, |
383 | | WORD32 wt, |
384 | | WORD32 ofst, |
385 | | WORD32 ht, |
386 | | WORD32 wd) |
387 | 6.08M | { |
388 | 6.08M | __m128i y_0_16x8b, y_1_16x8b, y_2_16x8b, y_3_16x8b; |
389 | | |
390 | 6.08M | __m128i wt_8x16b, round_8x16b, ofst_8x16b; |
391 | | |
392 | 6.08M | WORD32 round_val; |
393 | | |
394 | 6.08M | wt = (WORD16)(wt & 0xffff); |
395 | 6.08M | round_val = 1 << (log_wd - 1); |
396 | 6.08M | ofst = (WORD8)(ofst & 0xff); |
397 | | |
398 | 6.08M | wt_8x16b = _mm_set1_epi16(wt); |
399 | 6.08M | round_8x16b = _mm_set1_epi16(round_val); |
400 | 6.08M | ofst_8x16b = _mm_set1_epi16(ofst); |
401 | | |
402 | 6.08M | if(wd == 4) |
403 | 72.8k | { |
404 | 72.8k | __m128i y_0_8x16b, y_2_8x16b; |
405 | | |
406 | 72.8k | do |
407 | 132k | { |
408 | 132k | y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); |
409 | 132k | y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); |
410 | 132k | y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (src_strd << 1))); |
411 | 132k | y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd * 3)); |
412 | | |
413 | 132k | y_0_16x8b = _mm_unpacklo_epi32(y_0_16x8b, y_1_16x8b); |
414 | 132k | y_2_16x8b = _mm_unpacklo_epi32(y_2_16x8b, y_3_16x8b); |
415 | | |
416 | 132k | y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b); |
417 | 132k | y_2_8x16b = _mm_cvtepu8_epi16(y_2_16x8b); |
418 | | |
419 | 132k | y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b); |
420 | 132k | y_2_8x16b = _mm_mullo_epi16(y_2_8x16b, wt_8x16b); |
421 | | |
422 | 132k | y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b); |
423 | 132k | y_2_8x16b = _mm_adds_epi16(round_8x16b, y_2_8x16b); |
424 | | |
425 | 132k | y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd); |
426 | 132k | y_2_8x16b = _mm_srai_epi16(y_2_8x16b, log_wd); |
427 | | |
428 | 132k | y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b); |
429 | 132k | y_2_8x16b = _mm_adds_epi16(ofst_8x16b, y_2_8x16b); |
430 | | |
431 | 132k | y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_2_8x16b); |
432 | 132k | y_1_16x8b = _mm_srli_si128(y_0_16x8b, 4); |
433 | 132k | y_2_16x8b = _mm_srli_si128(y_0_16x8b, 8); |
434 | 132k | y_3_16x8b = _mm_srli_si128(y_0_16x8b, 12); |
435 | | |
436 | 132k | *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y_0_16x8b); |
437 | 132k | *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y_1_16x8b); |
438 | 132k | *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y_2_16x8b); |
439 | 132k | *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y_3_16x8b); |
440 | | |
441 | 132k | ht -= 4; |
442 | 132k | pu1_src += src_strd << 2; |
443 | 132k | pu1_dst += dst_strd << 2; |
444 | 132k | } |
445 | 132k | while(ht > 0); |
446 | 72.8k | } |
447 | 6.01M | else if(wd == 8) |
448 | 76.0k | { |
449 | 76.0k | __m128i y_0_8x16b, y_1_8x16b, y_2_8x16b, y_3_8x16b; |
450 | | |
451 | 76.0k | do |
452 | 186k | { |
453 | 186k | y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); |
454 | 186k | y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); |
455 | 186k | y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (src_strd << 1))); |
456 | 186k | y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd * 3)); |
457 | | |
458 | 186k | y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b); |
459 | 186k | y_1_8x16b = _mm_cvtepu8_epi16(y_1_16x8b); |
460 | 186k | y_2_8x16b = _mm_cvtepu8_epi16(y_2_16x8b); |
461 | 186k | y_3_8x16b = _mm_cvtepu8_epi16(y_3_16x8b); |
462 | | |
463 | 186k | y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b); |
464 | 186k | y_1_8x16b = _mm_mullo_epi16(y_1_8x16b, wt_8x16b); |
465 | 186k | y_2_8x16b = _mm_mullo_epi16(y_2_8x16b, wt_8x16b); |
466 | 186k | y_3_8x16b = _mm_mullo_epi16(y_3_8x16b, wt_8x16b); |
467 | | |
468 | 186k | y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b); |
469 | 186k | y_1_8x16b = _mm_adds_epi16(round_8x16b, y_1_8x16b); |
470 | 186k | y_2_8x16b = _mm_adds_epi16(round_8x16b, y_2_8x16b); |
471 | 186k | y_3_8x16b = _mm_adds_epi16(round_8x16b, y_3_8x16b); |
472 | | |
473 | 186k | y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd); |
474 | 186k | y_1_8x16b = _mm_srai_epi16(y_1_8x16b, log_wd); |
475 | 186k | y_2_8x16b = _mm_srai_epi16(y_2_8x16b, log_wd); |
476 | 186k | y_3_8x16b = _mm_srai_epi16(y_3_8x16b, log_wd); |
477 | | |
478 | 186k | y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b); |
479 | 186k | y_1_8x16b = _mm_adds_epi16(ofst_8x16b, y_1_8x16b); |
480 | 186k | y_2_8x16b = _mm_adds_epi16(ofst_8x16b, y_2_8x16b); |
481 | 186k | y_3_8x16b = _mm_adds_epi16(ofst_8x16b, y_3_8x16b); |
482 | | |
483 | 186k | y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_1_8x16b); |
484 | 186k | y_2_16x8b = _mm_packus_epi16(y_2_8x16b, y_3_8x16b); |
485 | 186k | y_1_16x8b = _mm_srli_si128(y_0_16x8b, 8); |
486 | 186k | y_3_16x8b = _mm_srli_si128(y_2_16x8b, 8); |
487 | | |
488 | 186k | _mm_storel_epi64((__m128i *)pu1_dst, y_0_16x8b); |
489 | 186k | _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y_1_16x8b); |
490 | 186k | _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b); |
491 | 186k | _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b); |
492 | | |
493 | 186k | ht -= 4; |
494 | 186k | pu1_src += src_strd << 2; |
495 | 186k | pu1_dst += dst_strd << 2; |
496 | 186k | } |
497 | 186k | while(ht > 0); |
498 | 76.0k | } |
499 | 5.93M | else // wd == 16 |
500 | 5.93M | { |
501 | 5.93M | __m128i y_0L_8x16b, y_1L_8x16b, y_2L_8x16b, y_3L_8x16b; |
502 | 5.93M | __m128i y_0H_8x16b, y_1H_8x16b, y_2H_8x16b, y_3H_8x16b; |
503 | | |
504 | 5.93M | __m128i zero_16x8b; |
505 | 5.93M | zero_16x8b = _mm_set1_epi8(0); |
506 | | |
507 | 5.93M | do |
508 | 23.6M | { |
509 | 23.6M | y_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); |
510 | 23.6M | y_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); |
511 | 23.6M | y_2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (src_strd << 1))); |
512 | 23.6M | y_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd * 3)); |
513 | | |
514 | 23.6M | y_0L_8x16b = _mm_cvtepu8_epi16(y_0_16x8b); |
515 | 23.6M | y_0H_8x16b = _mm_unpackhi_epi8(y_0_16x8b, zero_16x8b); |
516 | 23.6M | y_1L_8x16b = _mm_cvtepu8_epi16(y_1_16x8b); |
517 | 23.6M | y_1H_8x16b = _mm_unpackhi_epi8(y_1_16x8b, zero_16x8b); |
518 | 23.6M | y_2L_8x16b = _mm_cvtepu8_epi16(y_2_16x8b); |
519 | 23.6M | y_2H_8x16b = _mm_unpackhi_epi8(y_2_16x8b, zero_16x8b); |
520 | 23.6M | y_3L_8x16b = _mm_cvtepu8_epi16(y_3_16x8b); |
521 | 23.6M | y_3H_8x16b = _mm_unpackhi_epi8(y_3_16x8b, zero_16x8b); |
522 | | |
523 | 23.6M | y_0L_8x16b = _mm_mullo_epi16(y_0L_8x16b, wt_8x16b); |
524 | 23.6M | y_0H_8x16b = _mm_mullo_epi16(y_0H_8x16b, wt_8x16b); |
525 | 23.6M | y_1L_8x16b = _mm_mullo_epi16(y_1L_8x16b, wt_8x16b); |
526 | 23.6M | y_1H_8x16b = _mm_mullo_epi16(y_1H_8x16b, wt_8x16b); |
527 | 23.6M | y_2L_8x16b = _mm_mullo_epi16(y_2L_8x16b, wt_8x16b); |
528 | 23.6M | y_2H_8x16b = _mm_mullo_epi16(y_2H_8x16b, wt_8x16b); |
529 | 23.6M | y_3L_8x16b = _mm_mullo_epi16(y_3L_8x16b, wt_8x16b); |
530 | 23.6M | y_3H_8x16b = _mm_mullo_epi16(y_3H_8x16b, wt_8x16b); |
531 | | |
532 | 23.6M | y_0L_8x16b = _mm_adds_epi16(round_8x16b, y_0L_8x16b); |
533 | 23.6M | y_0H_8x16b = _mm_adds_epi16(round_8x16b, y_0H_8x16b); |
534 | 23.6M | y_1L_8x16b = _mm_adds_epi16(round_8x16b, y_1L_8x16b); |
535 | 23.6M | y_1H_8x16b = _mm_adds_epi16(round_8x16b, y_1H_8x16b); |
536 | 23.6M | y_2L_8x16b = _mm_adds_epi16(round_8x16b, y_2L_8x16b); |
537 | 23.6M | y_2H_8x16b = _mm_adds_epi16(round_8x16b, y_2H_8x16b); |
538 | 23.6M | y_3L_8x16b = _mm_adds_epi16(round_8x16b, y_3L_8x16b); |
539 | 23.6M | y_3H_8x16b = _mm_adds_epi16(round_8x16b, y_3H_8x16b); |
540 | | |
541 | 23.6M | y_0L_8x16b = _mm_srai_epi16(y_0L_8x16b, log_wd); |
542 | 23.6M | y_0H_8x16b = _mm_srai_epi16(y_0H_8x16b, log_wd); |
543 | 23.6M | y_1L_8x16b = _mm_srai_epi16(y_1L_8x16b, log_wd); |
544 | 23.6M | y_1H_8x16b = _mm_srai_epi16(y_1H_8x16b, log_wd); |
545 | 23.6M | y_2L_8x16b = _mm_srai_epi16(y_2L_8x16b, log_wd); |
546 | 23.6M | y_2H_8x16b = _mm_srai_epi16(y_2H_8x16b, log_wd); |
547 | 23.6M | y_3L_8x16b = _mm_srai_epi16(y_3L_8x16b, log_wd); |
548 | 23.6M | y_3H_8x16b = _mm_srai_epi16(y_3H_8x16b, log_wd); |
549 | | |
550 | 23.6M | y_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y_0L_8x16b); |
551 | 23.6M | y_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y_0H_8x16b); |
552 | 23.6M | y_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y_1L_8x16b); |
553 | 23.6M | y_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y_1H_8x16b); |
554 | 23.6M | y_2L_8x16b = _mm_adds_epi16(ofst_8x16b, y_2L_8x16b); |
555 | 23.6M | y_2H_8x16b = _mm_adds_epi16(ofst_8x16b, y_2H_8x16b); |
556 | 23.6M | y_3L_8x16b = _mm_adds_epi16(ofst_8x16b, y_3L_8x16b); |
557 | 23.6M | y_3H_8x16b = _mm_adds_epi16(ofst_8x16b, y_3H_8x16b); |
558 | | |
559 | 23.6M | y_0_16x8b = _mm_packus_epi16(y_0L_8x16b, y_0H_8x16b); |
560 | 23.6M | y_1_16x8b = _mm_packus_epi16(y_1L_8x16b, y_1H_8x16b); |
561 | 23.6M | y_2_16x8b = _mm_packus_epi16(y_2L_8x16b, y_2H_8x16b); |
562 | 23.6M | y_3_16x8b = _mm_packus_epi16(y_3L_8x16b, y_3H_8x16b); |
563 | | |
564 | 23.6M | _mm_storeu_si128((__m128i *)pu1_dst, y_0_16x8b); |
565 | 23.6M | _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y_1_16x8b); |
566 | 23.6M | _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b); |
567 | 23.6M | _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b); |
568 | | |
569 | 23.6M | ht -= 4; |
570 | 23.6M | pu1_src += src_strd << 2; |
571 | 23.6M | pu1_dst += dst_strd << 2; |
572 | 23.6M | } |
573 | 23.6M | while(ht > 0); |
574 | 5.93M | } |
575 | 6.08M | } |
576 | | |
577 | | /*****************************************************************************/ |
578 | | /* */ |
579 | | /* Function Name : ih264_weighted_pred_chroma_sse42 */ |
580 | | /* */ |
581 | | /* Description : This function performs the weighted prediction as */ |
582 | | /* described in sec 8.4.2.3.2 titled "Weighted sample */ |
583 | | /* prediction process" for chroma. The function gets one */ |
584 | | /* ht x wd block, weights it, rounds it off, offsets it, */ |
585 | | /* saturates it to unsigned 8-bit and stores it in the */ |
586 | | /* destination block. (ht,wd) can be (2,2), (4,2), (2,4), */ |
587 | | /* (4,4), (8,4), (4,8) or (8,8). */ |
588 | | /* */ |
589 | | /* Inputs : pu1_src - Pointer to source */ |
590 | | /* pu1_dst - Pointer to destination */ |
591 | | /* src_strd - stride for source */ |
592 | | /* dst_strd - stride for destination */ |
593 | | /* log_wd - number of bits to be rounded off */ |
594 | | /* wt - weight values for u and v */ |
595 | | /* ofst - offset values for u and v */ |
596 | | /* ht - height of the block */ |
597 | | /* wd - width of the block */ |
598 | | /* */ |
599 | | /* Issues : None */ |
600 | | /* */ |
601 | | /* Revision History: */ |
602 | | /* */ |
603 | | /* DD MM YYYY Author(s) Changes */ |
604 | | /* 04 02 2015 Kaushik Initial Version */ |
605 | | /* Senthoor */ |
606 | | /* */ |
607 | | /*****************************************************************************/ |
608 | | void ih264_weighted_pred_chroma_sse42(UWORD8 *pu1_src, |
609 | | UWORD8 *pu1_dst, |
610 | | WORD32 src_strd, |
611 | | WORD32 dst_strd, |
612 | | WORD32 log_wd, |
613 | | WORD32 wt, |
614 | | WORD32 ofst, |
615 | | WORD32 ht, |
616 | | WORD32 wd) |
617 | 6.08M | { |
618 | 6.08M | __m128i y_0_16x8b, y_1_16x8b; |
619 | | |
620 | 6.08M | __m128i wt_8x16b, round_8x16b, ofst_8x16b; |
621 | | |
622 | 6.08M | WORD32 ofst_u, ofst_v; |
623 | 6.08M | WORD32 round_val; |
624 | | |
625 | 6.08M | ofst_u = (WORD8)(ofst & 0xff); |
626 | 6.08M | ofst_v = (WORD8)(ofst >> 8); |
627 | 6.08M | round_val = 1 << (log_wd - 1); |
628 | 6.08M | ofst = (ofst_u & 0xffff) | (ofst_v << 16); |
629 | | |
630 | 6.08M | wt_8x16b = _mm_set1_epi32(wt); |
631 | 6.08M | round_8x16b = _mm_set1_epi16(round_val); |
632 | 6.08M | ofst_8x16b = _mm_set1_epi32(ofst); |
633 | | |
634 | 6.08M | if(wd == 2) |
635 | 72.8k | { |
636 | 72.8k | __m128i y_0_8x16b; |
637 | | |
638 | 72.8k | do |
639 | 132k | { |
640 | 132k | y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); |
641 | 132k | y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); |
642 | | |
643 | 132k | y_0_16x8b = _mm_unpacklo_epi32(y_0_16x8b, y_1_16x8b); |
644 | | |
645 | 132k | y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b); |
646 | | |
647 | 132k | y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b); |
648 | | |
649 | 132k | y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b); |
650 | | |
651 | 132k | y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd); |
652 | | |
653 | 132k | y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b); |
654 | | |
655 | 132k | y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_0_8x16b); |
656 | 132k | y_1_16x8b = _mm_srli_si128(y_0_16x8b, 4); |
657 | | |
658 | 132k | *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y_0_16x8b); |
659 | 132k | *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y_1_16x8b); |
660 | | |
661 | 132k | ht -= 2; |
662 | 132k | pu1_src += src_strd << 1; |
663 | 132k | pu1_dst += dst_strd << 1; |
664 | 132k | } |
665 | 132k | while(ht > 0); |
666 | 72.8k | } |
667 | 6.01M | else if(wd == 4) |
668 | 76.0k | { |
669 | 76.0k | __m128i y_0_8x16b, y_1_8x16b; |
670 | | |
671 | 76.0k | do |
672 | 186k | { |
673 | 186k | y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); |
674 | 186k | y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); |
675 | | |
676 | 186k | y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b); |
677 | 186k | y_1_8x16b = _mm_cvtepu8_epi16(y_1_16x8b); |
678 | | |
679 | 186k | y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b); |
680 | 186k | y_1_8x16b = _mm_mullo_epi16(y_1_8x16b, wt_8x16b); |
681 | | |
682 | 186k | y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b); |
683 | 186k | y_1_8x16b = _mm_adds_epi16(round_8x16b, y_1_8x16b); |
684 | | |
685 | 186k | y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd); |
686 | 186k | y_1_8x16b = _mm_srai_epi16(y_1_8x16b, log_wd); |
687 | | |
688 | 186k | y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b); |
689 | 186k | y_1_8x16b = _mm_adds_epi16(ofst_8x16b, y_1_8x16b); |
690 | | |
691 | 186k | y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_1_8x16b); |
692 | 186k | y_1_16x8b = _mm_srli_si128(y_0_16x8b, 8); |
693 | | |
694 | 186k | _mm_storel_epi64((__m128i *)pu1_dst, y_0_16x8b); |
695 | 186k | _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y_1_16x8b); |
696 | | |
697 | 186k | ht -= 2; |
698 | 186k | pu1_src += src_strd << 1; |
699 | 186k | pu1_dst += dst_strd << 1; |
700 | 186k | } |
701 | 186k | while(ht > 0); |
702 | 76.0k | } |
703 | 5.93M | else // wd == 16 |
704 | 5.93M | { |
705 | 5.93M | __m128i y_2_16x8b, y_3_16x8b; |
706 | 5.93M | __m128i y_0L_8x16b, y_1L_8x16b, y_2L_8x16b, y_3L_8x16b; |
707 | 5.93M | __m128i y_0H_8x16b, y_1H_8x16b, y_2H_8x16b, y_3H_8x16b; |
708 | | |
709 | 5.93M | __m128i zero_16x8b; |
710 | 5.93M | zero_16x8b = _mm_set1_epi8(0); |
711 | | |
712 | 5.93M | do |
713 | 11.8M | { |
714 | 11.8M | y_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); |
715 | 11.8M | y_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); |
716 | 11.8M | y_2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (src_strd << 1))); |
717 | 11.8M | y_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd * 3)); |
718 | | |
719 | 11.8M | y_0L_8x16b = _mm_cvtepu8_epi16(y_0_16x8b); |
720 | 11.8M | y_0H_8x16b = _mm_unpackhi_epi8(y_0_16x8b, zero_16x8b); |
721 | 11.8M | y_1L_8x16b = _mm_cvtepu8_epi16(y_1_16x8b); |
722 | 11.8M | y_1H_8x16b = _mm_unpackhi_epi8(y_1_16x8b, zero_16x8b); |
723 | 11.8M | y_2L_8x16b = _mm_cvtepu8_epi16(y_2_16x8b); |
724 | 11.8M | y_2H_8x16b = _mm_unpackhi_epi8(y_2_16x8b, zero_16x8b); |
725 | 11.8M | y_3L_8x16b = _mm_cvtepu8_epi16(y_3_16x8b); |
726 | 11.8M | y_3H_8x16b = _mm_unpackhi_epi8(y_3_16x8b, zero_16x8b); |
727 | | |
728 | 11.8M | y_0L_8x16b = _mm_mullo_epi16(y_0L_8x16b, wt_8x16b); |
729 | 11.8M | y_0H_8x16b = _mm_mullo_epi16(y_0H_8x16b, wt_8x16b); |
730 | 11.8M | y_1L_8x16b = _mm_mullo_epi16(y_1L_8x16b, wt_8x16b); |
731 | 11.8M | y_1H_8x16b = _mm_mullo_epi16(y_1H_8x16b, wt_8x16b); |
732 | 11.8M | y_2L_8x16b = _mm_mullo_epi16(y_2L_8x16b, wt_8x16b); |
733 | 11.8M | y_2H_8x16b = _mm_mullo_epi16(y_2H_8x16b, wt_8x16b); |
734 | 11.8M | y_3L_8x16b = _mm_mullo_epi16(y_3L_8x16b, wt_8x16b); |
735 | 11.8M | y_3H_8x16b = _mm_mullo_epi16(y_3H_8x16b, wt_8x16b); |
736 | | |
737 | 11.8M | y_0L_8x16b = _mm_adds_epi16(round_8x16b, y_0L_8x16b); |
738 | 11.8M | y_0H_8x16b = _mm_adds_epi16(round_8x16b, y_0H_8x16b); |
739 | 11.8M | y_1L_8x16b = _mm_adds_epi16(round_8x16b, y_1L_8x16b); |
740 | 11.8M | y_1H_8x16b = _mm_adds_epi16(round_8x16b, y_1H_8x16b); |
741 | 11.8M | y_2L_8x16b = _mm_adds_epi16(round_8x16b, y_2L_8x16b); |
742 | 11.8M | y_2H_8x16b = _mm_adds_epi16(round_8x16b, y_2H_8x16b); |
743 | 11.8M | y_3L_8x16b = _mm_adds_epi16(round_8x16b, y_3L_8x16b); |
744 | 11.8M | y_3H_8x16b = _mm_adds_epi16(round_8x16b, y_3H_8x16b); |
745 | | |
746 | 11.8M | y_0L_8x16b = _mm_srai_epi16(y_0L_8x16b, log_wd); |
747 | 11.8M | y_0H_8x16b = _mm_srai_epi16(y_0H_8x16b, log_wd); |
748 | 11.8M | y_1L_8x16b = _mm_srai_epi16(y_1L_8x16b, log_wd); |
749 | 11.8M | y_1H_8x16b = _mm_srai_epi16(y_1H_8x16b, log_wd); |
750 | 11.8M | y_2L_8x16b = _mm_srai_epi16(y_2L_8x16b, log_wd); |
751 | 11.8M | y_2H_8x16b = _mm_srai_epi16(y_2H_8x16b, log_wd); |
752 | 11.8M | y_3L_8x16b = _mm_srai_epi16(y_3L_8x16b, log_wd); |
753 | 11.8M | y_3H_8x16b = _mm_srai_epi16(y_3H_8x16b, log_wd); |
754 | | |
755 | 11.8M | y_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y_0L_8x16b); |
756 | 11.8M | y_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y_0H_8x16b); |
757 | 11.8M | y_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y_1L_8x16b); |
758 | 11.8M | y_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y_1H_8x16b); |
759 | 11.8M | y_2L_8x16b = _mm_adds_epi16(ofst_8x16b, y_2L_8x16b); |
760 | 11.8M | y_2H_8x16b = _mm_adds_epi16(ofst_8x16b, y_2H_8x16b); |
761 | 11.8M | y_3L_8x16b = _mm_adds_epi16(ofst_8x16b, y_3L_8x16b); |
762 | 11.8M | y_3H_8x16b = _mm_adds_epi16(ofst_8x16b, y_3H_8x16b); |
763 | | |
764 | 11.8M | y_0_16x8b = _mm_packus_epi16(y_0L_8x16b, y_0H_8x16b); |
765 | 11.8M | y_1_16x8b = _mm_packus_epi16(y_1L_8x16b, y_1H_8x16b); |
766 | 11.8M | y_2_16x8b = _mm_packus_epi16(y_2L_8x16b, y_2H_8x16b); |
767 | 11.8M | y_3_16x8b = _mm_packus_epi16(y_3L_8x16b, y_3H_8x16b); |
768 | | |
769 | 11.8M | _mm_storeu_si128((__m128i *)pu1_dst, y_0_16x8b); |
770 | 11.8M | _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y_1_16x8b); |
771 | 11.8M | _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b); |
772 | 11.8M | _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b); |
773 | | |
774 | 11.8M | ht -= 4; |
775 | 11.8M | pu1_src += src_strd << 2; |
776 | 11.8M | pu1_dst += dst_strd << 2; |
777 | 11.8M | } |
778 | 11.8M | while(ht > 0); |
779 | 5.93M | } |
780 | 6.08M | } |
781 | | |
782 | | /*****************************************************************************/ |
783 | | /* */ |
784 | | /* Function Name : ih264_weighted_bi_pred_luma_sse42 */ |
785 | | /* */ |
786 | | /* Description : This function performs the weighted biprediction as */ |
787 | | /* described in sec 8.4.2.3.2 titled "Weighted sample */ |
788 | | /* prediction process" for luma. The function gets two */ |
789 | | /* ht x wd blocks, weights them, adds them, rounds off the */ |
790 | | /* sum, offsets it, saturates it to unsigned 8-bit and */ |
791 | | /* stores it in the destination block. (ht,wd) can be */ |
792 | | /* (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16). */ |
793 | | /* */ |
794 | | /* Inputs : pu1_src1 - Pointer to source 1 */ |
795 | | /* pu1_src2 - Pointer to source 2 */ |
796 | | /* pu1_dst - Pointer to destination */ |
797 | | /* src_strd1 - stride for source 1 */ |
798 | | /* src_strd2 - stride for source 2 */ |
799 | | /* dst_strd2 - stride for destination */ |
800 | | /* log_wd - number of bits to be rounded off */ |
801 | | /* wt1 - weight value for source 1 */ |
802 | | /* wt2 - weight value for source 2 */ |
803 | | /* ofst1 - offset value for source 1 */ |
804 | | /* ofst2 - offset value for source 2 */ |
805 | | /* ht - height of the block */ |
806 | | /* wd - width of the block */ |
807 | | /* */ |
808 | | /* Issues : None */ |
809 | | /* */ |
810 | | /* Revision History: */ |
811 | | /* */ |
812 | | /* DD MM YYYY Author(s) Changes */ |
813 | | /* 04 02 2015 Kaushik Initial Version */ |
814 | | /* Senthoor */ |
815 | | /* */ |
816 | | /*****************************************************************************/ |
817 | | void ih264_weighted_bi_pred_luma_sse42(UWORD8 *pu1_src1, |
818 | | UWORD8 *pu1_src2, |
819 | | UWORD8 *pu1_dst, |
820 | | WORD32 src_strd1, |
821 | | WORD32 src_strd2, |
822 | | WORD32 dst_strd, |
823 | | WORD32 log_wd, |
824 | | WORD32 wt1, |
825 | | WORD32 wt2, |
826 | | WORD32 ofst1, |
827 | | WORD32 ofst2, |
828 | | WORD32 ht, |
829 | | WORD32 wd) |
830 | 733k | { |
831 | 733k | __m128i y1_0_16x8b, y1_1_16x8b; |
832 | 733k | __m128i y2_0_16x8b, y2_1_16x8b; |
833 | | |
834 | 733k | __m128i wt1_8x16b, wt2_8x16b; |
835 | 733k | __m128i ofst_8x16b, round_8x16b; |
836 | | |
837 | 733k | WORD32 ofst; |
838 | 733k | WORD32 round_val, shft; |
839 | | |
840 | 733k | wt1 = (WORD16)(wt1 & 0xffff); |
841 | 733k | wt2 = (WORD16)(wt2 & 0xffff); |
842 | 733k | round_val = 1 << log_wd; |
843 | 733k | shft = log_wd + 1; |
844 | 733k | ofst1 = (WORD8)(ofst1 & 0xff); |
845 | 733k | ofst2 = (WORD8)(ofst2 & 0xff); |
846 | 733k | ofst = (ofst1 + ofst2 + 1) >> 1; |
847 | | |
848 | 733k | wt1_8x16b = _mm_set1_epi16(wt1); |
849 | 733k | wt2_8x16b = _mm_set1_epi16(wt2); |
850 | 733k | round_8x16b = _mm_set1_epi16(round_val); |
851 | 733k | ofst_8x16b = _mm_set1_epi16(ofst); |
852 | | |
853 | 733k | if(wd == 4) |
854 | 38.9k | { |
855 | 38.9k | __m128i y1_2_16x8b, y1_3_16x8b; |
856 | 38.9k | __m128i y2_2_16x8b, y2_3_16x8b; |
857 | | |
858 | 38.9k | __m128i y1_0_8x16b, y1_2_8x16b; |
859 | 38.9k | __m128i y2_0_8x16b, y2_2_8x16b; |
860 | | |
861 | 38.9k | do |
862 | 40.5k | { |
863 | 40.5k | y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); |
864 | 40.5k | y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1)); |
865 | 40.5k | y1_2_16x8b = _mm_loadl_epi64( |
866 | 40.5k | (__m128i *)(pu1_src1 + (src_strd1 << 1))); |
867 | 40.5k | y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3)); |
868 | | |
869 | 40.5k | y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2); |
870 | 40.5k | y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2)); |
871 | 40.5k | y2_2_16x8b = _mm_loadl_epi64( |
872 | 40.5k | (__m128i *)(pu1_src2 + (src_strd2 << 1))); |
873 | 40.5k | y2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3)); |
874 | | |
875 | 40.5k | y1_0_16x8b = _mm_unpacklo_epi32(y1_0_16x8b, y1_1_16x8b); |
876 | 40.5k | y1_2_16x8b = _mm_unpacklo_epi32(y1_2_16x8b, y1_3_16x8b); |
877 | 40.5k | y2_0_16x8b = _mm_unpacklo_epi32(y2_0_16x8b, y2_1_16x8b); |
878 | 40.5k | y2_2_16x8b = _mm_unpacklo_epi32(y2_2_16x8b, y2_3_16x8b); |
879 | | |
880 | 40.5k | y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b); |
881 | 40.5k | y1_2_8x16b = _mm_cvtepu8_epi16(y1_2_16x8b); |
882 | 40.5k | y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b); |
883 | 40.5k | y2_2_8x16b = _mm_cvtepu8_epi16(y2_2_16x8b); |
884 | | |
885 | 40.5k | y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b); |
886 | 40.5k | y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b); |
887 | 40.5k | y1_2_8x16b = _mm_mullo_epi16(y1_2_8x16b, wt1_8x16b); |
888 | 40.5k | y2_2_8x16b = _mm_mullo_epi16(y2_2_8x16b, wt2_8x16b); |
889 | | |
890 | 40.5k | y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b); |
891 | 40.5k | y1_2_8x16b = _mm_adds_epi16(y1_2_8x16b, y2_2_8x16b); |
892 | | |
893 | 40.5k | y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b); |
894 | 40.5k | y1_2_8x16b = _mm_adds_epi16(round_8x16b, y1_2_8x16b); |
895 | | |
896 | 40.5k | y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft); |
897 | 40.5k | y1_2_8x16b = _mm_srai_epi16(y1_2_8x16b, shft); |
898 | | |
899 | 40.5k | y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b); |
900 | 40.5k | y1_2_8x16b = _mm_adds_epi16(ofst_8x16b, y1_2_8x16b); |
901 | | |
902 | 40.5k | y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_2_8x16b); |
903 | 40.5k | y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 4); |
904 | 40.5k | y1_2_16x8b = _mm_srli_si128(y1_0_16x8b, 8); |
905 | 40.5k | y1_3_16x8b = _mm_srli_si128(y1_0_16x8b, 12); |
906 | | |
907 | 40.5k | *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b); |
908 | 40.5k | *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b); |
909 | 40.5k | *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y1_2_16x8b); |
910 | 40.5k | *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y1_3_16x8b); |
911 | | |
912 | | |
913 | 40.5k | ht -= 4; |
914 | 40.5k | pu1_src1 += src_strd1 << 2; |
915 | 40.5k | pu1_src2 += src_strd2 << 2; |
916 | 40.5k | pu1_dst += dst_strd << 2; |
917 | 40.5k | } |
918 | 40.5k | while(ht > 0); |
919 | 38.9k | } |
920 | 694k | else if(wd == 8) |
921 | 53.1k | { |
922 | 53.1k | __m128i y1_2_16x8b, y1_3_16x8b; |
923 | 53.1k | __m128i y2_2_16x8b, y2_3_16x8b; |
924 | | |
925 | 53.1k | __m128i y1_0_8x16b, y1_1_8x16b, y1_2_8x16b, y1_3_8x16b; |
926 | 53.1k | __m128i y2_0_8x16b, y2_1_8x16b, y2_2_8x16b, y2_3_8x16b; |
927 | | |
928 | 53.1k | do |
929 | 119k | { |
930 | 119k | y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); |
931 | 119k | y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1)); |
932 | 119k | y1_2_16x8b = _mm_loadl_epi64( |
933 | 119k | (__m128i *)(pu1_src1 + (src_strd1 << 1))); |
934 | 119k | y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3)); |
935 | | |
936 | 119k | y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2); |
937 | 119k | y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2)); |
938 | 119k | y2_2_16x8b = _mm_loadl_epi64( |
939 | 119k | (__m128i *)(pu1_src2 + (src_strd2 << 1))); |
940 | 119k | y2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3)); |
941 | | |
942 | 119k | y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b); |
943 | 119k | y1_1_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b); |
944 | 119k | y1_2_8x16b = _mm_cvtepu8_epi16(y1_2_16x8b); |
945 | 119k | y1_3_8x16b = _mm_cvtepu8_epi16(y1_3_16x8b); |
946 | | |
947 | 119k | y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b); |
948 | 119k | y2_1_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b); |
949 | 119k | y2_2_8x16b = _mm_cvtepu8_epi16(y2_2_16x8b); |
950 | 119k | y2_3_8x16b = _mm_cvtepu8_epi16(y2_3_16x8b); |
951 | | |
952 | 119k | y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b); |
953 | 119k | y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b); |
954 | 119k | y1_1_8x16b = _mm_mullo_epi16(y1_1_8x16b, wt1_8x16b); |
955 | 119k | y2_1_8x16b = _mm_mullo_epi16(y2_1_8x16b, wt2_8x16b); |
956 | | |
957 | 119k | y1_2_8x16b = _mm_mullo_epi16(y1_2_8x16b, wt1_8x16b); |
958 | 119k | y2_2_8x16b = _mm_mullo_epi16(y2_2_8x16b, wt2_8x16b); |
959 | 119k | y1_3_8x16b = _mm_mullo_epi16(y1_3_8x16b, wt1_8x16b); |
960 | 119k | y2_3_8x16b = _mm_mullo_epi16(y2_3_8x16b, wt2_8x16b); |
961 | | |
962 | 119k | y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b); |
963 | 119k | y1_1_8x16b = _mm_adds_epi16(y1_1_8x16b, y2_1_8x16b); |
964 | 119k | y1_2_8x16b = _mm_adds_epi16(y1_2_8x16b, y2_2_8x16b); |
965 | 119k | y1_3_8x16b = _mm_adds_epi16(y1_3_8x16b, y2_3_8x16b); |
966 | | |
967 | 119k | y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b); |
968 | 119k | y1_1_8x16b = _mm_adds_epi16(round_8x16b, y1_1_8x16b); |
969 | 119k | y1_2_8x16b = _mm_adds_epi16(round_8x16b, y1_2_8x16b); |
970 | 119k | y1_3_8x16b = _mm_adds_epi16(round_8x16b, y1_3_8x16b); |
971 | | |
972 | 119k | y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft); |
973 | 119k | y1_1_8x16b = _mm_srai_epi16(y1_1_8x16b, shft); |
974 | 119k | y1_2_8x16b = _mm_srai_epi16(y1_2_8x16b, shft); |
975 | 119k | y1_3_8x16b = _mm_srai_epi16(y1_3_8x16b, shft); |
976 | | |
977 | 119k | y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b); |
978 | 119k | y1_1_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1_8x16b); |
979 | 119k | y1_2_8x16b = _mm_adds_epi16(ofst_8x16b, y1_2_8x16b); |
980 | 119k | y1_3_8x16b = _mm_adds_epi16(ofst_8x16b, y1_3_8x16b); |
981 | | |
982 | 119k | y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_1_8x16b); |
983 | 119k | y1_2_16x8b = _mm_packus_epi16(y1_2_8x16b, y1_3_8x16b); |
984 | 119k | y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 8); |
985 | 119k | y1_3_16x8b = _mm_srli_si128(y1_2_16x8b, 8); |
986 | | |
987 | 119k | _mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b); |
988 | 119k | _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b); |
989 | 119k | _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y1_2_16x8b); |
990 | 119k | _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y1_3_16x8b); |
991 | | |
992 | 119k | ht -= 4; |
993 | 119k | pu1_src1 += src_strd1 << 2; |
994 | 119k | pu1_src2 += src_strd2 << 2; |
995 | 119k | pu1_dst += dst_strd << 2; |
996 | 119k | } |
997 | 119k | while(ht > 0); |
998 | 53.1k | } |
999 | 641k | else // wd == 16 |
1000 | 641k | { |
1001 | 641k | __m128i y1_0L_8x16b, y1_0H_8x16b, y1_1L_8x16b, y1_1H_8x16b; |
1002 | 641k | __m128i y2_0L_8x16b, y2_0H_8x16b, y2_1L_8x16b, y2_1H_8x16b; |
1003 | | |
1004 | 641k | __m128i zero_16x8b; |
1005 | 641k | zero_16x8b = _mm_set1_epi8(0); |
1006 | | |
1007 | 641k | do |
1008 | 5.08M | { |
1009 | 5.08M | y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1); |
1010 | 5.08M | y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1)); |
1011 | 5.08M | y2_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2); |
1012 | 5.08M | y2_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2)); |
1013 | | |
1014 | 5.08M | y1_0L_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b); |
1015 | 5.08M | y1_0H_8x16b = _mm_unpackhi_epi8(y1_0_16x8b, zero_16x8b); |
1016 | 5.08M | y1_1L_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b); |
1017 | 5.08M | y1_1H_8x16b = _mm_unpackhi_epi8(y1_1_16x8b, zero_16x8b); |
1018 | | |
1019 | 5.08M | y2_0L_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b); |
1020 | 5.08M | y2_0H_8x16b = _mm_unpackhi_epi8(y2_0_16x8b, zero_16x8b); |
1021 | 5.08M | y2_1L_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b); |
1022 | 5.08M | y2_1H_8x16b = _mm_unpackhi_epi8(y2_1_16x8b, zero_16x8b); |
1023 | | |
1024 | 5.08M | y1_0L_8x16b = _mm_mullo_epi16(y1_0L_8x16b, wt1_8x16b); |
1025 | 5.08M | y1_0H_8x16b = _mm_mullo_epi16(y1_0H_8x16b, wt1_8x16b); |
1026 | 5.08M | y1_1L_8x16b = _mm_mullo_epi16(y1_1L_8x16b, wt1_8x16b); |
1027 | 5.08M | y1_1H_8x16b = _mm_mullo_epi16(y1_1H_8x16b, wt1_8x16b); |
1028 | | |
1029 | 5.08M | y2_0L_8x16b = _mm_mullo_epi16(y2_0L_8x16b, wt2_8x16b); |
1030 | 5.08M | y2_0H_8x16b = _mm_mullo_epi16(y2_0H_8x16b, wt2_8x16b); |
1031 | 5.08M | y2_1L_8x16b = _mm_mullo_epi16(y2_1L_8x16b, wt2_8x16b); |
1032 | 5.08M | y2_1H_8x16b = _mm_mullo_epi16(y2_1H_8x16b, wt2_8x16b); |
1033 | | |
1034 | 5.08M | y1_0L_8x16b = _mm_adds_epi16(y1_0L_8x16b, y2_0L_8x16b); |
1035 | 5.08M | y1_0H_8x16b = _mm_adds_epi16(y1_0H_8x16b, y2_0H_8x16b); |
1036 | 5.08M | y1_1L_8x16b = _mm_adds_epi16(y1_1L_8x16b, y2_1L_8x16b); |
1037 | 5.08M | y1_1H_8x16b = _mm_adds_epi16(y1_1H_8x16b, y2_1H_8x16b); |
1038 | | |
1039 | 5.08M | y1_0L_8x16b = _mm_adds_epi16(round_8x16b, y1_0L_8x16b); |
1040 | 5.08M | y1_0H_8x16b = _mm_adds_epi16(round_8x16b, y1_0H_8x16b); |
1041 | 5.08M | y1_1L_8x16b = _mm_adds_epi16(round_8x16b, y1_1L_8x16b); |
1042 | 5.08M | y1_1H_8x16b = _mm_adds_epi16(round_8x16b, y1_1H_8x16b); |
1043 | | |
1044 | 5.08M | y1_0L_8x16b = _mm_srai_epi16(y1_0L_8x16b, shft); |
1045 | 5.08M | y1_0H_8x16b = _mm_srai_epi16(y1_0H_8x16b, shft); |
1046 | 5.08M | y1_1L_8x16b = _mm_srai_epi16(y1_1L_8x16b, shft); |
1047 | 5.08M | y1_1H_8x16b = _mm_srai_epi16(y1_1H_8x16b, shft); |
1048 | | |
1049 | 5.08M | y1_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0L_8x16b); |
1050 | 5.08M | y1_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0H_8x16b); |
1051 | 5.08M | y1_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1L_8x16b); |
1052 | 5.08M | y1_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1H_8x16b); |
1053 | | |
1054 | 5.08M | y1_0_16x8b = _mm_packus_epi16(y1_0L_8x16b, y1_0H_8x16b); |
1055 | 5.08M | y1_1_16x8b = _mm_packus_epi16(y1_1L_8x16b, y1_1H_8x16b); |
1056 | | |
1057 | 5.08M | _mm_storeu_si128((__m128i *)pu1_dst, y1_0_16x8b); |
1058 | 5.08M | _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b); |
1059 | | |
1060 | 5.08M | ht -= 2; |
1061 | 5.08M | pu1_src1 += src_strd1 << 1; |
1062 | 5.08M | pu1_src2 += src_strd2 << 1; |
1063 | 5.08M | pu1_dst += dst_strd << 1; |
1064 | 5.08M | } |
1065 | 5.08M | while(ht > 0); |
1066 | 641k | } |
1067 | 733k | } |
1068 | | |
1069 | | /*****************************************************************************/ |
1070 | | /* */ |
1071 | | /* Function Name : ih264_weighted_bi_pred_chroma_sse42 */ |
1072 | | /* */ |
1073 | | /* Description : This function performs the weighted biprediction as */ |
1074 | | /* described in sec 8.4.2.3.2 titled "Weighted sample */ |
1075 | | /* prediction process" for chroma. The function gets two */ |
1076 | | /* ht x wd blocks, weights them, adds them, rounds off the */ |
1077 | | /* sum, offsets it, saturates it to unsigned 8-bit and */ |
1078 | | /* stores it in the destination block. (ht,wd) can be */ |
1079 | | /* (2,2), (4,2), (2,4), (4,4), (8,4), (4,8) or (8,8). */ |
1080 | | /* */ |
1081 | | /* Inputs : pu1_src1 - Pointer to source 1 */ |
1082 | | /* pu1_src2 - Pointer to source 2 */ |
1083 | | /* pu1_dst - Pointer to destination */ |
1084 | | /* src_strd1 - stride for source 1 */ |
1085 | | /* src_strd2 - stride for source 2 */ |
1086 | | /* dst_strd2 - stride for destination */ |
1087 | | /* log_wd - number of bits to be rounded off */ |
1088 | | /* wt1 - weight values for u and v in source 1 */ |
1089 | | /* wt2 - weight values for u and v in source 2 */ |
1090 | | /* ofst1 - offset value for u and v in source 1 */ |
1091 | | /* ofst2 - offset value for u and v in source 2 */ |
1092 | | /* ht - height of the block */ |
1093 | | /* wd - width of the block */ |
1094 | | /* */ |
1095 | | /* Issues : None */ |
1096 | | /* */ |
1097 | | /* Revision History: */ |
1098 | | /* */ |
1099 | | /* DD MM YYYY Author(s) Changes */ |
1100 | | /* 04 02 2015 Kaushik Initial Version */ |
1101 | | /* Senthoor */ |
1102 | | /* */ |
1103 | | /*****************************************************************************/ |
1104 | | void ih264_weighted_bi_pred_chroma_sse42(UWORD8 *pu1_src1, |
1105 | | UWORD8 *pu1_src2, |
1106 | | UWORD8 *pu1_dst, |
1107 | | WORD32 src_strd1, |
1108 | | WORD32 src_strd2, |
1109 | | WORD32 dst_strd, |
1110 | | WORD32 log_wd, |
1111 | | WORD32 wt1, |
1112 | | WORD32 wt2, |
1113 | | WORD32 ofst1, |
1114 | | WORD32 ofst2, |
1115 | | WORD32 ht, |
1116 | | WORD32 wd) |
1117 | 733k | { |
1118 | 733k | __m128i y1_0_16x8b, y1_1_16x8b; |
1119 | 733k | __m128i y2_0_16x8b, y2_1_16x8b; |
1120 | | |
1121 | 733k | __m128i wt1_8x16b, wt2_8x16b; |
1122 | 733k | __m128i ofst_8x16b, round_8x16b; |
1123 | | |
1124 | 733k | WORD32 ofst1_u, ofst2_u, ofst_u; |
1125 | 733k | WORD32 ofst1_v, ofst2_v, ofst_v; |
1126 | 733k | WORD32 round_val, shft, ofst_val; |
1127 | | |
1128 | 733k | round_val = 1 << log_wd; |
1129 | 733k | shft = log_wd + 1; |
1130 | | |
1131 | 733k | ofst1_u = (WORD8)(ofst1 & 0xff); |
1132 | 733k | ofst1_v = (WORD8)(ofst1 >> 8); |
1133 | 733k | ofst2_u = (WORD8)(ofst2 & 0xff); |
1134 | 733k | ofst2_v = (WORD8)(ofst2 >> 8); |
1135 | | |
1136 | 733k | wt1_8x16b = _mm_set1_epi32(wt1); |
1137 | 733k | wt2_8x16b = _mm_set1_epi32(wt2); |
1138 | | |
1139 | 733k | ofst_u = (ofst1_u + ofst2_u + 1) >> 1; |
1140 | 733k | ofst_v = (ofst1_v + ofst2_v + 1) >> 1; |
1141 | 733k | ofst_val = (ofst_u & 0xffff) | (ofst_v << 16); |
1142 | | |
1143 | 733k | round_8x16b = _mm_set1_epi16(round_val); |
1144 | 733k | ofst_8x16b = _mm_set1_epi32(ofst_val); |
1145 | | |
1146 | 733k | if(wd == 2) |
1147 | 38.9k | { |
1148 | 38.9k | __m128i y1_0_8x16b, y2_0_8x16b; |
1149 | | |
1150 | 38.9k | do |
1151 | 40.5k | { |
1152 | 40.5k | y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); |
1153 | 40.5k | y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1)); |
1154 | | |
1155 | 40.5k | y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2); |
1156 | 40.5k | y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2)); |
1157 | | |
1158 | 40.5k | y1_0_16x8b = _mm_unpacklo_epi32(y1_0_16x8b, y1_1_16x8b); |
1159 | 40.5k | y2_0_16x8b = _mm_unpacklo_epi32(y2_0_16x8b, y2_1_16x8b); |
1160 | | |
1161 | 40.5k | y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b); |
1162 | 40.5k | y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b); |
1163 | | |
1164 | 40.5k | y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b); |
1165 | 40.5k | y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b); |
1166 | | |
1167 | 40.5k | y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b); |
1168 | 40.5k | y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b); |
1169 | | |
1170 | 40.5k | y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft); |
1171 | 40.5k | y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b); |
1172 | | |
1173 | 40.5k | y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_0_8x16b); |
1174 | 40.5k | y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 4); |
1175 | | |
1176 | 40.5k | *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b); |
1177 | 40.5k | *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b); |
1178 | | |
1179 | 40.5k | ht -= 2; |
1180 | 40.5k | pu1_src1 += src_strd1 << 1; |
1181 | 40.5k | pu1_src2 += src_strd2 << 1; |
1182 | 40.5k | pu1_dst += dst_strd << 1; |
1183 | 40.5k | } |
1184 | 40.5k | while(ht > 0); |
1185 | 38.9k | } |
1186 | 694k | else if(wd == 4) |
1187 | 53.1k | { |
1188 | 53.1k | __m128i y1_0_8x16b, y1_1_8x16b; |
1189 | 53.1k | __m128i y2_0_8x16b, y2_1_8x16b; |
1190 | | |
1191 | 53.1k | do |
1192 | 119k | { |
1193 | 119k | y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); |
1194 | 119k | y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1)); |
1195 | | |
1196 | 119k | y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2); |
1197 | 119k | y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2)); |
1198 | | |
1199 | 119k | y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b); |
1200 | 119k | y1_1_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b); |
1201 | | |
1202 | 119k | y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b); |
1203 | 119k | y2_1_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b); |
1204 | | |
1205 | 119k | y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b); |
1206 | 119k | y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b); |
1207 | 119k | y1_1_8x16b = _mm_mullo_epi16(y1_1_8x16b, wt1_8x16b); |
1208 | 119k | y2_1_8x16b = _mm_mullo_epi16(y2_1_8x16b, wt2_8x16b); |
1209 | | |
1210 | 119k | y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b); |
1211 | 119k | y1_1_8x16b = _mm_adds_epi16(y1_1_8x16b, y2_1_8x16b); |
1212 | | |
1213 | 119k | y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b); |
1214 | 119k | y1_1_8x16b = _mm_adds_epi16(round_8x16b, y1_1_8x16b); |
1215 | | |
1216 | 119k | y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft); |
1217 | 119k | y1_1_8x16b = _mm_srai_epi16(y1_1_8x16b, shft); |
1218 | | |
1219 | 119k | y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b); |
1220 | 119k | y1_1_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1_8x16b); |
1221 | | |
1222 | 119k | y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_1_8x16b); |
1223 | 119k | y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 8); |
1224 | | |
1225 | 119k | _mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b); |
1226 | 119k | _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b); |
1227 | | |
1228 | 119k | ht -= 2; |
1229 | 119k | pu1_src1 += src_strd1 << 1; |
1230 | 119k | pu1_src2 += src_strd2 << 1; |
1231 | 119k | pu1_dst += dst_strd << 1; |
1232 | 119k | } |
1233 | 119k | while(ht > 0); |
1234 | 53.1k | } |
1235 | 641k | else // wd == 8 |
1236 | 641k | { |
1237 | 641k | __m128i y1_0L_8x16b, y1_0H_8x16b, y1_1L_8x16b, y1_1H_8x16b; |
1238 | 641k | __m128i y2_0L_8x16b, y2_0H_8x16b, y2_1L_8x16b, y2_1H_8x16b; |
1239 | | |
1240 | 641k | __m128i zero_16x8b; |
1241 | 641k | zero_16x8b = _mm_set1_epi8(0); |
1242 | | |
1243 | 641k | do |
1244 | 2.54M | { |
1245 | 2.54M | y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1); |
1246 | 2.54M | y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1)); |
1247 | 2.54M | y2_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2); |
1248 | 2.54M | y2_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2)); |
1249 | | |
1250 | 2.54M | y1_0L_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b); |
1251 | 2.54M | y1_0H_8x16b = _mm_unpackhi_epi8(y1_0_16x8b, zero_16x8b); |
1252 | 2.54M | y1_1L_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b); |
1253 | 2.54M | y1_1H_8x16b = _mm_unpackhi_epi8(y1_1_16x8b, zero_16x8b); |
1254 | | |
1255 | 2.54M | y2_0L_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b); |
1256 | 2.54M | y2_0H_8x16b = _mm_unpackhi_epi8(y2_0_16x8b, zero_16x8b); |
1257 | 2.54M | y2_1L_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b); |
1258 | 2.54M | y2_1H_8x16b = _mm_unpackhi_epi8(y2_1_16x8b, zero_16x8b); |
1259 | | |
1260 | 2.54M | y1_0L_8x16b = _mm_mullo_epi16(y1_0L_8x16b, wt1_8x16b); |
1261 | 2.54M | y1_0H_8x16b = _mm_mullo_epi16(y1_0H_8x16b, wt1_8x16b); |
1262 | 2.54M | y1_1L_8x16b = _mm_mullo_epi16(y1_1L_8x16b, wt1_8x16b); |
1263 | 2.54M | y1_1H_8x16b = _mm_mullo_epi16(y1_1H_8x16b, wt1_8x16b); |
1264 | | |
1265 | 2.54M | y2_0L_8x16b = _mm_mullo_epi16(y2_0L_8x16b, wt2_8x16b); |
1266 | 2.54M | y2_0H_8x16b = _mm_mullo_epi16(y2_0H_8x16b, wt2_8x16b); |
1267 | 2.54M | y2_1L_8x16b = _mm_mullo_epi16(y2_1L_8x16b, wt2_8x16b); |
1268 | 2.54M | y2_1H_8x16b = _mm_mullo_epi16(y2_1H_8x16b, wt2_8x16b); |
1269 | | |
1270 | 2.54M | y1_0L_8x16b = _mm_adds_epi16(y1_0L_8x16b, y2_0L_8x16b); |
1271 | 2.54M | y1_0H_8x16b = _mm_adds_epi16(y1_0H_8x16b, y2_0H_8x16b); |
1272 | 2.54M | y1_1L_8x16b = _mm_adds_epi16(y1_1L_8x16b, y2_1L_8x16b); |
1273 | 2.54M | y1_1H_8x16b = _mm_adds_epi16(y1_1H_8x16b, y2_1H_8x16b); |
1274 | | |
1275 | 2.54M | y1_0L_8x16b = _mm_adds_epi16(round_8x16b, y1_0L_8x16b); |
1276 | 2.54M | y1_0H_8x16b = _mm_adds_epi16(round_8x16b, y1_0H_8x16b); |
1277 | 2.54M | y1_1L_8x16b = _mm_adds_epi16(round_8x16b, y1_1L_8x16b); |
1278 | 2.54M | y1_1H_8x16b = _mm_adds_epi16(round_8x16b, y1_1H_8x16b); |
1279 | | |
1280 | 2.54M | y1_0L_8x16b = _mm_srai_epi16(y1_0L_8x16b, shft); |
1281 | 2.54M | y1_0H_8x16b = _mm_srai_epi16(y1_0H_8x16b, shft); |
1282 | 2.54M | y1_1L_8x16b = _mm_srai_epi16(y1_1L_8x16b, shft); |
1283 | 2.54M | y1_1H_8x16b = _mm_srai_epi16(y1_1H_8x16b, shft); |
1284 | | |
1285 | 2.54M | y1_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0L_8x16b); |
1286 | 2.54M | y1_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0H_8x16b); |
1287 | 2.54M | y1_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1L_8x16b); |
1288 | 2.54M | y1_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1H_8x16b); |
1289 | | |
1290 | 2.54M | y1_0_16x8b = _mm_packus_epi16(y1_0L_8x16b, y1_0H_8x16b); |
1291 | 2.54M | y1_1_16x8b = _mm_packus_epi16(y1_1L_8x16b, y1_1H_8x16b); |
1292 | | |
1293 | 2.54M | _mm_storeu_si128((__m128i *)pu1_dst, y1_0_16x8b); |
1294 | 2.54M | _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b); |
1295 | | |
1296 | 2.54M | ht -= 2; |
1297 | 2.54M | pu1_src1 += src_strd1 << 1; |
1298 | 2.54M | pu1_src2 += src_strd2 << 1; |
1299 | 2.54M | pu1_dst += dst_strd << 1; |
1300 | 2.54M | } |
1301 | 2.54M | while(ht > 0); |
1302 | 641k | } |
1303 | 733k | } |