/src/libavc/common/x86/ih264_weighted_pred_avx2.c
Line | Count | Source |
1 | | /****************************************************************************** |
2 | | * |
3 | | * Copyright (C) 2015 The Android Open Source Project |
4 | | * |
5 | | * Licensed under the Apache License, Version 2.0 (the "License"); |
6 | | * you may not use this file except in compliance with the License. |
7 | | * You may obtain a copy of the License at: |
8 | | * |
9 | | * http://www.apache.org/licenses/LICENSE-2.0 |
10 | | * |
11 | | * Unless required by applicable law or agreed to in writing, software |
12 | | * distributed under the License is distributed on an "AS IS" BASIS, |
13 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | * See the License for the specific language governing permissions and |
15 | | * limitations under the License. |
16 | | * |
17 | | ***************************************************************************** |
18 | | * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
19 | | */ |
20 | | /*****************************************************************************/ |
21 | | /*****************************************************************************/ |
22 | | /* File Includes */ |
23 | | /*****************************************************************************/ |
24 | | |
25 | | #include <immintrin.h> |
26 | | #include "ih264_typedefs.h" |
27 | | #include "ih264_macros.h" |
28 | | #include "ih264_platform_macros.h" |
29 | | #include "ih264_weighted_pred.h" |
30 | | #include <stdint.h> |
31 | | #include <string.h> |
32 | | |
33 | | #include <stdio.h> |
34 | | |
35 | | |
36 | | /*****************************************************************************/ |
37 | | /* */ |
38 | | /* Function Name : ih264_weighted_bi_pred_luma_avx2 */ |
39 | | /* */ |
40 | | /* Description : This function performs the weighted biprediction as */ |
41 | | /* described in sec 8.4.2.3.2 titled "Weighted sample */ |
42 | | /* prediction process" for luma. The function gets two */ |
43 | | /* ht x wd blocks, weights them, adds them, rounds off the */ |
44 | | /* sum, offsets it, saturates it to unsigned 8-bit and */ |
45 | | /* stores it in the destination block. (ht,wd) can be */ |
46 | | /* (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16). */ |
47 | | /* */ |
48 | | /* Inputs : pu1_src1 - Pointer to source 1 */ |
49 | | /* pu1_src2 - Pointer to source 2 */ |
50 | | /* pu1_dst - Pointer to destination */ |
51 | | /* src_strd1 - stride for source 1 */ |
52 | | /* src_strd2 - stride for source 2 */ |
53 | | /* dst_strd2 - stride for destination */ |
54 | | /* log_wd - number of bits to be rounded off */ |
55 | | /* wt1 - weight value for source 1 */ |
56 | | /* wt2 - weight value for source 2 */ |
57 | | /* ofst1 - offset value for source 1 */ |
58 | | /* ofst2 - offset value for source 2 */ |
59 | | /* ht - height of the block */ |
60 | | /* wd - width of the block */ |
61 | | /* */ |
62 | | /* Issues : None */ |
63 | | /* */ |
64 | | /* Revision History: */ |
65 | | /* */ |
66 | | /* DD MM YYYY Author(s) Changes */ |
67 | | /* 04 02 2015 Kaushik Initial Version */ |
68 | | /* Senthoor */ |
69 | | /* 15 09 2020 Priyanka Bose AVX2 Intel Intrinsics Support */ |
70 | | /*****************************************************************************/ |
71 | | void ih264_weighted_bi_pred_luma_avx2(UWORD8 *pu1_src1, |
72 | | UWORD8 *pu1_src2, |
73 | | UWORD8 *pu1_dst, |
74 | | WORD32 src_strd1, |
75 | | WORD32 src_strd2, |
76 | | WORD32 dst_strd, |
77 | | WORD32 log_wd, |
78 | | WORD32 wt1, |
79 | | WORD32 wt2, |
80 | | WORD32 ofst1, |
81 | | WORD32 ofst2, |
82 | | WORD32 ht, |
83 | | WORD32 wd) |
84 | 1.85k | { |
85 | | |
86 | 1.85k | __m256i wt1_8x32b, wt2_8x32b; |
87 | 1.85k | __m256i ofst_8x32b, round_8x32b; |
88 | 1.85k | __m256i zero; |
89 | 1.85k | zero = _mm256_set1_epi8(0); |
90 | | |
91 | 1.85k | WORD32 ofst; |
92 | 1.85k | WORD32 round_val, shft; |
93 | | |
94 | 1.85k | wt1 = (WORD16)(wt1 & 0xffff); |
95 | 1.85k | wt2 = (WORD16)(wt2 & 0xffff); |
96 | 1.85k | round_val = 1 << log_wd; |
97 | 1.85k | shft = log_wd + 1; |
98 | 1.85k | ofst1 = (WORD8)(ofst1 & 0xff); |
99 | 1.85k | ofst2 = (WORD8)(ofst2 & 0xff); |
100 | 1.85k | ofst = (ofst1 + ofst2 + 1) >> 1; |
101 | | |
102 | 1.85k | wt1_8x32b = _mm256_set1_epi16(wt1); |
103 | 1.85k | wt2_8x32b = _mm256_set1_epi16(wt2); |
104 | 1.85k | round_8x32b = _mm256_set1_epi16(round_val); |
105 | 1.85k | ofst_8x32b = _mm256_set1_epi16(ofst); |
106 | | |
107 | | |
108 | 1.85k | if(wd == 4) |
109 | 0 | { |
110 | 0 | __m128i y1_2_16x8b, y1_3_16x8b; |
111 | 0 | __m128i y2_2_16x8b, y2_3_16x8b; |
112 | |
|
113 | 0 | __m256i y1_02_32x8b,y1_13_32x8b,y2_02_32x8b,y2_13_32x8b,y1_0_32x8b,y2_0_32x8b,y1_0_8x32b,y2_1_8x32b,y2_0_8x32b; |
114 | 0 | __m128i y1_0_16x8b_128,y2_0_16x8b_128,y1_1_16x8b_128,y1_2_16x8b_128,y1_3_16x8b_128; |
115 | |
|
116 | 0 | do |
117 | 0 | { |
118 | 0 | y1_02_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src1 + (src_strd1 << 1)), (__m128i *)(pu1_src1)); |
119 | 0 | y1_13_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src1 + src_strd1 * 3), (__m128i *)(pu1_src1 + src_strd1)); |
120 | |
|
121 | 0 | y2_02_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src2 + (src_strd2 << 1)), (__m128i *)(pu1_src2)); |
122 | 0 | y2_13_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src2 + src_strd2 * 3), (__m128i *)(pu1_src2 + src_strd2)); |
123 | |
|
124 | 0 | y1_02_32x8b = _mm256_unpacklo_epi64(y1_02_32x8b, zero); |
125 | 0 | y1_13_32x8b = _mm256_unpacklo_epi64(y1_13_32x8b, zero); |
126 | 0 | y2_02_32x8b = _mm256_unpacklo_epi64(y2_02_32x8b, zero); |
127 | 0 | y2_13_32x8b = _mm256_unpacklo_epi64(y2_13_32x8b, zero); |
128 | |
|
129 | 0 | y1_0_32x8b = _mm256_unpacklo_epi32(y1_02_32x8b, y1_13_32x8b); |
130 | 0 | y2_0_32x8b = _mm256_unpacklo_epi32(y2_02_32x8b, y2_13_32x8b); |
131 | 0 | y1_0_16x8b_128 = _mm256_castsi256_si128(_mm256_permute4x64_epi64(y1_0_32x8b, 0xD8)); |
132 | 0 | y2_0_16x8b_128 = _mm256_castsi256_si128(_mm256_permute4x64_epi64(y2_0_32x8b, 0xD8)); |
133 | |
|
134 | 0 | y1_0_8x32b = _mm256_cvtepu8_epi16(y1_0_16x8b_128); // 8 to 16 |
135 | 0 | y2_0_8x32b = _mm256_cvtepu8_epi16(y2_0_16x8b_128); |
136 | |
|
137 | 0 | y1_0_8x32b = _mm256_mullo_epi16(y1_0_8x32b, wt1_8x32b); |
138 | 0 | y2_0_8x32b = _mm256_mullo_epi16(y2_0_8x32b, wt2_8x32b); |
139 | |
|
140 | 0 | y1_0_8x32b = _mm256_adds_epi16(y1_0_8x32b, y2_0_8x32b); |
141 | |
|
142 | 0 | y1_0_8x32b = _mm256_srai_epi16(y1_0_8x32b, shft); |
143 | |
|
144 | 0 | y1_0_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0_8x32b); |
145 | |
|
146 | 0 | y1_0_16x8b_128 = _mm256_castsi256_si128(_mm256_packus_epi16(y1_0_8x32b, y1_0_8x32b)); |
147 | 0 | y1_2_16x8b_128 = _mm_srli_si128(y1_0_16x8b_128, 4); |
148 | 0 | y1_1_16x8b_128 = _mm_srli_si128(y1_0_16x8b_128, 8); |
149 | 0 | y1_3_16x8b_128 = _mm_srli_si128(y1_0_16x8b_128, 12); |
150 | |
|
151 | 0 | *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b_128); |
152 | 0 | *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b_128); |
153 | 0 | *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y1_2_16x8b_128); |
154 | 0 | *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y1_3_16x8b_128); |
155 | |
|
156 | 0 | ht -= 4; |
157 | 0 | pu1_src1 += src_strd1 << 2; |
158 | 0 | pu1_src2 += src_strd2 << 2; |
159 | 0 | pu1_dst += dst_strd << 2; |
160 | 0 | } |
161 | 0 | while(ht > 0); |
162 | 0 | } |
163 | 1.85k | else if(wd == 8) |
164 | 184 | { |
165 | 184 | __m128i y1_0_16x8b_128,y2_0_16x8b_128,y1_2_16x8b_128,y1_1_16x8b_128,y1_3_16x8b_128; |
166 | 184 | __m256i y1_02_32x8b,y1_13_32x8b,y2_02_32x8b,y2_13_32x8b,y1_0_32x8b,y2_0_32x8b,y1_0_8x32b; |
167 | 184 | __m256i y1_1_8x32b,y2_0_8x32b,y2_1_8x32b; |
168 | | |
169 | 184 | do |
170 | 736 | { |
171 | | |
172 | 736 | y1_02_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src1 + (src_strd1 << 1)), (__m128i *)(pu1_src1)); |
173 | 736 | y1_13_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src1 + src_strd1 * 3), (__m128i *)(pu1_src1 + src_strd1)); |
174 | | |
175 | 736 | y2_02_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src2 + (src_strd2 << 1)), (__m128i *)(pu1_src2)); |
176 | 736 | y2_13_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src2 + src_strd2 * 3), (__m128i *)(pu1_src2 + src_strd2)); |
177 | | |
178 | 736 | y1_02_32x8b = _mm256_unpacklo_epi64(y1_02_32x8b, zero); |
179 | 736 | y1_13_32x8b = _mm256_unpacklo_epi64(y1_13_32x8b, zero); |
180 | 736 | y2_02_32x8b = _mm256_unpacklo_epi64(y2_02_32x8b, zero); |
181 | 736 | y2_13_32x8b = _mm256_unpacklo_epi64(y2_13_32x8b, zero); |
182 | | |
183 | 736 | y1_0_32x8b = _mm256_unpacklo_epi64(y1_02_32x8b, y1_13_32x8b); |
184 | 736 | y2_0_32x8b = _mm256_unpacklo_epi64(y2_02_32x8b, y2_13_32x8b); |
185 | | |
186 | 736 | y1_0_8x32b = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(y1_0_32x8b)); |
187 | 736 | y1_0_16x8b_128 = _mm256_castsi256_si128(_mm256_permute2x128_si256(y1_0_32x8b,y1_0_32x8b,0x1)); |
188 | 736 | y1_1_8x32b = _mm256_cvtepu8_epi16(y1_0_16x8b_128); |
189 | | |
190 | 736 | y2_0_8x32b = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(y2_0_32x8b)); |
191 | 736 | y2_0_16x8b_128 = _mm256_castsi256_si128(_mm256_permute2x128_si256(y2_0_32x8b,y2_0_32x8b,0x1)); |
192 | 736 | y2_1_8x32b = _mm256_cvtepu8_epi16(y2_0_16x8b_128); |
193 | | |
194 | | |
195 | 736 | y1_0_8x32b = _mm256_mullo_epi16(y1_0_8x32b, wt1_8x32b); |
196 | 736 | y2_0_8x32b = _mm256_mullo_epi16(y2_0_8x32b, wt2_8x32b); |
197 | 736 | y1_1_8x32b = _mm256_mullo_epi16(y1_1_8x32b, wt1_8x32b); |
198 | 736 | y2_1_8x32b = _mm256_mullo_epi16(y2_1_8x32b, wt2_8x32b); |
199 | | |
200 | 736 | y1_0_8x32b = _mm256_adds_epi16(y1_0_8x32b, y2_0_8x32b); |
201 | 736 | y1_1_8x32b = _mm256_adds_epi16(y1_1_8x32b, y2_1_8x32b); |
202 | | |
203 | 736 | y1_0_8x32b = _mm256_srai_epi16(y1_0_8x32b, shft); |
204 | 736 | y1_1_8x32b = _mm256_srai_epi16(y1_1_8x32b, shft); |
205 | | |
206 | 736 | y1_0_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0_8x32b); |
207 | 736 | y1_1_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_1_8x32b); |
208 | | |
209 | 736 | y1_0_32x8b = _mm256_packus_epi16(y1_0_8x32b, y1_1_8x32b); |
210 | 736 | y1_0_16x8b_128 = _mm256_castsi256_si128(y1_0_32x8b); |
211 | 736 | y1_2_16x8b_128 = _mm256_castsi256_si128(_mm256_srli_si256(y1_0_32x8b, 8)); |
212 | | |
213 | 736 | y1_0_32x8b = _mm256_permute2x128_si256(y1_0_32x8b,y1_0_32x8b,1); |
214 | 736 | y1_1_16x8b_128 = _mm256_castsi256_si128(y1_0_32x8b); |
215 | 736 | y1_3_16x8b_128 = _mm256_castsi256_si128(_mm256_srli_si256(y1_0_32x8b, 8)); |
216 | | |
217 | 736 | _mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b_128); |
218 | 736 | _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b_128); |
219 | 736 | _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y1_2_16x8b_128); |
220 | 736 | _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y1_3_16x8b_128); |
221 | | |
222 | 736 | ht -= 4; |
223 | 736 | pu1_src1 += src_strd1 << 2; |
224 | 736 | pu1_src2 += src_strd2 << 2; |
225 | 736 | pu1_dst += dst_strd << 2; |
226 | | |
227 | 736 | } |
228 | 736 | while(ht > 0); |
229 | 184 | } |
230 | 1.66k | else // wd == 16 |
231 | 1.66k | { |
232 | 1.66k | __m256i y1_0L_8x32b, y1_0H_8x32b, y1_1L_8x32b, y1_1H_8x32b; |
233 | 1.66k | __m256i y2_0L_8x32b, y2_0H_8x32b, y2_1L_8x32b, y2_1H_8x32b; |
234 | | |
235 | 1.66k | __m256i zero_32x8b,y1_0_32x8b,y2_0_32x8b; |
236 | 1.66k | zero_32x8b = _mm256_set1_epi8(0); |
237 | | |
238 | 1.66k | do |
239 | 10.2k | { |
240 | | |
241 | 10.2k | y1_0_32x8b = _mm256_loadu_si256((__m256i *)pu1_src1); |
242 | 10.2k | y2_0_32x8b = _mm256_loadu_si256((__m256i *)pu1_src2); |
243 | | |
244 | 10.2k | y1_0L_8x32b = _mm256_unpacklo_epi8(y1_0_32x8b, zero_32x8b); |
245 | 10.2k | y1_0H_8x32b = _mm256_unpackhi_epi8(y1_0_32x8b, zero_32x8b); |
246 | | |
247 | 10.2k | y2_0L_8x32b = _mm256_unpacklo_epi8(y2_0_32x8b,zero_32x8b); |
248 | 10.2k | y2_0H_8x32b = _mm256_unpackhi_epi8(y2_0_32x8b, zero_32x8b); |
249 | | |
250 | 10.2k | y1_0L_8x32b = _mm256_mullo_epi16(y1_0L_8x32b, wt1_8x32b); |
251 | 10.2k | y1_0H_8x32b = _mm256_mullo_epi16(y1_0H_8x32b, wt1_8x32b); |
252 | | |
253 | 10.2k | y2_0L_8x32b = _mm256_mullo_epi16(y2_0L_8x32b, wt2_8x32b); |
254 | 10.2k | y2_0H_8x32b = _mm256_mullo_epi16(y2_0H_8x32b, wt2_8x32b); |
255 | | |
256 | 10.2k | y1_0L_8x32b = _mm256_adds_epi16(y1_0L_8x32b, y2_0L_8x32b); |
257 | 10.2k | y1_0H_8x32b = _mm256_adds_epi16(y1_0H_8x32b, y2_0H_8x32b); |
258 | | |
259 | 10.2k | y1_0L_8x32b = _mm256_adds_epi16(round_8x32b, y1_0L_8x32b); |
260 | 10.2k | y1_0H_8x32b = _mm256_adds_epi16(round_8x32b, y1_0H_8x32b); |
261 | | |
262 | 10.2k | y1_0L_8x32b = _mm256_srai_epi16(y1_0L_8x32b, shft); |
263 | 10.2k | y1_0H_8x32b = _mm256_srai_epi16(y1_0H_8x32b, shft); |
264 | | |
265 | 10.2k | y1_0L_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0L_8x32b); |
266 | 10.2k | y1_0H_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0H_8x32b); |
267 | | |
268 | 10.2k | y1_0_32x8b = _mm256_packus_epi16(y1_0L_8x32b, y1_0H_8x32b); |
269 | | |
270 | 10.2k | _mm256_storeu_si256((__m256i *)pu1_dst, y1_0_32x8b); |
271 | | |
272 | 10.2k | ht -= 2; |
273 | 10.2k | pu1_src1 += src_strd1 << 1; |
274 | 10.2k | pu1_src2 += src_strd2 << 1; |
275 | 10.2k | pu1_dst += dst_strd << 1; |
276 | 10.2k | } |
277 | 10.2k | while(ht > 0); |
278 | 1.66k | } |
279 | 1.85k | } |
280 | | |
281 | | |
282 | | /*****************************************************************************/ |
283 | | /* */ |
284 | | /* Function Name : ih264_weighted_bi_pred_chroma_avx2 */ |
285 | | /* */ |
286 | | /* Description : This function performs the weighted biprediction as */ |
287 | | /* described in sec 8.4.2.3.2 titled "Weighted sample */ |
288 | | /* prediction process" for chroma. The function gets two */ |
289 | | /* ht x wd blocks, weights them, adds them, rounds off the */ |
290 | | /* sum, offsets it, saturates it to unsigned 8-bit and */ |
291 | | /* stores it in the destination block. (ht,wd) can be */ |
292 | | /* (2,2), (4,2), (2,4), (4,4), (8,4), (4,8) or (8,8). */ |
293 | | /* */ |
294 | | /* Inputs : pu1_src1 - Pointer to source 1 */ |
295 | | /* pu1_src2 - Pointer to source 2 */ |
296 | | /* pu1_dst - Pointer to destination */ |
297 | | /* src_strd1 - stride for source 1 */ |
298 | | /* src_strd2 - stride for source 2 */ |
299 | | /* dst_strd2 - stride for destination */ |
300 | | /* log_wd - number of bits to be rounded off */ |
301 | | /* wt1 - weight values for u and v in source 1 */ |
302 | | /* wt2 - weight values for u and v in source 2 */ |
303 | | /* ofst1 - offset value for u and v in source 1 */ |
304 | | /* ofst2 - offset value for u and v in source 2 */ |
305 | | /* ht - height of the block */ |
306 | | /* wd - width of the block */ |
307 | | /* */ |
308 | | /* Issues : None */ |
309 | | /* */ |
310 | | /* Revision History: */ |
311 | | /* */ |
312 | | /* DD MM YYYY Author(s) Changes */ |
313 | | /* 04 02 2015 Kaushik Initial Version */ |
314 | | /* Senthoor */ |
315 | | /* 15 09 2020 Priyanka Bose AVX2 Intel Intrinsics Support */ |
316 | | /*****************************************************************************/ |
317 | | void ih264_weighted_bi_pred_chroma_avx2(UWORD8 *pu1_src1, |
318 | | UWORD8 *pu1_src2, |
319 | | UWORD8 *pu1_dst, |
320 | | WORD32 src_strd1, |
321 | | WORD32 src_strd2, |
322 | | WORD32 dst_strd, |
323 | | WORD32 log_wd, |
324 | | WORD32 wt1, |
325 | | WORD32 wt2, |
326 | | WORD32 ofst1, |
327 | | WORD32 ofst2, |
328 | | WORD32 ht, |
329 | | WORD32 wd) |
330 | 1.85k | { |
331 | | |
332 | 1.85k | __m128i y1_0_16x8b, y1_1_16x8b; |
333 | 1.85k | __m128i y2_0_16x8b, y2_1_16x8b; |
334 | | |
335 | 1.85k | __m128i wt1_8x16b, wt2_8x16b; |
336 | 1.85k | __m128i ofst_8x16b, round_8x16b; |
337 | | |
338 | 1.85k | WORD32 ofst1_u, ofst2_u, ofst_u; |
339 | 1.85k | WORD32 ofst1_v, ofst2_v, ofst_v; |
340 | 1.85k | WORD32 round_val, shft, ofst_val,ofst_val_256; |
341 | | |
342 | 1.85k | round_val = 1 << log_wd; |
343 | 1.85k | shft = log_wd + 1; |
344 | | |
345 | 1.85k | ofst1_u = (WORD8)(ofst1 & 0xff); |
346 | 1.85k | ofst1_v = (WORD8)(ofst1 >> 8); |
347 | 1.85k | ofst2_u = (WORD8)(ofst2 & 0xff); |
348 | 1.85k | ofst2_v = (WORD8)(ofst2 >> 8); |
349 | | |
350 | 1.85k | wt1_8x16b = _mm_set1_epi32(wt1); |
351 | 1.85k | wt2_8x16b = _mm_set1_epi32(wt2); |
352 | | |
353 | 1.85k | ofst_u = (ofst1_u + ofst2_u + 1) >> 1; |
354 | 1.85k | ofst_v = (ofst1_v + ofst2_v + 1) >> 1; |
355 | 1.85k | ofst_val = (ofst_u & 0xffff) | (ofst_v << 16); |
356 | 1.85k | ofst_val_256 = (ofst_u & 0xffff) | (ofst_v << 16); |
357 | | |
358 | 1.85k | round_8x16b = _mm_set1_epi16(round_val); |
359 | 1.85k | ofst_8x16b = _mm_set1_epi32(ofst_val); |
360 | | |
361 | 1.85k | if(wd == 2) |
362 | 0 | { |
363 | 0 | __m128i y1_0_8x16b, y2_0_8x16b; |
364 | |
|
365 | 0 | do |
366 | 0 | { |
367 | 0 | y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); //Loading 64 bits from diff location |
368 | 0 | y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1)); |
369 | |
|
370 | 0 | y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2); |
371 | 0 | y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2)); |
372 | |
|
373 | 0 | y1_0_16x8b = _mm_unpacklo_epi32(y1_0_16x8b, y1_1_16x8b); |
374 | 0 | y2_0_16x8b = _mm_unpacklo_epi32(y2_0_16x8b, y2_1_16x8b); |
375 | |
|
376 | 0 | y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b); |
377 | 0 | y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b); |
378 | |
|
379 | 0 | y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b); |
380 | 0 | y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b); |
381 | |
|
382 | 0 | y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b); |
383 | 0 | y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b); |
384 | |
|
385 | 0 | y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft); |
386 | 0 | y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b); |
387 | |
|
388 | 0 | y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_0_8x16b); |
389 | 0 | y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 4); |
390 | |
|
391 | 0 | *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b); |
392 | 0 | *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b); |
393 | |
|
394 | 0 | ht -= 2; |
395 | 0 | pu1_src1 += src_strd1 << 1; |
396 | 0 | pu1_src2 += src_strd2 << 1; |
397 | 0 | pu1_dst += dst_strd << 1; |
398 | 0 | } |
399 | 0 | while(ht > 0); |
400 | 0 | } |
401 | 1.85k | else if(wd == 4) |
402 | 184 | { |
403 | 184 | __m128i y1_0_8x16b, y1_1_8x16b; |
404 | 184 | __m128i y2_0_8x16b, y2_1_8x16b; |
405 | | |
406 | 184 | do |
407 | 736 | { |
408 | 736 | y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); //Loading 64 bits from diff location |
409 | 736 | y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1)); |
410 | | |
411 | 736 | y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2); |
412 | 736 | y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2)); |
413 | | |
414 | 736 | y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b); |
415 | 736 | y1_1_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b); |
416 | | |
417 | 736 | y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b); |
418 | 736 | y2_1_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b); |
419 | | |
420 | 736 | y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b); |
421 | 736 | y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b); |
422 | 736 | y1_1_8x16b = _mm_mullo_epi16(y1_1_8x16b, wt1_8x16b); |
423 | 736 | y2_1_8x16b = _mm_mullo_epi16(y2_1_8x16b, wt2_8x16b); |
424 | | |
425 | 736 | y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b); |
426 | 736 | y1_1_8x16b = _mm_adds_epi16(y1_1_8x16b, y2_1_8x16b); |
427 | | |
428 | 736 | y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b); |
429 | 736 | y1_1_8x16b = _mm_adds_epi16(round_8x16b, y1_1_8x16b); |
430 | | |
431 | 736 | y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft); |
432 | 736 | y1_1_8x16b = _mm_srai_epi16(y1_1_8x16b, shft); |
433 | | |
434 | 736 | y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b); |
435 | 736 | y1_1_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1_8x16b); |
436 | | |
437 | 736 | y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_1_8x16b); |
438 | 736 | y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 8); |
439 | | |
440 | 736 | _mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b); |
441 | 736 | _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b); |
442 | | |
443 | 736 | ht -= 2; |
444 | 736 | pu1_src1 += src_strd1 << 1; |
445 | 736 | pu1_src2 += src_strd2 << 1; |
446 | 736 | pu1_dst += dst_strd << 1; |
447 | 736 | } |
448 | 736 | while(ht > 0); |
449 | 184 | } |
450 | 1.66k | else // wd == 8 |
451 | 1.66k | { |
452 | 1.66k | __m256i y1_0L_8x32b, y1_0H_8x32b, y1_1L_8x32b, y1_1H_8x32b; |
453 | 1.66k | __m256i y2_0L_8x32b, y2_0H_8x32b, y2_1L_8x32b, y2_1H_8x32b; |
454 | 1.66k | __m256i y1_0_32x8b,y2_0_32x8b,ofst_8x32b,round_8x32b; |
455 | 1.66k | __m256i wt1_8x32b, wt2_8x32b; |
456 | 1.66k | __m256i zero_32x8b; |
457 | | |
458 | 1.66k | wt1_8x32b = _mm256_set1_epi16(wt1); |
459 | 1.66k | wt2_8x32b = _mm256_set1_epi16(wt2); |
460 | 1.66k | round_8x32b = _mm256_set1_epi16(round_val); |
461 | 1.66k | ofst_8x32b = _mm256_set1_epi32(ofst_val_256); |
462 | 1.66k | zero_32x8b = _mm256_set1_epi8(0); |
463 | | |
464 | 1.66k | do |
465 | 5.12k | { |
466 | 5.12k | y1_0_32x8b = _mm256_loadu_si256((__m256i *)pu1_src1); |
467 | 5.12k | y2_0_32x8b = _mm256_loadu_si256((__m256i *)pu1_src2); |
468 | 5.12k | y1_0L_8x32b = _mm256_unpacklo_epi8(y1_0_32x8b, zero_32x8b); |
469 | 5.12k | y1_0H_8x32b = _mm256_unpackhi_epi8(y1_0_32x8b, zero_32x8b); |
470 | 5.12k | y2_0L_8x32b = _mm256_unpacklo_epi8(y2_0_32x8b, zero_32x8b); |
471 | 5.12k | y2_0H_8x32b = _mm256_unpackhi_epi8(y2_0_32x8b, zero_32x8b); |
472 | 5.12k | y1_0L_8x32b = _mm256_mullo_epi16(y1_0L_8x32b, wt1_8x32b); |
473 | 5.12k | y1_0H_8x32b = _mm256_mullo_epi16(y1_0H_8x32b, wt1_8x32b); |
474 | | |
475 | 5.12k | y2_0L_8x32b = _mm256_mullo_epi16(y2_0L_8x32b, wt2_8x32b); |
476 | 5.12k | y2_0H_8x32b = _mm256_mullo_epi16(y2_0H_8x32b, wt2_8x32b); |
477 | | |
478 | 5.12k | y1_0L_8x32b = _mm256_adds_epi16(y1_0L_8x32b, y2_0L_8x32b); |
479 | 5.12k | y1_0H_8x32b = _mm256_adds_epi16(y1_0H_8x32b, y2_0H_8x32b); |
480 | | |
481 | 5.12k | y1_0L_8x32b = _mm256_adds_epi16(round_8x32b, y1_0L_8x32b); |
482 | 5.12k | y1_0H_8x32b = _mm256_adds_epi16(round_8x32b, y1_0H_8x32b); |
483 | | |
484 | 5.12k | y1_0L_8x32b = _mm256_srai_epi16(y1_0L_8x32b, shft); |
485 | 5.12k | y1_0H_8x32b = _mm256_srai_epi16(y1_0H_8x32b, shft); |
486 | | |
487 | 5.12k | y1_0L_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0L_8x32b); |
488 | 5.12k | y1_0H_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0H_8x32b); |
489 | | |
490 | | |
491 | 5.12k | y1_0_32x8b = _mm256_packus_epi16(y1_0L_8x32b, y1_0H_8x32b); |
492 | 5.12k | _mm256_storeu_si256((__m256i *)pu1_dst, y1_0_32x8b); |
493 | | |
494 | 5.12k | ht -= 2; |
495 | 5.12k | pu1_src1 += src_strd1 << 1; |
496 | 5.12k | pu1_src2 += src_strd2 << 1; |
497 | 5.12k | pu1_dst += dst_strd << 1; |
498 | 5.12k | } |
499 | 5.12k | while(ht > 0); |
500 | 1.66k | } |
501 | 1.85k | } |