/src/aom/av1/common/x86/jnt_convolve_avx2.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2018, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #include <emmintrin.h> |
13 | | #include <immintrin.h> |
14 | | |
15 | | #include "config/av1_rtcd.h" |
16 | | |
17 | | #include "aom_dsp/aom_dsp_common.h" |
18 | | #include "aom_dsp/aom_filter.h" |
19 | | #include "aom_dsp/x86/convolve_avx2.h" |
20 | | #include "aom_dsp/x86/convolve_common_intrin.h" |
21 | | #include "aom_dsp/x86/convolve_sse4_1.h" |
22 | | #include "aom_dsp/x86/mem_sse2.h" |
23 | | #include "aom_dsp/x86/synonyms_avx2.h" |
24 | | |
25 | | #include "av1/common/convolve.h" |
26 | | |
27 | 1.36M | static inline __m256i unpack_weights_avx2(ConvolveParams *conv_params) { |
28 | 1.36M | const int w0 = conv_params->fwd_offset; |
29 | 1.36M | const int w1 = conv_params->bck_offset; |
30 | 1.36M | const __m256i wt0 = _mm256_set1_epi16((int16_t)w0); |
31 | 1.36M | const __m256i wt1 = _mm256_set1_epi16((int16_t)w1); |
32 | 1.36M | const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1); |
33 | 1.36M | return wt; |
34 | 1.36M | } |
35 | | |
36 | 4.80M | static inline __m256i load_line2_avx2(const void *a, const void *b) { |
37 | 4.80M | return _mm256_permute2x128_si256( |
38 | 4.80M | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)a)), |
39 | 4.80M | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)b)), 0x20); |
40 | 4.80M | } |
41 | | |
42 | | void av1_dist_wtd_convolve_x_avx2(const uint8_t *src, int src_stride, |
43 | | uint8_t *dst0, int dst_stride0, int w, int h, |
44 | | const InterpFilterParams *filter_params_x, |
45 | | const int subpel_x_qn, |
46 | 82.0k | ConvolveParams *conv_params) { |
47 | 82.0k | CONV_BUF_TYPE *dst = conv_params->dst; |
48 | 82.0k | int dst_stride = conv_params->dst_stride; |
49 | 82.0k | const int bd = 8; |
50 | 82.0k | int i, j, is_horiz_4tap = 0; |
51 | 82.0k | const int bits = FILTER_BITS - conv_params->round_1; |
52 | 82.0k | const __m256i wt = unpack_weights_avx2(conv_params); |
53 | 82.0k | const int do_average = conv_params->do_average; |
54 | 82.0k | const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; |
55 | 82.0k | const int offset_0 = |
56 | 82.0k | bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
57 | 82.0k | const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); |
58 | 82.0k | const __m256i offset_const = _mm256_set1_epi16(offset); |
59 | 82.0k | const int rounding_shift = |
60 | 82.0k | 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
61 | 82.0k | const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); |
62 | | |
63 | 82.0k | assert(bits >= 0); |
64 | 82.0k | assert(conv_params->round_0 > 0); |
65 | | |
66 | 82.0k | const __m256i round_const = |
67 | 82.0k | _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1); |
68 | 82.0k | const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1); |
69 | | |
70 | 82.0k | __m256i filt[4], coeffs[4]; |
71 | | |
72 | 82.0k | filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); |
73 | 82.0k | filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); |
74 | | |
75 | 82.0k | prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs); |
76 | | |
77 | | // Condition for checking valid horz_filt taps |
78 | 82.0k | if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0))) |
79 | 35.6k | is_horiz_4tap = 1; |
80 | | |
81 | | // horz_filt as 4 tap |
82 | 82.0k | if (is_horiz_4tap) { |
83 | 35.6k | const int fo_horiz = 1; |
84 | 35.6k | const uint8_t *const src_ptr = src - fo_horiz; |
85 | 233k | for (i = 0; i < h; i += 2) { |
86 | 198k | const uint8_t *src_data = src_ptr + i * src_stride; |
87 | 198k | CONV_BUF_TYPE *dst_data = dst + i * dst_stride; |
88 | 907k | for (j = 0; j < w; j += 8) { |
89 | 709k | const __m256i data = |
90 | 709k | load_line2_avx2(&src_data[j], &src_data[j + src_stride]); |
91 | | |
92 | 709k | __m256i res = convolve_lowbd_x_4tap(data, coeffs + 1, filt); |
93 | 709k | res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift); |
94 | 709k | res = _mm256_slli_epi16(res, bits); |
95 | | |
96 | 709k | const __m256i res_unsigned = _mm256_add_epi16(res, offset_const); |
97 | | |
98 | | // Accumulate values into the destination buffer |
99 | 709k | if (do_average) { |
100 | 210k | const __m256i data_ref_0 = |
101 | 210k | load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]); |
102 | 210k | const __m256i comp_avg_res = |
103 | 210k | comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); |
104 | | |
105 | 210k | const __m256i round_result = convolve_rounding( |
106 | 210k | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); |
107 | | |
108 | 210k | const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); |
109 | 210k | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
110 | 210k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
111 | | |
112 | 210k | if (w > 4) { |
113 | 188k | _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
114 | 188k | _mm_storel_epi64( |
115 | 188k | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); |
116 | 188k | } else { |
117 | 22.5k | *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); |
118 | 22.5k | *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = |
119 | 22.5k | _mm_cvtsi128_si32(res_1); |
120 | 22.5k | } |
121 | 498k | } else { |
122 | 498k | const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); |
123 | 498k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); |
124 | | |
125 | 498k | const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); |
126 | 498k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
127 | 498k | res_1); |
128 | 498k | } |
129 | 709k | } |
130 | 198k | } |
131 | 46.4k | } else { |
132 | 46.4k | const int fo_horiz = filter_params_x->taps / 2 - 1; |
133 | 46.4k | const uint8_t *const src_ptr = src - fo_horiz; |
134 | | |
135 | 46.4k | filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); |
136 | 46.4k | filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); |
137 | 425k | for (i = 0; i < h; i += 2) { |
138 | 378k | const uint8_t *src_data = src_ptr + i * src_stride; |
139 | 378k | CONV_BUF_TYPE *dst_data = dst + i * dst_stride; |
140 | 2.02M | for (j = 0; j < w; j += 8) { |
141 | 1.64M | const __m256i data = |
142 | 1.64M | load_line2_avx2(&src_data[j], &src_data[j + src_stride]); |
143 | | |
144 | 1.64M | __m256i res = convolve_lowbd_x(data, coeffs, filt); |
145 | | |
146 | 1.64M | res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift); |
147 | | |
148 | 1.64M | res = _mm256_slli_epi16(res, bits); |
149 | | |
150 | 1.64M | const __m256i res_unsigned = _mm256_add_epi16(res, offset_const); |
151 | | |
152 | | // Accumulate values into the destination buffer |
153 | 1.64M | if (do_average) { |
154 | 641k | const __m256i data_ref_0 = |
155 | 641k | load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]); |
156 | 641k | const __m256i comp_avg_res = |
157 | 641k | comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); |
158 | | |
159 | 641k | const __m256i round_result = convolve_rounding( |
160 | 641k | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); |
161 | | |
162 | 641k | const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); |
163 | 641k | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
164 | 641k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
165 | | |
166 | 641k | if (w > 4) { |
167 | 641k | _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
168 | 641k | _mm_storel_epi64( |
169 | 641k | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); |
170 | 18.4E | } else { |
171 | 18.4E | *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); |
172 | 18.4E | *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = |
173 | 18.4E | _mm_cvtsi128_si32(res_1); |
174 | 18.4E | } |
175 | 1.00M | } else { |
176 | 1.00M | const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); |
177 | 1.00M | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); |
178 | | |
179 | 1.00M | const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); |
180 | 1.00M | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
181 | 1.00M | res_1); |
182 | 1.00M | } |
183 | 1.64M | } |
184 | 378k | } |
185 | 46.4k | } |
186 | 82.0k | } |
187 | | |
188 | | void av1_dist_wtd_convolve_y_avx2(const uint8_t *src, int src_stride, |
189 | | uint8_t *dst0, int dst_stride0, int w, int h, |
190 | | const InterpFilterParams *filter_params_y, |
191 | | const int subpel_y_qn, |
192 | 61.9k | ConvolveParams *conv_params) { |
193 | 61.9k | CONV_BUF_TYPE *dst = conv_params->dst; |
194 | 61.9k | int dst_stride = conv_params->dst_stride; |
195 | 61.9k | const int bd = 8; |
196 | 61.9k | int i, j, is_vert_4tap = 0; |
197 | | // +1 to compensate for dividing the filter coeffs by 2 |
198 | 61.9k | const int left_shift = FILTER_BITS - conv_params->round_0 + 1; |
199 | 61.9k | const __m256i round_const = |
200 | 61.9k | _mm256_set1_epi32((1 << conv_params->round_1) >> 1); |
201 | 61.9k | const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); |
202 | 61.9k | const __m256i wt = unpack_weights_avx2(conv_params); |
203 | 61.9k | const int do_average = conv_params->do_average; |
204 | 61.9k | const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; |
205 | 61.9k | const int offset_0 = |
206 | 61.9k | bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
207 | 61.9k | const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); |
208 | 61.9k | const __m256i offset_const = _mm256_set1_epi16(offset); |
209 | 61.9k | const int offset_1 = (1 << (bd + FILTER_BITS - 2)); |
210 | 61.9k | const __m256i offset_const_1 = _mm256_set1_epi16(offset_1); |
211 | 61.9k | const __m256i offset_const_2 = _mm256_set1_epi16((1 << offset_0)); |
212 | 61.9k | const int rounding_shift = |
213 | 61.9k | 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
214 | 61.9k | const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); |
215 | 61.9k | const __m256i zero = _mm256_setzero_si256(); |
216 | 61.9k | __m256i coeffs[4], s[8]; |
217 | | |
218 | 61.9k | assert((FILTER_BITS - conv_params->round_0) >= 0); |
219 | | |
220 | 61.9k | prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs); |
221 | | |
222 | | // Condition for checking valid vert_filt taps |
223 | 61.9k | if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0))) |
224 | 32.9k | is_vert_4tap = 1; |
225 | | |
226 | 61.9k | if (is_vert_4tap) { |
227 | 32.9k | const int fo_vert = 1; |
228 | 32.9k | const uint8_t *const src_ptr = src - fo_vert * src_stride; |
229 | 68.4k | for (j = 0; j < w; j += 16) { |
230 | 35.5k | const uint8_t *data = &src_ptr[j]; |
231 | 35.5k | __m256i src4; |
232 | | // Load lines a and b. Line a to lower 128, line b to upper 128 |
233 | 35.5k | { |
234 | 35.5k | __m256i src_ab[4]; |
235 | 35.5k | __m256i src_a[5]; |
236 | 35.5k | src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); |
237 | 177k | for (int kk = 0; kk < 4; ++kk) { |
238 | 142k | data += src_stride; |
239 | 142k | src_a[kk + 1] = |
240 | 142k | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); |
241 | 142k | src_ab[kk] = |
242 | 142k | _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20); |
243 | 142k | } |
244 | 35.5k | src4 = src_a[4]; |
245 | 35.5k | s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]); |
246 | 35.5k | s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]); |
247 | | |
248 | 35.5k | s[3] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]); |
249 | 35.5k | s[4] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]); |
250 | 35.5k | } |
251 | | |
252 | 241k | for (i = 0; i < h; i += 2) { |
253 | 205k | data = &src_ptr[(i + 5) * src_stride + j]; |
254 | 205k | const __m256i src5 = |
255 | 205k | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); |
256 | 205k | const __m256i src_45a = _mm256_permute2x128_si256(src4, src5, 0x20); |
257 | | |
258 | 205k | src4 = _mm256_castsi128_si256( |
259 | 205k | _mm_loadu_si128((__m128i *)(data + src_stride))); |
260 | 205k | const __m256i src_56a = _mm256_permute2x128_si256(src5, src4, 0x20); |
261 | | |
262 | 205k | s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); |
263 | 205k | s[5] = _mm256_unpackhi_epi8(src_45a, src_56a); |
264 | | |
265 | 205k | __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1); |
266 | | |
267 | 205k | res_lo = _mm256_add_epi16(res_lo, offset_const_1); |
268 | | |
269 | 205k | const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero); |
270 | 205k | const __m256i res_lo_0_shift = |
271 | 205k | _mm256_slli_epi32(res_lo_0_32b, left_shift); |
272 | 205k | const __m256i res_lo_0_round = _mm256_sra_epi32( |
273 | 205k | _mm256_add_epi32(res_lo_0_shift, round_const), round_shift); |
274 | | |
275 | 205k | const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero); |
276 | 205k | const __m256i res_lo_1_shift = |
277 | 205k | _mm256_slli_epi32(res_lo_1_32b, left_shift); |
278 | 205k | const __m256i res_lo_1_round = _mm256_sra_epi32( |
279 | 205k | _mm256_add_epi32(res_lo_1_shift, round_const), round_shift); |
280 | | |
281 | 205k | const __m256i res_lo_round = |
282 | 205k | _mm256_packs_epi32(res_lo_0_round, res_lo_1_round); |
283 | | |
284 | 205k | const __m256i res_lo_unsigned = |
285 | 205k | _mm256_add_epi16(res_lo_round, offset_const_2); |
286 | | |
287 | 205k | if (w - j < 16) { |
288 | 81.9k | if (do_average) { |
289 | 35.3k | const __m256i data_ref_0 = |
290 | 35.3k | load_line2_avx2(&dst[i * dst_stride + j], |
291 | 35.3k | &dst[i * dst_stride + j + dst_stride]); |
292 | 35.3k | const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned, |
293 | 35.3k | &wt, use_dist_wtd_comp_avg); |
294 | | |
295 | 35.3k | const __m256i round_result = convolve_rounding( |
296 | 35.3k | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); |
297 | | |
298 | 35.3k | const __m256i res_8 = |
299 | 35.3k | _mm256_packus_epi16(round_result, round_result); |
300 | 35.3k | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
301 | 35.3k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
302 | | |
303 | 35.3k | if (w - j > 4) { |
304 | 19.3k | _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
305 | 19.3k | _mm_storel_epi64( |
306 | 19.3k | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), |
307 | 19.3k | res_1); |
308 | 19.3k | } else { |
309 | 15.9k | *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); |
310 | 15.9k | *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = |
311 | 15.9k | _mm_cvtsi128_si32(res_1); |
312 | 15.9k | } |
313 | 46.6k | } else { |
314 | 46.6k | const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned); |
315 | 46.6k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); |
316 | | |
317 | 46.6k | const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1); |
318 | 46.6k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
319 | 46.6k | res_1); |
320 | 46.6k | } |
321 | 123k | } else { |
322 | 123k | __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1); |
323 | | |
324 | 123k | res_hi = _mm256_add_epi16(res_hi, offset_const_1); |
325 | | |
326 | 123k | const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero); |
327 | 123k | const __m256i res_hi_0_shift = |
328 | 123k | _mm256_slli_epi32(res_hi_0_32b, left_shift); |
329 | 123k | const __m256i res_hi_0_round = _mm256_sra_epi32( |
330 | 123k | _mm256_add_epi32(res_hi_0_shift, round_const), round_shift); |
331 | | |
332 | 123k | const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero); |
333 | 123k | const __m256i res_hi_1_shift = |
334 | 123k | _mm256_slli_epi32(res_hi_1_32b, left_shift); |
335 | 123k | const __m256i res_hi_1_round = _mm256_sra_epi32( |
336 | 123k | _mm256_add_epi32(res_hi_1_shift, round_const), round_shift); |
337 | | |
338 | 123k | const __m256i res_hi_round = |
339 | 123k | _mm256_packs_epi32(res_hi_0_round, res_hi_1_round); |
340 | | |
341 | 123k | const __m256i res_hi_unsigned = |
342 | 123k | _mm256_add_epi16(res_hi_round, offset_const_2); |
343 | | |
344 | 123k | if (do_average) { |
345 | 47.8k | const __m256i data_ref_0_lo = |
346 | 47.8k | load_line2_avx2(&dst[i * dst_stride + j], |
347 | 47.8k | &dst[i * dst_stride + j + dst_stride]); |
348 | | |
349 | 47.8k | const __m256i data_ref_0_hi = |
350 | 47.8k | load_line2_avx2(&dst[i * dst_stride + j + 8], |
351 | 47.8k | &dst[i * dst_stride + j + 8 + dst_stride]); |
352 | | |
353 | 47.8k | const __m256i comp_avg_res_lo = comp_avg( |
354 | 47.8k | &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg); |
355 | | |
356 | 47.8k | const __m256i comp_avg_res_hi = comp_avg( |
357 | 47.8k | &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg); |
358 | | |
359 | 47.8k | const __m256i round_result_lo = |
360 | 47.8k | convolve_rounding(&comp_avg_res_lo, &offset_const, |
361 | 47.8k | &rounding_const, rounding_shift); |
362 | | |
363 | 47.8k | const __m256i round_result_hi = |
364 | 47.8k | convolve_rounding(&comp_avg_res_hi, &offset_const, |
365 | 47.8k | &rounding_const, rounding_shift); |
366 | | |
367 | 47.8k | const __m256i res_8 = |
368 | 47.8k | _mm256_packus_epi16(round_result_lo, round_result_hi); |
369 | 47.8k | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
370 | 47.8k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
371 | | |
372 | 47.8k | _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
373 | 47.8k | _mm_store_si128( |
374 | 47.8k | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); |
375 | | |
376 | 75.9k | } else { |
377 | 75.9k | const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned); |
378 | 75.9k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0); |
379 | | |
380 | 75.9k | const __m128i res_lo_1 = |
381 | 75.9k | _mm256_extracti128_si256(res_lo_unsigned, 1); |
382 | 75.9k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
383 | 75.9k | res_lo_1); |
384 | | |
385 | 75.9k | const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned); |
386 | 75.9k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]), |
387 | 75.9k | res_hi_0); |
388 | | |
389 | 75.9k | const __m128i res_hi_1 = |
390 | 75.9k | _mm256_extracti128_si256(res_hi_unsigned, 1); |
391 | 75.9k | _mm_store_si128( |
392 | 75.9k | (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]), |
393 | 75.9k | res_hi_1); |
394 | 75.9k | } |
395 | 123k | } |
396 | 205k | s[0] = s[1]; |
397 | 205k | s[1] = s[2]; |
398 | | |
399 | 205k | s[3] = s[4]; |
400 | 205k | s[4] = s[5]; |
401 | 205k | } |
402 | 35.5k | } |
403 | 32.9k | } else { |
404 | 29.0k | const int fo_vert = filter_params_y->taps / 2 - 1; |
405 | 29.0k | const uint8_t *const src_ptr = src - fo_vert * src_stride; |
406 | 67.9k | for (j = 0; j < w; j += 16) { |
407 | 38.9k | const uint8_t *data = &src_ptr[j]; |
408 | 38.9k | __m256i src6; |
409 | | // Load lines a and b. Line a to lower 128, line b to upper 128 |
410 | 38.9k | { |
411 | 38.9k | __m256i src_ab[7]; |
412 | 38.9k | __m256i src_a[7]; |
413 | 38.9k | src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); |
414 | 272k | for (int kk = 0; kk < 6; ++kk) { |
415 | 233k | data += src_stride; |
416 | 233k | src_a[kk + 1] = |
417 | 233k | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); |
418 | 233k | src_ab[kk] = |
419 | 233k | _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20); |
420 | 233k | } |
421 | 38.9k | src6 = src_a[6]; |
422 | 38.9k | s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]); |
423 | 38.9k | s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]); |
424 | 38.9k | s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]); |
425 | 38.9k | s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]); |
426 | 38.9k | s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]); |
427 | 38.9k | s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]); |
428 | 38.9k | } |
429 | | |
430 | 477k | for (i = 0; i < h; i += 2) { |
431 | 438k | data = &src_ptr[(i + 7) * src_stride + j]; |
432 | 438k | const __m256i src7 = |
433 | 438k | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); |
434 | 438k | const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20); |
435 | | |
436 | 438k | src6 = _mm256_castsi128_si256( |
437 | 438k | _mm_loadu_si128((__m128i *)(data + src_stride))); |
438 | 438k | const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20); |
439 | | |
440 | 438k | s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); |
441 | 438k | s[7] = _mm256_unpackhi_epi8(src_67a, src_78a); |
442 | | |
443 | 438k | __m256i res_lo = convolve_lowbd(s, coeffs); |
444 | | |
445 | 438k | res_lo = _mm256_add_epi16(res_lo, offset_const_1); |
446 | | |
447 | 438k | const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero); |
448 | 438k | const __m256i res_lo_0_shift = |
449 | 438k | _mm256_slli_epi32(res_lo_0_32b, left_shift); |
450 | 438k | const __m256i res_lo_0_round = _mm256_sra_epi32( |
451 | 438k | _mm256_add_epi32(res_lo_0_shift, round_const), round_shift); |
452 | | |
453 | 438k | const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero); |
454 | 438k | const __m256i res_lo_1_shift = |
455 | 438k | _mm256_slli_epi32(res_lo_1_32b, left_shift); |
456 | 438k | const __m256i res_lo_1_round = _mm256_sra_epi32( |
457 | 438k | _mm256_add_epi32(res_lo_1_shift, round_const), round_shift); |
458 | | |
459 | 438k | const __m256i res_lo_round = |
460 | 438k | _mm256_packs_epi32(res_lo_0_round, res_lo_1_round); |
461 | | |
462 | 438k | const __m256i res_lo_unsigned = |
463 | 438k | _mm256_add_epi16(res_lo_round, offset_const_2); |
464 | | |
465 | 438k | if (w - j < 16) { |
466 | 86.5k | if (do_average) { |
467 | 39.3k | const __m256i data_ref_0 = |
468 | 39.3k | load_line2_avx2(&dst[i * dst_stride + j], |
469 | 39.3k | &dst[i * dst_stride + j + dst_stride]); |
470 | 39.3k | const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned, |
471 | 39.3k | &wt, use_dist_wtd_comp_avg); |
472 | | |
473 | 39.3k | const __m256i round_result = convolve_rounding( |
474 | 39.3k | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); |
475 | | |
476 | 39.3k | const __m256i res_8 = |
477 | 39.3k | _mm256_packus_epi16(round_result, round_result); |
478 | 39.3k | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
479 | 39.3k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
480 | | |
481 | 39.3k | if (w - j > 4) { |
482 | 30.9k | _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
483 | 30.9k | _mm_storel_epi64( |
484 | 30.9k | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), |
485 | 30.9k | res_1); |
486 | 30.9k | } else { |
487 | 8.41k | *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); |
488 | 8.41k | *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = |
489 | 8.41k | _mm_cvtsi128_si32(res_1); |
490 | 8.41k | } |
491 | 47.1k | } else { |
492 | 47.1k | const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned); |
493 | 47.1k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); |
494 | | |
495 | 47.1k | const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1); |
496 | 47.1k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
497 | 47.1k | res_1); |
498 | 47.1k | } |
499 | 351k | } else { |
500 | 351k | __m256i res_hi = convolve_lowbd(s + 4, coeffs); |
501 | | |
502 | 351k | res_hi = _mm256_add_epi16(res_hi, offset_const_1); |
503 | | |
504 | 351k | const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero); |
505 | 351k | const __m256i res_hi_0_shift = |
506 | 351k | _mm256_slli_epi32(res_hi_0_32b, left_shift); |
507 | 351k | const __m256i res_hi_0_round = _mm256_sra_epi32( |
508 | 351k | _mm256_add_epi32(res_hi_0_shift, round_const), round_shift); |
509 | | |
510 | 351k | const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero); |
511 | 351k | const __m256i res_hi_1_shift = |
512 | 351k | _mm256_slli_epi32(res_hi_1_32b, left_shift); |
513 | 351k | const __m256i res_hi_1_round = _mm256_sra_epi32( |
514 | 351k | _mm256_add_epi32(res_hi_1_shift, round_const), round_shift); |
515 | | |
516 | 351k | const __m256i res_hi_round = |
517 | 351k | _mm256_packs_epi32(res_hi_0_round, res_hi_1_round); |
518 | | |
519 | 351k | const __m256i res_hi_unsigned = |
520 | 351k | _mm256_add_epi16(res_hi_round, offset_const_2); |
521 | | |
522 | 351k | if (do_average) { |
523 | 142k | const __m256i data_ref_0_lo = |
524 | 142k | load_line2_avx2(&dst[i * dst_stride + j], |
525 | 142k | &dst[i * dst_stride + j + dst_stride]); |
526 | | |
527 | 142k | const __m256i data_ref_0_hi = |
528 | 142k | load_line2_avx2(&dst[i * dst_stride + j + 8], |
529 | 142k | &dst[i * dst_stride + j + 8 + dst_stride]); |
530 | | |
531 | 142k | const __m256i comp_avg_res_lo = comp_avg( |
532 | 142k | &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg); |
533 | | |
534 | 142k | const __m256i comp_avg_res_hi = comp_avg( |
535 | 142k | &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg); |
536 | | |
537 | 142k | const __m256i round_result_lo = |
538 | 142k | convolve_rounding(&comp_avg_res_lo, &offset_const, |
539 | 142k | &rounding_const, rounding_shift); |
540 | | |
541 | 142k | const __m256i round_result_hi = |
542 | 142k | convolve_rounding(&comp_avg_res_hi, &offset_const, |
543 | 142k | &rounding_const, rounding_shift); |
544 | | |
545 | 142k | const __m256i res_8 = |
546 | 142k | _mm256_packus_epi16(round_result_lo, round_result_hi); |
547 | 142k | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
548 | 142k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
549 | | |
550 | 142k | _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
551 | 142k | _mm_store_si128( |
552 | 142k | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); |
553 | | |
554 | 209k | } else { |
555 | 209k | const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned); |
556 | 209k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0); |
557 | | |
558 | 209k | const __m128i res_lo_1 = |
559 | 209k | _mm256_extracti128_si256(res_lo_unsigned, 1); |
560 | 209k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
561 | 209k | res_lo_1); |
562 | | |
563 | 209k | const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned); |
564 | 209k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]), |
565 | 209k | res_hi_0); |
566 | | |
567 | 209k | const __m128i res_hi_1 = |
568 | 209k | _mm256_extracti128_si256(res_hi_unsigned, 1); |
569 | 209k | _mm_store_si128( |
570 | 209k | (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]), |
571 | 209k | res_hi_1); |
572 | 209k | } |
573 | 351k | } |
574 | 438k | s[0] = s[1]; |
575 | 438k | s[1] = s[2]; |
576 | 438k | s[2] = s[3]; |
577 | | |
578 | 438k | s[4] = s[5]; |
579 | 438k | s[5] = s[6]; |
580 | 438k | s[6] = s[7]; |
581 | 438k | } |
582 | 38.9k | } |
583 | 29.0k | } |
584 | 61.9k | } |
585 | | |
586 | | void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride, |
587 | | uint8_t *dst0, int dst_stride0, int w, int h, |
588 | | const InterpFilterParams *filter_params_x, |
589 | | const InterpFilterParams *filter_params_y, |
590 | | const int subpel_x_qn, const int subpel_y_qn, |
591 | 162k | ConvolveParams *conv_params) { |
592 | 162k | CONV_BUF_TYPE *dst = conv_params->dst; |
593 | 162k | int dst_stride = conv_params->dst_stride; |
594 | 162k | const int bd = 8; |
595 | | |
596 | 162k | DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); |
597 | | |
598 | 162k | int im_stride = 8; |
599 | 162k | int i, is_horiz_4tap = 0, is_vert_4tap = 0; |
600 | 162k | const __m256i wt = unpack_weights_avx2(conv_params); |
601 | 162k | const int do_average = conv_params->do_average; |
602 | 162k | const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; |
603 | 162k | const int offset_0 = |
604 | 162k | bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
605 | 162k | const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); |
606 | 162k | const __m256i offset_const = _mm256_set1_epi16(offset); |
607 | 162k | const int rounding_shift = |
608 | 162k | 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
609 | 162k | const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); |
610 | | |
611 | 162k | assert(conv_params->round_0 > 0); |
612 | | |
613 | 162k | const __m256i round_const_h = _mm256_set1_epi16( |
614 | 162k | ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2))); |
615 | 162k | const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1); |
616 | | |
617 | 162k | const __m256i round_const_v = _mm256_set1_epi32( |
618 | 162k | ((1 << conv_params->round_1) >> 1) - |
619 | 162k | (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); |
620 | 162k | const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1); |
621 | | |
622 | 162k | __m256i filt[4], coeffs_x[4], coeffs_y[4]; |
623 | | |
624 | 162k | filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); |
625 | 162k | filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); |
626 | | |
627 | 162k | prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_x); |
628 | 162k | prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); |
629 | | |
630 | | // Condition for checking valid horz_filt taps |
631 | 162k | if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_x[0], coeffs_x[3]), 0))) |
632 | 68.9k | is_horiz_4tap = 1; |
633 | | |
634 | | // Condition for checking valid vert_filt taps |
635 | 162k | if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_y[0], coeffs_y[3]), 0))) |
636 | 76.9k | is_vert_4tap = 1; |
637 | | |
638 | 162k | if (is_horiz_4tap) { |
639 | 68.9k | int im_h = h + filter_params_y->taps - 1; |
640 | 68.9k | const int fo_vert = filter_params_y->taps / 2 - 1; |
641 | 68.9k | const int fo_horiz = 1; |
642 | 68.9k | const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; |
643 | 152k | for (int j = 0; j < w; j += 8) { |
644 | | /* Horizontal filter */ |
645 | 83.6k | const uint8_t *src_h = src_ptr + j; |
646 | 897k | for (i = 0; i < im_h; i += 2) { |
647 | 814k | __m256i data = |
648 | 814k | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h)); |
649 | 814k | if (i + 1 < im_h) |
650 | 730k | data = _mm256_inserti128_si256( |
651 | 814k | data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1); |
652 | 814k | src_h += (src_stride << 1); |
653 | 814k | __m256i res = convolve_lowbd_x_4tap(data, coeffs_x + 1, filt); |
654 | | |
655 | 814k | res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), |
656 | 814k | round_shift_h); |
657 | | |
658 | 814k | _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); |
659 | 814k | } |
660 | 83.6k | DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP; |
661 | 83.6k | } |
662 | 93.8k | } else if (is_vert_4tap) { |
663 | 22.3k | int im_h = h + 3; |
664 | 22.3k | const int fo_vert = 1; |
665 | 22.3k | const int fo_horiz = filter_params_x->taps / 2 - 1; |
666 | 22.3k | const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; |
667 | | |
668 | 22.3k | filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); |
669 | 22.3k | filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); |
670 | | |
671 | 53.8k | for (int j = 0; j < w; j += 8) { |
672 | | /* Horizontal filter */ |
673 | 31.5k | const uint8_t *src_h = src_ptr + j; |
674 | 31.5k | DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP; |
675 | | |
676 | | /* Vertical filter */ |
677 | 31.5k | __m256i s[6]; |
678 | 31.5k | __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); |
679 | 31.5k | __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); |
680 | 31.5k | __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); |
681 | 31.5k | __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); |
682 | | |
683 | 31.5k | s[0] = _mm256_unpacklo_epi16(s0, s1); |
684 | 31.5k | s[1] = _mm256_unpacklo_epi16(s2, s3); |
685 | | |
686 | 31.5k | s[3] = _mm256_unpackhi_epi16(s0, s1); |
687 | 31.5k | s[4] = _mm256_unpackhi_epi16(s2, s3); |
688 | | |
689 | 143k | for (i = 0; i < h; i += 2) { |
690 | 111k | const int16_t *data = &im_block[i * im_stride]; |
691 | | |
692 | 111k | const __m256i s4 = |
693 | 111k | _mm256_loadu_si256((__m256i *)(data + 4 * im_stride)); |
694 | 111k | const __m256i s5 = |
695 | 111k | _mm256_loadu_si256((__m256i *)(data + 5 * im_stride)); |
696 | | |
697 | 111k | s[2] = _mm256_unpacklo_epi16(s4, s5); |
698 | 111k | s[5] = _mm256_unpackhi_epi16(s4, s5); |
699 | | |
700 | 111k | const __m256i res_a = convolve_4tap(s, coeffs_y + 1); |
701 | 111k | const __m256i res_a_round = _mm256_sra_epi32( |
702 | 111k | _mm256_add_epi32(res_a, round_const_v), round_shift_v); |
703 | | |
704 | 111k | if (w - j > 4) { |
705 | 111k | const __m256i res_b = convolve_4tap(s + 3, coeffs_y + 1); |
706 | 111k | const __m256i res_b_round = _mm256_sra_epi32( |
707 | 111k | _mm256_add_epi32(res_b, round_const_v), round_shift_v); |
708 | 111k | const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round); |
709 | 111k | const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); |
710 | | |
711 | 111k | if (do_average) { |
712 | 44.3k | const __m256i data_ref_0 = |
713 | 44.3k | load_line2_avx2(&dst[i * dst_stride + j], |
714 | 44.3k | &dst[i * dst_stride + j + dst_stride]); |
715 | 44.3k | const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, |
716 | 44.3k | &wt, use_dist_wtd_comp_avg); |
717 | | |
718 | 44.3k | const __m256i round_result = convolve_rounding( |
719 | 44.3k | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); |
720 | | |
721 | 44.3k | const __m256i res_8 = |
722 | 44.3k | _mm256_packus_epi16(round_result, round_result); |
723 | 44.3k | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
724 | 44.3k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
725 | | |
726 | 44.3k | _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
727 | 44.3k | _mm_storel_epi64( |
728 | 44.3k | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); |
729 | 67.2k | } else { |
730 | 67.2k | const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); |
731 | 67.2k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); |
732 | | |
733 | 67.2k | const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); |
734 | 67.2k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
735 | 67.2k | res_1); |
736 | 67.2k | } |
737 | 111k | } else { |
738 | 0 | const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round); |
739 | 0 | const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); |
740 | |
|
741 | 0 | if (do_average) { |
742 | 0 | const __m256i data_ref_0 = |
743 | 0 | load_line2_avx2(&dst[i * dst_stride + j], |
744 | 0 | &dst[i * dst_stride + j + dst_stride]); |
745 | |
|
746 | 0 | const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, |
747 | 0 | &wt, use_dist_wtd_comp_avg); |
748 | |
|
749 | 0 | const __m256i round_result = convolve_rounding( |
750 | 0 | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); |
751 | |
|
752 | 0 | const __m256i res_8 = |
753 | 0 | _mm256_packus_epi16(round_result, round_result); |
754 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
755 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
756 | |
|
757 | 0 | *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); |
758 | 0 | *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = |
759 | 0 | _mm_cvtsi128_si32(res_1); |
760 | |
|
761 | 0 | } else { |
762 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); |
763 | 0 | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); |
764 | |
|
765 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); |
766 | 0 | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
767 | 0 | res_1); |
768 | 0 | } |
769 | 0 | } |
770 | 111k | s[0] = s[1]; |
771 | 111k | s[1] = s[2]; |
772 | 111k | s[3] = s[4]; |
773 | 111k | s[4] = s[5]; |
774 | 111k | } |
775 | 31.5k | } |
776 | 71.4k | } else { |
777 | 71.4k | int im_h = h + filter_params_y->taps - 1; |
778 | 71.4k | const int fo_vert = filter_params_y->taps / 2 - 1; |
779 | 71.4k | const int fo_horiz = filter_params_x->taps / 2 - 1; |
780 | 71.4k | const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; |
781 | | |
782 | 71.4k | filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); |
783 | 71.4k | filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); |
784 | | |
785 | 238k | for (int j = 0; j < w; j += 8) { |
786 | | /* Horizontal filter */ |
787 | 166k | const uint8_t *src_h = src_ptr + j; |
788 | 166k | DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP; |
789 | | |
790 | 166k | DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP; |
791 | 166k | } |
792 | 71.4k | } |
793 | 162k | } |
794 | | |
795 | | #define DO_NO_AVG_2D_COPY_4X16(r0, c0, r1, c1, r2, c2, r3, c3) \ |
796 | 40.7M | do { \ |
797 | 40.7M | src_0 = _mm256_cvtepu8_epi16( \ |
798 | 40.7M | _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0]))); \ |
799 | 40.7M | src_1 = _mm256_cvtepu8_epi16( \ |
800 | 40.7M | _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1]))); \ |
801 | 40.7M | src_2 = _mm256_cvtepu8_epi16( \ |
802 | 40.7M | _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2]))); \ |
803 | 40.7M | src_3 = _mm256_cvtepu8_epi16( \ |
804 | 40.7M | _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3]))); \ |
805 | 40.7M | \ |
806 | 40.7M | src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT); \ |
807 | 40.7M | src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT); \ |
808 | 40.7M | src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT); \ |
809 | 40.7M | src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT); \ |
810 | 40.7M | \ |
811 | 40.7M | src_0 = _mm256_add_epi16(src_0, offset_const); \ |
812 | 40.7M | src_1 = _mm256_add_epi16(src_1, offset_const); \ |
813 | 40.7M | src_2 = _mm256_add_epi16(src_2, offset_const); \ |
814 | 40.7M | src_3 = _mm256_add_epi16(src_3, offset_const); \ |
815 | 40.7M | \ |
816 | 40.7M | _mm256_store_si256((__m256i *)(&dst[r0 * dst_stride + c0]), src_0); \ |
817 | 40.7M | _mm256_store_si256((__m256i *)(&dst[r1 * dst_stride + c1]), src_1); \ |
818 | 40.7M | _mm256_store_si256((__m256i *)(&dst[r2 * dst_stride + c2]), src_2); \ |
819 | 40.7M | _mm256_store_si256((__m256i *)(&dst[r3 * dst_stride + c3]), src_3); \ |
820 | 40.7M | } while (0) |
821 | | |
822 | 325M | #define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7) |
823 | | static inline void av1_dist_wtd_convolve_2d_no_avg_copy_avx2( |
824 | | const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, |
825 | 580k | int w, int h, const __m256i offset_const) { |
826 | 580k | int i = h; |
827 | 580k | if (w >= 16) { |
828 | 447k | __m256i src_0, src_1, src_2, src_3; |
829 | 447k | if (w == 128) { |
830 | 10.7M | do { |
831 | 10.7M | DO_NO_AVG_2D_COPY_4X16(0, 0, 0, 16, 0, 32, 0, 48); |
832 | 10.7M | DO_NO_AVG_2D_COPY_4X16(0, 64, 0, 80, 0, 96, 0, 112); |
833 | 10.7M | src += 1 * src_stride; |
834 | 10.7M | dst += 1 * dst_stride; |
835 | 10.7M | i -= 1; |
836 | 10.7M | } while (i); |
837 | 363k | } else if (w == 64) { |
838 | 16.2M | do { |
839 | 16.2M | DO_NO_AVG_2D_COPY_4X16(0, 0, 0, 16, 0, 32, 0, 48); |
840 | 16.2M | src += 1 * src_stride; |
841 | 16.2M | dst += 1 * dst_stride; |
842 | 16.2M | i -= 1; |
843 | 16.2M | } while (i); |
844 | 212k | } else if (w == 32) { |
845 | 2.87M | do { |
846 | 2.87M | DO_NO_AVG_2D_COPY_4X16(0, 0, 1, 0, 0, 16, 1, 16); |
847 | 2.87M | src += 2 * src_stride; |
848 | 2.87M | dst += 2 * dst_stride; |
849 | 2.87M | i -= 2; |
850 | 2.87M | } while (i); |
851 | 101k | } else if (w == 16) { |
852 | 234k | do { |
853 | 234k | DO_NO_AVG_2D_COPY_4X16(0, 0, 1, 0, 2, 0, 3, 0); |
854 | 234k | src += 4 * src_stride; |
855 | 234k | dst += 4 * dst_stride; |
856 | 234k | i -= 4; |
857 | 234k | } while (i); |
858 | 49.7k | } |
859 | 447k | } else { |
860 | 132k | const __m256i zero = _mm256_setzero_si256(); |
861 | 334k | do { |
862 | 334k | const __m128i src_row_0 = |
863 | 334k | _mm_loadl_epi64((__m128i *)(&src[0 * src_stride])); |
864 | 334k | const __m128i src_row_1 = |
865 | 334k | _mm_loadl_epi64((__m128i *)(&src[1 * src_stride])); |
866 | 334k | const __m128i src_row_2 = |
867 | 334k | _mm_loadl_epi64((__m128i *)(&src[2 * src_stride])); |
868 | 334k | const __m128i src_row_3 = |
869 | 334k | _mm_loadl_epi64((__m128i *)(&src[3 * src_stride])); |
870 | | |
871 | 334k | __m256i src_10 = _mm256_insertf128_si256( |
872 | 334k | _mm256_castsi128_si256(src_row_0), src_row_1, 1); |
873 | 334k | __m256i src_32 = _mm256_insertf128_si256( |
874 | 334k | _mm256_castsi128_si256(src_row_2), src_row_3, 1); |
875 | | |
876 | 334k | src_10 = _mm256_unpacklo_epi8(src_10, zero); |
877 | 334k | src_32 = _mm256_unpacklo_epi8(src_32, zero); |
878 | | |
879 | 334k | src_10 = _mm256_slli_epi16(src_10, LEFT_SHIFT); |
880 | 334k | src_32 = _mm256_slli_epi16(src_32, LEFT_SHIFT); |
881 | | |
882 | 334k | src_10 = _mm256_add_epi16(src_10, offset_const); |
883 | 334k | src_32 = _mm256_add_epi16(src_32, offset_const); |
884 | | |
885 | | // Accumulate values into the destination buffer |
886 | 334k | _mm_store_si128((__m128i *)(&dst[0 * dst_stride]), |
887 | 334k | _mm256_castsi256_si128(src_10)); |
888 | 334k | _mm_store_si128((__m128i *)(&dst[1 * dst_stride]), |
889 | 334k | _mm256_extracti128_si256(src_10, 1)); |
890 | 334k | _mm_store_si128((__m128i *)(&dst[2 * dst_stride]), |
891 | 334k | _mm256_castsi256_si128(src_32)); |
892 | 334k | _mm_store_si128((__m128i *)(&dst[3 * dst_stride]), |
893 | 334k | _mm256_extracti128_si256(src_32, 1)); |
894 | | |
895 | 334k | src += 4 * src_stride; |
896 | 334k | dst += 4 * dst_stride; |
897 | 334k | i -= 4; |
898 | 334k | } while (i); |
899 | 132k | } |
900 | 580k | } |
901 | | |
902 | | #define DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, r0, c0, r1, c1, r2, c2, r3, c3) \ |
903 | 40.4M | do { \ |
904 | 40.4M | src_0 = _mm256_cvtepu8_epi16( \ |
905 | 40.4M | _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0]))); \ |
906 | 40.4M | src_1 = _mm256_cvtepu8_epi16( \ |
907 | 40.4M | _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1]))); \ |
908 | 40.4M | src_2 = _mm256_cvtepu8_epi16( \ |
909 | 40.4M | _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2]))); \ |
910 | 40.4M | src_3 = _mm256_cvtepu8_epi16( \ |
911 | 40.4M | _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3]))); \ |
912 | 40.4M | \ |
913 | 40.4M | src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT); \ |
914 | 40.4M | src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT); \ |
915 | 40.4M | src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT); \ |
916 | 40.4M | src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT); \ |
917 | 40.4M | src_0 = _mm256_add_epi16(src_0, offset_const); \ |
918 | 40.4M | src_1 = _mm256_add_epi16(src_1, offset_const); \ |
919 | 40.4M | src_2 = _mm256_add_epi16(src_2, offset_const); \ |
920 | 40.4M | src_3 = _mm256_add_epi16(src_3, offset_const); \ |
921 | 40.4M | \ |
922 | 40.4M | ref_0 = _mm256_loadu_si256((__m256i *)(&dst[r0 * dst_stride + c0])); \ |
923 | 40.4M | ref_1 = _mm256_loadu_si256((__m256i *)(&dst[r1 * dst_stride + c1])); \ |
924 | 40.4M | ref_2 = _mm256_loadu_si256((__m256i *)(&dst[r2 * dst_stride + c2])); \ |
925 | 40.4M | ref_3 = _mm256_loadu_si256((__m256i *)(&dst[r3 * dst_stride + c3])); \ |
926 | 40.4M | \ |
927 | 40.4M | res_0 = comp_avg(&ref_0, &src_0, &wt, USE_DIST_WEIGHTED); \ |
928 | 40.4M | res_1 = comp_avg(&ref_1, &src_1, &wt, USE_DIST_WEIGHTED); \ |
929 | 40.4M | res_2 = comp_avg(&ref_2, &src_2, &wt, USE_DIST_WEIGHTED); \ |
930 | 40.4M | res_3 = comp_avg(&ref_3, &src_3, &wt, USE_DIST_WEIGHTED); \ |
931 | 40.4M | \ |
932 | 40.4M | res_0 = convolve_rounding(&res_0, &offset_const, &rounding_const, \ |
933 | 40.4M | rounding_shift); \ |
934 | 40.4M | res_1 = convolve_rounding(&res_1, &offset_const, &rounding_const, \ |
935 | 40.4M | rounding_shift); \ |
936 | 40.4M | res_2 = convolve_rounding(&res_2, &offset_const, &rounding_const, \ |
937 | 40.4M | rounding_shift); \ |
938 | 40.4M | res_3 = convolve_rounding(&res_3, &offset_const, &rounding_const, \ |
939 | 40.4M | rounding_shift); \ |
940 | 40.4M | \ |
941 | 40.4M | res_10 = _mm256_packus_epi16(res_0, res_1); \ |
942 | 40.4M | res_32 = _mm256_packus_epi16(res_2, res_3); \ |
943 | 40.4M | res_10 = _mm256_permute4x64_epi64(res_10, 0xD8); \ |
944 | 40.4M | res_32 = _mm256_permute4x64_epi64(res_32, 0xD8); \ |
945 | 40.4M | \ |
946 | 40.4M | _mm_store_si128((__m128i *)(&dst0[r0 * dst_stride0 + c0]), \ |
947 | 40.4M | _mm256_castsi256_si128(res_10)); \ |
948 | 40.4M | _mm_store_si128((__m128i *)(&dst0[r1 * dst_stride0 + c1]), \ |
949 | 40.4M | _mm256_extracti128_si256(res_10, 1)); \ |
950 | 40.4M | _mm_store_si128((__m128i *)(&dst0[r2 * dst_stride0 + c2]), \ |
951 | 40.4M | _mm256_castsi256_si128(res_32)); \ |
952 | 40.4M | _mm_store_si128((__m128i *)(&dst0[r3 * dst_stride0 + c3]), \ |
953 | 40.4M | _mm256_extracti128_si256(res_32, 1)); \ |
954 | 40.4M | } while (0) |
955 | | |
956 | | #define DO_AVG_2D_COPY(USE_DIST_WEIGHTED) \ |
957 | 483k | int i = h; \ |
958 | 483k | if (w >= 16) { \ |
959 | 414k | __m256i src_0, src_1, src_2, src_3; \ |
960 | 414k | __m256i ref_0, ref_1, ref_2, ref_3; \ |
961 | 414k | __m256i res_0, res_1, res_2, res_3; \ |
962 | 414k | __m256i res_10, res_32; \ |
963 | 414k | if (w == 128) { \ |
964 | 10.7M | do { \ |
965 | 10.7M | DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 0, 16, 0, 32, 0, 48); \ |
966 | 10.7M | DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 64, 0, 80, 0, 96, 0, 112); \ |
967 | 10.7M | i -= 1; \ |
968 | 10.7M | src += 1 * src_stride; \ |
969 | 10.7M | dst += 1 * dst_stride; \ |
970 | 10.7M | dst0 += 1 * dst_stride0; \ |
971 | 10.7M | } while (i); \ |
972 | 330k | } else if (w == 64) { \ |
973 | 16.1M | do { \ |
974 | 16.1M | DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 0, 16, 0, 32, 0, 48); \ |
975 | 16.1M | \ |
976 | 16.1M | i -= 1; \ |
977 | 16.1M | src += 1 * src_stride; \ |
978 | 16.1M | dst += 1 * dst_stride; \ |
979 | 16.1M | dst0 += 1 * dst_stride0; \ |
980 | 16.1M | } while (i); \ |
981 | 210k | } else if (w == 32) { \ |
982 | 2.77M | do { \ |
983 | 2.77M | DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 1, 0, 0, 16, 1, 16); \ |
984 | 2.77M | \ |
985 | 2.77M | i -= 2; \ |
986 | 2.77M | src += 2 * src_stride; \ |
987 | 2.77M | dst += 2 * dst_stride; \ |
988 | 2.77M | dst0 += 2 * dst_stride0; \ |
989 | 2.77M | } while (i); \ |
990 | 93.5k | } else { \ |
991 | 26.1k | assert(w == 16); \ |
992 | 103k | do { \ |
993 | 103k | DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 1, 0, 2, 0, 3, 0); \ |
994 | 103k | \ |
995 | 103k | i -= 4; \ |
996 | 103k | src += 4 * src_stride; \ |
997 | 103k | dst += 4 * dst_stride; \ |
998 | 103k | dst0 += 4 * dst_stride0; \ |
999 | 103k | } while (i); \ |
1000 | 26.3k | } \ |
1001 | 414k | } else if (w == 8) { \ |
1002 | 111k | do { \ |
1003 | 111k | const __m128i src_0 = \ |
1004 | 111k | _mm_loadl_epi64((__m128i *)(&src[0 * src_stride])); \ |
1005 | 111k | const __m128i src_1 = \ |
1006 | 111k | _mm_loadl_epi64((__m128i *)(&src[1 * src_stride])); \ |
1007 | 111k | const __m128i src_2 = \ |
1008 | 111k | _mm_loadl_epi64((__m128i *)(&src[2 * src_stride])); \ |
1009 | 111k | const __m128i src_3 = \ |
1010 | 111k | _mm_loadl_epi64((__m128i *)(&src[3 * src_stride])); \ |
1011 | 111k | __m256i src_10 = \ |
1012 | 111k | _mm256_insertf128_si256(_mm256_castsi128_si256(src_0), src_1, 1); \ |
1013 | 111k | __m256i src_32 = \ |
1014 | 111k | _mm256_insertf128_si256(_mm256_castsi128_si256(src_2), src_3, 1); \ |
1015 | 111k | \ |
1016 | 111k | src_10 = _mm256_unpacklo_epi8(src_10, zero); \ |
1017 | 111k | src_32 = _mm256_unpacklo_epi8(src_32, zero); \ |
1018 | 111k | \ |
1019 | 111k | src_10 = _mm256_slli_epi16(src_10, LEFT_SHIFT); \ |
1020 | 111k | src_32 = _mm256_slli_epi16(src_32, LEFT_SHIFT); \ |
1021 | 111k | \ |
1022 | 111k | src_10 = _mm256_add_epi16(src_10, offset_const); \ |
1023 | 111k | src_32 = _mm256_add_epi16(src_32, offset_const); \ |
1024 | 111k | \ |
1025 | 111k | const __m256i ref_10 = \ |
1026 | 111k | load_line2_avx2(&dst[0 * dst_stride], &dst[1 * dst_stride]); \ |
1027 | 111k | const __m256i ref_32 = \ |
1028 | 111k | load_line2_avx2(&dst[2 * dst_stride], &dst[3 * dst_stride]); \ |
1029 | 111k | __m256i res_10 = comp_avg(&ref_10, &src_10, &wt, USE_DIST_WEIGHTED); \ |
1030 | 111k | __m256i res_32 = comp_avg(&ref_32, &src_32, &wt, USE_DIST_WEIGHTED); \ |
1031 | 111k | \ |
1032 | 111k | res_10 = convolve_rounding(&res_10, &offset_const, &rounding_const, \ |
1033 | 111k | rounding_shift); \ |
1034 | 111k | res_32 = convolve_rounding(&res_32, &offset_const, &rounding_const, \ |
1035 | 111k | rounding_shift); \ |
1036 | 111k | \ |
1037 | 111k | __m256i res = _mm256_packus_epi16(res_10, res_32); \ |
1038 | 111k | const __m128i res_20 = _mm256_castsi256_si128(res); \ |
1039 | 111k | const __m128i res_31 = _mm256_extracti128_si256(res, 1); \ |
1040 | 111k | \ |
1041 | 111k | _mm_storel_epi64((__m128i *)(&dst0[0 * dst_stride0]), res_20); \ |
1042 | 111k | _mm_storel_epi64((__m128i *)((&dst0[1 * dst_stride0])), res_31); \ |
1043 | 111k | _mm_storeh_epi64((__m128i *)(&dst0[2 * dst_stride0]), res_20); \ |
1044 | 111k | _mm_storeh_epi64((__m128i *)((&dst0[3 * dst_stride0])), res_31); \ |
1045 | 111k | i -= 4; \ |
1046 | 111k | src += 4 * src_stride; \ |
1047 | 111k | dst += 4 * dst_stride; \ |
1048 | 111k | dst0 += 4 * dst_stride0; \ |
1049 | 111k | } while (i); \ |
1050 | 41.2k | } else { \ |
1051 | 28.1k | assert(w == 4); \ |
1052 | 44.0k | do { \ |
1053 | 44.0k | __m256i src_3210_8bit = \ |
1054 | 44.0k | _mm256_setr_epi32(loadu_int32(src + 0 * src_stride), \ |
1055 | 44.0k | loadu_int32(src + 1 * src_stride), 0, 0, \ |
1056 | 44.0k | loadu_int32(src + 2 * src_stride), \ |
1057 | 44.0k | loadu_int32(src + 3 * src_stride), 0, 0); \ |
1058 | 44.0k | \ |
1059 | 44.0k | __m256i src_3210 = _mm256_unpacklo_epi8(src_3210_8bit, zero); \ |
1060 | 44.0k | src_3210 = _mm256_slli_epi16(src_3210, LEFT_SHIFT); \ |
1061 | 44.0k | src_3210 = _mm256_add_epi16(src_3210, offset_const); \ |
1062 | 44.0k | \ |
1063 | 44.0k | __m256i ref_3210 = \ |
1064 | 44.0k | _mm256_setr_epi64x(*(int64_t *)(dst + 0 * dst_stride), \ |
1065 | 44.0k | *(int64_t *)(dst + 1 * dst_stride), \ |
1066 | 44.0k | *(int64_t *)(dst + 2 * dst_stride), \ |
1067 | 44.0k | *(int64_t *)(dst + 3 * dst_stride)); \ |
1068 | 44.0k | __m256i res_3210 = \ |
1069 | 44.0k | comp_avg(&ref_3210, &src_3210, &wt, USE_DIST_WEIGHTED); \ |
1070 | 44.0k | \ |
1071 | 44.0k | res_3210 = convolve_rounding(&res_3210, &offset_const, &rounding_const, \ |
1072 | 44.0k | rounding_shift); \ |
1073 | 44.0k | \ |
1074 | 44.0k | res_3210 = _mm256_packus_epi16(res_3210, res_3210); \ |
1075 | 44.0k | const __m128i res_10 = _mm256_castsi256_si128(res_3210); \ |
1076 | 44.0k | const __m128i res_32 = _mm256_extracti128_si256(res_3210, 1); \ |
1077 | 44.0k | \ |
1078 | 44.0k | *(int *)(&dst0[0 * dst_stride0]) = _mm_cvtsi128_si32(res_10); \ |
1079 | 44.0k | *(int *)(&dst0[2 * dst_stride0]) = _mm_cvtsi128_si32(res_32); \ |
1080 | 44.0k | *(int *)(&dst0[1 * dst_stride0]) = _mm_extract_epi32(res_10, 1); \ |
1081 | 44.0k | *(int *)(&dst0[3 * dst_stride0]) = _mm_extract_epi32(res_32, 1); \ |
1082 | 44.0k | i -= 4; \ |
1083 | 44.0k | src += 4 * src_stride; \ |
1084 | 44.0k | dst += 4 * dst_stride; \ |
1085 | 44.0k | dst0 += 4 * dst_stride0; \ |
1086 | 44.0k | } while (i); \ |
1087 | 28.1k | } |
1088 | | |
1089 | | void av1_dist_wtd_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, |
1090 | | uint8_t *dst0, int dst_stride0, int w, |
1091 | 1.06M | int h, ConvolveParams *conv_params) { |
1092 | 1.06M | const int bd = 8; |
1093 | 1.06M | CONV_BUF_TYPE *dst = conv_params->dst; |
1094 | 1.06M | int dst_stride = conv_params->dst_stride; |
1095 | 1.06M | assert(conv_params->round_0 == 3); |
1096 | 1.06M | assert(conv_params->round_1 == 7); |
1097 | 1.06M | assert(w % 4 == 0); |
1098 | 1.06M | assert(h % 4 == 0); |
1099 | | |
1100 | 1.06M | const int do_average = conv_params->do_average; |
1101 | 1.06M | const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; |
1102 | 1.06M | const __m256i wt = unpack_weights_avx2(conv_params); |
1103 | 1.06M | const __m256i zero = _mm256_setzero_si256(); |
1104 | | |
1105 | 1.06M | const int offset_0 = |
1106 | 1.06M | bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
1107 | 1.06M | const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); |
1108 | 1.06M | const __m256i offset_const = _mm256_set1_epi16(offset); |
1109 | 1.06M | const int rounding_shift = |
1110 | 1.06M | 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
1111 | 1.06M | const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); |
1112 | | |
1113 | 1.06M | if (do_average) { |
1114 | 483k | if (use_dist_wtd_comp_avg) { |
1115 | 50.2k | DO_AVG_2D_COPY(1) |
1116 | 433k | } else { |
1117 | 433k | DO_AVG_2D_COPY(0) |
1118 | 433k | } |
1119 | 579k | } else { |
1120 | 579k | av1_dist_wtd_convolve_2d_no_avg_copy_avx2(src, src_stride, dst, dst_stride, |
1121 | 579k | w, h, offset_const); |
1122 | 579k | } |
1123 | 1.06M | } |
1124 | | #undef LEFT_SHIFT |