/src/aom/av1/common/x86/jnt_convolve_avx2.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2018, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #include <emmintrin.h> |
13 | | #include <immintrin.h> |
14 | | |
15 | | #include "config/av1_rtcd.h" |
16 | | |
17 | | #include "aom_dsp/aom_dsp_common.h" |
18 | | #include "aom_dsp/aom_filter.h" |
19 | | #include "aom_dsp/x86/convolve_avx2.h" |
20 | | #include "aom_dsp/x86/convolve_common_intrin.h" |
21 | | #include "aom_dsp/x86/convolve_sse4_1.h" |
22 | | #include "aom_dsp/x86/mem_sse2.h" |
23 | | #include "aom_dsp/x86/synonyms_avx2.h" |
24 | | |
25 | | #include "av1/common/convolve.h" |
26 | | |
27 | 792k | static inline __m256i unpack_weights_avx2(ConvolveParams *conv_params) { |
28 | 792k | const int w0 = conv_params->fwd_offset; |
29 | 792k | const int w1 = conv_params->bck_offset; |
30 | 792k | const __m256i wt0 = _mm256_set1_epi16((int16_t)w0); |
31 | 792k | const __m256i wt1 = _mm256_set1_epi16((int16_t)w1); |
32 | 792k | const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1); |
33 | 792k | return wt; |
34 | 792k | } |
35 | | |
36 | 4.08M | static inline __m256i load_line2_avx2(const void *a, const void *b) { |
37 | 4.08M | return _mm256_permute2x128_si256( |
38 | 4.08M | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)a)), |
39 | 4.08M | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)b)), 0x20); |
40 | 4.08M | } |
41 | | |
42 | | void av1_dist_wtd_convolve_x_avx2(const uint8_t *src, int src_stride, |
43 | | uint8_t *dst0, int dst_stride0, int w, int h, |
44 | | const InterpFilterParams *filter_params_x, |
45 | | const int subpel_x_qn, |
46 | 75.1k | ConvolveParams *conv_params) { |
47 | 75.1k | CONV_BUF_TYPE *dst = conv_params->dst; |
48 | 75.1k | int dst_stride = conv_params->dst_stride; |
49 | 75.1k | const int bd = 8; |
50 | 75.1k | int i, j, is_horiz_4tap = 0; |
51 | 75.1k | const int bits = FILTER_BITS - conv_params->round_1; |
52 | 75.1k | const __m256i wt = unpack_weights_avx2(conv_params); |
53 | 75.1k | const int do_average = conv_params->do_average; |
54 | 75.1k | const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; |
55 | 75.1k | const int offset_0 = |
56 | 75.1k | bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
57 | 75.1k | const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); |
58 | 75.1k | const __m256i offset_const = _mm256_set1_epi16(offset); |
59 | 75.1k | const int rounding_shift = |
60 | 75.1k | 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
61 | 75.1k | const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); |
62 | | |
63 | 75.1k | assert(bits >= 0); |
64 | 75.1k | assert(conv_params->round_0 > 0); |
65 | | |
66 | 75.1k | const __m256i round_const = |
67 | 75.1k | _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1); |
68 | 75.1k | const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1); |
69 | | |
70 | 75.1k | __m256i filt[4], coeffs[4]; |
71 | | |
72 | 75.1k | filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); |
73 | 75.1k | filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); |
74 | | |
75 | 75.1k | prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs); |
76 | | |
77 | | // Condition for checking valid horz_filt taps |
78 | 75.1k | if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0))) |
79 | 25.8k | is_horiz_4tap = 1; |
80 | | |
81 | | // horz_filt as 4 tap |
82 | 75.1k | if (is_horiz_4tap) { |
83 | 25.8k | const int fo_horiz = 1; |
84 | 25.8k | const uint8_t *const src_ptr = src - fo_horiz; |
85 | 160k | for (i = 0; i < h; i += 2) { |
86 | 134k | const uint8_t *src_data = src_ptr + i * src_stride; |
87 | 134k | CONV_BUF_TYPE *dst_data = dst + i * dst_stride; |
88 | 637k | for (j = 0; j < w; j += 8) { |
89 | 502k | const __m256i data = |
90 | 502k | load_line2_avx2(&src_data[j], &src_data[j + src_stride]); |
91 | | |
92 | 502k | __m256i res = convolve_lowbd_x_4tap(data, coeffs + 1, filt); |
93 | 502k | res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift); |
94 | 502k | res = _mm256_slli_epi16(res, bits); |
95 | | |
96 | 502k | const __m256i res_unsigned = _mm256_add_epi16(res, offset_const); |
97 | | |
98 | | // Accumulate values into the destination buffer |
99 | 502k | if (do_average) { |
100 | 133k | const __m256i data_ref_0 = |
101 | 133k | load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]); |
102 | 133k | const __m256i comp_avg_res = |
103 | 133k | comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); |
104 | | |
105 | 133k | const __m256i round_result = convolve_rounding( |
106 | 133k | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); |
107 | | |
108 | 133k | const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); |
109 | 133k | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
110 | 133k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
111 | | |
112 | 133k | if (w > 4) { |
113 | 111k | _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
114 | 111k | _mm_storel_epi64( |
115 | 111k | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); |
116 | 111k | } else { |
117 | 21.3k | *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); |
118 | 21.3k | *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = |
119 | 21.3k | _mm_cvtsi128_si32(res_1); |
120 | 21.3k | } |
121 | 369k | } else { |
122 | 369k | const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); |
123 | 369k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); |
124 | | |
125 | 369k | const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); |
126 | 369k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
127 | 369k | res_1); |
128 | 369k | } |
129 | 502k | } |
130 | 134k | } |
131 | 49.3k | } else { |
132 | 49.3k | const int fo_horiz = filter_params_x->taps / 2 - 1; |
133 | 49.3k | const uint8_t *const src_ptr = src - fo_horiz; |
134 | | |
135 | 49.3k | filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); |
136 | 49.3k | filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); |
137 | 428k | for (i = 0; i < h; i += 2) { |
138 | 379k | const uint8_t *src_data = src_ptr + i * src_stride; |
139 | 379k | CONV_BUF_TYPE *dst_data = dst + i * dst_stride; |
140 | 1.88M | for (j = 0; j < w; j += 8) { |
141 | 1.51M | const __m256i data = |
142 | 1.51M | load_line2_avx2(&src_data[j], &src_data[j + src_stride]); |
143 | | |
144 | 1.51M | __m256i res = convolve_lowbd_x(data, coeffs, filt); |
145 | | |
146 | 1.51M | res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift); |
147 | | |
148 | 1.51M | res = _mm256_slli_epi16(res, bits); |
149 | | |
150 | 1.51M | const __m256i res_unsigned = _mm256_add_epi16(res, offset_const); |
151 | | |
152 | | // Accumulate values into the destination buffer |
153 | 1.51M | if (do_average) { |
154 | 614k | const __m256i data_ref_0 = |
155 | 614k | load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]); |
156 | 614k | const __m256i comp_avg_res = |
157 | 614k | comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); |
158 | | |
159 | 614k | const __m256i round_result = convolve_rounding( |
160 | 614k | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); |
161 | | |
162 | 614k | const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); |
163 | 614k | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
164 | 614k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
165 | | |
166 | 614k | if (w > 4) { |
167 | 614k | _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
168 | 614k | _mm_storel_epi64( |
169 | 614k | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); |
170 | 614k | } else { |
171 | 8 | *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); |
172 | 8 | *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = |
173 | 8 | _mm_cvtsi128_si32(res_1); |
174 | 8 | } |
175 | 896k | } else { |
176 | 896k | const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); |
177 | 896k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); |
178 | | |
179 | 896k | const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); |
180 | 896k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
181 | 896k | res_1); |
182 | 896k | } |
183 | 1.51M | } |
184 | 379k | } |
185 | 49.3k | } |
186 | 75.1k | } |
187 | | |
188 | | void av1_dist_wtd_convolve_y_avx2(const uint8_t *src, int src_stride, |
189 | | uint8_t *dst0, int dst_stride0, int w, int h, |
190 | | const InterpFilterParams *filter_params_y, |
191 | | const int subpel_y_qn, |
192 | 45.2k | ConvolveParams *conv_params) { |
193 | 45.2k | CONV_BUF_TYPE *dst = conv_params->dst; |
194 | 45.2k | int dst_stride = conv_params->dst_stride; |
195 | 45.2k | const int bd = 8; |
196 | 45.2k | int i, j, is_vert_4tap = 0; |
197 | | // +1 to compensate for dividing the filter coeffs by 2 |
198 | 45.2k | const int left_shift = FILTER_BITS - conv_params->round_0 + 1; |
199 | 45.2k | const __m256i round_const = |
200 | 45.2k | _mm256_set1_epi32((1 << conv_params->round_1) >> 1); |
201 | 45.2k | const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); |
202 | 45.2k | const __m256i wt = unpack_weights_avx2(conv_params); |
203 | 45.2k | const int do_average = conv_params->do_average; |
204 | 45.2k | const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; |
205 | 45.2k | const int offset_0 = |
206 | 45.2k | bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
207 | 45.2k | const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); |
208 | 45.2k | const __m256i offset_const = _mm256_set1_epi16(offset); |
209 | 45.2k | const int offset_1 = (1 << (bd + FILTER_BITS - 2)); |
210 | 45.2k | const __m256i offset_const_1 = _mm256_set1_epi16(offset_1); |
211 | 45.2k | const __m256i offset_const_2 = _mm256_set1_epi16((1 << offset_0)); |
212 | 45.2k | const int rounding_shift = |
213 | 45.2k | 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
214 | 45.2k | const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); |
215 | 45.2k | const __m256i zero = _mm256_setzero_si256(); |
216 | 45.2k | __m256i coeffs[4], s[8]; |
217 | | |
218 | 45.2k | assert((FILTER_BITS - conv_params->round_0) >= 0); |
219 | | |
220 | 45.2k | prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs); |
221 | | |
222 | | // Condition for checking valid vert_filt taps |
223 | 45.2k | if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0))) |
224 | 19.9k | is_vert_4tap = 1; |
225 | | |
226 | 45.2k | if (is_vert_4tap) { |
227 | 19.9k | const int fo_vert = 1; |
228 | 19.9k | const uint8_t *const src_ptr = src - fo_vert * src_stride; |
229 | 40.6k | for (j = 0; j < w; j += 16) { |
230 | 20.7k | const uint8_t *data = &src_ptr[j]; |
231 | 20.7k | __m256i src4; |
232 | | // Load lines a and b. Line a to lower 128, line b to upper 128 |
233 | 20.7k | { |
234 | 20.7k | __m256i src_ab[4]; |
235 | 20.7k | __m256i src_a[5]; |
236 | 20.7k | src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); |
237 | 103k | for (int kk = 0; kk < 4; ++kk) { |
238 | 83.0k | data += src_stride; |
239 | 83.0k | src_a[kk + 1] = |
240 | 83.0k | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); |
241 | 83.0k | src_ab[kk] = |
242 | 83.0k | _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20); |
243 | 83.0k | } |
244 | 20.7k | src4 = src_a[4]; |
245 | 20.7k | s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]); |
246 | 20.7k | s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]); |
247 | | |
248 | 20.7k | s[3] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]); |
249 | 20.7k | s[4] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]); |
250 | 20.7k | } |
251 | | |
252 | 110k | for (i = 0; i < h; i += 2) { |
253 | 89.7k | data = &src_ptr[(i + 5) * src_stride + j]; |
254 | 89.7k | const __m256i src5 = |
255 | 89.7k | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); |
256 | 89.7k | const __m256i src_45a = _mm256_permute2x128_si256(src4, src5, 0x20); |
257 | | |
258 | 89.7k | src4 = _mm256_castsi128_si256( |
259 | 89.7k | _mm_loadu_si128((__m128i *)(data + src_stride))); |
260 | 89.7k | const __m256i src_56a = _mm256_permute2x128_si256(src5, src4, 0x20); |
261 | | |
262 | 89.7k | s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); |
263 | 89.7k | s[5] = _mm256_unpackhi_epi8(src_45a, src_56a); |
264 | | |
265 | 89.7k | __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1); |
266 | | |
267 | 89.7k | res_lo = _mm256_add_epi16(res_lo, offset_const_1); |
268 | | |
269 | 89.7k | const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero); |
270 | 89.7k | const __m256i res_lo_0_shift = |
271 | 89.7k | _mm256_slli_epi32(res_lo_0_32b, left_shift); |
272 | 89.7k | const __m256i res_lo_0_round = _mm256_sra_epi32( |
273 | 89.7k | _mm256_add_epi32(res_lo_0_shift, round_const), round_shift); |
274 | | |
275 | 89.7k | const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero); |
276 | 89.7k | const __m256i res_lo_1_shift = |
277 | 89.7k | _mm256_slli_epi32(res_lo_1_32b, left_shift); |
278 | 89.7k | const __m256i res_lo_1_round = _mm256_sra_epi32( |
279 | 89.7k | _mm256_add_epi32(res_lo_1_shift, round_const), round_shift); |
280 | | |
281 | 89.7k | const __m256i res_lo_round = |
282 | 89.7k | _mm256_packs_epi32(res_lo_0_round, res_lo_1_round); |
283 | | |
284 | 89.7k | const __m256i res_lo_unsigned = |
285 | 89.7k | _mm256_add_epi16(res_lo_round, offset_const_2); |
286 | | |
287 | 89.7k | if (w - j < 16) { |
288 | 46.5k | if (do_average) { |
289 | 26.0k | const __m256i data_ref_0 = |
290 | 26.0k | load_line2_avx2(&dst[i * dst_stride + j], |
291 | 26.0k | &dst[i * dst_stride + j + dst_stride]); |
292 | 26.0k | const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned, |
293 | 26.0k | &wt, use_dist_wtd_comp_avg); |
294 | | |
295 | 26.0k | const __m256i round_result = convolve_rounding( |
296 | 26.0k | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); |
297 | | |
298 | 26.0k | const __m256i res_8 = |
299 | 26.0k | _mm256_packus_epi16(round_result, round_result); |
300 | 26.0k | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
301 | 26.0k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
302 | | |
303 | 26.0k | if (w - j > 4) { |
304 | 13.6k | _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
305 | 13.6k | _mm_storel_epi64( |
306 | 13.6k | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), |
307 | 13.6k | res_1); |
308 | 13.6k | } else { |
309 | 12.3k | *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); |
310 | 12.3k | *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = |
311 | 12.3k | _mm_cvtsi128_si32(res_1); |
312 | 12.3k | } |
313 | 26.0k | } else { |
314 | 20.4k | const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned); |
315 | 20.4k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); |
316 | | |
317 | 20.4k | const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1); |
318 | 20.4k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
319 | 20.4k | res_1); |
320 | 20.4k | } |
321 | 46.5k | } else { |
322 | 43.2k | __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1); |
323 | | |
324 | 43.2k | res_hi = _mm256_add_epi16(res_hi, offset_const_1); |
325 | | |
326 | 43.2k | const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero); |
327 | 43.2k | const __m256i res_hi_0_shift = |
328 | 43.2k | _mm256_slli_epi32(res_hi_0_32b, left_shift); |
329 | 43.2k | const __m256i res_hi_0_round = _mm256_sra_epi32( |
330 | 43.2k | _mm256_add_epi32(res_hi_0_shift, round_const), round_shift); |
331 | | |
332 | 43.2k | const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero); |
333 | 43.2k | const __m256i res_hi_1_shift = |
334 | 43.2k | _mm256_slli_epi32(res_hi_1_32b, left_shift); |
335 | 43.2k | const __m256i res_hi_1_round = _mm256_sra_epi32( |
336 | 43.2k | _mm256_add_epi32(res_hi_1_shift, round_const), round_shift); |
337 | | |
338 | 43.2k | const __m256i res_hi_round = |
339 | 43.2k | _mm256_packs_epi32(res_hi_0_round, res_hi_1_round); |
340 | | |
341 | 43.2k | const __m256i res_hi_unsigned = |
342 | 43.2k | _mm256_add_epi16(res_hi_round, offset_const_2); |
343 | | |
344 | 43.2k | if (do_average) { |
345 | 17.4k | const __m256i data_ref_0_lo = |
346 | 17.4k | load_line2_avx2(&dst[i * dst_stride + j], |
347 | 17.4k | &dst[i * dst_stride + j + dst_stride]); |
348 | | |
349 | 17.4k | const __m256i data_ref_0_hi = |
350 | 17.4k | load_line2_avx2(&dst[i * dst_stride + j + 8], |
351 | 17.4k | &dst[i * dst_stride + j + 8 + dst_stride]); |
352 | | |
353 | 17.4k | const __m256i comp_avg_res_lo = comp_avg( |
354 | 17.4k | &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg); |
355 | | |
356 | 17.4k | const __m256i comp_avg_res_hi = comp_avg( |
357 | 17.4k | &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg); |
358 | | |
359 | 17.4k | const __m256i round_result_lo = |
360 | 17.4k | convolve_rounding(&comp_avg_res_lo, &offset_const, |
361 | 17.4k | &rounding_const, rounding_shift); |
362 | | |
363 | 17.4k | const __m256i round_result_hi = |
364 | 17.4k | convolve_rounding(&comp_avg_res_hi, &offset_const, |
365 | 17.4k | &rounding_const, rounding_shift); |
366 | | |
367 | 17.4k | const __m256i res_8 = |
368 | 17.4k | _mm256_packus_epi16(round_result_lo, round_result_hi); |
369 | 17.4k | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
370 | 17.4k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
371 | | |
372 | 17.4k | _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
373 | 17.4k | _mm_store_si128( |
374 | 17.4k | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); |
375 | | |
376 | 25.7k | } else { |
377 | 25.7k | const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned); |
378 | 25.7k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0); |
379 | | |
380 | 25.7k | const __m128i res_lo_1 = |
381 | 25.7k | _mm256_extracti128_si256(res_lo_unsigned, 1); |
382 | 25.7k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
383 | 25.7k | res_lo_1); |
384 | | |
385 | 25.7k | const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned); |
386 | 25.7k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]), |
387 | 25.7k | res_hi_0); |
388 | | |
389 | 25.7k | const __m128i res_hi_1 = |
390 | 25.7k | _mm256_extracti128_si256(res_hi_unsigned, 1); |
391 | 25.7k | _mm_store_si128( |
392 | 25.7k | (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]), |
393 | 25.7k | res_hi_1); |
394 | 25.7k | } |
395 | 43.2k | } |
396 | 89.7k | s[0] = s[1]; |
397 | 89.7k | s[1] = s[2]; |
398 | | |
399 | 89.7k | s[3] = s[4]; |
400 | 89.7k | s[4] = s[5]; |
401 | 89.7k | } |
402 | 20.7k | } |
403 | 25.3k | } else { |
404 | 25.3k | const int fo_vert = filter_params_y->taps / 2 - 1; |
405 | 25.3k | const uint8_t *const src_ptr = src - fo_vert * src_stride; |
406 | 59.2k | for (j = 0; j < w; j += 16) { |
407 | 33.9k | const uint8_t *data = &src_ptr[j]; |
408 | 33.9k | __m256i src6; |
409 | | // Load lines a and b. Line a to lower 128, line b to upper 128 |
410 | 33.9k | { |
411 | 33.9k | __m256i src_ab[7]; |
412 | 33.9k | __m256i src_a[7]; |
413 | 33.9k | src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); |
414 | 237k | for (int kk = 0; kk < 6; ++kk) { |
415 | 203k | data += src_stride; |
416 | 203k | src_a[kk + 1] = |
417 | 203k | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); |
418 | 203k | src_ab[kk] = |
419 | 203k | _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20); |
420 | 203k | } |
421 | 33.9k | src6 = src_a[6]; |
422 | 33.9k | s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]); |
423 | 33.9k | s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]); |
424 | 33.9k | s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]); |
425 | 33.9k | s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]); |
426 | 33.9k | s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]); |
427 | 33.9k | s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]); |
428 | 33.9k | } |
429 | | |
430 | 422k | for (i = 0; i < h; i += 2) { |
431 | 388k | data = &src_ptr[(i + 7) * src_stride + j]; |
432 | 388k | const __m256i src7 = |
433 | 388k | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); |
434 | 388k | const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20); |
435 | | |
436 | 388k | src6 = _mm256_castsi128_si256( |
437 | 388k | _mm_loadu_si128((__m128i *)(data + src_stride))); |
438 | 388k | const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20); |
439 | | |
440 | 388k | s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); |
441 | 388k | s[7] = _mm256_unpackhi_epi8(src_67a, src_78a); |
442 | | |
443 | 388k | __m256i res_lo = convolve_lowbd(s, coeffs); |
444 | | |
445 | 388k | res_lo = _mm256_add_epi16(res_lo, offset_const_1); |
446 | | |
447 | 388k | const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero); |
448 | 388k | const __m256i res_lo_0_shift = |
449 | 388k | _mm256_slli_epi32(res_lo_0_32b, left_shift); |
450 | 388k | const __m256i res_lo_0_round = _mm256_sra_epi32( |
451 | 388k | _mm256_add_epi32(res_lo_0_shift, round_const), round_shift); |
452 | | |
453 | 388k | const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero); |
454 | 388k | const __m256i res_lo_1_shift = |
455 | 388k | _mm256_slli_epi32(res_lo_1_32b, left_shift); |
456 | 388k | const __m256i res_lo_1_round = _mm256_sra_epi32( |
457 | 388k | _mm256_add_epi32(res_lo_1_shift, round_const), round_shift); |
458 | | |
459 | 388k | const __m256i res_lo_round = |
460 | 388k | _mm256_packs_epi32(res_lo_0_round, res_lo_1_round); |
461 | | |
462 | 388k | const __m256i res_lo_unsigned = |
463 | 388k | _mm256_add_epi16(res_lo_round, offset_const_2); |
464 | | |
465 | 388k | if (w - j < 16) { |
466 | 68.8k | if (do_average) { |
467 | 33.7k | const __m256i data_ref_0 = |
468 | 33.7k | load_line2_avx2(&dst[i * dst_stride + j], |
469 | 33.7k | &dst[i * dst_stride + j + dst_stride]); |
470 | 33.7k | const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned, |
471 | 33.7k | &wt, use_dist_wtd_comp_avg); |
472 | | |
473 | 33.7k | const __m256i round_result = convolve_rounding( |
474 | 33.7k | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); |
475 | | |
476 | 33.7k | const __m256i res_8 = |
477 | 33.7k | _mm256_packus_epi16(round_result, round_result); |
478 | 33.7k | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
479 | 33.7k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
480 | | |
481 | 33.7k | if (w - j > 4) { |
482 | 27.0k | _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
483 | 27.0k | _mm_storel_epi64( |
484 | 27.0k | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), |
485 | 27.0k | res_1); |
486 | 27.0k | } else { |
487 | 6.78k | *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); |
488 | 6.78k | *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = |
489 | 6.78k | _mm_cvtsi128_si32(res_1); |
490 | 6.78k | } |
491 | 35.0k | } else { |
492 | 35.0k | const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned); |
493 | 35.0k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); |
494 | | |
495 | 35.0k | const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1); |
496 | 35.0k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
497 | 35.0k | res_1); |
498 | 35.0k | } |
499 | 319k | } else { |
500 | 319k | __m256i res_hi = convolve_lowbd(s + 4, coeffs); |
501 | | |
502 | 319k | res_hi = _mm256_add_epi16(res_hi, offset_const_1); |
503 | | |
504 | 319k | const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero); |
505 | 319k | const __m256i res_hi_0_shift = |
506 | 319k | _mm256_slli_epi32(res_hi_0_32b, left_shift); |
507 | 319k | const __m256i res_hi_0_round = _mm256_sra_epi32( |
508 | 319k | _mm256_add_epi32(res_hi_0_shift, round_const), round_shift); |
509 | | |
510 | 319k | const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero); |
511 | 319k | const __m256i res_hi_1_shift = |
512 | 319k | _mm256_slli_epi32(res_hi_1_32b, left_shift); |
513 | 319k | const __m256i res_hi_1_round = _mm256_sra_epi32( |
514 | 319k | _mm256_add_epi32(res_hi_1_shift, round_const), round_shift); |
515 | | |
516 | 319k | const __m256i res_hi_round = |
517 | 319k | _mm256_packs_epi32(res_hi_0_round, res_hi_1_round); |
518 | | |
519 | 319k | const __m256i res_hi_unsigned = |
520 | 319k | _mm256_add_epi16(res_hi_round, offset_const_2); |
521 | | |
522 | 319k | if (do_average) { |
523 | 137k | const __m256i data_ref_0_lo = |
524 | 137k | load_line2_avx2(&dst[i * dst_stride + j], |
525 | 137k | &dst[i * dst_stride + j + dst_stride]); |
526 | | |
527 | 137k | const __m256i data_ref_0_hi = |
528 | 137k | load_line2_avx2(&dst[i * dst_stride + j + 8], |
529 | 137k | &dst[i * dst_stride + j + 8 + dst_stride]); |
530 | | |
531 | 137k | const __m256i comp_avg_res_lo = comp_avg( |
532 | 137k | &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg); |
533 | | |
534 | 137k | const __m256i comp_avg_res_hi = comp_avg( |
535 | 137k | &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg); |
536 | | |
537 | 137k | const __m256i round_result_lo = |
538 | 137k | convolve_rounding(&comp_avg_res_lo, &offset_const, |
539 | 137k | &rounding_const, rounding_shift); |
540 | | |
541 | 137k | const __m256i round_result_hi = |
542 | 137k | convolve_rounding(&comp_avg_res_hi, &offset_const, |
543 | 137k | &rounding_const, rounding_shift); |
544 | | |
545 | 137k | const __m256i res_8 = |
546 | 137k | _mm256_packus_epi16(round_result_lo, round_result_hi); |
547 | 137k | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
548 | 137k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
549 | | |
550 | 137k | _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
551 | 137k | _mm_store_si128( |
552 | 137k | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); |
553 | | |
554 | 182k | } else { |
555 | 182k | const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned); |
556 | 182k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0); |
557 | | |
558 | 182k | const __m128i res_lo_1 = |
559 | 182k | _mm256_extracti128_si256(res_lo_unsigned, 1); |
560 | 182k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
561 | 182k | res_lo_1); |
562 | | |
563 | 182k | const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned); |
564 | 182k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]), |
565 | 182k | res_hi_0); |
566 | | |
567 | 182k | const __m128i res_hi_1 = |
568 | 182k | _mm256_extracti128_si256(res_hi_unsigned, 1); |
569 | 182k | _mm_store_si128( |
570 | 182k | (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]), |
571 | 182k | res_hi_1); |
572 | 182k | } |
573 | 319k | } |
574 | 388k | s[0] = s[1]; |
575 | 388k | s[1] = s[2]; |
576 | 388k | s[2] = s[3]; |
577 | | |
578 | 388k | s[4] = s[5]; |
579 | 388k | s[5] = s[6]; |
580 | 388k | s[6] = s[7]; |
581 | 388k | } |
582 | 33.9k | } |
583 | 25.3k | } |
584 | 45.2k | } |
585 | | |
586 | | void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride, |
587 | | uint8_t *dst0, int dst_stride0, int w, int h, |
588 | | const InterpFilterParams *filter_params_x, |
589 | | const InterpFilterParams *filter_params_y, |
590 | | const int subpel_x_qn, const int subpel_y_qn, |
591 | 145k | ConvolveParams *conv_params) { |
592 | 145k | CONV_BUF_TYPE *dst = conv_params->dst; |
593 | 145k | int dst_stride = conv_params->dst_stride; |
594 | 145k | const int bd = 8; |
595 | | |
596 | 145k | DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); |
597 | | |
598 | 145k | int im_stride = 8; |
599 | 145k | int i, is_horiz_4tap = 0, is_vert_4tap = 0; |
600 | 145k | const __m256i wt = unpack_weights_avx2(conv_params); |
601 | 145k | const int do_average = conv_params->do_average; |
602 | 145k | const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; |
603 | 145k | const int offset_0 = |
604 | 145k | bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
605 | 145k | const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); |
606 | 145k | const __m256i offset_const = _mm256_set1_epi16(offset); |
607 | 145k | const int rounding_shift = |
608 | 145k | 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
609 | 145k | const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); |
610 | | |
611 | 145k | assert(conv_params->round_0 > 0); |
612 | | |
613 | 145k | const __m256i round_const_h = _mm256_set1_epi16( |
614 | 145k | ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2))); |
615 | 145k | const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1); |
616 | | |
617 | 145k | const __m256i round_const_v = _mm256_set1_epi32( |
618 | 145k | ((1 << conv_params->round_1) >> 1) - |
619 | 145k | (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); |
620 | 145k | const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1); |
621 | | |
622 | 145k | __m256i filt[4], coeffs_x[4], coeffs_y[4]; |
623 | | |
624 | 145k | filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); |
625 | 145k | filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); |
626 | | |
627 | 145k | prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_x); |
628 | 145k | prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); |
629 | | |
630 | | // Condition for checking valid horz_filt taps |
631 | 145k | if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_x[0], coeffs_x[3]), 0))) |
632 | 53.5k | is_horiz_4tap = 1; |
633 | | |
634 | | // Condition for checking valid vert_filt taps |
635 | 145k | if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_y[0], coeffs_y[3]), 0))) |
636 | 63.2k | is_vert_4tap = 1; |
637 | | |
638 | 145k | if (is_horiz_4tap) { |
639 | 53.5k | int im_h = h + filter_params_y->taps - 1; |
640 | 53.5k | const int fo_vert = filter_params_y->taps / 2 - 1; |
641 | 53.5k | const int fo_horiz = 1; |
642 | 53.5k | const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; |
643 | 115k | for (int j = 0; j < w; j += 8) { |
644 | | /* Horizontal filter */ |
645 | 61.4k | const uint8_t *src_h = src_ptr + j; |
646 | 596k | for (i = 0; i < im_h; i += 2) { |
647 | 535k | __m256i data = |
648 | 535k | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h)); |
649 | 535k | if (i + 1 < im_h) |
650 | 473k | data = _mm256_inserti128_si256( |
651 | 535k | data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1); |
652 | 535k | src_h += (src_stride << 1); |
653 | 535k | __m256i res = convolve_lowbd_x_4tap(data, coeffs_x + 1, filt); |
654 | | |
655 | 535k | res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), |
656 | 535k | round_shift_h); |
657 | | |
658 | 535k | _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); |
659 | 535k | } |
660 | 61.4k | DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP; |
661 | 61.4k | } |
662 | 91.8k | } else if (is_vert_4tap) { |
663 | 22.3k | int im_h = h + 3; |
664 | 22.3k | const int fo_vert = 1; |
665 | 22.3k | const int fo_horiz = filter_params_x->taps / 2 - 1; |
666 | 22.3k | const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; |
667 | | |
668 | 22.3k | filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); |
669 | 22.3k | filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); |
670 | | |
671 | 52.9k | for (int j = 0; j < w; j += 8) { |
672 | | /* Horizontal filter */ |
673 | 30.5k | const uint8_t *src_h = src_ptr + j; |
674 | 30.5k | DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP; |
675 | | |
676 | | /* Vertical filter */ |
677 | 30.5k | __m256i s[6]; |
678 | 30.5k | __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); |
679 | 30.5k | __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); |
680 | 30.5k | __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); |
681 | 30.5k | __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); |
682 | | |
683 | 30.5k | s[0] = _mm256_unpacklo_epi16(s0, s1); |
684 | 30.5k | s[1] = _mm256_unpacklo_epi16(s2, s3); |
685 | | |
686 | 30.5k | s[3] = _mm256_unpackhi_epi16(s0, s1); |
687 | 30.5k | s[4] = _mm256_unpackhi_epi16(s2, s3); |
688 | | |
689 | 110k | for (i = 0; i < h; i += 2) { |
690 | 80.4k | const int16_t *data = &im_block[i * im_stride]; |
691 | | |
692 | 80.4k | const __m256i s4 = |
693 | 80.4k | _mm256_loadu_si256((__m256i *)(data + 4 * im_stride)); |
694 | 80.4k | const __m256i s5 = |
695 | 80.4k | _mm256_loadu_si256((__m256i *)(data + 5 * im_stride)); |
696 | | |
697 | 80.4k | s[2] = _mm256_unpacklo_epi16(s4, s5); |
698 | 80.4k | s[5] = _mm256_unpackhi_epi16(s4, s5); |
699 | | |
700 | 80.4k | const __m256i res_a = convolve_4tap(s, coeffs_y + 1); |
701 | 80.4k | const __m256i res_a_round = _mm256_sra_epi32( |
702 | 80.4k | _mm256_add_epi32(res_a, round_const_v), round_shift_v); |
703 | | |
704 | 80.4k | if (w - j > 4) { |
705 | 80.4k | const __m256i res_b = convolve_4tap(s + 3, coeffs_y + 1); |
706 | 80.4k | const __m256i res_b_round = _mm256_sra_epi32( |
707 | 80.4k | _mm256_add_epi32(res_b, round_const_v), round_shift_v); |
708 | 80.4k | const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round); |
709 | 80.4k | const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); |
710 | | |
711 | 80.4k | if (do_average) { |
712 | 34.4k | const __m256i data_ref_0 = |
713 | 34.4k | load_line2_avx2(&dst[i * dst_stride + j], |
714 | 34.4k | &dst[i * dst_stride + j + dst_stride]); |
715 | 34.4k | const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, |
716 | 34.4k | &wt, use_dist_wtd_comp_avg); |
717 | | |
718 | 34.4k | const __m256i round_result = convolve_rounding( |
719 | 34.4k | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); |
720 | | |
721 | 34.4k | const __m256i res_8 = |
722 | 34.4k | _mm256_packus_epi16(round_result, round_result); |
723 | 34.4k | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
724 | 34.4k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
725 | | |
726 | 34.4k | _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
727 | 34.4k | _mm_storel_epi64( |
728 | 34.4k | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); |
729 | 45.9k | } else { |
730 | 45.9k | const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); |
731 | 45.9k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); |
732 | | |
733 | 45.9k | const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); |
734 | 45.9k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
735 | 45.9k | res_1); |
736 | 45.9k | } |
737 | 80.4k | } else { |
738 | 0 | const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round); |
739 | 0 | const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); |
740 | |
|
741 | 0 | if (do_average) { |
742 | 0 | const __m256i data_ref_0 = |
743 | 0 | load_line2_avx2(&dst[i * dst_stride + j], |
744 | 0 | &dst[i * dst_stride + j + dst_stride]); |
745 | |
|
746 | 0 | const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, |
747 | 0 | &wt, use_dist_wtd_comp_avg); |
748 | |
|
749 | 0 | const __m256i round_result = convolve_rounding( |
750 | 0 | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); |
751 | |
|
752 | 0 | const __m256i res_8 = |
753 | 0 | _mm256_packus_epi16(round_result, round_result); |
754 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
755 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
756 | |
|
757 | 0 | *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); |
758 | 0 | *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = |
759 | 0 | _mm_cvtsi128_si32(res_1); |
760 | |
|
761 | 0 | } else { |
762 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); |
763 | 0 | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); |
764 | |
|
765 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); |
766 | 0 | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
767 | 0 | res_1); |
768 | 0 | } |
769 | 0 | } |
770 | 80.4k | s[0] = s[1]; |
771 | 80.4k | s[1] = s[2]; |
772 | 80.4k | s[3] = s[4]; |
773 | 80.4k | s[4] = s[5]; |
774 | 80.4k | } |
775 | 30.5k | } |
776 | 69.5k | } else { |
777 | 69.5k | int im_h = h + filter_params_y->taps - 1; |
778 | 69.5k | const int fo_vert = filter_params_y->taps / 2 - 1; |
779 | 69.5k | const int fo_horiz = filter_params_x->taps / 2 - 1; |
780 | 69.5k | const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; |
781 | | |
782 | 69.5k | filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); |
783 | 69.5k | filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); |
784 | | |
785 | 221k | for (int j = 0; j < w; j += 8) { |
786 | | /* Horizontal filter */ |
787 | 152k | const uint8_t *src_h = src_ptr + j; |
788 | 152k | DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP; |
789 | | |
790 | 152k | DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP; |
791 | 152k | } |
792 | 69.5k | } |
793 | 145k | } |
794 | | |
795 | | #define DO_NO_AVG_2D_COPY_4X16(r0, c0, r1, c1, r2, c2, r3, c3) \ |
796 | 16.8M | do { \ |
797 | 16.8M | src_0 = _mm256_cvtepu8_epi16( \ |
798 | 16.8M | _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0]))); \ |
799 | 16.8M | src_1 = _mm256_cvtepu8_epi16( \ |
800 | 16.8M | _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1]))); \ |
801 | 16.8M | src_2 = _mm256_cvtepu8_epi16( \ |
802 | 16.8M | _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2]))); \ |
803 | 16.8M | src_3 = _mm256_cvtepu8_epi16( \ |
804 | 16.8M | _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3]))); \ |
805 | 16.8M | \ |
806 | 16.8M | src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT); \ |
807 | 16.8M | src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT); \ |
808 | 16.8M | src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT); \ |
809 | 16.8M | src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT); \ |
810 | 16.8M | \ |
811 | 16.8M | src_0 = _mm256_add_epi16(src_0, offset_const); \ |
812 | 16.8M | src_1 = _mm256_add_epi16(src_1, offset_const); \ |
813 | 16.8M | src_2 = _mm256_add_epi16(src_2, offset_const); \ |
814 | 16.8M | src_3 = _mm256_add_epi16(src_3, offset_const); \ |
815 | 16.8M | \ |
816 | 16.8M | _mm256_store_si256((__m256i *)(&dst[r0 * dst_stride + c0]), src_0); \ |
817 | 16.8M | _mm256_store_si256((__m256i *)(&dst[r1 * dst_stride + c1]), src_1); \ |
818 | 16.8M | _mm256_store_si256((__m256i *)(&dst[r2 * dst_stride + c2]), src_2); \ |
819 | 16.8M | _mm256_store_si256((__m256i *)(&dst[r3 * dst_stride + c3]), src_3); \ |
820 | 16.8M | } while (0) |
821 | | |
822 | 134M | #define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7) |
823 | | static inline void av1_dist_wtd_convolve_2d_no_avg_copy_avx2( |
824 | | const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, |
825 | 287k | int w, int h, const __m256i offset_const) { |
826 | 287k | int i = h; |
827 | 287k | if (w >= 16) { |
828 | 203k | __m256i src_0, src_1, src_2, src_3; |
829 | 203k | if (w == 128) { |
830 | 4.37M | do { |
831 | 4.37M | DO_NO_AVG_2D_COPY_4X16(0, 0, 0, 16, 0, 32, 0, 48); |
832 | 4.37M | DO_NO_AVG_2D_COPY_4X16(0, 64, 0, 80, 0, 96, 0, 112); |
833 | 4.37M | src += 1 * src_stride; |
834 | 4.37M | dst += 1 * dst_stride; |
835 | 4.37M | i -= 1; |
836 | 4.37M | } while (i); |
837 | 168k | } else if (w == 64) { |
838 | 6.67M | do { |
839 | 6.67M | DO_NO_AVG_2D_COPY_4X16(0, 0, 0, 16, 0, 32, 0, 48); |
840 | 6.67M | src += 1 * src_stride; |
841 | 6.67M | dst += 1 * dst_stride; |
842 | 6.67M | i -= 1; |
843 | 6.67M | } while (i); |
844 | 88.0k | } else if (w == 32) { |
845 | 1.24M | do { |
846 | 1.24M | DO_NO_AVG_2D_COPY_4X16(0, 0, 1, 0, 0, 16, 1, 16); |
847 | 1.24M | src += 2 * src_stride; |
848 | 1.24M | dst += 2 * dst_stride; |
849 | 1.24M | i -= 2; |
850 | 1.24M | } while (i); |
851 | 47.5k | } else if (w == 16) { |
852 | 139k | do { |
853 | 139k | DO_NO_AVG_2D_COPY_4X16(0, 0, 1, 0, 2, 0, 3, 0); |
854 | 139k | src += 4 * src_stride; |
855 | 139k | dst += 4 * dst_stride; |
856 | 139k | i -= 4; |
857 | 139k | } while (i); |
858 | 33.7k | } |
859 | 203k | } else { |
860 | 84.3k | const __m256i zero = _mm256_setzero_si256(); |
861 | 198k | do { |
862 | 198k | const __m128i src_row_0 = |
863 | 198k | _mm_loadl_epi64((__m128i *)(&src[0 * src_stride])); |
864 | 198k | const __m128i src_row_1 = |
865 | 198k | _mm_loadl_epi64((__m128i *)(&src[1 * src_stride])); |
866 | 198k | const __m128i src_row_2 = |
867 | 198k | _mm_loadl_epi64((__m128i *)(&src[2 * src_stride])); |
868 | 198k | const __m128i src_row_3 = |
869 | 198k | _mm_loadl_epi64((__m128i *)(&src[3 * src_stride])); |
870 | | |
871 | 198k | __m256i src_10 = _mm256_insertf128_si256( |
872 | 198k | _mm256_castsi128_si256(src_row_0), src_row_1, 1); |
873 | 198k | __m256i src_32 = _mm256_insertf128_si256( |
874 | 198k | _mm256_castsi128_si256(src_row_2), src_row_3, 1); |
875 | | |
876 | 198k | src_10 = _mm256_unpacklo_epi8(src_10, zero); |
877 | 198k | src_32 = _mm256_unpacklo_epi8(src_32, zero); |
878 | | |
879 | 198k | src_10 = _mm256_slli_epi16(src_10, LEFT_SHIFT); |
880 | 198k | src_32 = _mm256_slli_epi16(src_32, LEFT_SHIFT); |
881 | | |
882 | 198k | src_10 = _mm256_add_epi16(src_10, offset_const); |
883 | 198k | src_32 = _mm256_add_epi16(src_32, offset_const); |
884 | | |
885 | | // Accumulate values into the destination buffer |
886 | 198k | _mm_store_si128((__m128i *)(&dst[0 * dst_stride]), |
887 | 198k | _mm256_castsi256_si128(src_10)); |
888 | 198k | _mm_store_si128((__m128i *)(&dst[1 * dst_stride]), |
889 | 198k | _mm256_extracti128_si256(src_10, 1)); |
890 | 198k | _mm_store_si128((__m128i *)(&dst[2 * dst_stride]), |
891 | 198k | _mm256_castsi256_si128(src_32)); |
892 | 198k | _mm_store_si128((__m128i *)(&dst[3 * dst_stride]), |
893 | 198k | _mm256_extracti128_si256(src_32, 1)); |
894 | | |
895 | 198k | src += 4 * src_stride; |
896 | 198k | dst += 4 * dst_stride; |
897 | 198k | i -= 4; |
898 | 198k | } while (i); |
899 | 84.3k | } |
900 | 287k | } |
901 | | |
902 | | #define DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, r0, c0, r1, c1, r2, c2, r3, c3) \ |
903 | 16.5M | do { \ |
904 | 16.5M | src_0 = _mm256_cvtepu8_epi16( \ |
905 | 16.5M | _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0]))); \ |
906 | 16.5M | src_1 = _mm256_cvtepu8_epi16( \ |
907 | 16.5M | _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1]))); \ |
908 | 16.5M | src_2 = _mm256_cvtepu8_epi16( \ |
909 | 16.5M | _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2]))); \ |
910 | 16.5M | src_3 = _mm256_cvtepu8_epi16( \ |
911 | 16.5M | _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3]))); \ |
912 | 16.5M | \ |
913 | 16.5M | src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT); \ |
914 | 16.5M | src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT); \ |
915 | 16.5M | src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT); \ |
916 | 16.5M | src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT); \ |
917 | 16.5M | src_0 = _mm256_add_epi16(src_0, offset_const); \ |
918 | 16.5M | src_1 = _mm256_add_epi16(src_1, offset_const); \ |
919 | 16.5M | src_2 = _mm256_add_epi16(src_2, offset_const); \ |
920 | 16.5M | src_3 = _mm256_add_epi16(src_3, offset_const); \ |
921 | 16.5M | \ |
922 | 16.5M | ref_0 = _mm256_loadu_si256((__m256i *)(&dst[r0 * dst_stride + c0])); \ |
923 | 16.5M | ref_1 = _mm256_loadu_si256((__m256i *)(&dst[r1 * dst_stride + c1])); \ |
924 | 16.5M | ref_2 = _mm256_loadu_si256((__m256i *)(&dst[r2 * dst_stride + c2])); \ |
925 | 16.5M | ref_3 = _mm256_loadu_si256((__m256i *)(&dst[r3 * dst_stride + c3])); \ |
926 | 16.5M | \ |
927 | 16.5M | res_0 = comp_avg(&ref_0, &src_0, &wt, USE_DIST_WEIGHTED); \ |
928 | 16.5M | res_1 = comp_avg(&ref_1, &src_1, &wt, USE_DIST_WEIGHTED); \ |
929 | 16.5M | res_2 = comp_avg(&ref_2, &src_2, &wt, USE_DIST_WEIGHTED); \ |
930 | 16.5M | res_3 = comp_avg(&ref_3, &src_3, &wt, USE_DIST_WEIGHTED); \ |
931 | 16.5M | \ |
932 | 16.5M | res_0 = convolve_rounding(&res_0, &offset_const, &rounding_const, \ |
933 | 16.5M | rounding_shift); \ |
934 | 16.5M | res_1 = convolve_rounding(&res_1, &offset_const, &rounding_const, \ |
935 | 16.5M | rounding_shift); \ |
936 | 16.5M | res_2 = convolve_rounding(&res_2, &offset_const, &rounding_const, \ |
937 | 16.5M | rounding_shift); \ |
938 | 16.5M | res_3 = convolve_rounding(&res_3, &offset_const, &rounding_const, \ |
939 | 16.5M | rounding_shift); \ |
940 | 16.5M | \ |
941 | 16.5M | res_10 = _mm256_packus_epi16(res_0, res_1); \ |
942 | 16.5M | res_32 = _mm256_packus_epi16(res_2, res_3); \ |
943 | 16.5M | res_10 = _mm256_permute4x64_epi64(res_10, 0xD8); \ |
944 | 16.5M | res_32 = _mm256_permute4x64_epi64(res_32, 0xD8); \ |
945 | 16.5M | \ |
946 | 16.5M | _mm_store_si128((__m128i *)(&dst0[r0 * dst_stride0 + c0]), \ |
947 | 16.5M | _mm256_castsi256_si128(res_10)); \ |
948 | 16.5M | _mm_store_si128((__m128i *)(&dst0[r1 * dst_stride0 + c1]), \ |
949 | 16.5M | _mm256_extracti128_si256(res_10, 1)); \ |
950 | 16.5M | _mm_store_si128((__m128i *)(&dst0[r2 * dst_stride0 + c2]), \ |
951 | 16.5M | _mm256_castsi256_si128(res_32)); \ |
952 | 16.5M | _mm_store_si128((__m128i *)(&dst0[r3 * dst_stride0 + c3]), \ |
953 | 16.5M | _mm256_extracti128_si256(res_32, 1)); \ |
954 | 16.5M | } while (0) |
955 | | |
956 | | #define DO_AVG_2D_COPY(USE_DIST_WEIGHTED) \ |
957 | 239k | int i = h; \ |
958 | 239k | if (w >= 16) { \ |
959 | 185k | __m256i src_0, src_1, src_2, src_3; \ |
960 | 185k | __m256i ref_0, ref_1, ref_2, ref_3; \ |
961 | 185k | __m256i res_0, res_1, res_2, res_3; \ |
962 | 185k | __m256i res_10, res_32; \ |
963 | 185k | if (w == 128) { \ |
964 | 4.36M | do { \ |
965 | 4.36M | DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 0, 16, 0, 32, 0, 48); \ |
966 | 4.36M | DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 64, 0, 80, 0, 96, 0, 112); \ |
967 | 4.36M | i -= 1; \ |
968 | 4.36M | src += 1 * src_stride; \ |
969 | 4.36M | dst += 1 * dst_stride; \ |
970 | 4.36M | dst0 += 1 * dst_stride0; \ |
971 | 4.36M | } while (i); \ |
972 | 151k | } else if (w == 64) { \ |
973 | 6.58M | do { \ |
974 | 6.58M | DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 0, 16, 0, 32, 0, 48); \ |
975 | 6.58M | \ |
976 | 6.58M | i -= 1; \ |
977 | 6.58M | src += 1 * src_stride; \ |
978 | 6.58M | dst += 1 * dst_stride; \ |
979 | 6.58M | dst0 += 1 * dst_stride0; \ |
980 | 6.58M | } while (i); \ |
981 | 86.5k | } else if (w == 32) { \ |
982 | 1.17M | do { \ |
983 | 1.17M | DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 1, 0, 0, 16, 1, 16); \ |
984 | 1.17M | \ |
985 | 1.17M | i -= 2; \ |
986 | 1.17M | src += 2 * src_stride; \ |
987 | 1.17M | dst += 2 * dst_stride; \ |
988 | 1.17M | dst0 += 2 * dst_stride0; \ |
989 | 1.17M | } while (i); \ |
990 | 42.4k | } else { \ |
991 | 22.2k | assert(w == 16); \ |
992 | 85.4k | do { \ |
993 | 85.4k | DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 1, 0, 2, 0, 3, 0); \ |
994 | 85.4k | \ |
995 | 85.4k | i -= 4; \ |
996 | 85.4k | src += 4 * src_stride; \ |
997 | 85.4k | dst += 4 * dst_stride; \ |
998 | 85.4k | dst0 += 4 * dst_stride0; \ |
999 | 85.4k | } while (i); \ |
1000 | 22.3k | } \ |
1001 | 185k | } else if (w == 8) { \ |
1002 | 87.7k | do { \ |
1003 | 87.7k | const __m128i src_0 = \ |
1004 | 87.7k | _mm_loadl_epi64((__m128i *)(&src[0 * src_stride])); \ |
1005 | 87.7k | const __m128i src_1 = \ |
1006 | 87.7k | _mm_loadl_epi64((__m128i *)(&src[1 * src_stride])); \ |
1007 | 87.7k | const __m128i src_2 = \ |
1008 | 87.7k | _mm_loadl_epi64((__m128i *)(&src[2 * src_stride])); \ |
1009 | 87.7k | const __m128i src_3 = \ |
1010 | 87.7k | _mm_loadl_epi64((__m128i *)(&src[3 * src_stride])); \ |
1011 | 87.7k | __m256i src_10 = \ |
1012 | 87.7k | _mm256_insertf128_si256(_mm256_castsi128_si256(src_0), src_1, 1); \ |
1013 | 87.7k | __m256i src_32 = \ |
1014 | 87.7k | _mm256_insertf128_si256(_mm256_castsi128_si256(src_2), src_3, 1); \ |
1015 | 87.7k | \ |
1016 | 87.7k | src_10 = _mm256_unpacklo_epi8(src_10, zero); \ |
1017 | 87.7k | src_32 = _mm256_unpacklo_epi8(src_32, zero); \ |
1018 | 87.7k | \ |
1019 | 87.7k | src_10 = _mm256_slli_epi16(src_10, LEFT_SHIFT); \ |
1020 | 87.7k | src_32 = _mm256_slli_epi16(src_32, LEFT_SHIFT); \ |
1021 | 87.7k | \ |
1022 | 87.7k | src_10 = _mm256_add_epi16(src_10, offset_const); \ |
1023 | 87.7k | src_32 = _mm256_add_epi16(src_32, offset_const); \ |
1024 | 87.7k | \ |
1025 | 87.7k | const __m256i ref_10 = \ |
1026 | 87.7k | load_line2_avx2(&dst[0 * dst_stride], &dst[1 * dst_stride]); \ |
1027 | 87.7k | const __m256i ref_32 = \ |
1028 | 87.7k | load_line2_avx2(&dst[2 * dst_stride], &dst[3 * dst_stride]); \ |
1029 | 87.7k | __m256i res_10 = comp_avg(&ref_10, &src_10, &wt, USE_DIST_WEIGHTED); \ |
1030 | 87.7k | __m256i res_32 = comp_avg(&ref_32, &src_32, &wt, USE_DIST_WEIGHTED); \ |
1031 | 87.7k | \ |
1032 | 87.7k | res_10 = convolve_rounding(&res_10, &offset_const, &rounding_const, \ |
1033 | 87.7k | rounding_shift); \ |
1034 | 87.7k | res_32 = convolve_rounding(&res_32, &offset_const, &rounding_const, \ |
1035 | 87.7k | rounding_shift); \ |
1036 | 87.7k | \ |
1037 | 87.7k | __m256i res = _mm256_packus_epi16(res_10, res_32); \ |
1038 | 87.7k | const __m128i res_20 = _mm256_castsi256_si128(res); \ |
1039 | 87.7k | const __m128i res_31 = _mm256_extracti128_si256(res, 1); \ |
1040 | 87.7k | \ |
1041 | 87.7k | _mm_storel_epi64((__m128i *)(&dst0[0 * dst_stride0]), res_20); \ |
1042 | 87.7k | _mm_storel_epi64((__m128i *)((&dst0[1 * dst_stride0])), res_31); \ |
1043 | 87.7k | _mm_storeh_epi64((__m128i *)(&dst0[2 * dst_stride0]), res_20); \ |
1044 | 87.7k | _mm_storeh_epi64((__m128i *)((&dst0[3 * dst_stride0])), res_31); \ |
1045 | 87.7k | i -= 4; \ |
1046 | 87.7k | src += 4 * src_stride; \ |
1047 | 87.7k | dst += 4 * dst_stride; \ |
1048 | 87.7k | dst0 += 4 * dst_stride0; \ |
1049 | 87.7k | } while (i); \ |
1050 | 33.1k | } else { \ |
1051 | 21.3k | assert(w == 4); \ |
1052 | 33.4k | do { \ |
1053 | 33.4k | __m256i src_3210_8bit = \ |
1054 | 33.4k | _mm256_setr_epi32(loadu_int32(src + 0 * src_stride), \ |
1055 | 33.4k | loadu_int32(src + 1 * src_stride), 0, 0, \ |
1056 | 33.4k | loadu_int32(src + 2 * src_stride), \ |
1057 | 33.4k | loadu_int32(src + 3 * src_stride), 0, 0); \ |
1058 | 33.4k | \ |
1059 | 33.4k | __m256i src_3210 = _mm256_unpacklo_epi8(src_3210_8bit, zero); \ |
1060 | 33.4k | src_3210 = _mm256_slli_epi16(src_3210, LEFT_SHIFT); \ |
1061 | 33.4k | src_3210 = _mm256_add_epi16(src_3210, offset_const); \ |
1062 | 33.4k | \ |
1063 | 33.4k | __m256i ref_3210 = \ |
1064 | 33.4k | _mm256_setr_epi64x(*(int64_t *)(dst + 0 * dst_stride), \ |
1065 | 33.4k | *(int64_t *)(dst + 1 * dst_stride), \ |
1066 | 33.4k | *(int64_t *)(dst + 2 * dst_stride), \ |
1067 | 33.4k | *(int64_t *)(dst + 3 * dst_stride)); \ |
1068 | 33.4k | __m256i res_3210 = \ |
1069 | 33.4k | comp_avg(&ref_3210, &src_3210, &wt, USE_DIST_WEIGHTED); \ |
1070 | 33.4k | \ |
1071 | 33.4k | res_3210 = convolve_rounding(&res_3210, &offset_const, &rounding_const, \ |
1072 | 33.4k | rounding_shift); \ |
1073 | 33.4k | \ |
1074 | 33.4k | res_3210 = _mm256_packus_epi16(res_3210, res_3210); \ |
1075 | 33.4k | const __m128i res_10 = _mm256_castsi256_si128(res_3210); \ |
1076 | 33.4k | const __m128i res_32 = _mm256_extracti128_si256(res_3210, 1); \ |
1077 | 33.4k | \ |
1078 | 33.4k | *(int *)(&dst0[0 * dst_stride0]) = _mm_cvtsi128_si32(res_10); \ |
1079 | 33.4k | *(int *)(&dst0[2 * dst_stride0]) = _mm_cvtsi128_si32(res_32); \ |
1080 | 33.4k | *(int *)(&dst0[1 * dst_stride0]) = _mm_extract_epi32(res_10, 1); \ |
1081 | 33.4k | *(int *)(&dst0[3 * dst_stride0]) = _mm_extract_epi32(res_32, 1); \ |
1082 | 33.4k | i -= 4; \ |
1083 | 33.4k | src += 4 * src_stride; \ |
1084 | 33.4k | dst += 4 * dst_stride; \ |
1085 | 33.4k | dst0 += 4 * dst_stride0; \ |
1086 | 33.4k | } while (i); \ |
1087 | 21.3k | } |
1088 | | |
1089 | | void av1_dist_wtd_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, |
1090 | | uint8_t *dst0, int dst_stride0, int w, |
1091 | 527k | int h, ConvolveParams *conv_params) { |
1092 | 527k | const int bd = 8; |
1093 | 527k | CONV_BUF_TYPE *dst = conv_params->dst; |
1094 | 527k | int dst_stride = conv_params->dst_stride; |
1095 | 527k | assert(conv_params->round_0 == 3); |
1096 | 527k | assert(conv_params->round_1 == 7); |
1097 | 527k | assert(w % 4 == 0); |
1098 | 527k | assert(h % 4 == 0); |
1099 | | |
1100 | 527k | const int do_average = conv_params->do_average; |
1101 | 527k | const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; |
1102 | 527k | const __m256i wt = unpack_weights_avx2(conv_params); |
1103 | 527k | const __m256i zero = _mm256_setzero_si256(); |
1104 | | |
1105 | 527k | const int offset_0 = |
1106 | 527k | bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
1107 | 527k | const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); |
1108 | 527k | const __m256i offset_const = _mm256_set1_epi16(offset); |
1109 | 527k | const int rounding_shift = |
1110 | 527k | 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
1111 | 527k | const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); |
1112 | | |
1113 | 527k | if (do_average) { |
1114 | 239k | if (use_dist_wtd_comp_avg) { |
1115 | 42.7k | DO_AVG_2D_COPY(1) |
1116 | 197k | } else { |
1117 | 197k | DO_AVG_2D_COPY(0) |
1118 | 197k | } |
1119 | 287k | } else { |
1120 | 287k | av1_dist_wtd_convolve_2d_no_avg_copy_avx2(src, src_stride, dst, dst_stride, |
1121 | 287k | w, h, offset_const); |
1122 | 287k | } |
1123 | 527k | } |
1124 | | #undef LEFT_SHIFT |