/src/aom/av1/common/x86/jnt_convolve_avx2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2018, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #include <emmintrin.h> |
13 | | #include <immintrin.h> |
14 | | |
15 | | #include "config/av1_rtcd.h" |
16 | | |
17 | | #include "aom_dsp/aom_dsp_common.h" |
18 | | #include "aom_dsp/aom_filter.h" |
19 | | #include "aom_dsp/x86/convolve_avx2.h" |
20 | | #include "aom_dsp/x86/convolve_common_intrin.h" |
21 | | #include "aom_dsp/x86/convolve_sse4_1.h" |
22 | | #include "aom_dsp/x86/mem_sse2.h" |
23 | | #include "aom_dsp/x86/synonyms_avx2.h" |
24 | | |
25 | | #include "av1/common/convolve.h" |
26 | | |
27 | 1.44M | static inline __m256i unpack_weights_avx2(ConvolveParams *conv_params) { |
28 | 1.44M | const int w0 = conv_params->fwd_offset; |
29 | 1.44M | const int w1 = conv_params->bck_offset; |
30 | 1.44M | const __m256i wt0 = _mm256_set1_epi16((int16_t)w0); |
31 | 1.44M | const __m256i wt1 = _mm256_set1_epi16((int16_t)w1); |
32 | 1.44M | const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1); |
33 | 1.44M | return wt; |
34 | 1.44M | } |
35 | | |
36 | 8.94M | static inline __m256i load_line2_avx2(const void *a, const void *b) { |
37 | 8.94M | return _mm256_permute2x128_si256( |
38 | 8.94M | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)a)), |
39 | 8.94M | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)b)), 0x20); |
40 | 8.94M | } |
41 | | |
42 | | void av1_dist_wtd_convolve_x_avx2(const uint8_t *src, int src_stride, |
43 | | uint8_t *dst0, int dst_stride0, int w, int h, |
44 | | const InterpFilterParams *filter_params_x, |
45 | | const int subpel_x_qn, |
46 | 153k | ConvolveParams *conv_params) { |
47 | 153k | CONV_BUF_TYPE *dst = conv_params->dst; |
48 | 153k | int dst_stride = conv_params->dst_stride; |
49 | 153k | const int bd = 8; |
50 | 153k | int i, j, is_horiz_4tap = 0; |
51 | 153k | const int bits = FILTER_BITS - conv_params->round_1; |
52 | 153k | const __m256i wt = unpack_weights_avx2(conv_params); |
53 | 153k | const int do_average = conv_params->do_average; |
54 | 153k | const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; |
55 | 153k | const int offset_0 = |
56 | 153k | bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
57 | 153k | const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); |
58 | 153k | const __m256i offset_const = _mm256_set1_epi16(offset); |
59 | 153k | const int rounding_shift = |
60 | 153k | 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
61 | 153k | const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); |
62 | | |
63 | 153k | assert(bits >= 0); |
64 | 153k | assert(conv_params->round_0 > 0); |
65 | | |
66 | 153k | const __m256i round_const = |
67 | 153k | _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1); |
68 | 153k | const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1); |
69 | | |
70 | 153k | __m256i filt[4], coeffs[4]; |
71 | | |
72 | 153k | filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); |
73 | 153k | filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); |
74 | | |
75 | 153k | prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs); |
76 | | |
77 | | // Condition for checking valid horz_filt taps |
78 | 153k | if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0))) |
79 | 58.8k | is_horiz_4tap = 1; |
80 | | |
81 | | // horz_filt as 4 tap |
82 | 153k | if (is_horiz_4tap) { |
83 | 58.8k | const int fo_horiz = 1; |
84 | 58.8k | const uint8_t *const src_ptr = src - fo_horiz; |
85 | 463k | for (i = 0; i < h; i += 2) { |
86 | 404k | const uint8_t *src_data = src_ptr + i * src_stride; |
87 | 404k | CONV_BUF_TYPE *dst_data = dst + i * dst_stride; |
88 | 2.06M | for (j = 0; j < w; j += 8) { |
89 | 1.66M | const __m256i data = |
90 | 1.66M | load_line2_avx2(&src_data[j], &src_data[j + src_stride]); |
91 | | |
92 | 1.66M | __m256i res = convolve_lowbd_x_4tap(data, coeffs + 1, filt); |
93 | 1.66M | res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift); |
94 | 1.66M | res = _mm256_slli_epi16(res, bits); |
95 | | |
96 | 1.66M | const __m256i res_unsigned = _mm256_add_epi16(res, offset_const); |
97 | | |
98 | | // Accumulate values into the destination buffer |
99 | 1.66M | if (do_average) { |
100 | 461k | const __m256i data_ref_0 = |
101 | 461k | load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]); |
102 | 461k | const __m256i comp_avg_res = |
103 | 461k | comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); |
104 | | |
105 | 461k | const __m256i round_result = convolve_rounding( |
106 | 461k | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); |
107 | | |
108 | 461k | const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); |
109 | 461k | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
110 | 461k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
111 | | |
112 | 461k | if (w > 4) { |
113 | 424k | _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
114 | 424k | _mm_storel_epi64( |
115 | 424k | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); |
116 | 424k | } else { |
117 | 37.3k | *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); |
118 | 37.3k | *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = |
119 | 37.3k | _mm_cvtsi128_si32(res_1); |
120 | 37.3k | } |
121 | 1.20M | } else { |
122 | 1.20M | const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); |
123 | 1.20M | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); |
124 | | |
125 | 1.20M | const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); |
126 | 1.20M | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
127 | 1.20M | res_1); |
128 | 1.20M | } |
129 | 1.66M | } |
130 | 404k | } |
131 | 94.9k | } else { |
132 | 94.9k | const int fo_horiz = filter_params_x->taps / 2 - 1; |
133 | 94.9k | const uint8_t *const src_ptr = src - fo_horiz; |
134 | | |
135 | 94.9k | filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); |
136 | 94.9k | filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); |
137 | 879k | for (i = 0; i < h; i += 2) { |
138 | 784k | const uint8_t *src_data = src_ptr + i * src_stride; |
139 | 784k | CONV_BUF_TYPE *dst_data = dst + i * dst_stride; |
140 | 3.89M | for (j = 0; j < w; j += 8) { |
141 | 3.10M | const __m256i data = |
142 | 3.10M | load_line2_avx2(&src_data[j], &src_data[j + src_stride]); |
143 | | |
144 | 3.10M | __m256i res = convolve_lowbd_x(data, coeffs, filt); |
145 | | |
146 | 3.10M | res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift); |
147 | | |
148 | 3.10M | res = _mm256_slli_epi16(res, bits); |
149 | | |
150 | 3.10M | const __m256i res_unsigned = _mm256_add_epi16(res, offset_const); |
151 | | |
152 | | // Accumulate values into the destination buffer |
153 | 3.10M | if (do_average) { |
154 | 1.41M | const __m256i data_ref_0 = |
155 | 1.41M | load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]); |
156 | 1.41M | const __m256i comp_avg_res = |
157 | 1.41M | comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); |
158 | | |
159 | 1.41M | const __m256i round_result = convolve_rounding( |
160 | 1.41M | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); |
161 | | |
162 | 1.41M | const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); |
163 | 1.41M | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
164 | 1.41M | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
165 | | |
166 | 1.41M | if (w > 4) { |
167 | 1.41M | _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
168 | 1.41M | _mm_storel_epi64( |
169 | 1.41M | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); |
170 | 1.41M | } else { |
171 | 2.30k | *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); |
172 | 2.30k | *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = |
173 | 2.30k | _mm_cvtsi128_si32(res_1); |
174 | 2.30k | } |
175 | 1.68M | } else { |
176 | 1.68M | const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); |
177 | 1.68M | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); |
178 | | |
179 | 1.68M | const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); |
180 | 1.68M | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
181 | 1.68M | res_1); |
182 | 1.68M | } |
183 | 3.10M | } |
184 | 784k | } |
185 | 94.9k | } |
186 | 153k | } |
187 | | |
188 | | void av1_dist_wtd_convolve_y_avx2(const uint8_t *src, int src_stride, |
189 | | uint8_t *dst0, int dst_stride0, int w, int h, |
190 | | const InterpFilterParams *filter_params_y, |
191 | | const int subpel_y_qn, |
192 | 79.3k | ConvolveParams *conv_params) { |
193 | 79.3k | CONV_BUF_TYPE *dst = conv_params->dst; |
194 | 79.3k | int dst_stride = conv_params->dst_stride; |
195 | 79.3k | const int bd = 8; |
196 | 79.3k | int i, j, is_vert_4tap = 0; |
197 | | // +1 to compensate for dividing the filter coeffs by 2 |
198 | 79.3k | const int left_shift = FILTER_BITS - conv_params->round_0 + 1; |
199 | 79.3k | const __m256i round_const = |
200 | 79.3k | _mm256_set1_epi32((1 << conv_params->round_1) >> 1); |
201 | 79.3k | const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); |
202 | 79.3k | const __m256i wt = unpack_weights_avx2(conv_params); |
203 | 79.3k | const int do_average = conv_params->do_average; |
204 | 79.3k | const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; |
205 | 79.3k | const int offset_0 = |
206 | 79.3k | bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
207 | 79.3k | const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); |
208 | 79.3k | const __m256i offset_const = _mm256_set1_epi16(offset); |
209 | 79.3k | const int offset_1 = (1 << (bd + FILTER_BITS - 2)); |
210 | 79.3k | const __m256i offset_const_1 = _mm256_set1_epi16(offset_1); |
211 | 79.3k | const __m256i offset_const_2 = _mm256_set1_epi16((1 << offset_0)); |
212 | 79.3k | const int rounding_shift = |
213 | 79.3k | 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
214 | 79.3k | const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); |
215 | 79.3k | const __m256i zero = _mm256_setzero_si256(); |
216 | 79.3k | __m256i coeffs[4], s[8]; |
217 | | |
218 | 79.3k | assert((FILTER_BITS - conv_params->round_0) >= 0); |
219 | | |
220 | 79.3k | prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs); |
221 | | |
222 | | // Condition for checking valid vert_filt taps |
223 | 79.3k | if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0))) |
224 | 35.2k | is_vert_4tap = 1; |
225 | | |
226 | 79.3k | if (is_vert_4tap) { |
227 | 35.2k | const int fo_vert = 1; |
228 | 35.2k | const uint8_t *const src_ptr = src - fo_vert * src_stride; |
229 | 72.8k | for (j = 0; j < w; j += 16) { |
230 | 37.5k | const uint8_t *data = &src_ptr[j]; |
231 | 37.5k | __m256i src4; |
232 | | // Load lines a and b. Line a to lower 128, line b to upper 128 |
233 | 37.5k | { |
234 | 37.5k | __m256i src_ab[4]; |
235 | 37.5k | __m256i src_a[5]; |
236 | 37.5k | src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); |
237 | 187k | for (int kk = 0; kk < 4; ++kk) { |
238 | 150k | data += src_stride; |
239 | 150k | src_a[kk + 1] = |
240 | 150k | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); |
241 | 150k | src_ab[kk] = |
242 | 150k | _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20); |
243 | 150k | } |
244 | 37.5k | src4 = src_a[4]; |
245 | 37.5k | s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]); |
246 | 37.5k | s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]); |
247 | | |
248 | 37.5k | s[3] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]); |
249 | 37.5k | s[4] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]); |
250 | 37.5k | } |
251 | | |
252 | 238k | for (i = 0; i < h; i += 2) { |
253 | 201k | data = &src_ptr[(i + 5) * src_stride + j]; |
254 | 201k | const __m256i src5 = |
255 | 201k | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); |
256 | 201k | const __m256i src_45a = _mm256_permute2x128_si256(src4, src5, 0x20); |
257 | | |
258 | 201k | src4 = _mm256_castsi128_si256( |
259 | 201k | _mm_loadu_si128((__m128i *)(data + src_stride))); |
260 | 201k | const __m256i src_56a = _mm256_permute2x128_si256(src5, src4, 0x20); |
261 | | |
262 | 201k | s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); |
263 | 201k | s[5] = _mm256_unpackhi_epi8(src_45a, src_56a); |
264 | | |
265 | 201k | __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1); |
266 | | |
267 | 201k | res_lo = _mm256_add_epi16(res_lo, offset_const_1); |
268 | | |
269 | 201k | const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero); |
270 | 201k | const __m256i res_lo_0_shift = |
271 | 201k | _mm256_slli_epi32(res_lo_0_32b, left_shift); |
272 | 201k | const __m256i res_lo_0_round = _mm256_sra_epi32( |
273 | 201k | _mm256_add_epi32(res_lo_0_shift, round_const), round_shift); |
274 | | |
275 | 201k | const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero); |
276 | 201k | const __m256i res_lo_1_shift = |
277 | 201k | _mm256_slli_epi32(res_lo_1_32b, left_shift); |
278 | 201k | const __m256i res_lo_1_round = _mm256_sra_epi32( |
279 | 201k | _mm256_add_epi32(res_lo_1_shift, round_const), round_shift); |
280 | | |
281 | 201k | const __m256i res_lo_round = |
282 | 201k | _mm256_packs_epi32(res_lo_0_round, res_lo_1_round); |
283 | | |
284 | 201k | const __m256i res_lo_unsigned = |
285 | 201k | _mm256_add_epi16(res_lo_round, offset_const_2); |
286 | | |
287 | 201k | if (w - j < 16) { |
288 | 87.3k | if (do_average) { |
289 | 59.1k | const __m256i data_ref_0 = |
290 | 59.1k | load_line2_avx2(&dst[i * dst_stride + j], |
291 | 59.1k | &dst[i * dst_stride + j + dst_stride]); |
292 | 59.1k | const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned, |
293 | 59.1k | &wt, use_dist_wtd_comp_avg); |
294 | | |
295 | 59.1k | const __m256i round_result = convolve_rounding( |
296 | 59.1k | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); |
297 | | |
298 | 59.1k | const __m256i res_8 = |
299 | 59.1k | _mm256_packus_epi16(round_result, round_result); |
300 | 59.1k | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
301 | 59.1k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
302 | | |
303 | 59.1k | if (w - j > 4) { |
304 | 31.0k | _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
305 | 31.0k | _mm_storel_epi64( |
306 | 31.0k | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), |
307 | 31.0k | res_1); |
308 | 31.0k | } else { |
309 | 28.0k | *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); |
310 | 28.0k | *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = |
311 | 28.0k | _mm_cvtsi128_si32(res_1); |
312 | 28.0k | } |
313 | 59.1k | } else { |
314 | 28.2k | const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned); |
315 | 28.2k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); |
316 | | |
317 | 28.2k | const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1); |
318 | 28.2k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
319 | 28.2k | res_1); |
320 | 28.2k | } |
321 | 114k | } else { |
322 | 114k | __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1); |
323 | | |
324 | 114k | res_hi = _mm256_add_epi16(res_hi, offset_const_1); |
325 | | |
326 | 114k | const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero); |
327 | 114k | const __m256i res_hi_0_shift = |
328 | 114k | _mm256_slli_epi32(res_hi_0_32b, left_shift); |
329 | 114k | const __m256i res_hi_0_round = _mm256_sra_epi32( |
330 | 114k | _mm256_add_epi32(res_hi_0_shift, round_const), round_shift); |
331 | | |
332 | 114k | const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero); |
333 | 114k | const __m256i res_hi_1_shift = |
334 | 114k | _mm256_slli_epi32(res_hi_1_32b, left_shift); |
335 | 114k | const __m256i res_hi_1_round = _mm256_sra_epi32( |
336 | 114k | _mm256_add_epi32(res_hi_1_shift, round_const), round_shift); |
337 | | |
338 | 114k | const __m256i res_hi_round = |
339 | 114k | _mm256_packs_epi32(res_hi_0_round, res_hi_1_round); |
340 | | |
341 | 114k | const __m256i res_hi_unsigned = |
342 | 114k | _mm256_add_epi16(res_hi_round, offset_const_2); |
343 | | |
344 | 114k | if (do_average) { |
345 | 34.3k | const __m256i data_ref_0_lo = |
346 | 34.3k | load_line2_avx2(&dst[i * dst_stride + j], |
347 | 34.3k | &dst[i * dst_stride + j + dst_stride]); |
348 | | |
349 | 34.3k | const __m256i data_ref_0_hi = |
350 | 34.3k | load_line2_avx2(&dst[i * dst_stride + j + 8], |
351 | 34.3k | &dst[i * dst_stride + j + 8 + dst_stride]); |
352 | | |
353 | 34.3k | const __m256i comp_avg_res_lo = comp_avg( |
354 | 34.3k | &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg); |
355 | | |
356 | 34.3k | const __m256i comp_avg_res_hi = comp_avg( |
357 | 34.3k | &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg); |
358 | | |
359 | 34.3k | const __m256i round_result_lo = |
360 | 34.3k | convolve_rounding(&comp_avg_res_lo, &offset_const, |
361 | 34.3k | &rounding_const, rounding_shift); |
362 | | |
363 | 34.3k | const __m256i round_result_hi = |
364 | 34.3k | convolve_rounding(&comp_avg_res_hi, &offset_const, |
365 | 34.3k | &rounding_const, rounding_shift); |
366 | | |
367 | 34.3k | const __m256i res_8 = |
368 | 34.3k | _mm256_packus_epi16(round_result_lo, round_result_hi); |
369 | 34.3k | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
370 | 34.3k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
371 | | |
372 | 34.3k | _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
373 | 34.3k | _mm_store_si128( |
374 | 34.3k | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); |
375 | | |
376 | 79.7k | } else { |
377 | 79.7k | const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned); |
378 | 79.7k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0); |
379 | | |
380 | 79.7k | const __m128i res_lo_1 = |
381 | 79.7k | _mm256_extracti128_si256(res_lo_unsigned, 1); |
382 | 79.7k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
383 | 79.7k | res_lo_1); |
384 | | |
385 | 79.7k | const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned); |
386 | 79.7k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]), |
387 | 79.7k | res_hi_0); |
388 | | |
389 | 79.7k | const __m128i res_hi_1 = |
390 | 79.7k | _mm256_extracti128_si256(res_hi_unsigned, 1); |
391 | 79.7k | _mm_store_si128( |
392 | 79.7k | (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]), |
393 | 79.7k | res_hi_1); |
394 | 79.7k | } |
395 | 114k | } |
396 | 201k | s[0] = s[1]; |
397 | 201k | s[1] = s[2]; |
398 | | |
399 | 201k | s[3] = s[4]; |
400 | 201k | s[4] = s[5]; |
401 | 201k | } |
402 | 37.5k | } |
403 | 44.1k | } else { |
404 | 44.1k | const int fo_vert = filter_params_y->taps / 2 - 1; |
405 | 44.1k | const uint8_t *const src_ptr = src - fo_vert * src_stride; |
406 | 103k | for (j = 0; j < w; j += 16) { |
407 | 59.6k | const uint8_t *data = &src_ptr[j]; |
408 | 59.6k | __m256i src6; |
409 | | // Load lines a and b. Line a to lower 128, line b to upper 128 |
410 | 59.6k | { |
411 | 59.6k | __m256i src_ab[7]; |
412 | 59.6k | __m256i src_a[7]; |
413 | 59.6k | src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); |
414 | 417k | for (int kk = 0; kk < 6; ++kk) { |
415 | 357k | data += src_stride; |
416 | 357k | src_a[kk + 1] = |
417 | 357k | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); |
418 | 357k | src_ab[kk] = |
419 | 357k | _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20); |
420 | 357k | } |
421 | 59.6k | src6 = src_a[6]; |
422 | 59.6k | s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]); |
423 | 59.6k | s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]); |
424 | 59.6k | s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]); |
425 | 59.6k | s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]); |
426 | 59.6k | s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]); |
427 | 59.6k | s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]); |
428 | 59.6k | } |
429 | | |
430 | 697k | for (i = 0; i < h; i += 2) { |
431 | 638k | data = &src_ptr[(i + 7) * src_stride + j]; |
432 | 638k | const __m256i src7 = |
433 | 638k | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); |
434 | 638k | const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20); |
435 | | |
436 | 638k | src6 = _mm256_castsi128_si256( |
437 | 638k | _mm_loadu_si128((__m128i *)(data + src_stride))); |
438 | 638k | const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20); |
439 | | |
440 | 638k | s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); |
441 | 638k | s[7] = _mm256_unpackhi_epi8(src_67a, src_78a); |
442 | | |
443 | 638k | __m256i res_lo = convolve_lowbd(s, coeffs); |
444 | | |
445 | 638k | res_lo = _mm256_add_epi16(res_lo, offset_const_1); |
446 | | |
447 | 638k | const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero); |
448 | 638k | const __m256i res_lo_0_shift = |
449 | 638k | _mm256_slli_epi32(res_lo_0_32b, left_shift); |
450 | 638k | const __m256i res_lo_0_round = _mm256_sra_epi32( |
451 | 638k | _mm256_add_epi32(res_lo_0_shift, round_const), round_shift); |
452 | | |
453 | 638k | const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero); |
454 | 638k | const __m256i res_lo_1_shift = |
455 | 638k | _mm256_slli_epi32(res_lo_1_32b, left_shift); |
456 | 638k | const __m256i res_lo_1_round = _mm256_sra_epi32( |
457 | 638k | _mm256_add_epi32(res_lo_1_shift, round_const), round_shift); |
458 | | |
459 | 638k | const __m256i res_lo_round = |
460 | 638k | _mm256_packs_epi32(res_lo_0_round, res_lo_1_round); |
461 | | |
462 | 638k | const __m256i res_lo_unsigned = |
463 | 638k | _mm256_add_epi16(res_lo_round, offset_const_2); |
464 | | |
465 | 638k | if (w - j < 16) { |
466 | 118k | if (do_average) { |
467 | 57.9k | const __m256i data_ref_0 = |
468 | 57.9k | load_line2_avx2(&dst[i * dst_stride + j], |
469 | 57.9k | &dst[i * dst_stride + j + dst_stride]); |
470 | 57.9k | const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned, |
471 | 57.9k | &wt, use_dist_wtd_comp_avg); |
472 | | |
473 | 57.9k | const __m256i round_result = convolve_rounding( |
474 | 57.9k | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); |
475 | | |
476 | 57.9k | const __m256i res_8 = |
477 | 57.9k | _mm256_packus_epi16(round_result, round_result); |
478 | 57.9k | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
479 | 57.9k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
480 | | |
481 | 57.9k | if (w - j > 4) { |
482 | 46.5k | _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
483 | 46.5k | _mm_storel_epi64( |
484 | 46.5k | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), |
485 | 46.5k | res_1); |
486 | 46.5k | } else { |
487 | 11.4k | *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); |
488 | 11.4k | *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = |
489 | 11.4k | _mm_cvtsi128_si32(res_1); |
490 | 11.4k | } |
491 | 60.4k | } else { |
492 | 60.4k | const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned); |
493 | 60.4k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); |
494 | | |
495 | 60.4k | const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1); |
496 | 60.4k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
497 | 60.4k | res_1); |
498 | 60.4k | } |
499 | 519k | } else { |
500 | 519k | __m256i res_hi = convolve_lowbd(s + 4, coeffs); |
501 | | |
502 | 519k | res_hi = _mm256_add_epi16(res_hi, offset_const_1); |
503 | | |
504 | 519k | const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero); |
505 | 519k | const __m256i res_hi_0_shift = |
506 | 519k | _mm256_slli_epi32(res_hi_0_32b, left_shift); |
507 | 519k | const __m256i res_hi_0_round = _mm256_sra_epi32( |
508 | 519k | _mm256_add_epi32(res_hi_0_shift, round_const), round_shift); |
509 | | |
510 | 519k | const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero); |
511 | 519k | const __m256i res_hi_1_shift = |
512 | 519k | _mm256_slli_epi32(res_hi_1_32b, left_shift); |
513 | 519k | const __m256i res_hi_1_round = _mm256_sra_epi32( |
514 | 519k | _mm256_add_epi32(res_hi_1_shift, round_const), round_shift); |
515 | | |
516 | 519k | const __m256i res_hi_round = |
517 | 519k | _mm256_packs_epi32(res_hi_0_round, res_hi_1_round); |
518 | | |
519 | 519k | const __m256i res_hi_unsigned = |
520 | 519k | _mm256_add_epi16(res_hi_round, offset_const_2); |
521 | | |
522 | 519k | if (do_average) { |
523 | 235k | const __m256i data_ref_0_lo = |
524 | 235k | load_line2_avx2(&dst[i * dst_stride + j], |
525 | 235k | &dst[i * dst_stride + j + dst_stride]); |
526 | | |
527 | 235k | const __m256i data_ref_0_hi = |
528 | 235k | load_line2_avx2(&dst[i * dst_stride + j + 8], |
529 | 235k | &dst[i * dst_stride + j + 8 + dst_stride]); |
530 | | |
531 | 235k | const __m256i comp_avg_res_lo = comp_avg( |
532 | 235k | &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg); |
533 | | |
534 | 235k | const __m256i comp_avg_res_hi = comp_avg( |
535 | 235k | &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg); |
536 | | |
537 | 235k | const __m256i round_result_lo = |
538 | 235k | convolve_rounding(&comp_avg_res_lo, &offset_const, |
539 | 235k | &rounding_const, rounding_shift); |
540 | | |
541 | 235k | const __m256i round_result_hi = |
542 | 235k | convolve_rounding(&comp_avg_res_hi, &offset_const, |
543 | 235k | &rounding_const, rounding_shift); |
544 | | |
545 | 235k | const __m256i res_8 = |
546 | 235k | _mm256_packus_epi16(round_result_lo, round_result_hi); |
547 | 235k | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
548 | 235k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
549 | | |
550 | 235k | _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
551 | 235k | _mm_store_si128( |
552 | 235k | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); |
553 | | |
554 | 284k | } else { |
555 | 284k | const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned); |
556 | 284k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0); |
557 | | |
558 | 284k | const __m128i res_lo_1 = |
559 | 284k | _mm256_extracti128_si256(res_lo_unsigned, 1); |
560 | 284k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
561 | 284k | res_lo_1); |
562 | | |
563 | 284k | const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned); |
564 | 284k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]), |
565 | 284k | res_hi_0); |
566 | | |
567 | 284k | const __m128i res_hi_1 = |
568 | 284k | _mm256_extracti128_si256(res_hi_unsigned, 1); |
569 | 284k | _mm_store_si128( |
570 | 284k | (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]), |
571 | 284k | res_hi_1); |
572 | 284k | } |
573 | 519k | } |
574 | 638k | s[0] = s[1]; |
575 | 638k | s[1] = s[2]; |
576 | 638k | s[2] = s[3]; |
577 | | |
578 | 638k | s[4] = s[5]; |
579 | 638k | s[5] = s[6]; |
580 | 638k | s[6] = s[7]; |
581 | 638k | } |
582 | 59.6k | } |
583 | 44.1k | } |
584 | 79.3k | } |
585 | | |
586 | | void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride, |
587 | | uint8_t *dst0, int dst_stride0, int w, int h, |
588 | | const InterpFilterParams *filter_params_x, |
589 | | const InterpFilterParams *filter_params_y, |
590 | | const int subpel_x_qn, const int subpel_y_qn, |
591 | 238k | ConvolveParams *conv_params) { |
592 | 238k | CONV_BUF_TYPE *dst = conv_params->dst; |
593 | 238k | int dst_stride = conv_params->dst_stride; |
594 | 238k | const int bd = 8; |
595 | | |
596 | 238k | DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); |
597 | | |
598 | 238k | int im_stride = 8; |
599 | 238k | int i, is_horiz_4tap = 0, is_vert_4tap = 0; |
600 | 238k | const __m256i wt = unpack_weights_avx2(conv_params); |
601 | 238k | const int do_average = conv_params->do_average; |
602 | 238k | const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; |
603 | 238k | const int offset_0 = |
604 | 238k | bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
605 | 238k | const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); |
606 | 238k | const __m256i offset_const = _mm256_set1_epi16(offset); |
607 | 238k | const int rounding_shift = |
608 | 238k | 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
609 | 238k | const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); |
610 | | |
611 | 238k | assert(conv_params->round_0 > 0); |
612 | | |
613 | 238k | const __m256i round_const_h = _mm256_set1_epi16( |
614 | 238k | ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2))); |
615 | 238k | const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1); |
616 | | |
617 | 238k | const __m256i round_const_v = _mm256_set1_epi32( |
618 | 238k | ((1 << conv_params->round_1) >> 1) - |
619 | 238k | (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); |
620 | 238k | const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1); |
621 | | |
622 | 238k | __m256i filt[4], coeffs_x[4], coeffs_y[4]; |
623 | | |
624 | 238k | filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); |
625 | 238k | filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); |
626 | | |
627 | 238k | prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_x); |
628 | 238k | prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); |
629 | | |
630 | | // Condition for checking valid horz_filt taps |
631 | 238k | if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_x[0], coeffs_x[3]), 0))) |
632 | 107k | is_horiz_4tap = 1; |
633 | | |
634 | | // Condition for checking valid vert_filt taps |
635 | 238k | if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_y[0], coeffs_y[3]), 0))) |
636 | 119k | is_vert_4tap = 1; |
637 | | |
638 | 238k | if (is_horiz_4tap) { |
639 | 107k | int im_h = h + filter_params_y->taps - 1; |
640 | 107k | const int fo_vert = filter_params_y->taps / 2 - 1; |
641 | 107k | const int fo_horiz = 1; |
642 | 107k | const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; |
643 | 238k | for (int j = 0; j < w; j += 8) { |
644 | | /* Horizontal filter */ |
645 | 131k | const uint8_t *src_h = src_ptr + j; |
646 | 1.46M | for (i = 0; i < im_h; i += 2) { |
647 | 1.33M | __m256i data = |
648 | 1.33M | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h)); |
649 | 1.33M | if (i + 1 < im_h) |
650 | 1.20M | data = _mm256_inserti128_si256( |
651 | 1.33M | data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1); |
652 | 1.33M | src_h += (src_stride << 1); |
653 | 1.33M | __m256i res = convolve_lowbd_x_4tap(data, coeffs_x + 1, filt); |
654 | | |
655 | 1.33M | res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), |
656 | 1.33M | round_shift_h); |
657 | | |
658 | 1.33M | _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); |
659 | 1.33M | } |
660 | 131k | DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP; |
661 | 131k | } |
662 | 130k | } else if (is_vert_4tap) { |
663 | 31.4k | int im_h = h + 3; |
664 | 31.4k | const int fo_vert = 1; |
665 | 31.4k | const int fo_horiz = filter_params_x->taps / 2 - 1; |
666 | 31.4k | const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; |
667 | | |
668 | 31.4k | filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); |
669 | 31.4k | filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); |
670 | | |
671 | 76.7k | for (int j = 0; j < w; j += 8) { |
672 | | /* Horizontal filter */ |
673 | 45.2k | const uint8_t *src_h = src_ptr + j; |
674 | 45.2k | DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP; |
675 | | |
676 | | /* Vertical filter */ |
677 | 45.2k | __m256i s[6]; |
678 | 45.2k | __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); |
679 | 45.2k | __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); |
680 | 45.2k | __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); |
681 | 45.2k | __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); |
682 | | |
683 | 45.2k | s[0] = _mm256_unpacklo_epi16(s0, s1); |
684 | 45.2k | s[1] = _mm256_unpacklo_epi16(s2, s3); |
685 | | |
686 | 45.2k | s[3] = _mm256_unpackhi_epi16(s0, s1); |
687 | 45.2k | s[4] = _mm256_unpackhi_epi16(s2, s3); |
688 | | |
689 | 189k | for (i = 0; i < h; i += 2) { |
690 | 144k | const int16_t *data = &im_block[i * im_stride]; |
691 | | |
692 | 144k | const __m256i s4 = |
693 | 144k | _mm256_loadu_si256((__m256i *)(data + 4 * im_stride)); |
694 | 144k | const __m256i s5 = |
695 | 144k | _mm256_loadu_si256((__m256i *)(data + 5 * im_stride)); |
696 | | |
697 | 144k | s[2] = _mm256_unpacklo_epi16(s4, s5); |
698 | 144k | s[5] = _mm256_unpackhi_epi16(s4, s5); |
699 | | |
700 | 144k | const __m256i res_a = convolve_4tap(s, coeffs_y + 1); |
701 | 144k | const __m256i res_a_round = _mm256_sra_epi32( |
702 | 144k | _mm256_add_epi32(res_a, round_const_v), round_shift_v); |
703 | | |
704 | 144k | if (w - j > 4) { |
705 | 144k | const __m256i res_b = convolve_4tap(s + 3, coeffs_y + 1); |
706 | 144k | const __m256i res_b_round = _mm256_sra_epi32( |
707 | 144k | _mm256_add_epi32(res_b, round_const_v), round_shift_v); |
708 | 144k | const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round); |
709 | 144k | const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); |
710 | | |
711 | 144k | if (do_average) { |
712 | 59.6k | const __m256i data_ref_0 = |
713 | 59.6k | load_line2_avx2(&dst[i * dst_stride + j], |
714 | 59.6k | &dst[i * dst_stride + j + dst_stride]); |
715 | 59.6k | const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, |
716 | 59.6k | &wt, use_dist_wtd_comp_avg); |
717 | | |
718 | 59.6k | const __m256i round_result = convolve_rounding( |
719 | 59.6k | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); |
720 | | |
721 | 59.6k | const __m256i res_8 = |
722 | 59.6k | _mm256_packus_epi16(round_result, round_result); |
723 | 59.6k | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
724 | 59.6k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
725 | | |
726 | 59.6k | _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
727 | 59.6k | _mm_storel_epi64( |
728 | 59.6k | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); |
729 | 84.7k | } else { |
730 | 84.7k | const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); |
731 | 84.7k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); |
732 | | |
733 | 84.7k | const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); |
734 | 84.7k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
735 | 84.7k | res_1); |
736 | 84.7k | } |
737 | 144k | } else { |
738 | 1 | const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round); |
739 | 1 | const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); |
740 | | |
741 | 1 | if (do_average) { |
742 | 0 | const __m256i data_ref_0 = |
743 | 0 | load_line2_avx2(&dst[i * dst_stride + j], |
744 | 0 | &dst[i * dst_stride + j + dst_stride]); |
745 | |
|
746 | 0 | const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, |
747 | 0 | &wt, use_dist_wtd_comp_avg); |
748 | |
|
749 | 0 | const __m256i round_result = convolve_rounding( |
750 | 0 | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); |
751 | |
|
752 | 0 | const __m256i res_8 = |
753 | 0 | _mm256_packus_epi16(round_result, round_result); |
754 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
755 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
756 | |
|
757 | 0 | *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); |
758 | 0 | *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = |
759 | 0 | _mm_cvtsi128_si32(res_1); |
760 | |
|
761 | 1 | } else { |
762 | 1 | const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); |
763 | 1 | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); |
764 | | |
765 | 1 | const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); |
766 | 1 | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
767 | 1 | res_1); |
768 | 1 | } |
769 | 1 | } |
770 | 144k | s[0] = s[1]; |
771 | 144k | s[1] = s[2]; |
772 | 144k | s[3] = s[4]; |
773 | 144k | s[4] = s[5]; |
774 | 144k | } |
775 | 45.2k | } |
776 | 99.1k | } else { |
777 | 99.1k | int im_h = h + filter_params_y->taps - 1; |
778 | 99.1k | const int fo_vert = filter_params_y->taps / 2 - 1; |
779 | 99.1k | const int fo_horiz = filter_params_x->taps / 2 - 1; |
780 | 99.1k | const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; |
781 | | |
782 | 99.1k | filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); |
783 | 99.1k | filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); |
784 | | |
785 | 335k | for (int j = 0; j < w; j += 8) { |
786 | | /* Horizontal filter */ |
787 | 236k | const uint8_t *src_h = src_ptr + j; |
788 | 236k | DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP; |
789 | | |
790 | 236k | DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP; |
791 | 236k | } |
792 | 99.1k | } |
793 | 238k | } |
794 | | |
795 | | #define DO_NO_AVG_2D_COPY_4X16(r0, c0, r1, c1, r2, c2, r3, c3) \ |
796 | 32.3M | do { \ |
797 | 32.3M | src_0 = _mm256_cvtepu8_epi16( \ |
798 | 32.3M | _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0]))); \ |
799 | 32.3M | src_1 = _mm256_cvtepu8_epi16( \ |
800 | 32.3M | _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1]))); \ |
801 | 32.3M | src_2 = _mm256_cvtepu8_epi16( \ |
802 | 32.3M | _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2]))); \ |
803 | 32.3M | src_3 = _mm256_cvtepu8_epi16( \ |
804 | 32.3M | _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3]))); \ |
805 | 32.3M | \ |
806 | 32.3M | src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT); \ |
807 | 32.3M | src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT); \ |
808 | 32.3M | src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT); \ |
809 | 32.3M | src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT); \ |
810 | 32.3M | \ |
811 | 32.3M | src_0 = _mm256_add_epi16(src_0, offset_const); \ |
812 | 32.3M | src_1 = _mm256_add_epi16(src_1, offset_const); \ |
813 | 32.3M | src_2 = _mm256_add_epi16(src_2, offset_const); \ |
814 | 32.3M | src_3 = _mm256_add_epi16(src_3, offset_const); \ |
815 | 32.3M | \ |
816 | 32.3M | _mm256_store_si256((__m256i *)(&dst[r0 * dst_stride + c0]), src_0); \ |
817 | 32.3M | _mm256_store_si256((__m256i *)(&dst[r1 * dst_stride + c1]), src_1); \ |
818 | 32.3M | _mm256_store_si256((__m256i *)(&dst[r2 * dst_stride + c2]), src_2); \ |
819 | 32.3M | _mm256_store_si256((__m256i *)(&dst[r3 * dst_stride + c3]), src_3); \ |
820 | 32.3M | } while (0) |
821 | | |
822 | 257M | #define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7) |
823 | | static inline void av1_dist_wtd_convolve_2d_no_avg_copy_avx2( |
824 | | const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, |
825 | 534k | int w, int h, const __m256i offset_const) { |
826 | 534k | int i = h; |
827 | 534k | if (w >= 16) { |
828 | 385k | __m256i src_0, src_1, src_2, src_3; |
829 | 385k | if (w == 128) { |
830 | 8.39M | do { |
831 | 8.39M | DO_NO_AVG_2D_COPY_4X16(0, 0, 0, 16, 0, 32, 0, 48); |
832 | 8.39M | DO_NO_AVG_2D_COPY_4X16(0, 64, 0, 80, 0, 96, 0, 112); |
833 | 8.39M | src += 1 * src_stride; |
834 | 8.39M | dst += 1 * dst_stride; |
835 | 8.39M | i -= 1; |
836 | 8.39M | } while (i); |
837 | 319k | } else if (w == 64) { |
838 | 12.8M | do { |
839 | 12.8M | DO_NO_AVG_2D_COPY_4X16(0, 0, 0, 16, 0, 32, 0, 48); |
840 | 12.8M | src += 1 * src_stride; |
841 | 12.8M | dst += 1 * dst_stride; |
842 | 12.8M | i -= 1; |
843 | 12.8M | } while (i); |
844 | 169k | } else if (w == 32) { |
845 | 2.38M | do { |
846 | 2.38M | DO_NO_AVG_2D_COPY_4X16(0, 0, 1, 0, 0, 16, 1, 16); |
847 | 2.38M | src += 2 * src_stride; |
848 | 2.38M | dst += 2 * dst_stride; |
849 | 2.38M | i -= 2; |
850 | 2.38M | } while (i); |
851 | 90.4k | } else if (w == 16) { |
852 | 293k | do { |
853 | 293k | DO_NO_AVG_2D_COPY_4X16(0, 0, 1, 0, 2, 0, 3, 0); |
854 | 293k | src += 4 * src_stride; |
855 | 293k | dst += 4 * dst_stride; |
856 | 293k | i -= 4; |
857 | 293k | } while (i); |
858 | 59.7k | } |
859 | 385k | } else { |
860 | 149k | const __m256i zero = _mm256_setzero_si256(); |
861 | 385k | do { |
862 | 385k | const __m128i src_row_0 = |
863 | 385k | _mm_loadl_epi64((__m128i *)(&src[0 * src_stride])); |
864 | 385k | const __m128i src_row_1 = |
865 | 385k | _mm_loadl_epi64((__m128i *)(&src[1 * src_stride])); |
866 | 385k | const __m128i src_row_2 = |
867 | 385k | _mm_loadl_epi64((__m128i *)(&src[2 * src_stride])); |
868 | 385k | const __m128i src_row_3 = |
869 | 385k | _mm_loadl_epi64((__m128i *)(&src[3 * src_stride])); |
870 | | |
871 | 385k | __m256i src_10 = _mm256_insertf128_si256( |
872 | 385k | _mm256_castsi128_si256(src_row_0), src_row_1, 1); |
873 | 385k | __m256i src_32 = _mm256_insertf128_si256( |
874 | 385k | _mm256_castsi128_si256(src_row_2), src_row_3, 1); |
875 | | |
876 | 385k | src_10 = _mm256_unpacklo_epi8(src_10, zero); |
877 | 385k | src_32 = _mm256_unpacklo_epi8(src_32, zero); |
878 | | |
879 | 385k | src_10 = _mm256_slli_epi16(src_10, LEFT_SHIFT); |
880 | 385k | src_32 = _mm256_slli_epi16(src_32, LEFT_SHIFT); |
881 | | |
882 | 385k | src_10 = _mm256_add_epi16(src_10, offset_const); |
883 | 385k | src_32 = _mm256_add_epi16(src_32, offset_const); |
884 | | |
885 | | // Accumulate values into the destination buffer |
886 | 385k | _mm_store_si128((__m128i *)(&dst[0 * dst_stride]), |
887 | 385k | _mm256_castsi256_si128(src_10)); |
888 | 385k | _mm_store_si128((__m128i *)(&dst[1 * dst_stride]), |
889 | 385k | _mm256_extracti128_si256(src_10, 1)); |
890 | 385k | _mm_store_si128((__m128i *)(&dst[2 * dst_stride]), |
891 | 385k | _mm256_castsi256_si128(src_32)); |
892 | 385k | _mm_store_si128((__m128i *)(&dst[3 * dst_stride]), |
893 | 385k | _mm256_extracti128_si256(src_32, 1)); |
894 | | |
895 | 385k | src += 4 * src_stride; |
896 | 385k | dst += 4 * dst_stride; |
897 | 385k | i -= 4; |
898 | 385k | } while (i); |
899 | 149k | } |
900 | 534k | } |
901 | | |
902 | | #define DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, r0, c0, r1, c1, r2, c2, r3, c3) \ |
903 | 31.8M | do { \ |
904 | 31.8M | src_0 = _mm256_cvtepu8_epi16( \ |
905 | 31.8M | _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0]))); \ |
906 | 31.8M | src_1 = _mm256_cvtepu8_epi16( \ |
907 | 31.8M | _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1]))); \ |
908 | 31.8M | src_2 = _mm256_cvtepu8_epi16( \ |
909 | 31.8M | _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2]))); \ |
910 | 31.8M | src_3 = _mm256_cvtepu8_epi16( \ |
911 | 31.8M | _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3]))); \ |
912 | 31.8M | \ |
913 | 31.8M | src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT); \ |
914 | 31.8M | src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT); \ |
915 | 31.8M | src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT); \ |
916 | 31.8M | src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT); \ |
917 | 31.8M | src_0 = _mm256_add_epi16(src_0, offset_const); \ |
918 | 31.8M | src_1 = _mm256_add_epi16(src_1, offset_const); \ |
919 | 31.8M | src_2 = _mm256_add_epi16(src_2, offset_const); \ |
920 | 31.8M | src_3 = _mm256_add_epi16(src_3, offset_const); \ |
921 | 31.8M | \ |
922 | 31.8M | ref_0 = _mm256_loadu_si256((__m256i *)(&dst[r0 * dst_stride + c0])); \ |
923 | 31.8M | ref_1 = _mm256_loadu_si256((__m256i *)(&dst[r1 * dst_stride + c1])); \ |
924 | 31.8M | ref_2 = _mm256_loadu_si256((__m256i *)(&dst[r2 * dst_stride + c2])); \ |
925 | 31.8M | ref_3 = _mm256_loadu_si256((__m256i *)(&dst[r3 * dst_stride + c3])); \ |
926 | 31.8M | \ |
927 | 31.8M | res_0 = comp_avg(&ref_0, &src_0, &wt, USE_DIST_WEIGHTED); \ |
928 | 31.8M | res_1 = comp_avg(&ref_1, &src_1, &wt, USE_DIST_WEIGHTED); \ |
929 | 31.8M | res_2 = comp_avg(&ref_2, &src_2, &wt, USE_DIST_WEIGHTED); \ |
930 | 31.8M | res_3 = comp_avg(&ref_3, &src_3, &wt, USE_DIST_WEIGHTED); \ |
931 | 31.8M | \ |
932 | 31.8M | res_0 = convolve_rounding(&res_0, &offset_const, &rounding_const, \ |
933 | 31.8M | rounding_shift); \ |
934 | 31.8M | res_1 = convolve_rounding(&res_1, &offset_const, &rounding_const, \ |
935 | 31.8M | rounding_shift); \ |
936 | 31.8M | res_2 = convolve_rounding(&res_2, &offset_const, &rounding_const, \ |
937 | 31.8M | rounding_shift); \ |
938 | 31.8M | res_3 = convolve_rounding(&res_3, &offset_const, &rounding_const, \ |
939 | 31.8M | rounding_shift); \ |
940 | 31.8M | \ |
941 | 31.8M | res_10 = _mm256_packus_epi16(res_0, res_1); \ |
942 | 31.8M | res_32 = _mm256_packus_epi16(res_2, res_3); \ |
943 | 31.8M | res_10 = _mm256_permute4x64_epi64(res_10, 0xD8); \ |
944 | 31.8M | res_32 = _mm256_permute4x64_epi64(res_32, 0xD8); \ |
945 | 31.8M | \ |
946 | 31.8M | _mm_store_si128((__m128i *)(&dst0[r0 * dst_stride0 + c0]), \ |
947 | 31.8M | _mm256_castsi256_si128(res_10)); \ |
948 | 31.8M | _mm_store_si128((__m128i *)(&dst0[r1 * dst_stride0 + c1]), \ |
949 | 31.8M | _mm256_extracti128_si256(res_10, 1)); \ |
950 | 31.8M | _mm_store_si128((__m128i *)(&dst0[r2 * dst_stride0 + c2]), \ |
951 | 31.8M | _mm256_castsi256_si128(res_32)); \ |
952 | 31.8M | _mm_store_si128((__m128i *)(&dst0[r3 * dst_stride0 + c3]), \ |
953 | 31.8M | _mm256_extracti128_si256(res_32, 1)); \ |
954 | 31.8M | } while (0) |
955 | | |
956 | | #define DO_AVG_2D_COPY(USE_DIST_WEIGHTED) \ |
957 | 441k | int i = h; \ |
958 | 441k | if (w >= 16) { \ |
959 | 352k | __m256i src_0, src_1, src_2, src_3; \ |
960 | 352k | __m256i ref_0, ref_1, ref_2, ref_3; \ |
961 | 352k | __m256i res_0, res_1, res_2, res_3; \ |
962 | 352k | __m256i res_10, res_32; \ |
963 | 352k | if (w == 128) { \ |
964 | 8.36M | do { \ |
965 | 8.36M | DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 0, 16, 0, 32, 0, 48); \ |
966 | 8.36M | DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 64, 0, 80, 0, 96, 0, 112); \ |
967 | 8.36M | i -= 1; \ |
968 | 8.36M | src += 1 * src_stride; \ |
969 | 8.36M | dst += 1 * dst_stride; \ |
970 | 8.36M | dst0 += 1 * dst_stride0; \ |
971 | 8.36M | } while (i); \ |
972 | 286k | } else if (w == 64) { \ |
973 | 12.7M | do { \ |
974 | 12.7M | DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 0, 16, 0, 32, 0, 48); \ |
975 | 12.7M | \ |
976 | 12.7M | i -= 1; \ |
977 | 12.7M | src += 1 * src_stride; \ |
978 | 12.7M | dst += 1 * dst_stride; \ |
979 | 12.7M | dst0 += 1 * dst_stride0; \ |
980 | 12.7M | } while (i); \ |
981 | 167k | } else if (w == 32) { \ |
982 | 2.27M | do { \ |
983 | 2.27M | DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 1, 0, 0, 16, 1, 16); \ |
984 | 2.27M | \ |
985 | 2.27M | i -= 2; \ |
986 | 2.27M | src += 2 * src_stride; \ |
987 | 2.27M | dst += 2 * dst_stride; \ |
988 | 2.27M | dst0 += 2 * dst_stride0; \ |
989 | 2.27M | } while (i); \ |
990 | 82.1k | } else { \ |
991 | 36.9k | assert(w == 16); \ |
992 | 147k | do { \ |
993 | 147k | DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 1, 0, 2, 0, 3, 0); \ |
994 | 147k | \ |
995 | 147k | i -= 4; \ |
996 | 147k | src += 4 * src_stride; \ |
997 | 147k | dst += 4 * dst_stride; \ |
998 | 147k | dst0 += 4 * dst_stride0; \ |
999 | 147k | } while (i); \ |
1000 | 37.1k | } \ |
1001 | 352k | } else if (w == 8) { \ |
1002 | 142k | do { \ |
1003 | 142k | const __m128i src_0 = \ |
1004 | 142k | _mm_loadl_epi64((__m128i *)(&src[0 * src_stride])); \ |
1005 | 142k | const __m128i src_1 = \ |
1006 | 142k | _mm_loadl_epi64((__m128i *)(&src[1 * src_stride])); \ |
1007 | 142k | const __m128i src_2 = \ |
1008 | 142k | _mm_loadl_epi64((__m128i *)(&src[2 * src_stride])); \ |
1009 | 142k | const __m128i src_3 = \ |
1010 | 142k | _mm_loadl_epi64((__m128i *)(&src[3 * src_stride])); \ |
1011 | 142k | __m256i src_10 = \ |
1012 | 142k | _mm256_insertf128_si256(_mm256_castsi128_si256(src_0), src_1, 1); \ |
1013 | 142k | __m256i src_32 = \ |
1014 | 142k | _mm256_insertf128_si256(_mm256_castsi128_si256(src_2), src_3, 1); \ |
1015 | 142k | \ |
1016 | 142k | src_10 = _mm256_unpacklo_epi8(src_10, zero); \ |
1017 | 142k | src_32 = _mm256_unpacklo_epi8(src_32, zero); \ |
1018 | 142k | \ |
1019 | 142k | src_10 = _mm256_slli_epi16(src_10, LEFT_SHIFT); \ |
1020 | 142k | src_32 = _mm256_slli_epi16(src_32, LEFT_SHIFT); \ |
1021 | 142k | \ |
1022 | 142k | src_10 = _mm256_add_epi16(src_10, offset_const); \ |
1023 | 142k | src_32 = _mm256_add_epi16(src_32, offset_const); \ |
1024 | 142k | \ |
1025 | 142k | const __m256i ref_10 = \ |
1026 | 142k | load_line2_avx2(&dst[0 * dst_stride], &dst[1 * dst_stride]); \ |
1027 | 142k | const __m256i ref_32 = \ |
1028 | 142k | load_line2_avx2(&dst[2 * dst_stride], &dst[3 * dst_stride]); \ |
1029 | 142k | __m256i res_10 = comp_avg(&ref_10, &src_10, &wt, USE_DIST_WEIGHTED); \ |
1030 | 142k | __m256i res_32 = comp_avg(&ref_32, &src_32, &wt, USE_DIST_WEIGHTED); \ |
1031 | 142k | \ |
1032 | 142k | res_10 = convolve_rounding(&res_10, &offset_const, &rounding_const, \ |
1033 | 142k | rounding_shift); \ |
1034 | 142k | res_32 = convolve_rounding(&res_32, &offset_const, &rounding_const, \ |
1035 | 142k | rounding_shift); \ |
1036 | 142k | \ |
1037 | 142k | __m256i res = _mm256_packus_epi16(res_10, res_32); \ |
1038 | 142k | const __m128i res_20 = _mm256_castsi256_si128(res); \ |
1039 | 142k | const __m128i res_31 = _mm256_extracti128_si256(res, 1); \ |
1040 | 142k | \ |
1041 | 142k | _mm_storel_epi64((__m128i *)(&dst0[0 * dst_stride0]), res_20); \ |
1042 | 142k | _mm_storel_epi64((__m128i *)((&dst0[1 * dst_stride0])), res_31); \ |
1043 | 142k | _mm_storeh_epi64((__m128i *)(&dst0[2 * dst_stride0]), res_20); \ |
1044 | 142k | _mm_storeh_epi64((__m128i *)((&dst0[3 * dst_stride0])), res_31); \ |
1045 | 142k | i -= 4; \ |
1046 | 142k | src += 4 * src_stride; \ |
1047 | 142k | dst += 4 * dst_stride; \ |
1048 | 142k | dst0 += 4 * dst_stride0; \ |
1049 | 142k | } while (i); \ |
1050 | 54.5k | } else { \ |
1051 | 34.6k | assert(w == 4); \ |
1052 | 52.5k | do { \ |
1053 | 52.5k | __m256i src_3210_8bit = \ |
1054 | 52.5k | _mm256_setr_epi32(loadu_int32(src + 0 * src_stride), \ |
1055 | 52.5k | loadu_int32(src + 1 * src_stride), 0, 0, \ |
1056 | 52.5k | loadu_int32(src + 2 * src_stride), \ |
1057 | 52.5k | loadu_int32(src + 3 * src_stride), 0, 0); \ |
1058 | 52.5k | \ |
1059 | 52.5k | __m256i src_3210 = _mm256_unpacklo_epi8(src_3210_8bit, zero); \ |
1060 | 52.5k | src_3210 = _mm256_slli_epi16(src_3210, LEFT_SHIFT); \ |
1061 | 52.5k | src_3210 = _mm256_add_epi16(src_3210, offset_const); \ |
1062 | 52.5k | \ |
1063 | 52.5k | __m256i ref_3210 = \ |
1064 | 52.5k | _mm256_setr_epi64x(*(int64_t *)(dst + 0 * dst_stride), \ |
1065 | 52.5k | *(int64_t *)(dst + 1 * dst_stride), \ |
1066 | 52.5k | *(int64_t *)(dst + 2 * dst_stride), \ |
1067 | 52.5k | *(int64_t *)(dst + 3 * dst_stride)); \ |
1068 | 52.5k | __m256i res_3210 = \ |
1069 | 52.5k | comp_avg(&ref_3210, &src_3210, &wt, USE_DIST_WEIGHTED); \ |
1070 | 52.5k | \ |
1071 | 52.5k | res_3210 = convolve_rounding(&res_3210, &offset_const, &rounding_const, \ |
1072 | 52.5k | rounding_shift); \ |
1073 | 52.5k | \ |
1074 | 52.5k | res_3210 = _mm256_packus_epi16(res_3210, res_3210); \ |
1075 | 52.5k | const __m128i res_10 = _mm256_castsi256_si128(res_3210); \ |
1076 | 52.5k | const __m128i res_32 = _mm256_extracti128_si256(res_3210, 1); \ |
1077 | 52.5k | \ |
1078 | 52.5k | *(int *)(&dst0[0 * dst_stride0]) = _mm_cvtsi128_si32(res_10); \ |
1079 | 52.5k | *(int *)(&dst0[2 * dst_stride0]) = _mm_cvtsi128_si32(res_32); \ |
1080 | 52.5k | *(int *)(&dst0[1 * dst_stride0]) = _mm_extract_epi32(res_10, 1); \ |
1081 | 52.5k | *(int *)(&dst0[3 * dst_stride0]) = _mm_extract_epi32(res_32, 1); \ |
1082 | 52.5k | i -= 4; \ |
1083 | 52.5k | src += 4 * src_stride; \ |
1084 | 52.5k | dst += 4 * dst_stride; \ |
1085 | 52.5k | dst0 += 4 * dst_stride0; \ |
1086 | 52.5k | } while (i); \ |
1087 | 34.6k | } |
1088 | | |
1089 | | void av1_dist_wtd_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, |
1090 | | uint8_t *dst0, int dst_stride0, int w, |
1091 | 975k | int h, ConvolveParams *conv_params) { |
1092 | 975k | const int bd = 8; |
1093 | 975k | CONV_BUF_TYPE *dst = conv_params->dst; |
1094 | 975k | int dst_stride = conv_params->dst_stride; |
1095 | 975k | assert(conv_params->round_0 == 3); |
1096 | 975k | assert(conv_params->round_1 == 7); |
1097 | 975k | assert(w % 4 == 0); |
1098 | 975k | assert(h % 4 == 0); |
1099 | | |
1100 | 975k | const int do_average = conv_params->do_average; |
1101 | 975k | const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; |
1102 | 975k | const __m256i wt = unpack_weights_avx2(conv_params); |
1103 | 975k | const __m256i zero = _mm256_setzero_si256(); |
1104 | | |
1105 | 975k | const int offset_0 = |
1106 | 975k | bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
1107 | 975k | const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); |
1108 | 975k | const __m256i offset_const = _mm256_set1_epi16(offset); |
1109 | 975k | const int rounding_shift = |
1110 | 975k | 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
1111 | 975k | const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); |
1112 | | |
1113 | 975k | if (do_average) { |
1114 | 441k | if (use_dist_wtd_comp_avg) { |
1115 | 74.3k | DO_AVG_2D_COPY(1) |
1116 | 366k | } else { |
1117 | 366k | DO_AVG_2D_COPY(0) |
1118 | 366k | } |
1119 | 534k | } else { |
1120 | 534k | av1_dist_wtd_convolve_2d_no_avg_copy_avx2(src, src_stride, dst, dst_stride, |
1121 | 534k | w, h, offset_const); |
1122 | 534k | } |
1123 | 975k | } |
1124 | | #undef LEFT_SHIFT |