/src/aom/av1/common/x86/jnt_convolve_avx2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2018, Alliance for Open Media. All rights reserved |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #include <emmintrin.h> |
13 | | #include <immintrin.h> |
14 | | |
15 | | #include "config/aom_dsp_rtcd.h" |
16 | | |
17 | | #include "aom_dsp/aom_dsp_common.h" |
18 | | #include "aom_dsp/aom_filter.h" |
19 | | #include "aom_dsp/x86/convolve_avx2.h" |
20 | | #include "aom_dsp/x86/convolve_common_intrin.h" |
21 | | #include "aom_dsp/x86/convolve_sse4_1.h" |
22 | | #include "aom_dsp/x86/mem_sse2.h" |
23 | | #include "aom_dsp/x86/synonyms_avx2.h" |
24 | | |
25 | | #include "av1/common/convolve.h" |
26 | | |
27 | 2.52M | static INLINE __m256i unpack_weights_avx2(ConvolveParams *conv_params) { |
28 | 2.52M | const int w0 = conv_params->fwd_offset; |
29 | 2.52M | const int w1 = conv_params->bck_offset; |
30 | 2.52M | const __m256i wt0 = _mm256_set1_epi16((int16_t)w0); |
31 | 2.52M | const __m256i wt1 = _mm256_set1_epi16((int16_t)w1); |
32 | 2.52M | const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1); |
33 | 2.52M | return wt; |
34 | 2.52M | } |
35 | | |
36 | 16.2M | static INLINE __m256i load_line2_avx2(const void *a, const void *b) { |
37 | 16.2M | return _mm256_permute2x128_si256( |
38 | 16.2M | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)a)), |
39 | 16.2M | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)b)), 0x20); |
40 | 16.2M | } |
41 | | |
42 | | void av1_dist_wtd_convolve_x_avx2(const uint8_t *src, int src_stride, |
43 | | uint8_t *dst0, int dst_stride0, int w, int h, |
44 | | const InterpFilterParams *filter_params_x, |
45 | | const int subpel_x_qn, |
46 | 367k | ConvolveParams *conv_params) { |
47 | 367k | CONV_BUF_TYPE *dst = conv_params->dst; |
48 | 367k | int dst_stride = conv_params->dst_stride; |
49 | 367k | const int bd = 8; |
50 | 367k | int i, j, is_horiz_4tap = 0; |
51 | 367k | const int bits = FILTER_BITS - conv_params->round_1; |
52 | 367k | const __m256i wt = unpack_weights_avx2(conv_params); |
53 | 367k | const int do_average = conv_params->do_average; |
54 | 367k | const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; |
55 | 367k | const int offset_0 = |
56 | 367k | bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
57 | 367k | const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); |
58 | 367k | const __m256i offset_const = _mm256_set1_epi16(offset); |
59 | 367k | const int rounding_shift = |
60 | 367k | 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
61 | 367k | const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); |
62 | | |
63 | 367k | assert(bits >= 0); |
64 | 0 | assert(conv_params->round_0 > 0); |
65 | | |
66 | 0 | const __m256i round_const = |
67 | 367k | _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1); |
68 | 367k | const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1); |
69 | | |
70 | 367k | __m256i filt[4], coeffs[4]; |
71 | | |
72 | 367k | filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); |
73 | 367k | filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); |
74 | | |
75 | 367k | prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs); |
76 | | |
77 | | // Condition for checking valid horz_filt taps |
78 | 367k | if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0))) |
79 | 116k | is_horiz_4tap = 1; |
80 | | |
81 | | // horz_filt as 4 tap |
82 | 367k | if (is_horiz_4tap) { |
83 | 116k | const int fo_horiz = 1; |
84 | 116k | const uint8_t *const src_ptr = src - fo_horiz; |
85 | 615k | for (i = 0; i < h; i += 2) { |
86 | 498k | const uint8_t *src_data = src_ptr + i * src_stride; |
87 | 498k | CONV_BUF_TYPE *dst_data = dst + i * dst_stride; |
88 | 1.86M | for (j = 0; j < w; j += 8) { |
89 | 1.37M | const __m256i data = |
90 | 1.37M | load_line2_avx2(&src_data[j], &src_data[j + src_stride]); |
91 | | |
92 | 1.37M | __m256i res = convolve_lowbd_x_4tap(data, coeffs + 1, filt); |
93 | 1.37M | res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift); |
94 | 1.37M | res = _mm256_slli_epi16(res, bits); |
95 | | |
96 | 1.37M | const __m256i res_unsigned = _mm256_add_epi16(res, offset_const); |
97 | | |
98 | | // Accumulate values into the destination buffer |
99 | 1.37M | if (do_average) { |
100 | 351k | const __m256i data_ref_0 = |
101 | 351k | load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]); |
102 | 351k | const __m256i comp_avg_res = |
103 | 351k | comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); |
104 | | |
105 | 351k | const __m256i round_result = convolve_rounding( |
106 | 351k | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); |
107 | | |
108 | 351k | const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); |
109 | 351k | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
110 | 351k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
111 | | |
112 | 351k | if (w > 4) { |
113 | 271k | _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
114 | 271k | _mm_storel_epi64( |
115 | 271k | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); |
116 | 271k | } else { |
117 | 80.0k | *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); |
118 | 80.0k | *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = |
119 | 80.0k | _mm_cvtsi128_si32(res_1); |
120 | 80.0k | } |
121 | 1.01M | } else { |
122 | 1.01M | const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); |
123 | 1.01M | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); |
124 | | |
125 | 1.01M | const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); |
126 | 1.01M | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
127 | 1.01M | res_1); |
128 | 1.01M | } |
129 | 1.37M | } |
130 | 498k | } |
131 | 251k | } else { |
132 | 251k | const int fo_horiz = filter_params_x->taps / 2 - 1; |
133 | 251k | const uint8_t *const src_ptr = src - fo_horiz; |
134 | | |
135 | 251k | filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); |
136 | 251k | filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); |
137 | 2.08M | for (i = 0; i < h; i += 2) { |
138 | 1.83M | const uint8_t *src_data = src_ptr + i * src_stride; |
139 | 1.83M | CONV_BUF_TYPE *dst_data = dst + i * dst_stride; |
140 | 8.48M | for (j = 0; j < w; j += 8) { |
141 | 6.65M | const __m256i data = |
142 | 6.65M | load_line2_avx2(&src_data[j], &src_data[j + src_stride]); |
143 | | |
144 | 6.65M | __m256i res = convolve_lowbd_x(data, coeffs, filt); |
145 | | |
146 | 6.65M | res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift); |
147 | | |
148 | 6.65M | res = _mm256_slli_epi16(res, bits); |
149 | | |
150 | 6.65M | const __m256i res_unsigned = _mm256_add_epi16(res, offset_const); |
151 | | |
152 | | // Accumulate values into the destination buffer |
153 | 6.65M | if (do_average) { |
154 | 2.00M | const __m256i data_ref_0 = |
155 | 2.00M | load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]); |
156 | 2.00M | const __m256i comp_avg_res = |
157 | 2.00M | comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); |
158 | | |
159 | 2.00M | const __m256i round_result = convolve_rounding( |
160 | 2.00M | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); |
161 | | |
162 | 2.00M | const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); |
163 | 2.00M | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
164 | 2.00M | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
165 | | |
166 | 2.00M | if (w > 4) { |
167 | 2.00M | _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
168 | 2.00M | _mm_storel_epi64( |
169 | 2.00M | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); |
170 | 2.00M | } else { |
171 | 131 | *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); |
172 | 131 | *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = |
173 | 131 | _mm_cvtsi128_si32(res_1); |
174 | 131 | } |
175 | 4.64M | } else { |
176 | 4.64M | const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); |
177 | 4.64M | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); |
178 | | |
179 | 4.64M | const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); |
180 | 4.64M | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
181 | 4.64M | res_1); |
182 | 4.64M | } |
183 | 6.65M | } |
184 | 1.83M | } |
185 | 251k | } |
186 | 367k | } |
187 | | |
188 | | void av1_dist_wtd_convolve_y_avx2(const uint8_t *src, int src_stride, |
189 | | uint8_t *dst0, int dst_stride0, int w, int h, |
190 | | const InterpFilterParams *filter_params_y, |
191 | | const int subpel_y_qn, |
192 | 288k | ConvolveParams *conv_params) { |
193 | 288k | CONV_BUF_TYPE *dst = conv_params->dst; |
194 | 288k | int dst_stride = conv_params->dst_stride; |
195 | 288k | const int bd = 8; |
196 | 288k | int i, j, is_vert_4tap = 0; |
197 | | // +1 to compensate for dividing the filter coeffs by 2 |
198 | 288k | const int left_shift = FILTER_BITS - conv_params->round_0 + 1; |
199 | 288k | const __m256i round_const = |
200 | 288k | _mm256_set1_epi32((1 << conv_params->round_1) >> 1); |
201 | 288k | const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); |
202 | 288k | const __m256i wt = unpack_weights_avx2(conv_params); |
203 | 288k | const int do_average = conv_params->do_average; |
204 | 288k | const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; |
205 | 288k | const int offset_0 = |
206 | 288k | bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
207 | 288k | const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); |
208 | 288k | const __m256i offset_const = _mm256_set1_epi16(offset); |
209 | 288k | const int offset_1 = (1 << (bd + FILTER_BITS - 2)); |
210 | 288k | const __m256i offset_const_1 = _mm256_set1_epi16(offset_1); |
211 | 288k | const __m256i offset_const_2 = _mm256_set1_epi16((1 << offset_0)); |
212 | 288k | const int rounding_shift = |
213 | 288k | 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
214 | 288k | const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); |
215 | 288k | const __m256i zero = _mm256_setzero_si256(); |
216 | 288k | __m256i coeffs[4], s[8]; |
217 | | |
218 | 288k | assert((FILTER_BITS - conv_params->round_0) >= 0); |
219 | | |
220 | 0 | prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs); |
221 | | |
222 | | // Condition for checking valid vert_filt taps |
223 | 288k | if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0))) |
224 | 102k | is_vert_4tap = 1; |
225 | | |
226 | 288k | if (is_vert_4tap) { |
227 | 102k | const int fo_vert = 1; |
228 | 102k | const uint8_t *const src_ptr = src - fo_vert * src_stride; |
229 | 208k | for (j = 0; j < w; j += 16) { |
230 | 105k | const uint8_t *data = &src_ptr[j]; |
231 | 105k | __m256i src4; |
232 | | // Load lines a and b. Line a to lower 128, line b to upper 128 |
233 | 105k | { |
234 | 105k | __m256i src_ab[4]; |
235 | 105k | __m256i src_a[5]; |
236 | 105k | src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); |
237 | 528k | for (int kk = 0; kk < 4; ++kk) { |
238 | 423k | data += src_stride; |
239 | 423k | src_a[kk + 1] = |
240 | 423k | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); |
241 | 423k | src_ab[kk] = |
242 | 423k | _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20); |
243 | 423k | } |
244 | 105k | src4 = src_a[4]; |
245 | 105k | s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]); |
246 | 105k | s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]); |
247 | | |
248 | 105k | s[3] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]); |
249 | 105k | s[4] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]); |
250 | 105k | } |
251 | | |
252 | 464k | for (i = 0; i < h; i += 2) { |
253 | 358k | data = &src_ptr[(i + 5) * src_stride + j]; |
254 | 358k | const __m256i src5 = |
255 | 358k | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); |
256 | 358k | const __m256i src_45a = _mm256_permute2x128_si256(src4, src5, 0x20); |
257 | | |
258 | 358k | src4 = _mm256_castsi128_si256( |
259 | 358k | _mm_loadu_si128((__m128i *)(data + src_stride))); |
260 | 358k | const __m256i src_56a = _mm256_permute2x128_si256(src5, src4, 0x20); |
261 | | |
262 | 358k | s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); |
263 | 358k | s[5] = _mm256_unpackhi_epi8(src_45a, src_56a); |
264 | | |
265 | 358k | __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1); |
266 | | |
267 | 358k | res_lo = _mm256_add_epi16(res_lo, offset_const_1); |
268 | | |
269 | 358k | const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero); |
270 | 358k | const __m256i res_lo_0_shift = |
271 | 358k | _mm256_slli_epi32(res_lo_0_32b, left_shift); |
272 | 358k | const __m256i res_lo_0_round = _mm256_sra_epi32( |
273 | 358k | _mm256_add_epi32(res_lo_0_shift, round_const), round_shift); |
274 | | |
275 | 358k | const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero); |
276 | 358k | const __m256i res_lo_1_shift = |
277 | 358k | _mm256_slli_epi32(res_lo_1_32b, left_shift); |
278 | 358k | const __m256i res_lo_1_round = _mm256_sra_epi32( |
279 | 358k | _mm256_add_epi32(res_lo_1_shift, round_const), round_shift); |
280 | | |
281 | 358k | const __m256i res_lo_round = |
282 | 358k | _mm256_packs_epi32(res_lo_0_round, res_lo_1_round); |
283 | | |
284 | 358k | const __m256i res_lo_unsigned = |
285 | 358k | _mm256_add_epi16(res_lo_round, offset_const_2); |
286 | | |
287 | 358k | if (w - j < 16) { |
288 | 205k | if (do_average) { |
289 | 80.4k | const __m256i data_ref_0 = |
290 | 80.4k | load_line2_avx2(&dst[i * dst_stride + j], |
291 | 80.4k | &dst[i * dst_stride + j + dst_stride]); |
292 | 80.4k | const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned, |
293 | 80.4k | &wt, use_dist_wtd_comp_avg); |
294 | | |
295 | 80.4k | const __m256i round_result = convolve_rounding( |
296 | 80.4k | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); |
297 | | |
298 | 80.4k | const __m256i res_8 = |
299 | 80.4k | _mm256_packus_epi16(round_result, round_result); |
300 | 80.4k | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
301 | 80.4k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
302 | | |
303 | 80.4k | if (w - j > 4) { |
304 | 38.7k | _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
305 | 38.7k | _mm_storel_epi64( |
306 | 38.7k | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), |
307 | 38.7k | res_1); |
308 | 41.6k | } else { |
309 | 41.6k | *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); |
310 | 41.6k | *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = |
311 | 41.6k | _mm_cvtsi128_si32(res_1); |
312 | 41.6k | } |
313 | 125k | } else { |
314 | 125k | const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned); |
315 | 125k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); |
316 | | |
317 | 125k | const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1); |
318 | 125k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
319 | 125k | res_1); |
320 | 125k | } |
321 | 205k | } else { |
322 | 153k | __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1); |
323 | | |
324 | 153k | res_hi = _mm256_add_epi16(res_hi, offset_const_1); |
325 | | |
326 | 153k | const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero); |
327 | 153k | const __m256i res_hi_0_shift = |
328 | 153k | _mm256_slli_epi32(res_hi_0_32b, left_shift); |
329 | 153k | const __m256i res_hi_0_round = _mm256_sra_epi32( |
330 | 153k | _mm256_add_epi32(res_hi_0_shift, round_const), round_shift); |
331 | | |
332 | 153k | const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero); |
333 | 153k | const __m256i res_hi_1_shift = |
334 | 153k | _mm256_slli_epi32(res_hi_1_32b, left_shift); |
335 | 153k | const __m256i res_hi_1_round = _mm256_sra_epi32( |
336 | 153k | _mm256_add_epi32(res_hi_1_shift, round_const), round_shift); |
337 | | |
338 | 153k | const __m256i res_hi_round = |
339 | 153k | _mm256_packs_epi32(res_hi_0_round, res_hi_1_round); |
340 | | |
341 | 153k | const __m256i res_hi_unsigned = |
342 | 153k | _mm256_add_epi16(res_hi_round, offset_const_2); |
343 | | |
344 | 153k | if (do_average) { |
345 | 50.4k | const __m256i data_ref_0_lo = |
346 | 50.4k | load_line2_avx2(&dst[i * dst_stride + j], |
347 | 50.4k | &dst[i * dst_stride + j + dst_stride]); |
348 | | |
349 | 50.4k | const __m256i data_ref_0_hi = |
350 | 50.4k | load_line2_avx2(&dst[i * dst_stride + j + 8], |
351 | 50.4k | &dst[i * dst_stride + j + 8 + dst_stride]); |
352 | | |
353 | 50.4k | const __m256i comp_avg_res_lo = comp_avg( |
354 | 50.4k | &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg); |
355 | | |
356 | 50.4k | const __m256i comp_avg_res_hi = comp_avg( |
357 | 50.4k | &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg); |
358 | | |
359 | 50.4k | const __m256i round_result_lo = |
360 | 50.4k | convolve_rounding(&comp_avg_res_lo, &offset_const, |
361 | 50.4k | &rounding_const, rounding_shift); |
362 | | |
363 | 50.4k | const __m256i round_result_hi = |
364 | 50.4k | convolve_rounding(&comp_avg_res_hi, &offset_const, |
365 | 50.4k | &rounding_const, rounding_shift); |
366 | | |
367 | 50.4k | const __m256i res_8 = |
368 | 50.4k | _mm256_packus_epi16(round_result_lo, round_result_hi); |
369 | 50.4k | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
370 | 50.4k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
371 | | |
372 | 50.4k | _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
373 | 50.4k | _mm_store_si128( |
374 | 50.4k | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); |
375 | | |
376 | 102k | } else { |
377 | 102k | const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned); |
378 | 102k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0); |
379 | | |
380 | 102k | const __m128i res_lo_1 = |
381 | 102k | _mm256_extracti128_si256(res_lo_unsigned, 1); |
382 | 102k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
383 | 102k | res_lo_1); |
384 | | |
385 | 102k | const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned); |
386 | 102k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]), |
387 | 102k | res_hi_0); |
388 | | |
389 | 102k | const __m128i res_hi_1 = |
390 | 102k | _mm256_extracti128_si256(res_hi_unsigned, 1); |
391 | 102k | _mm_store_si128( |
392 | 102k | (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]), |
393 | 102k | res_hi_1); |
394 | 102k | } |
395 | 153k | } |
396 | 358k | s[0] = s[1]; |
397 | 358k | s[1] = s[2]; |
398 | | |
399 | 358k | s[3] = s[4]; |
400 | 358k | s[4] = s[5]; |
401 | 358k | } |
402 | 105k | } |
403 | 185k | } else { |
404 | 185k | const int fo_vert = filter_params_y->taps / 2 - 1; |
405 | 185k | const uint8_t *const src_ptr = src - fo_vert * src_stride; |
406 | 420k | for (j = 0; j < w; j += 16) { |
407 | 234k | const uint8_t *data = &src_ptr[j]; |
408 | 234k | __m256i src6; |
409 | | // Load lines a and b. Line a to lower 128, line b to upper 128 |
410 | 234k | { |
411 | 234k | __m256i src_ab[7]; |
412 | 234k | __m256i src_a[7]; |
413 | 234k | src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); |
414 | 1.64M | for (int kk = 0; kk < 6; ++kk) { |
415 | 1.40M | data += src_stride; |
416 | 1.40M | src_a[kk + 1] = |
417 | 1.40M | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); |
418 | 1.40M | src_ab[kk] = |
419 | 1.40M | _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20); |
420 | 1.40M | } |
421 | 234k | src6 = src_a[6]; |
422 | 234k | s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]); |
423 | 234k | s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]); |
424 | 234k | s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]); |
425 | 234k | s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]); |
426 | 234k | s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]); |
427 | 234k | s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]); |
428 | 234k | } |
429 | | |
430 | 2.63M | for (i = 0; i < h; i += 2) { |
431 | 2.40M | data = &src_ptr[(i + 7) * src_stride + j]; |
432 | 2.40M | const __m256i src7 = |
433 | 2.40M | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); |
434 | 2.40M | const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20); |
435 | | |
436 | 2.40M | src6 = _mm256_castsi128_si256( |
437 | 2.40M | _mm_loadu_si128((__m128i *)(data + src_stride))); |
438 | 2.40M | const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20); |
439 | | |
440 | 2.40M | s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); |
441 | 2.40M | s[7] = _mm256_unpackhi_epi8(src_67a, src_78a); |
442 | | |
443 | 2.40M | __m256i res_lo = convolve_lowbd(s, coeffs); |
444 | | |
445 | 2.40M | res_lo = _mm256_add_epi16(res_lo, offset_const_1); |
446 | | |
447 | 2.40M | const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero); |
448 | 2.40M | const __m256i res_lo_0_shift = |
449 | 2.40M | _mm256_slli_epi32(res_lo_0_32b, left_shift); |
450 | 2.40M | const __m256i res_lo_0_round = _mm256_sra_epi32( |
451 | 2.40M | _mm256_add_epi32(res_lo_0_shift, round_const), round_shift); |
452 | | |
453 | 2.40M | const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero); |
454 | 2.40M | const __m256i res_lo_1_shift = |
455 | 2.40M | _mm256_slli_epi32(res_lo_1_32b, left_shift); |
456 | 2.40M | const __m256i res_lo_1_round = _mm256_sra_epi32( |
457 | 2.40M | _mm256_add_epi32(res_lo_1_shift, round_const), round_shift); |
458 | | |
459 | 2.40M | const __m256i res_lo_round = |
460 | 2.40M | _mm256_packs_epi32(res_lo_0_round, res_lo_1_round); |
461 | | |
462 | 2.40M | const __m256i res_lo_unsigned = |
463 | 2.40M | _mm256_add_epi16(res_lo_round, offset_const_2); |
464 | | |
465 | 2.40M | if (w - j < 16) { |
466 | 577k | if (do_average) { |
467 | 178k | const __m256i data_ref_0 = |
468 | 178k | load_line2_avx2(&dst[i * dst_stride + j], |
469 | 178k | &dst[i * dst_stride + j + dst_stride]); |
470 | 178k | const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned, |
471 | 178k | &wt, use_dist_wtd_comp_avg); |
472 | | |
473 | 178k | const __m256i round_result = convolve_rounding( |
474 | 178k | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); |
475 | | |
476 | 178k | const __m256i res_8 = |
477 | 178k | _mm256_packus_epi16(round_result, round_result); |
478 | 178k | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
479 | 178k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
480 | | |
481 | 178k | if (w - j > 4) { |
482 | 139k | _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
483 | 139k | _mm_storel_epi64( |
484 | 139k | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), |
485 | 139k | res_1); |
486 | 139k | } else { |
487 | 38.6k | *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); |
488 | 38.6k | *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = |
489 | 38.6k | _mm_cvtsi128_si32(res_1); |
490 | 38.6k | } |
491 | 399k | } else { |
492 | 399k | const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned); |
493 | 399k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); |
494 | | |
495 | 399k | const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1); |
496 | 399k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
497 | 399k | res_1); |
498 | 399k | } |
499 | 1.82M | } else { |
500 | 1.82M | __m256i res_hi = convolve_lowbd(s + 4, coeffs); |
501 | | |
502 | 1.82M | res_hi = _mm256_add_epi16(res_hi, offset_const_1); |
503 | | |
504 | 1.82M | const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero); |
505 | 1.82M | const __m256i res_hi_0_shift = |
506 | 1.82M | _mm256_slli_epi32(res_hi_0_32b, left_shift); |
507 | 1.82M | const __m256i res_hi_0_round = _mm256_sra_epi32( |
508 | 1.82M | _mm256_add_epi32(res_hi_0_shift, round_const), round_shift); |
509 | | |
510 | 1.82M | const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero); |
511 | 1.82M | const __m256i res_hi_1_shift = |
512 | 1.82M | _mm256_slli_epi32(res_hi_1_32b, left_shift); |
513 | 1.82M | const __m256i res_hi_1_round = _mm256_sra_epi32( |
514 | 1.82M | _mm256_add_epi32(res_hi_1_shift, round_const), round_shift); |
515 | | |
516 | 1.82M | const __m256i res_hi_round = |
517 | 1.82M | _mm256_packs_epi32(res_hi_0_round, res_hi_1_round); |
518 | | |
519 | 1.82M | const __m256i res_hi_unsigned = |
520 | 1.82M | _mm256_add_epi16(res_hi_round, offset_const_2); |
521 | | |
522 | 1.82M | if (do_average) { |
523 | 575k | const __m256i data_ref_0_lo = |
524 | 575k | load_line2_avx2(&dst[i * dst_stride + j], |
525 | 575k | &dst[i * dst_stride + j + dst_stride]); |
526 | | |
527 | 575k | const __m256i data_ref_0_hi = |
528 | 575k | load_line2_avx2(&dst[i * dst_stride + j + 8], |
529 | 575k | &dst[i * dst_stride + j + 8 + dst_stride]); |
530 | | |
531 | 575k | const __m256i comp_avg_res_lo = comp_avg( |
532 | 575k | &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg); |
533 | | |
534 | 575k | const __m256i comp_avg_res_hi = comp_avg( |
535 | 575k | &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg); |
536 | | |
537 | 575k | const __m256i round_result_lo = |
538 | 575k | convolve_rounding(&comp_avg_res_lo, &offset_const, |
539 | 575k | &rounding_const, rounding_shift); |
540 | | |
541 | 575k | const __m256i round_result_hi = |
542 | 575k | convolve_rounding(&comp_avg_res_hi, &offset_const, |
543 | 575k | &rounding_const, rounding_shift); |
544 | | |
545 | 575k | const __m256i res_8 = |
546 | 575k | _mm256_packus_epi16(round_result_lo, round_result_hi); |
547 | 575k | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
548 | 575k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
549 | | |
550 | 575k | _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
551 | 575k | _mm_store_si128( |
552 | 575k | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); |
553 | | |
554 | 1.24M | } else { |
555 | 1.24M | const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned); |
556 | 1.24M | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0); |
557 | | |
558 | 1.24M | const __m128i res_lo_1 = |
559 | 1.24M | _mm256_extracti128_si256(res_lo_unsigned, 1); |
560 | 1.24M | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
561 | 1.24M | res_lo_1); |
562 | | |
563 | 1.24M | const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned); |
564 | 1.24M | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]), |
565 | 1.24M | res_hi_0); |
566 | | |
567 | 1.24M | const __m128i res_hi_1 = |
568 | 1.24M | _mm256_extracti128_si256(res_hi_unsigned, 1); |
569 | 1.24M | _mm_store_si128( |
570 | 1.24M | (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]), |
571 | 1.24M | res_hi_1); |
572 | 1.24M | } |
573 | 1.82M | } |
574 | 2.40M | s[0] = s[1]; |
575 | 2.40M | s[1] = s[2]; |
576 | 2.40M | s[2] = s[3]; |
577 | | |
578 | 2.40M | s[4] = s[5]; |
579 | 2.40M | s[5] = s[6]; |
580 | 2.40M | s[6] = s[7]; |
581 | 2.40M | } |
582 | 234k | } |
583 | 185k | } |
584 | 288k | } |
585 | | |
586 | | void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride, |
587 | | uint8_t *dst0, int dst_stride0, int w, int h, |
588 | | const InterpFilterParams *filter_params_x, |
589 | | const InterpFilterParams *filter_params_y, |
590 | | const int subpel_x_qn, const int subpel_y_qn, |
591 | 852k | ConvolveParams *conv_params) { |
592 | 852k | CONV_BUF_TYPE *dst = conv_params->dst; |
593 | 852k | int dst_stride = conv_params->dst_stride; |
594 | 852k | const int bd = 8; |
595 | | |
596 | 852k | DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); |
597 | | |
598 | 852k | int im_stride = 8; |
599 | 852k | int i, is_horiz_4tap = 0, is_vert_4tap = 0; |
600 | 852k | const __m256i wt = unpack_weights_avx2(conv_params); |
601 | 852k | const int do_average = conv_params->do_average; |
602 | 852k | const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; |
603 | 852k | const int offset_0 = |
604 | 852k | bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
605 | 852k | const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); |
606 | 852k | const __m256i offset_const = _mm256_set1_epi16(offset); |
607 | 852k | const int rounding_shift = |
608 | 852k | 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
609 | 852k | const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); |
610 | | |
611 | 852k | assert(conv_params->round_0 > 0); |
612 | | |
613 | 0 | const __m256i round_const_h = _mm256_set1_epi16( |
614 | 852k | ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2))); |
615 | 852k | const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1); |
616 | | |
617 | 852k | const __m256i round_const_v = _mm256_set1_epi32( |
618 | 852k | ((1 << conv_params->round_1) >> 1) - |
619 | 852k | (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); |
620 | 852k | const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1); |
621 | | |
622 | 852k | __m256i filt[4], coeffs_x[4], coeffs_y[4]; |
623 | | |
624 | 852k | filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); |
625 | 852k | filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); |
626 | | |
627 | 852k | prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_x); |
628 | 852k | prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); |
629 | | |
630 | | // Condition for checking valid horz_filt taps |
631 | 852k | if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_x[0], coeffs_x[3]), 0))) |
632 | 303k | is_horiz_4tap = 1; |
633 | | |
634 | | // Condition for checking valid vert_filt taps |
635 | 852k | if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_y[0], coeffs_y[3]), 0))) |
636 | 338k | is_vert_4tap = 1; |
637 | | |
638 | 852k | if (is_horiz_4tap) { |
639 | 303k | int im_h = h + filter_params_y->taps - 1; |
640 | 303k | const int fo_vert = filter_params_y->taps / 2 - 1; |
641 | 303k | const int fo_horiz = 1; |
642 | 303k | const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; |
643 | 645k | for (int j = 0; j < w; j += 8) { |
644 | | /* Horizontal filter */ |
645 | 341k | const uint8_t *src_h = src_ptr + j; |
646 | 3.35M | for (i = 0; i < im_h; i += 2) { |
647 | 3.01M | __m256i data = |
648 | 3.01M | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h)); |
649 | 3.01M | if (i + 1 < im_h) |
650 | 2.67M | data = _mm256_inserti128_si256( |
651 | 3.01M | data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1); |
652 | 3.01M | src_h += (src_stride << 1); |
653 | 3.01M | __m256i res = convolve_lowbd_x_4tap(data, coeffs_x + 1, filt); |
654 | | |
655 | 3.01M | res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), |
656 | 3.01M | round_shift_h); |
657 | | |
658 | 3.01M | _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); |
659 | 3.01M | } |
660 | 341k | DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP; |
661 | 341k | } |
662 | 548k | } else if (is_vert_4tap) { |
663 | 127k | int im_h = h + 3; |
664 | 127k | const int fo_vert = 1; |
665 | 127k | const int fo_horiz = filter_params_x->taps / 2 - 1; |
666 | 127k | const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; |
667 | | |
668 | 127k | filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); |
669 | 127k | filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); |
670 | | |
671 | 301k | for (int j = 0; j < w; j += 8) { |
672 | | /* Horizontal filter */ |
673 | 174k | const uint8_t *src_h = src_ptr + j; |
674 | 174k | DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP; |
675 | | |
676 | | /* Vertical filter */ |
677 | 174k | __m256i s[6]; |
678 | 174k | __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); |
679 | 174k | __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); |
680 | 174k | __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); |
681 | 174k | __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); |
682 | | |
683 | 174k | s[0] = _mm256_unpacklo_epi16(s0, s1); |
684 | 174k | s[1] = _mm256_unpacklo_epi16(s2, s3); |
685 | | |
686 | 174k | s[3] = _mm256_unpackhi_epi16(s0, s1); |
687 | 174k | s[4] = _mm256_unpackhi_epi16(s2, s3); |
688 | | |
689 | 615k | for (i = 0; i < h; i += 2) { |
690 | 441k | const int16_t *data = &im_block[i * im_stride]; |
691 | | |
692 | 441k | const __m256i s4 = |
693 | 441k | _mm256_loadu_si256((__m256i *)(data + 4 * im_stride)); |
694 | 441k | const __m256i s5 = |
695 | 441k | _mm256_loadu_si256((__m256i *)(data + 5 * im_stride)); |
696 | | |
697 | 441k | s[2] = _mm256_unpacklo_epi16(s4, s5); |
698 | 441k | s[5] = _mm256_unpackhi_epi16(s4, s5); |
699 | | |
700 | 441k | const __m256i res_a = convolve_4tap(s, coeffs_y + 1); |
701 | 441k | const __m256i res_a_round = _mm256_sra_epi32( |
702 | 441k | _mm256_add_epi32(res_a, round_const_v), round_shift_v); |
703 | | |
704 | 441k | if (w - j > 4) { |
705 | 441k | const __m256i res_b = convolve_4tap(s + 3, coeffs_y + 1); |
706 | 441k | const __m256i res_b_round = _mm256_sra_epi32( |
707 | 441k | _mm256_add_epi32(res_b, round_const_v), round_shift_v); |
708 | 441k | const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round); |
709 | 441k | const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); |
710 | | |
711 | 441k | if (do_average) { |
712 | 133k | const __m256i data_ref_0 = |
713 | 133k | load_line2_avx2(&dst[i * dst_stride + j], |
714 | 133k | &dst[i * dst_stride + j + dst_stride]); |
715 | 133k | const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, |
716 | 133k | &wt, use_dist_wtd_comp_avg); |
717 | | |
718 | 133k | const __m256i round_result = convolve_rounding( |
719 | 133k | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); |
720 | | |
721 | 133k | const __m256i res_8 = |
722 | 133k | _mm256_packus_epi16(round_result, round_result); |
723 | 133k | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
724 | 133k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
725 | | |
726 | 133k | _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
727 | 133k | _mm_storel_epi64( |
728 | 133k | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); |
729 | 307k | } else { |
730 | 307k | const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); |
731 | 307k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); |
732 | | |
733 | 307k | const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); |
734 | 307k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
735 | 307k | res_1); |
736 | 307k | } |
737 | 441k | } else { |
738 | 0 | const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round); |
739 | 0 | const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); |
740 | |
|
741 | 0 | if (do_average) { |
742 | 0 | const __m256i data_ref_0 = |
743 | 0 | load_line2_avx2(&dst[i * dst_stride + j], |
744 | 0 | &dst[i * dst_stride + j + dst_stride]); |
745 | |
|
746 | 0 | const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, |
747 | 0 | &wt, use_dist_wtd_comp_avg); |
748 | |
|
749 | 0 | const __m256i round_result = convolve_rounding( |
750 | 0 | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); |
751 | |
|
752 | 0 | const __m256i res_8 = |
753 | 0 | _mm256_packus_epi16(round_result, round_result); |
754 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
755 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
756 | |
|
757 | 0 | *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); |
758 | 0 | *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = |
759 | 0 | _mm_cvtsi128_si32(res_1); |
760 | |
|
761 | 0 | } else { |
762 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); |
763 | 0 | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); |
764 | |
|
765 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); |
766 | 0 | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
767 | 0 | res_1); |
768 | 0 | } |
769 | 0 | } |
770 | 441k | s[0] = s[1]; |
771 | 441k | s[1] = s[2]; |
772 | 441k | s[3] = s[4]; |
773 | 441k | s[4] = s[5]; |
774 | 441k | } |
775 | 174k | } |
776 | 421k | } else { |
777 | 421k | int im_h = h + filter_params_y->taps - 1; |
778 | 421k | const int fo_vert = filter_params_y->taps / 2 - 1; |
779 | 421k | const int fo_horiz = filter_params_x->taps / 2 - 1; |
780 | 421k | const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; |
781 | | |
782 | 421k | filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); |
783 | 421k | filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); |
784 | | |
785 | 1.31M | for (int j = 0; j < w; j += 8) { |
786 | | /* Horizontal filter */ |
787 | 897k | const uint8_t *src_h = src_ptr + j; |
788 | 897k | DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP; |
789 | | |
790 | 897k | DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP; |
791 | 897k | } |
792 | 421k | } |
793 | 852k | } |
794 | | |
795 | | #define DO_NO_AVG_2D_COPY_4X16(r0, c0, r1, c1, r2, c2, r3, c3) \ |
796 | 18.4M | do { \ |
797 | 18.4M | src_0 = _mm256_cvtepu8_epi16( \ |
798 | 18.4M | _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0]))); \ |
799 | 18.4M | src_1 = _mm256_cvtepu8_epi16( \ |
800 | 18.4M | _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1]))); \ |
801 | 18.4M | src_2 = _mm256_cvtepu8_epi16( \ |
802 | 18.4M | _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2]))); \ |
803 | 18.4M | src_3 = _mm256_cvtepu8_epi16( \ |
804 | 18.4M | _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3]))); \ |
805 | 18.4M | \ |
806 | 18.4M | src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT); \ |
807 | 18.4M | src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT); \ |
808 | 18.4M | src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT); \ |
809 | 18.4M | src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT); \ |
810 | 18.4M | \ |
811 | 18.4M | src_0 = _mm256_add_epi16(src_0, offset_const); \ |
812 | 18.4M | src_1 = _mm256_add_epi16(src_1, offset_const); \ |
813 | 18.4M | src_2 = _mm256_add_epi16(src_2, offset_const); \ |
814 | 18.4M | src_3 = _mm256_add_epi16(src_3, offset_const); \ |
815 | 18.4M | \ |
816 | 18.4M | _mm256_store_si256((__m256i *)(&dst[r0 * dst_stride + c0]), src_0); \ |
817 | 18.4M | _mm256_store_si256((__m256i *)(&dst[r1 * dst_stride + c1]), src_1); \ |
818 | 18.4M | _mm256_store_si256((__m256i *)(&dst[r2 * dst_stride + c2]), src_2); \ |
819 | 18.4M | _mm256_store_si256((__m256i *)(&dst[r3 * dst_stride + c3]), src_3); \ |
820 | 18.4M | } while (0) |
821 | | |
822 | 143M | #define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7) |
823 | | static AOM_INLINE void av1_dist_wtd_convolve_2d_no_avg_copy_avx2( |
824 | | const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, |
825 | 610k | int w, int h, const __m256i offset_const) { |
826 | 610k | int i = h; |
827 | 610k | if (w >= 16) { |
828 | 324k | __m256i src_0, src_1, src_2, src_3; |
829 | 324k | if (w == 128) { |
830 | 4.61M | do { |
831 | 4.61M | DO_NO_AVG_2D_COPY_4X16(0, 0, 0, 16, 0, 32, 0, 48); |
832 | 4.61M | DO_NO_AVG_2D_COPY_4X16(0, 64, 0, 80, 0, 96, 0, 112); |
833 | 4.61M | src += 1 * src_stride; |
834 | 4.61M | dst += 1 * dst_stride; |
835 | 4.61M | i -= 1; |
836 | 4.61M | } while (i); |
837 | 287k | } else if (w == 64) { |
838 | 7.19M | do { |
839 | 7.19M | DO_NO_AVG_2D_COPY_4X16(0, 0, 0, 16, 0, 32, 0, 48); |
840 | 7.19M | src += 1 * src_stride; |
841 | 7.19M | dst += 1 * dst_stride; |
842 | 7.19M | i -= 1; |
843 | 7.19M | } while (i); |
844 | 188k | } else if (w == 32) { |
845 | 1.63M | do { |
846 | 1.63M | DO_NO_AVG_2D_COPY_4X16(0, 0, 1, 0, 0, 16, 1, 16); |
847 | 1.63M | src += 2 * src_stride; |
848 | 1.63M | dst += 2 * dst_stride; |
849 | 1.63M | i -= 2; |
850 | 1.63M | } while (i); |
851 | 108k | } else if (w == 16) { |
852 | 388k | do { |
853 | 388k | DO_NO_AVG_2D_COPY_4X16(0, 0, 1, 0, 2, 0, 3, 0); |
854 | 388k | src += 4 * src_stride; |
855 | 388k | dst += 4 * dst_stride; |
856 | 388k | i -= 4; |
857 | 388k | } while (i); |
858 | 108k | } |
859 | 324k | } else { |
860 | 286k | const __m256i zero = _mm256_setzero_si256(); |
861 | 589k | do { |
862 | 589k | const __m128i src_row_0 = |
863 | 589k | _mm_loadl_epi64((__m128i *)(&src[0 * src_stride])); |
864 | 589k | const __m128i src_row_1 = |
865 | 589k | _mm_loadl_epi64((__m128i *)(&src[1 * src_stride])); |
866 | 589k | const __m128i src_row_2 = |
867 | 589k | _mm_loadl_epi64((__m128i *)(&src[2 * src_stride])); |
868 | 589k | const __m128i src_row_3 = |
869 | 589k | _mm_loadl_epi64((__m128i *)(&src[3 * src_stride])); |
870 | | |
871 | 589k | __m256i src_10 = _mm256_insertf128_si256( |
872 | 589k | _mm256_castsi128_si256(src_row_0), src_row_1, 1); |
873 | 589k | __m256i src_32 = _mm256_insertf128_si256( |
874 | 589k | _mm256_castsi128_si256(src_row_2), src_row_3, 1); |
875 | | |
876 | 589k | src_10 = _mm256_unpacklo_epi8(src_10, zero); |
877 | 589k | src_32 = _mm256_unpacklo_epi8(src_32, zero); |
878 | | |
879 | 589k | src_10 = _mm256_slli_epi16(src_10, LEFT_SHIFT); |
880 | 589k | src_32 = _mm256_slli_epi16(src_32, LEFT_SHIFT); |
881 | | |
882 | 589k | src_10 = _mm256_add_epi16(src_10, offset_const); |
883 | 589k | src_32 = _mm256_add_epi16(src_32, offset_const); |
884 | | |
885 | | // Accumulate values into the destination buffer |
886 | 589k | _mm_store_si128((__m128i *)(&dst[0 * dst_stride]), |
887 | 589k | _mm256_castsi256_si128(src_10)); |
888 | 589k | _mm_store_si128((__m128i *)(&dst[1 * dst_stride]), |
889 | 589k | _mm256_extracti128_si256(src_10, 1)); |
890 | 589k | _mm_store_si128((__m128i *)(&dst[2 * dst_stride]), |
891 | 589k | _mm256_castsi256_si128(src_32)); |
892 | 589k | _mm_store_si128((__m128i *)(&dst[3 * dst_stride]), |
893 | 589k | _mm256_extracti128_si256(src_32, 1)); |
894 | | |
895 | 589k | src += 4 * src_stride; |
896 | 589k | dst += 4 * dst_stride; |
897 | 589k | i -= 4; |
898 | 589k | } while (i); |
899 | 286k | } |
900 | 610k | } |
901 | | |
902 | | #define DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, r0, c0, r1, c1, r2, c2, r3, c3) \ |
903 | 16.9M | do { \ |
904 | 16.9M | src_0 = _mm256_cvtepu8_epi16( \ |
905 | 16.9M | _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0]))); \ |
906 | 16.9M | src_1 = _mm256_cvtepu8_epi16( \ |
907 | 16.9M | _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1]))); \ |
908 | 16.9M | src_2 = _mm256_cvtepu8_epi16( \ |
909 | 16.9M | _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2]))); \ |
910 | 16.9M | src_3 = _mm256_cvtepu8_epi16( \ |
911 | 16.9M | _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3]))); \ |
912 | 16.9M | \ |
913 | 16.9M | src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT); \ |
914 | 16.9M | src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT); \ |
915 | 16.9M | src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT); \ |
916 | 16.9M | src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT); \ |
917 | 16.9M | src_0 = _mm256_add_epi16(src_0, offset_const); \ |
918 | 16.9M | src_1 = _mm256_add_epi16(src_1, offset_const); \ |
919 | 16.9M | src_2 = _mm256_add_epi16(src_2, offset_const); \ |
920 | 16.9M | src_3 = _mm256_add_epi16(src_3, offset_const); \ |
921 | 16.9M | \ |
922 | 16.9M | ref_0 = _mm256_loadu_si256((__m256i *)(&dst[r0 * dst_stride + c0])); \ |
923 | 16.9M | ref_1 = _mm256_loadu_si256((__m256i *)(&dst[r1 * dst_stride + c1])); \ |
924 | 16.9M | ref_2 = _mm256_loadu_si256((__m256i *)(&dst[r2 * dst_stride + c2])); \ |
925 | 16.9M | ref_3 = _mm256_loadu_si256((__m256i *)(&dst[r3 * dst_stride + c3])); \ |
926 | 16.9M | \ |
927 | 16.9M | res_0 = comp_avg(&ref_0, &src_0, &wt, USE_DIST_WEIGHTED); \ |
928 | 16.9M | res_1 = comp_avg(&ref_1, &src_1, &wt, USE_DIST_WEIGHTED); \ |
929 | 16.9M | res_2 = comp_avg(&ref_2, &src_2, &wt, USE_DIST_WEIGHTED); \ |
930 | 16.9M | res_3 = comp_avg(&ref_3, &src_3, &wt, USE_DIST_WEIGHTED); \ |
931 | 16.9M | \ |
932 | 16.9M | res_0 = convolve_rounding(&res_0, &offset_const, &rounding_const, \ |
933 | 16.9M | rounding_shift); \ |
934 | 16.9M | res_1 = convolve_rounding(&res_1, &offset_const, &rounding_const, \ |
935 | 16.9M | rounding_shift); \ |
936 | 16.9M | res_2 = convolve_rounding(&res_2, &offset_const, &rounding_const, \ |
937 | 16.9M | rounding_shift); \ |
938 | 16.9M | res_3 = convolve_rounding(&res_3, &offset_const, &rounding_const, \ |
939 | 16.9M | rounding_shift); \ |
940 | 16.9M | \ |
941 | 16.9M | res_10 = _mm256_packus_epi16(res_0, res_1); \ |
942 | 16.9M | res_32 = _mm256_packus_epi16(res_2, res_3); \ |
943 | 16.9M | res_10 = _mm256_permute4x64_epi64(res_10, 0xD8); \ |
944 | 16.9M | res_32 = _mm256_permute4x64_epi64(res_32, 0xD8); \ |
945 | 16.9M | \ |
946 | 16.9M | _mm_store_si128((__m128i *)(&dst0[r0 * dst_stride0 + c0]), \ |
947 | 16.9M | _mm256_castsi256_si128(res_10)); \ |
948 | 16.9M | _mm_store_si128((__m128i *)(&dst0[r1 * dst_stride0 + c1]), \ |
949 | 16.9M | _mm256_extracti128_si256(res_10, 1)); \ |
950 | 16.9M | _mm_store_si128((__m128i *)(&dst0[r2 * dst_stride0 + c2]), \ |
951 | 16.9M | _mm256_castsi256_si128(res_32)); \ |
952 | 16.9M | _mm_store_si128((__m128i *)(&dst0[r3 * dst_stride0 + c3]), \ |
953 | 16.9M | _mm256_extracti128_si256(res_32, 1)); \ |
954 | 16.9M | } while (0) |
955 | | |
956 | | #define DO_AVG_2D_COPY(USE_DIST_WEIGHTED) \ |
957 | 404k | int i = h; \ |
958 | 404k | if (w >= 16) { \ |
959 | 243k | __m256i src_0, src_1, src_2, src_3; \ |
960 | 243k | __m256i ref_0, ref_1, ref_2, ref_3; \ |
961 | 243k | __m256i res_0, res_1, res_2, res_3; \ |
962 | 243k | __m256i res_10, res_32; \ |
963 | 243k | if (w == 128) { \ |
964 | 4.33M | do { \ |
965 | 4.33M | DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 0, 16, 0, 32, 0, 48); \ |
966 | 4.33M | DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 64, 0, 80, 0, 96, 0, 112); \ |
967 | 4.33M | i -= 1; \ |
968 | 4.33M | src += 1 * src_stride; \ |
969 | 4.33M | dst += 1 * dst_stride; \ |
970 | 4.33M | dst0 += 1 * dst_stride0; \ |
971 | 4.33M | } while (i); \ |
972 | 209k | } else if (w == 64) { \ |
973 | 6.68M | do { \ |
974 | 6.68M | DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 0, 16, 0, 32, 0, 48); \ |
975 | 6.68M | \ |
976 | 6.68M | i -= 1; \ |
977 | 6.68M | src += 1 * src_stride; \ |
978 | 6.68M | dst += 1 * dst_stride; \ |
979 | 6.68M | dst0 += 1 * dst_stride0; \ |
980 | 6.68M | } while (i); \ |
981 | 120k | } else if (w == 32) { \ |
982 | 1.38M | do { \ |
983 | 1.38M | DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 1, 0, 0, 16, 1, 16); \ |
984 | 1.38M | \ |
985 | 1.38M | i -= 2; \ |
986 | 1.38M | src += 2 * src_stride; \ |
987 | 1.38M | dst += 2 * dst_stride; \ |
988 | 1.38M | dst0 += 2 * dst_stride0; \ |
989 | 1.38M | } while (i); \ |
990 | 60.7k | } else { \ |
991 | 60.7k | assert(w == 16); \ |
992 | 224k | do { \ |
993 | 224k | DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 1, 0, 2, 0, 3, 0); \ |
994 | 224k | \ |
995 | 224k | i -= 4; \ |
996 | 224k | src += 4 * src_stride; \ |
997 | 224k | dst += 4 * dst_stride; \ |
998 | 224k | dst0 += 4 * dst_stride0; \ |
999 | 224k | } while (i); \ |
1000 | 60.7k | } \ |
1001 | 243k | } else if (w == 8) { \ |
1002 | 239k | do { \ |
1003 | 239k | const __m128i src_0 = \ |
1004 | 239k | _mm_loadl_epi64((__m128i *)(&src[0 * src_stride])); \ |
1005 | 239k | const __m128i src_1 = \ |
1006 | 239k | _mm_loadl_epi64((__m128i *)(&src[1 * src_stride])); \ |
1007 | 239k | const __m128i src_2 = \ |
1008 | 239k | _mm_loadl_epi64((__m128i *)(&src[2 * src_stride])); \ |
1009 | 239k | const __m128i src_3 = \ |
1010 | 239k | _mm_loadl_epi64((__m128i *)(&src[3 * src_stride])); \ |
1011 | 239k | __m256i src_10 = \ |
1012 | 239k | _mm256_insertf128_si256(_mm256_castsi128_si256(src_0), src_1, 1); \ |
1013 | 239k | __m256i src_32 = \ |
1014 | 239k | _mm256_insertf128_si256(_mm256_castsi128_si256(src_2), src_3, 1); \ |
1015 | 239k | \ |
1016 | 239k | src_10 = _mm256_unpacklo_epi8(src_10, zero); \ |
1017 | 239k | src_32 = _mm256_unpacklo_epi8(src_32, zero); \ |
1018 | 239k | \ |
1019 | 239k | src_10 = _mm256_slli_epi16(src_10, LEFT_SHIFT); \ |
1020 | 239k | src_32 = _mm256_slli_epi16(src_32, LEFT_SHIFT); \ |
1021 | 239k | \ |
1022 | 239k | src_10 = _mm256_add_epi16(src_10, offset_const); \ |
1023 | 239k | src_32 = _mm256_add_epi16(src_32, offset_const); \ |
1024 | 239k | \ |
1025 | 239k | const __m256i ref_10 = \ |
1026 | 239k | load_line2_avx2(&dst[0 * dst_stride], &dst[1 * dst_stride]); \ |
1027 | 239k | const __m256i ref_32 = \ |
1028 | 239k | load_line2_avx2(&dst[2 * dst_stride], &dst[3 * dst_stride]); \ |
1029 | 239k | __m256i res_10 = comp_avg(&ref_10, &src_10, &wt, USE_DIST_WEIGHTED); \ |
1030 | 239k | __m256i res_32 = comp_avg(&ref_32, &src_32, &wt, USE_DIST_WEIGHTED); \ |
1031 | 239k | \ |
1032 | 239k | res_10 = convolve_rounding(&res_10, &offset_const, &rounding_const, \ |
1033 | 239k | rounding_shift); \ |
1034 | 239k | res_32 = convolve_rounding(&res_32, &offset_const, &rounding_const, \ |
1035 | 239k | rounding_shift); \ |
1036 | 239k | \ |
1037 | 239k | __m256i res = _mm256_packus_epi16(res_10, res_32); \ |
1038 | 239k | const __m128i res_20 = _mm256_castsi256_si128(res); \ |
1039 | 239k | const __m128i res_31 = _mm256_extracti128_si256(res, 1); \ |
1040 | 239k | \ |
1041 | 239k | _mm_storel_epi64((__m128i *)(&dst0[0 * dst_stride0]), res_20); \ |
1042 | 239k | _mm_storel_epi64((__m128i *)((&dst0[1 * dst_stride0])), res_31); \ |
1043 | 239k | _mm_storeh_epi64((__m128i *)(&dst0[2 * dst_stride0]), res_20); \ |
1044 | 239k | _mm_storeh_epi64((__m128i *)((&dst0[3 * dst_stride0])), res_31); \ |
1045 | 239k | i -= 4; \ |
1046 | 239k | src += 4 * src_stride; \ |
1047 | 239k | dst += 4 * dst_stride; \ |
1048 | 239k | dst0 += 4 * dst_stride0; \ |
1049 | 239k | } while (i); \ |
1050 | 93.8k | } else { \ |
1051 | 67.2k | assert(w == 4); \ |
1052 | 99.4k | do { \ |
1053 | 99.4k | __m256i src_3210_8bit = \ |
1054 | 99.4k | _mm256_setr_epi32(loadu_int32(src + 0 * src_stride), \ |
1055 | 99.4k | loadu_int32(src + 1 * src_stride), 0, 0, \ |
1056 | 99.4k | loadu_int32(src + 2 * src_stride), \ |
1057 | 99.4k | loadu_int32(src + 3 * src_stride), 0, 0); \ |
1058 | 99.4k | \ |
1059 | 99.4k | __m256i src_3210 = _mm256_unpacklo_epi8(src_3210_8bit, zero); \ |
1060 | 99.4k | src_3210 = _mm256_slli_epi16(src_3210, LEFT_SHIFT); \ |
1061 | 99.4k | src_3210 = _mm256_add_epi16(src_3210, offset_const); \ |
1062 | 99.4k | \ |
1063 | 99.4k | __m256i ref_3210 = \ |
1064 | 99.4k | _mm256_setr_epi64x(*(int64_t *)(dst + 0 * dst_stride), \ |
1065 | 99.4k | *(int64_t *)(dst + 1 * dst_stride), \ |
1066 | 99.4k | *(int64_t *)(dst + 2 * dst_stride), \ |
1067 | 99.4k | *(int64_t *)(dst + 3 * dst_stride)); \ |
1068 | 99.4k | __m256i res_3210 = \ |
1069 | 99.4k | comp_avg(&ref_3210, &src_3210, &wt, USE_DIST_WEIGHTED); \ |
1070 | 99.4k | \ |
1071 | 99.4k | res_3210 = convolve_rounding(&res_3210, &offset_const, &rounding_const, \ |
1072 | 99.4k | rounding_shift); \ |
1073 | 99.4k | \ |
1074 | 99.4k | res_3210 = _mm256_packus_epi16(res_3210, res_3210); \ |
1075 | 99.4k | const __m128i res_10 = _mm256_castsi256_si128(res_3210); \ |
1076 | 99.4k | const __m128i res_32 = _mm256_extracti128_si256(res_3210, 1); \ |
1077 | 99.4k | \ |
1078 | 99.4k | *(int *)(&dst0[0 * dst_stride0]) = _mm_cvtsi128_si32(res_10); \ |
1079 | 99.4k | *(int *)(&dst0[2 * dst_stride0]) = _mm_cvtsi128_si32(res_32); \ |
1080 | 99.4k | *(int *)(&dst0[1 * dst_stride0]) = _mm_extract_epi32(res_10, 1); \ |
1081 | 99.4k | *(int *)(&dst0[3 * dst_stride0]) = _mm_extract_epi32(res_32, 1); \ |
1082 | 99.4k | i -= 4; \ |
1083 | 99.4k | src += 4 * src_stride; \ |
1084 | 99.4k | dst += 4 * dst_stride; \ |
1085 | 99.4k | dst0 += 4 * dst_stride0; \ |
1086 | 99.4k | } while (i); \ |
1087 | 67.2k | } |
1088 | | |
1089 | | void av1_dist_wtd_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, |
1090 | | uint8_t *dst0, int dst_stride0, int w, |
1091 | 1.01M | int h, ConvolveParams *conv_params) { |
1092 | 1.01M | const int bd = 8; |
1093 | 1.01M | CONV_BUF_TYPE *dst = conv_params->dst; |
1094 | 1.01M | int dst_stride = conv_params->dst_stride; |
1095 | 1.01M | assert(conv_params->round_0 == 3); |
1096 | 0 | assert(conv_params->round_1 == 7); |
1097 | 0 | assert(w % 4 == 0); |
1098 | 0 | assert(h % 4 == 0); |
1099 | | |
1100 | 0 | const int do_average = conv_params->do_average; |
1101 | 1.01M | const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; |
1102 | 1.01M | const __m256i wt = unpack_weights_avx2(conv_params); |
1103 | 1.01M | const __m256i zero = _mm256_setzero_si256(); |
1104 | | |
1105 | 1.01M | const int offset_0 = |
1106 | 1.01M | bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
1107 | 1.01M | const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); |
1108 | 1.01M | const __m256i offset_const = _mm256_set1_epi16(offset); |
1109 | 1.01M | const int rounding_shift = |
1110 | 1.01M | 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
1111 | 1.01M | const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); |
1112 | | |
1113 | 1.01M | if (do_average) { |
1114 | 404k | if (use_dist_wtd_comp_avg) { |
1115 | 82.3k | DO_AVG_2D_COPY(1) |
1116 | 322k | } else { |
1117 | 322k | DO_AVG_2D_COPY(0) |
1118 | 322k | } |
1119 | 610k | } else { |
1120 | 610k | av1_dist_wtd_convolve_2d_no_avg_copy_avx2(src, src_stride, dst, dst_stride, |
1121 | 610k | w, h, offset_const); |
1122 | 610k | } |
1123 | 1.01M | } |
1124 | | #undef LEFT_SHIFT |