/src/aom/av1/common/x86/jnt_convolve_avx2.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2018, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #include <emmintrin.h> |
13 | | #include <immintrin.h> |
14 | | |
15 | | #include "config/av1_rtcd.h" |
16 | | |
17 | | #include "aom_dsp/aom_dsp_common.h" |
18 | | #include "aom_dsp/aom_filter.h" |
19 | | #include "aom_dsp/x86/convolve_avx2.h" |
20 | | #include "aom_dsp/x86/convolve_common_intrin.h" |
21 | | #include "aom_dsp/x86/convolve_sse4_1.h" |
22 | | #include "aom_dsp/x86/mem_sse2.h" |
23 | | #include "aom_dsp/x86/synonyms_avx2.h" |
24 | | |
25 | | #include "av1/common/convolve.h" |
26 | | |
27 | 1.73M | static inline __m256i unpack_weights_avx2(ConvolveParams *conv_params) { |
28 | 1.73M | const int w0 = conv_params->fwd_offset; |
29 | 1.73M | const int w1 = conv_params->bck_offset; |
30 | 1.73M | const __m256i wt0 = _mm256_set1_epi16((int16_t)w0); |
31 | 1.73M | const __m256i wt1 = _mm256_set1_epi16((int16_t)w1); |
32 | 1.73M | const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1); |
33 | 1.73M | return wt; |
34 | 1.73M | } |
35 | | |
36 | 10.5M | static inline __m256i load_line2_avx2(const void *a, const void *b) { |
37 | 10.5M | return _mm256_permute2x128_si256( |
38 | 10.5M | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)a)), |
39 | 10.5M | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)b)), 0x20); |
40 | 10.5M | } |
41 | | |
42 | | void av1_dist_wtd_convolve_x_avx2(const uint8_t *src, int src_stride, |
43 | | uint8_t *dst0, int dst_stride0, int w, int h, |
44 | | const InterpFilterParams *filter_params_x, |
45 | | const int subpel_x_qn, |
46 | 103k | ConvolveParams *conv_params) { |
47 | 103k | CONV_BUF_TYPE *dst = conv_params->dst; |
48 | 103k | int dst_stride = conv_params->dst_stride; |
49 | 103k | const int bd = 8; |
50 | 103k | int i, j, is_horiz_4tap = 0; |
51 | 103k | const int bits = FILTER_BITS - conv_params->round_1; |
52 | 103k | const __m256i wt = unpack_weights_avx2(conv_params); |
53 | 103k | const int do_average = conv_params->do_average; |
54 | 103k | const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; |
55 | 103k | const int offset_0 = |
56 | 103k | bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
57 | 103k | const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); |
58 | 103k | const __m256i offset_const = _mm256_set1_epi16(offset); |
59 | 103k | const int rounding_shift = |
60 | 103k | 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
61 | 103k | const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); |
62 | | |
63 | 103k | assert(bits >= 0); |
64 | 103k | assert(conv_params->round_0 > 0); |
65 | | |
66 | 103k | const __m256i round_const = |
67 | 103k | _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1); |
68 | 103k | const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1); |
69 | | |
70 | 103k | __m256i filt[4], coeffs[4]; |
71 | | |
72 | 103k | filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); |
73 | 103k | filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); |
74 | | |
75 | 103k | prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs); |
76 | | |
77 | | // Condition for checking valid horz_filt taps |
78 | 103k | if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0))) |
79 | 40.3k | is_horiz_4tap = 1; |
80 | | |
81 | | // horz_filt as 4 tap |
82 | 103k | if (is_horiz_4tap) { |
83 | 40.3k | const int fo_horiz = 1; |
84 | 40.3k | const uint8_t *const src_ptr = src - fo_horiz; |
85 | 277k | for (i = 0; i < h; i += 2) { |
86 | 236k | const uint8_t *src_data = src_ptr + i * src_stride; |
87 | 236k | CONV_BUF_TYPE *dst_data = dst + i * dst_stride; |
88 | 1.20M | for (j = 0; j < w; j += 8) { |
89 | 972k | const __m256i data = |
90 | 972k | load_line2_avx2(&src_data[j], &src_data[j + src_stride]); |
91 | | |
92 | 972k | __m256i res = convolve_lowbd_x_4tap(data, coeffs + 1, filt); |
93 | 972k | res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift); |
94 | 972k | res = _mm256_slli_epi16(res, bits); |
95 | | |
96 | 972k | const __m256i res_unsigned = _mm256_add_epi16(res, offset_const); |
97 | | |
98 | | // Accumulate values into the destination buffer |
99 | 972k | if (do_average) { |
100 | 302k | const __m256i data_ref_0 = |
101 | 302k | load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]); |
102 | 302k | const __m256i comp_avg_res = |
103 | 302k | comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); |
104 | | |
105 | 302k | const __m256i round_result = convolve_rounding( |
106 | 302k | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); |
107 | | |
108 | 302k | const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); |
109 | 302k | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
110 | 302k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
111 | | |
112 | 302k | if (w > 4) { |
113 | 273k | _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
114 | 273k | _mm_storel_epi64( |
115 | 273k | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); |
116 | 273k | } else { |
117 | 29.1k | *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); |
118 | 29.1k | *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = |
119 | 29.1k | _mm_cvtsi128_si32(res_1); |
120 | 29.1k | } |
121 | 669k | } else { |
122 | 669k | const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); |
123 | 669k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); |
124 | | |
125 | 669k | const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); |
126 | 669k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
127 | 669k | res_1); |
128 | 669k | } |
129 | 972k | } |
130 | 236k | } |
131 | 63.4k | } else { |
132 | 63.4k | const int fo_horiz = filter_params_x->taps / 2 - 1; |
133 | 63.4k | const uint8_t *const src_ptr = src - fo_horiz; |
134 | | |
135 | 63.4k | filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); |
136 | 63.4k | filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); |
137 | 577k | for (i = 0; i < h; i += 2) { |
138 | 513k | const uint8_t *src_data = src_ptr + i * src_stride; |
139 | 513k | CONV_BUF_TYPE *dst_data = dst + i * dst_stride; |
140 | 2.81M | for (j = 0; j < w; j += 8) { |
141 | 2.30M | const __m256i data = |
142 | 2.30M | load_line2_avx2(&src_data[j], &src_data[j + src_stride]); |
143 | | |
144 | 2.30M | __m256i res = convolve_lowbd_x(data, coeffs, filt); |
145 | | |
146 | 2.30M | res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift); |
147 | | |
148 | 2.30M | res = _mm256_slli_epi16(res, bits); |
149 | | |
150 | 2.30M | const __m256i res_unsigned = _mm256_add_epi16(res, offset_const); |
151 | | |
152 | | // Accumulate values into the destination buffer |
153 | 2.30M | if (do_average) { |
154 | 940k | const __m256i data_ref_0 = |
155 | 940k | load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]); |
156 | 940k | const __m256i comp_avg_res = |
157 | 940k | comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); |
158 | | |
159 | 940k | const __m256i round_result = convolve_rounding( |
160 | 940k | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); |
161 | | |
162 | 940k | const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); |
163 | 940k | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
164 | 940k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
165 | | |
166 | 940k | if (w > 4) { |
167 | 940k | _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
168 | 940k | _mm_storel_epi64( |
169 | 940k | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); |
170 | 940k | } else { |
171 | 118 | *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); |
172 | 118 | *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = |
173 | 118 | _mm_cvtsi128_si32(res_1); |
174 | 118 | } |
175 | 1.36M | } else { |
176 | 1.36M | const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); |
177 | 1.36M | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); |
178 | | |
179 | 1.36M | const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); |
180 | 1.36M | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
181 | 1.36M | res_1); |
182 | 1.36M | } |
183 | 2.30M | } |
184 | 513k | } |
185 | 63.4k | } |
186 | 103k | } |
187 | | |
188 | | void av1_dist_wtd_convolve_y_avx2(const uint8_t *src, int src_stride, |
189 | | uint8_t *dst0, int dst_stride0, int w, int h, |
190 | | const InterpFilterParams *filter_params_y, |
191 | | const int subpel_y_qn, |
192 | 70.1k | ConvolveParams *conv_params) { |
193 | 70.1k | CONV_BUF_TYPE *dst = conv_params->dst; |
194 | 70.1k | int dst_stride = conv_params->dst_stride; |
195 | 70.1k | const int bd = 8; |
196 | 70.1k | int i, j, is_vert_4tap = 0; |
197 | | // +1 to compensate for dividing the filter coeffs by 2 |
198 | 70.1k | const int left_shift = FILTER_BITS - conv_params->round_0 + 1; |
199 | 70.1k | const __m256i round_const = |
200 | 70.1k | _mm256_set1_epi32((1 << conv_params->round_1) >> 1); |
201 | 70.1k | const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); |
202 | 70.1k | const __m256i wt = unpack_weights_avx2(conv_params); |
203 | 70.1k | const int do_average = conv_params->do_average; |
204 | 70.1k | const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; |
205 | 70.1k | const int offset_0 = |
206 | 70.1k | bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
207 | 70.1k | const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); |
208 | 70.1k | const __m256i offset_const = _mm256_set1_epi16(offset); |
209 | 70.1k | const int offset_1 = (1 << (bd + FILTER_BITS - 2)); |
210 | 70.1k | const __m256i offset_const_1 = _mm256_set1_epi16(offset_1); |
211 | 70.1k | const __m256i offset_const_2 = _mm256_set1_epi16((1 << offset_0)); |
212 | 70.1k | const int rounding_shift = |
213 | 70.1k | 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
214 | 70.1k | const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); |
215 | 70.1k | const __m256i zero = _mm256_setzero_si256(); |
216 | 70.1k | __m256i coeffs[4], s[8]; |
217 | | |
218 | 70.1k | assert((FILTER_BITS - conv_params->round_0) >= 0); |
219 | | |
220 | 70.1k | prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs); |
221 | | |
222 | | // Condition for checking valid vert_filt taps |
223 | 70.1k | if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0))) |
224 | 33.5k | is_vert_4tap = 1; |
225 | | |
226 | 70.1k | if (is_vert_4tap) { |
227 | 33.5k | const int fo_vert = 1; |
228 | 33.5k | const uint8_t *const src_ptr = src - fo_vert * src_stride; |
229 | 69.5k | for (j = 0; j < w; j += 16) { |
230 | 35.9k | const uint8_t *data = &src_ptr[j]; |
231 | 35.9k | __m256i src4; |
232 | | // Load lines a and b. Line a to lower 128, line b to upper 128 |
233 | 35.9k | { |
234 | 35.9k | __m256i src_ab[4]; |
235 | 35.9k | __m256i src_a[5]; |
236 | 35.9k | src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); |
237 | 179k | for (int kk = 0; kk < 4; ++kk) { |
238 | 143k | data += src_stride; |
239 | 143k | src_a[kk + 1] = |
240 | 143k | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); |
241 | 143k | src_ab[kk] = |
242 | 143k | _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20); |
243 | 143k | } |
244 | 35.9k | src4 = src_a[4]; |
245 | 35.9k | s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]); |
246 | 35.9k | s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]); |
247 | | |
248 | 35.9k | s[3] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]); |
249 | 35.9k | s[4] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]); |
250 | 35.9k | } |
251 | | |
252 | 232k | for (i = 0; i < h; i += 2) { |
253 | 196k | data = &src_ptr[(i + 5) * src_stride + j]; |
254 | 196k | const __m256i src5 = |
255 | 196k | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); |
256 | 196k | const __m256i src_45a = _mm256_permute2x128_si256(src4, src5, 0x20); |
257 | | |
258 | 196k | src4 = _mm256_castsi128_si256( |
259 | 196k | _mm_loadu_si128((__m128i *)(data + src_stride))); |
260 | 196k | const __m256i src_56a = _mm256_permute2x128_si256(src5, src4, 0x20); |
261 | | |
262 | 196k | s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); |
263 | 196k | s[5] = _mm256_unpackhi_epi8(src_45a, src_56a); |
264 | | |
265 | 196k | __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1); |
266 | | |
267 | 196k | res_lo = _mm256_add_epi16(res_lo, offset_const_1); |
268 | | |
269 | 196k | const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero); |
270 | 196k | const __m256i res_lo_0_shift = |
271 | 196k | _mm256_slli_epi32(res_lo_0_32b, left_shift); |
272 | 196k | const __m256i res_lo_0_round = _mm256_sra_epi32( |
273 | 196k | _mm256_add_epi32(res_lo_0_shift, round_const), round_shift); |
274 | | |
275 | 196k | const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero); |
276 | 196k | const __m256i res_lo_1_shift = |
277 | 196k | _mm256_slli_epi32(res_lo_1_32b, left_shift); |
278 | 196k | const __m256i res_lo_1_round = _mm256_sra_epi32( |
279 | 196k | _mm256_add_epi32(res_lo_1_shift, round_const), round_shift); |
280 | | |
281 | 196k | const __m256i res_lo_round = |
282 | 196k | _mm256_packs_epi32(res_lo_0_round, res_lo_1_round); |
283 | | |
284 | 196k | const __m256i res_lo_unsigned = |
285 | 196k | _mm256_add_epi16(res_lo_round, offset_const_2); |
286 | | |
287 | 196k | if (w - j < 16) { |
288 | 77.6k | if (do_average) { |
289 | 35.0k | const __m256i data_ref_0 = |
290 | 35.0k | load_line2_avx2(&dst[i * dst_stride + j], |
291 | 35.0k | &dst[i * dst_stride + j + dst_stride]); |
292 | 35.0k | const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned, |
293 | 35.0k | &wt, use_dist_wtd_comp_avg); |
294 | | |
295 | 35.0k | const __m256i round_result = convolve_rounding( |
296 | 35.0k | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); |
297 | | |
298 | 35.0k | const __m256i res_8 = |
299 | 35.0k | _mm256_packus_epi16(round_result, round_result); |
300 | 35.0k | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
301 | 35.0k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
302 | | |
303 | 35.0k | if (w - j > 4) { |
304 | 18.8k | _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
305 | 18.8k | _mm_storel_epi64( |
306 | 18.8k | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), |
307 | 18.8k | res_1); |
308 | 18.8k | } else { |
309 | 16.1k | *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); |
310 | 16.1k | *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = |
311 | 16.1k | _mm_cvtsi128_si32(res_1); |
312 | 16.1k | } |
313 | 42.5k | } else { |
314 | 42.5k | const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned); |
315 | 42.5k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); |
316 | | |
317 | 42.5k | const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1); |
318 | 42.5k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
319 | 42.5k | res_1); |
320 | 42.5k | } |
321 | 119k | } else { |
322 | 119k | __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1); |
323 | | |
324 | 119k | res_hi = _mm256_add_epi16(res_hi, offset_const_1); |
325 | | |
326 | 119k | const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero); |
327 | 119k | const __m256i res_hi_0_shift = |
328 | 119k | _mm256_slli_epi32(res_hi_0_32b, left_shift); |
329 | 119k | const __m256i res_hi_0_round = _mm256_sra_epi32( |
330 | 119k | _mm256_add_epi32(res_hi_0_shift, round_const), round_shift); |
331 | | |
332 | 119k | const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero); |
333 | 119k | const __m256i res_hi_1_shift = |
334 | 119k | _mm256_slli_epi32(res_hi_1_32b, left_shift); |
335 | 119k | const __m256i res_hi_1_round = _mm256_sra_epi32( |
336 | 119k | _mm256_add_epi32(res_hi_1_shift, round_const), round_shift); |
337 | | |
338 | 119k | const __m256i res_hi_round = |
339 | 119k | _mm256_packs_epi32(res_hi_0_round, res_hi_1_round); |
340 | | |
341 | 119k | const __m256i res_hi_unsigned = |
342 | 119k | _mm256_add_epi16(res_hi_round, offset_const_2); |
343 | | |
344 | 119k | if (do_average) { |
345 | 52.8k | const __m256i data_ref_0_lo = |
346 | 52.8k | load_line2_avx2(&dst[i * dst_stride + j], |
347 | 52.8k | &dst[i * dst_stride + j + dst_stride]); |
348 | | |
349 | 52.8k | const __m256i data_ref_0_hi = |
350 | 52.8k | load_line2_avx2(&dst[i * dst_stride + j + 8], |
351 | 52.8k | &dst[i * dst_stride + j + 8 + dst_stride]); |
352 | | |
353 | 52.8k | const __m256i comp_avg_res_lo = comp_avg( |
354 | 52.8k | &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg); |
355 | | |
356 | 52.8k | const __m256i comp_avg_res_hi = comp_avg( |
357 | 52.8k | &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg); |
358 | | |
359 | 52.8k | const __m256i round_result_lo = |
360 | 52.8k | convolve_rounding(&comp_avg_res_lo, &offset_const, |
361 | 52.8k | &rounding_const, rounding_shift); |
362 | | |
363 | 52.8k | const __m256i round_result_hi = |
364 | 52.8k | convolve_rounding(&comp_avg_res_hi, &offset_const, |
365 | 52.8k | &rounding_const, rounding_shift); |
366 | | |
367 | 52.8k | const __m256i res_8 = |
368 | 52.8k | _mm256_packus_epi16(round_result_lo, round_result_hi); |
369 | 52.8k | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
370 | 52.8k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
371 | | |
372 | 52.8k | _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
373 | 52.8k | _mm_store_si128( |
374 | 52.8k | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); |
375 | | |
376 | 66.3k | } else { |
377 | 66.3k | const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned); |
378 | 66.3k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0); |
379 | | |
380 | 66.3k | const __m128i res_lo_1 = |
381 | 66.3k | _mm256_extracti128_si256(res_lo_unsigned, 1); |
382 | 66.3k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
383 | 66.3k | res_lo_1); |
384 | | |
385 | 66.3k | const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned); |
386 | 66.3k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]), |
387 | 66.3k | res_hi_0); |
388 | | |
389 | 66.3k | const __m128i res_hi_1 = |
390 | 66.3k | _mm256_extracti128_si256(res_hi_unsigned, 1); |
391 | 66.3k | _mm_store_si128( |
392 | 66.3k | (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]), |
393 | 66.3k | res_hi_1); |
394 | 66.3k | } |
395 | 119k | } |
396 | 196k | s[0] = s[1]; |
397 | 196k | s[1] = s[2]; |
398 | | |
399 | 196k | s[3] = s[4]; |
400 | 196k | s[4] = s[5]; |
401 | 196k | } |
402 | 35.9k | } |
403 | 36.5k | } else { |
404 | 36.5k | const int fo_vert = filter_params_y->taps / 2 - 1; |
405 | 36.5k | const uint8_t *const src_ptr = src - fo_vert * src_stride; |
406 | 86.3k | for (j = 0; j < w; j += 16) { |
407 | 49.8k | const uint8_t *data = &src_ptr[j]; |
408 | 49.8k | __m256i src6; |
409 | | // Load lines a and b. Line a to lower 128, line b to upper 128 |
410 | 49.8k | { |
411 | 49.8k | __m256i src_ab[7]; |
412 | 49.8k | __m256i src_a[7]; |
413 | 49.8k | src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); |
414 | 348k | for (int kk = 0; kk < 6; ++kk) { |
415 | 299k | data += src_stride; |
416 | 299k | src_a[kk + 1] = |
417 | 299k | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); |
418 | 299k | src_ab[kk] = |
419 | 299k | _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20); |
420 | 299k | } |
421 | 49.8k | src6 = src_a[6]; |
422 | 49.8k | s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]); |
423 | 49.8k | s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]); |
424 | 49.8k | s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]); |
425 | 49.8k | s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]); |
426 | 49.8k | s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]); |
427 | 49.8k | s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]); |
428 | 49.8k | } |
429 | | |
430 | 599k | for (i = 0; i < h; i += 2) { |
431 | 549k | data = &src_ptr[(i + 7) * src_stride + j]; |
432 | 549k | const __m256i src7 = |
433 | 549k | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); |
434 | 549k | const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20); |
435 | | |
436 | 549k | src6 = _mm256_castsi128_si256( |
437 | 549k | _mm_loadu_si128((__m128i *)(data + src_stride))); |
438 | 549k | const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20); |
439 | | |
440 | 549k | s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); |
441 | 549k | s[7] = _mm256_unpackhi_epi8(src_67a, src_78a); |
442 | | |
443 | 549k | __m256i res_lo = convolve_lowbd(s, coeffs); |
444 | | |
445 | 549k | res_lo = _mm256_add_epi16(res_lo, offset_const_1); |
446 | | |
447 | 549k | const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero); |
448 | 549k | const __m256i res_lo_0_shift = |
449 | 549k | _mm256_slli_epi32(res_lo_0_32b, left_shift); |
450 | 549k | const __m256i res_lo_0_round = _mm256_sra_epi32( |
451 | 549k | _mm256_add_epi32(res_lo_0_shift, round_const), round_shift); |
452 | | |
453 | 549k | const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero); |
454 | 549k | const __m256i res_lo_1_shift = |
455 | 549k | _mm256_slli_epi32(res_lo_1_32b, left_shift); |
456 | 549k | const __m256i res_lo_1_round = _mm256_sra_epi32( |
457 | 549k | _mm256_add_epi32(res_lo_1_shift, round_const), round_shift); |
458 | | |
459 | 549k | const __m256i res_lo_round = |
460 | 549k | _mm256_packs_epi32(res_lo_0_round, res_lo_1_round); |
461 | | |
462 | 549k | const __m256i res_lo_unsigned = |
463 | 549k | _mm256_add_epi16(res_lo_round, offset_const_2); |
464 | | |
465 | 549k | if (w - j < 16) { |
466 | 101k | if (do_average) { |
467 | 47.3k | const __m256i data_ref_0 = |
468 | 47.3k | load_line2_avx2(&dst[i * dst_stride + j], |
469 | 47.3k | &dst[i * dst_stride + j + dst_stride]); |
470 | 47.3k | const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned, |
471 | 47.3k | &wt, use_dist_wtd_comp_avg); |
472 | | |
473 | 47.3k | const __m256i round_result = convolve_rounding( |
474 | 47.3k | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); |
475 | | |
476 | 47.3k | const __m256i res_8 = |
477 | 47.3k | _mm256_packus_epi16(round_result, round_result); |
478 | 47.3k | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
479 | 47.3k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
480 | | |
481 | 47.3k | if (w - j > 4) { |
482 | 37.0k | _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
483 | 37.0k | _mm_storel_epi64( |
484 | 37.0k | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), |
485 | 37.0k | res_1); |
486 | 37.0k | } else { |
487 | 10.3k | *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); |
488 | 10.3k | *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = |
489 | 10.3k | _mm_cvtsi128_si32(res_1); |
490 | 10.3k | } |
491 | 54.5k | } else { |
492 | 54.5k | const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned); |
493 | 54.5k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); |
494 | | |
495 | 54.5k | const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1); |
496 | 54.5k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
497 | 54.5k | res_1); |
498 | 54.5k | } |
499 | 447k | } else { |
500 | 447k | __m256i res_hi = convolve_lowbd(s + 4, coeffs); |
501 | | |
502 | 447k | res_hi = _mm256_add_epi16(res_hi, offset_const_1); |
503 | | |
504 | 447k | const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero); |
505 | 447k | const __m256i res_hi_0_shift = |
506 | 447k | _mm256_slli_epi32(res_hi_0_32b, left_shift); |
507 | 447k | const __m256i res_hi_0_round = _mm256_sra_epi32( |
508 | 447k | _mm256_add_epi32(res_hi_0_shift, round_const), round_shift); |
509 | | |
510 | 447k | const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero); |
511 | 447k | const __m256i res_hi_1_shift = |
512 | 447k | _mm256_slli_epi32(res_hi_1_32b, left_shift); |
513 | 447k | const __m256i res_hi_1_round = _mm256_sra_epi32( |
514 | 447k | _mm256_add_epi32(res_hi_1_shift, round_const), round_shift); |
515 | | |
516 | 447k | const __m256i res_hi_round = |
517 | 447k | _mm256_packs_epi32(res_hi_0_round, res_hi_1_round); |
518 | | |
519 | 447k | const __m256i res_hi_unsigned = |
520 | 447k | _mm256_add_epi16(res_hi_round, offset_const_2); |
521 | | |
522 | 447k | if (do_average) { |
523 | 180k | const __m256i data_ref_0_lo = |
524 | 180k | load_line2_avx2(&dst[i * dst_stride + j], |
525 | 180k | &dst[i * dst_stride + j + dst_stride]); |
526 | | |
527 | 180k | const __m256i data_ref_0_hi = |
528 | 180k | load_line2_avx2(&dst[i * dst_stride + j + 8], |
529 | 180k | &dst[i * dst_stride + j + 8 + dst_stride]); |
530 | | |
531 | 180k | const __m256i comp_avg_res_lo = comp_avg( |
532 | 180k | &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg); |
533 | | |
534 | 180k | const __m256i comp_avg_res_hi = comp_avg( |
535 | 180k | &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg); |
536 | | |
537 | 180k | const __m256i round_result_lo = |
538 | 180k | convolve_rounding(&comp_avg_res_lo, &offset_const, |
539 | 180k | &rounding_const, rounding_shift); |
540 | | |
541 | 180k | const __m256i round_result_hi = |
542 | 180k | convolve_rounding(&comp_avg_res_hi, &offset_const, |
543 | 180k | &rounding_const, rounding_shift); |
544 | | |
545 | 180k | const __m256i res_8 = |
546 | 180k | _mm256_packus_epi16(round_result_lo, round_result_hi); |
547 | 180k | const __m128i res_0 = _mm256_castsi256_si128(res_8); |
548 | 180k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); |
549 | | |
550 | 180k | _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); |
551 | 180k | _mm_store_si128( |
552 | 180k | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); |
553 | | |
554 | 267k | } else { |
555 | 267k | const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned); |
556 | 267k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0); |
557 | | |
558 | 267k | const __m128i res_lo_1 = |
559 | 267k | _mm256_extracti128_si256(res_lo_unsigned, 1); |
560 | 267k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), |
561 | 267k | res_lo_1); |
562 | | |
563 | 267k | const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned); |
564 | 267k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]), |
565 | 267k | res_hi_0); |
566 | | |
567 | 267k | const __m128i res_hi_1 = |
568 | 267k | _mm256_extracti128_si256(res_hi_unsigned, 1); |
569 | 267k | _mm_store_si128( |
570 | 267k | (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]), |
571 | 267k | res_hi_1); |
572 | 267k | } |
573 | 447k | } |
574 | 549k | s[0] = s[1]; |
575 | 549k | s[1] = s[2]; |
576 | 549k | s[2] = s[3]; |
577 | | |
578 | 549k | s[4] = s[5]; |
579 | 549k | s[5] = s[6]; |
580 | 549k | s[6] = s[7]; |
581 | 549k | } |
582 | 49.8k | } |
583 | 36.5k | } |
584 | 70.1k | } |
585 | | |
586 | | #define JNT_CONVOLVE_2D_VERTICAL_FILTER_4TAP(last_4, mode) \ |
587 | 114k | do { \ |
588 | 114k | __m256i s0_reg = \ |
589 | 114k | _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ |
590 | 114k | __m256i s2_reg = \ |
591 | 114k | _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \ |
592 | 114k | __m256i s4_init = \ |
593 | 114k | _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); \ |
594 | 114k | __m256i s1_reg = _mm256_permute2x128_si256(s0_reg, s2_reg, 0x21); \ |
595 | 114k | __m256i s3_reg = _mm256_permute2x128_si256(s2_reg, s4_init, 0x21); \ |
596 | 114k | __m256i s0 = _mm256_unpacklo_epi16(s0_reg, s1_reg); \ |
597 | 114k | __m256i s1 = _mm256_unpacklo_epi16(s2_reg, s3_reg); \ |
598 | 114k | __m256i s3 = _mm256_setzero_si256(); \ |
599 | 114k | __m256i s4 = _mm256_setzero_si256(); \ |
600 | 114k | if (!(last_4)) { \ |
601 | 74.7k | s3 = _mm256_unpackhi_epi16(s0_reg, s1_reg); \ |
602 | 74.7k | s4 = _mm256_unpackhi_epi16(s2_reg, s3_reg); \ |
603 | 74.7k | } \ |
604 | 647k | for (i = 0; i < h; i += 2) { \ |
605 | 533k | const int16_t *data = &im_block[i * im_stride]; \ |
606 | 533k | \ |
607 | 533k | const __m256i s4_reg = \ |
608 | 533k | _mm256_loadu_si256((__m256i *)(data + 4 * im_stride)); \ |
609 | 533k | const __m256i s5_reg = \ |
610 | 533k | _mm256_loadu_si256((__m256i *)(data + 5 * im_stride)); \ |
611 | 533k | \ |
612 | 533k | __m256i s2 = _mm256_unpacklo_epi16(s4_reg, s5_reg); \ |
613 | 533k | \ |
614 | 533k | const __m256i res_a_1 = _mm256_madd_epi16(s0, coeffs_y[1]); \ |
615 | 533k | const __m256i res_a_2 = _mm256_madd_epi16(s1, coeffs_y[2]); \ |
616 | 533k | const __m256i res_a = _mm256_add_epi32(res_a_1, res_a_2); \ |
617 | 533k | const __m256i res_a_round = \ |
618 | 533k | _mm256_srai_epi32(_mm256_add_epi32(res_a, round_offset_const_v), 7); \ |
619 | 533k | \ |
620 | 533k | __m256i res_unsigned; \ |
621 | 533k | if (last_4) { \ |
622 | 92.8k | res_unsigned = _mm256_packus_epi32(res_a_round, res_a_round); \ |
623 | 440k | } else { \ |
624 | 440k | __m256i s5 = _mm256_unpackhi_epi16(s4_reg, s5_reg); \ |
625 | 440k | const __m256i res_b_1 = _mm256_madd_epi16(s3, coeffs_y[1]); \ |
626 | 440k | const __m256i res_b_2 = _mm256_madd_epi16(s4, coeffs_y[2]); \ |
627 | 440k | const __m256i res_b = _mm256_add_epi32(res_b_1, res_b_2); \ |
628 | 440k | const __m256i res_b_round = _mm256_srai_epi32( \ |
629 | 440k | _mm256_add_epi32(res_b, round_offset_const_v), 7); \ |
630 | 440k | res_unsigned = _mm256_packus_epi32(res_a_round, res_b_round); \ |
631 | 440k | s3 = s4; \ |
632 | 440k | s4 = s5; \ |
633 | 440k | } \ |
634 | 533k | s0 = s1; \ |
635 | 533k | s1 = s2; \ |
636 | 533k | \ |
637 | 533k | if (mode == 0) { \ |
638 | 106k | const __m256i comp_const = _mm256_set1_epi32(-16 * offset + 128); \ |
639 | 106k | const __m256i wt = unpack_weights_avx2(conv_params); \ |
640 | 106k | __m256i round_result; \ |
641 | 106k | const __m256i data_ref_0 = load_line2_avx2( \ |
642 | 106k | &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); \ |
643 | 106k | const __m256i data_lo = \ |
644 | 106k | _mm256_unpacklo_epi16(data_ref_0, res_unsigned); \ |
645 | 106k | const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, wt); \ |
646 | 106k | const __m256i fused_lo = \ |
647 | 106k | _mm256_srai_epi32(_mm256_add_epi32(wt_res_lo, comp_const), 8); \ |
648 | 106k | if (last_4) { \ |
649 | 20.5k | round_result = _mm256_packs_epi32(fused_lo, fused_lo); \ |
650 | 86.0k | } else { \ |
651 | 86.0k | const __m256i data_hi = \ |
652 | 86.0k | _mm256_unpackhi_epi16(data_ref_0, res_unsigned); \ |
653 | 86.0k | const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, wt); \ |
654 | 86.0k | const __m256i fused_hi = \ |
655 | 86.0k | _mm256_srai_epi32(_mm256_add_epi32(wt_res_hi, comp_const), 8); \ |
656 | 86.0k | round_result = _mm256_packs_epi32(fused_lo, fused_hi); \ |
657 | 86.0k | } \ |
658 | 106k | const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); \ |
659 | 106k | const __m128i res_0 = _mm256_castsi256_si128(res_8); \ |
660 | 106k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); \ |
661 | 106k | if (last_4) { \ |
662 | 20.5k | *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); \ |
663 | 20.5k | *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = \ |
664 | 20.5k | _mm_cvtsi128_si32(res_1); \ |
665 | 86.0k | } else { \ |
666 | 86.0k | _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); \ |
667 | 86.0k | _mm_storel_epi64( \ |
668 | 86.0k | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); \ |
669 | 86.0k | } \ |
670 | 426k | } else if (mode == 1) { \ |
671 | 131k | const int rounding_shift = \ |
672 | 131k | 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; \ |
673 | 131k | const __m256i offset_const = _mm256_set1_epi16(offset); \ |
674 | 131k | const __m256i rounding_const = \ |
675 | 131k | _mm256_set1_epi16((1 << rounding_shift) >> 1); \ |
676 | 131k | const __m256i data_ref_0 = load_line2_avx2( \ |
677 | 131k | &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); \ |
678 | 131k | const __m256i wt_res = _mm256_add_epi16(data_ref_0, res_unsigned); \ |
679 | 131k | const __m256i comp_avg_res = _mm256_srai_epi16(wt_res, 1); \ |
680 | 131k | const __m256i res_signed = \ |
681 | 131k | _mm256_sub_epi16(comp_avg_res, offset_const); \ |
682 | 131k | const __m256i round_result = _mm256_srai_epi16( \ |
683 | 131k | _mm256_add_epi16(res_signed, rounding_const), 4); \ |
684 | 131k | const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); \ |
685 | 131k | const __m128i res_0 = _mm256_castsi256_si128(res_8); \ |
686 | 131k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); \ |
687 | 131k | if (last_4) { \ |
688 | 20.5k | *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); \ |
689 | 20.5k | *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = \ |
690 | 20.5k | _mm_cvtsi128_si32(res_1); \ |
691 | 110k | } else { \ |
692 | 110k | _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); \ |
693 | 110k | _mm_storel_epi64( \ |
694 | 110k | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); \ |
695 | 110k | } \ |
696 | 295k | } else { \ |
697 | 295k | const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); \ |
698 | 295k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); \ |
699 | 295k | const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); \ |
700 | 295k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), \ |
701 | 295k | res_1); \ |
702 | 295k | } \ |
703 | 533k | } \ |
704 | 114k | } while (0) |
705 | | |
706 | | void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride, |
707 | | uint8_t *dst0, int dst_stride0, int w, int h, |
708 | | const InterpFilterParams *filter_params_x, |
709 | | const InterpFilterParams *filter_params_y, |
710 | | const int subpel_x_qn, const int subpel_y_qn, |
711 | 201k | ConvolveParams *conv_params) { |
712 | 201k | CONV_BUF_TYPE *dst = conv_params->dst; |
713 | 201k | int dst_stride = conv_params->dst_stride; |
714 | 201k | const int bd = 8; |
715 | | |
716 | 201k | DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); |
717 | | |
718 | 201k | int im_stride = 8; |
719 | 201k | int i, is_horiz_4tap = 0, is_vert_4tap = 0; |
720 | 201k | const int do_average = conv_params->do_average; |
721 | 201k | const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; |
722 | 201k | const int offset_0 = |
723 | 201k | bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
724 | 201k | const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); |
725 | | |
726 | 201k | assert(conv_params->round_0 > 0); |
727 | | |
728 | 201k | const __m256i round_const_h = _mm256_set1_epi16( |
729 | 201k | ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2))); |
730 | 201k | const int round_const_v_val = |
731 | 201k | ((1 << conv_params->round_1) >> 1) - |
732 | 201k | (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)); |
733 | 201k | const __m256i round_const_v = _mm256_set1_epi32(round_const_v_val); |
734 | 201k | const __m256i round_offset_const_v = |
735 | 201k | _mm256_set1_epi32(round_const_v_val + (offset << 7)); |
736 | | |
737 | 201k | DECLARE_ALIGNED(32, __m256i, filt[4]); |
738 | 201k | DECLARE_ALIGNED(32, __m256i, coeffs_x[4]); |
739 | 201k | DECLARE_ALIGNED(32, __m256i, coeffs_y[4]); |
740 | | |
741 | 201k | filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); |
742 | 201k | filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); |
743 | | |
744 | 201k | prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_x); |
745 | 201k | prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); |
746 | | |
747 | | // Condition for checking valid horz_filt taps |
748 | 201k | if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_x[0], coeffs_x[3]), 0))) |
749 | 76.7k | is_horiz_4tap = 1; |
750 | | |
751 | | // Condition for checking valid vert_filt taps |
752 | 201k | if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_y[0], coeffs_y[3]), 0))) |
753 | 89.3k | is_vert_4tap = 1; |
754 | | |
755 | 201k | if (is_horiz_4tap && is_vert_4tap) { |
756 | 58.0k | int im_h = h + 4; |
757 | 58.0k | const int fo_vert = 1; |
758 | 58.0k | const int fo_horiz = 1; |
759 | 58.0k | const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; |
760 | 58.0k | if (w > 4) { |
761 | 49.3k | for (int j = 0; j < w; j += 8) { |
762 | 31.0k | JNT_CONVOLVE_HORIZONTAL_FILTER(src_ptr + j, convolve_lowbd_x_4tap, |
763 | 31.0k | coeffs_x + 1); |
764 | 31.0k | if (do_average) { |
765 | 14.3k | if (use_dist_wtd_comp_avg) { |
766 | 6.15k | JNT_CONVOLVE_2D_VERTICAL_FILTER_4TAP(0, 0); |
767 | 8.21k | } else { |
768 | 8.21k | JNT_CONVOLVE_2D_VERTICAL_FILTER_4TAP(0, 1); |
769 | 8.21k | } |
770 | 16.6k | } else { |
771 | 16.6k | JNT_CONVOLVE_2D_VERTICAL_FILTER_4TAP(0, 2); |
772 | 16.6k | } |
773 | 31.0k | } |
774 | 39.8k | } else { |
775 | 39.8k | const int j = 0; |
776 | 39.8k | JNT_CONVOLVE_HORIZONTAL_FILTER(src_ptr, convolve_lowbd_x_4tap, |
777 | 39.8k | coeffs_x + 1); |
778 | 39.8k | if (do_average) { |
779 | 17.5k | if (use_dist_wtd_comp_avg) { |
780 | 9.03k | JNT_CONVOLVE_2D_VERTICAL_FILTER_4TAP(1, 0); |
781 | 9.03k | } else { |
782 | 8.51k | JNT_CONVOLVE_2D_VERTICAL_FILTER_4TAP(1, 1); |
783 | 8.51k | } |
784 | 22.2k | } else { |
785 | 22.2k | JNT_CONVOLVE_2D_VERTICAL_FILTER_4TAP(1, 2); |
786 | 22.2k | } |
787 | 39.8k | } |
788 | 143k | } else if (is_horiz_4tap) { |
789 | 18.6k | int im_h = h + 8; |
790 | 18.6k | const int fo_vert = filter_params_y->taps / 2 - 1; |
791 | 18.6k | const int fo_horiz = 1; |
792 | 18.6k | const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; |
793 | 18.6k | const __m256i wt = unpack_weights_avx2(conv_params); |
794 | 18.6k | const __m256i offset_const = _mm256_set1_epi16(offset); |
795 | 18.6k | const int rounding_shift = |
796 | 18.6k | 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
797 | 18.6k | const __m256i rounding_const = |
798 | 18.6k | _mm256_set1_epi16((1 << rounding_shift) >> 1); |
799 | 40.0k | for (int j = 0; j < w; j += 8) { |
800 | 21.3k | JNT_CONVOLVE_HORIZONTAL_FILTER(src_ptr + j, convolve_lowbd_x_4tap, |
801 | 21.3k | coeffs_x + 1); |
802 | 21.3k | JNT_CONVOLVE_VERTICAL_FILTER_8TAP; |
803 | 21.3k | } |
804 | 124k | } else if (is_vert_4tap) { |
805 | 31.2k | int im_h = h + 4; |
806 | 31.2k | const int fo_vert = 1; |
807 | 31.2k | const int fo_horiz = filter_params_x->taps / 2 - 1; |
808 | 31.2k | const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; |
809 | | |
810 | 31.2k | filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); |
811 | 31.2k | filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); |
812 | | |
813 | 74.8k | for (int j = 0; j < w; j += 8) { |
814 | 43.6k | JNT_CONVOLVE_HORIZONTAL_FILTER(src_ptr + j, convolve_lowbd_x, coeffs_x); |
815 | 43.6k | if (do_average) { |
816 | 18.9k | if (use_dist_wtd_comp_avg) { |
817 | 7.98k | JNT_CONVOLVE_2D_VERTICAL_FILTER_4TAP((w - j <= 4), 0); |
818 | 10.9k | } else { |
819 | 10.9k | JNT_CONVOLVE_2D_VERTICAL_FILTER_4TAP((w - j <= 4), 1); |
820 | 10.9k | } |
821 | 24.6k | } else { |
822 | 24.6k | JNT_CONVOLVE_2D_VERTICAL_FILTER_4TAP((w - j <= 4), 2); |
823 | 24.6k | } |
824 | 43.6k | } |
825 | 93.6k | } else { |
826 | 93.6k | int im_h = h + 8; |
827 | 93.6k | const int fo_vert = filter_params_y->taps / 2 - 1; |
828 | 93.6k | const int fo_horiz = filter_params_x->taps / 2 - 1; |
829 | 93.6k | const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; |
830 | 93.6k | const __m256i wt = unpack_weights_avx2(conv_params); |
831 | 93.6k | const __m256i offset_const = _mm256_set1_epi16(offset); |
832 | 93.6k | const int rounding_shift = |
833 | 93.6k | 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
834 | 93.6k | const __m256i rounding_const = |
835 | 93.6k | _mm256_set1_epi16((1 << rounding_shift) >> 1); |
836 | | |
837 | 93.6k | filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); |
838 | 93.6k | filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); |
839 | | |
840 | 310k | for (int j = 0; j < w; j += 8) { |
841 | 217k | JNT_CONVOLVE_HORIZONTAL_FILTER(src_ptr + j, convolve_lowbd_x, coeffs_x); |
842 | | |
843 | 217k | JNT_CONVOLVE_VERTICAL_FILTER_8TAP; |
844 | 217k | } |
845 | 93.6k | } |
846 | 201k | } |
847 | | #undef JNT_CONVOLVE_2D_VERTICAL_FILTER_4TAP |
848 | | |
849 | | #define DO_NO_AVG_2D_COPY_4X16(r0, c0, r1, c1, r2, c2, r3, c3) \ |
850 | 54.5M | do { \ |
851 | 54.5M | src_0 = _mm256_cvtepu8_epi16( \ |
852 | 54.5M | _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0]))); \ |
853 | 54.5M | src_1 = _mm256_cvtepu8_epi16( \ |
854 | 54.5M | _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1]))); \ |
855 | 54.5M | src_2 = _mm256_cvtepu8_epi16( \ |
856 | 54.5M | _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2]))); \ |
857 | 54.5M | src_3 = _mm256_cvtepu8_epi16( \ |
858 | 54.5M | _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3]))); \ |
859 | 54.5M | \ |
860 | 54.5M | src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT); \ |
861 | 54.5M | src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT); \ |
862 | 54.5M | src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT); \ |
863 | 54.5M | src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT); \ |
864 | 54.5M | \ |
865 | 54.5M | src_0 = _mm256_add_epi16(src_0, offset_const); \ |
866 | 54.5M | src_1 = _mm256_add_epi16(src_1, offset_const); \ |
867 | 54.5M | src_2 = _mm256_add_epi16(src_2, offset_const); \ |
868 | 54.5M | src_3 = _mm256_add_epi16(src_3, offset_const); \ |
869 | 54.5M | \ |
870 | 54.5M | _mm256_store_si256((__m256i *)(&dst[r0 * dst_stride + c0]), src_0); \ |
871 | 54.5M | _mm256_store_si256((__m256i *)(&dst[r1 * dst_stride + c1]), src_1); \ |
872 | 54.5M | _mm256_store_si256((__m256i *)(&dst[r2 * dst_stride + c2]), src_2); \ |
873 | 54.5M | _mm256_store_si256((__m256i *)(&dst[r3 * dst_stride + c3]), src_3); \ |
874 | 54.5M | } while (0) |
875 | | |
876 | 435M | #define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7) |
877 | | static inline void av1_dist_wtd_convolve_2d_no_avg_copy_avx2( |
878 | | const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, |
879 | 723k | int w, int h, const __m256i offset_const) { |
880 | 723k | int i = h; |
881 | 723k | if (w >= 16) { |
882 | 585k | __m256i src_0, src_1, src_2, src_3; |
883 | 585k | if (w == 128) { |
884 | 14.3M | do { |
885 | 14.3M | DO_NO_AVG_2D_COPY_4X16(0, 0, 0, 16, 0, 32, 0, 48); |
886 | 14.3M | DO_NO_AVG_2D_COPY_4X16(0, 64, 0, 80, 0, 96, 0, 112); |
887 | 14.3M | src += 1 * src_stride; |
888 | 14.3M | dst += 1 * dst_stride; |
889 | 14.3M | i -= 1; |
890 | 14.3M | } while (i); |
891 | 472k | } else if (w == 64) { |
892 | 21.7M | do { |
893 | 21.7M | DO_NO_AVG_2D_COPY_4X16(0, 0, 0, 16, 0, 32, 0, 48); |
894 | 21.7M | src += 1 * src_stride; |
895 | 21.7M | dst += 1 * dst_stride; |
896 | 21.7M | i -= 1; |
897 | 21.7M | } while (i); |
898 | 285k | } else if (w == 32) { |
899 | 3.83M | do { |
900 | 3.83M | DO_NO_AVG_2D_COPY_4X16(0, 0, 1, 0, 0, 16, 1, 16); |
901 | 3.83M | src += 2 * src_stride; |
902 | 3.83M | dst += 2 * dst_stride; |
903 | 3.83M | i -= 2; |
904 | 3.83M | } while (i); |
905 | 133k | } else if (w == 16) { |
906 | 240k | do { |
907 | 240k | DO_NO_AVG_2D_COPY_4X16(0, 0, 1, 0, 2, 0, 3, 0); |
908 | 240k | src += 4 * src_stride; |
909 | 240k | dst += 4 * dst_stride; |
910 | 240k | i -= 4; |
911 | 240k | } while (i); |
912 | 54.3k | } |
913 | 585k | } else { |
914 | 138k | const __m256i zero = _mm256_setzero_si256(); |
915 | 335k | do { |
916 | 335k | const __m128i src_row_0 = |
917 | 335k | _mm_loadl_epi64((__m128i *)(&src[0 * src_stride])); |
918 | 335k | const __m128i src_row_1 = |
919 | 335k | _mm_loadl_epi64((__m128i *)(&src[1 * src_stride])); |
920 | 335k | const __m128i src_row_2 = |
921 | 335k | _mm_loadl_epi64((__m128i *)(&src[2 * src_stride])); |
922 | 335k | const __m128i src_row_3 = |
923 | 335k | _mm_loadl_epi64((__m128i *)(&src[3 * src_stride])); |
924 | | |
925 | 335k | __m256i src_10 = _mm256_insertf128_si256( |
926 | 335k | _mm256_castsi128_si256(src_row_0), src_row_1, 1); |
927 | 335k | __m256i src_32 = _mm256_insertf128_si256( |
928 | 335k | _mm256_castsi128_si256(src_row_2), src_row_3, 1); |
929 | | |
930 | 335k | src_10 = _mm256_unpacklo_epi8(src_10, zero); |
931 | 335k | src_32 = _mm256_unpacklo_epi8(src_32, zero); |
932 | | |
933 | 335k | src_10 = _mm256_slli_epi16(src_10, LEFT_SHIFT); |
934 | 335k | src_32 = _mm256_slli_epi16(src_32, LEFT_SHIFT); |
935 | | |
936 | 335k | src_10 = _mm256_add_epi16(src_10, offset_const); |
937 | 335k | src_32 = _mm256_add_epi16(src_32, offset_const); |
938 | | |
939 | | // Accumulate values into the destination buffer |
940 | 335k | _mm_store_si128((__m128i *)(&dst[0 * dst_stride]), |
941 | 335k | _mm256_castsi256_si128(src_10)); |
942 | 335k | _mm_store_si128((__m128i *)(&dst[1 * dst_stride]), |
943 | 335k | _mm256_extracti128_si256(src_10, 1)); |
944 | 335k | _mm_store_si128((__m128i *)(&dst[2 * dst_stride]), |
945 | 335k | _mm256_castsi256_si128(src_32)); |
946 | 335k | _mm_store_si128((__m128i *)(&dst[3 * dst_stride]), |
947 | 335k | _mm256_extracti128_si256(src_32, 1)); |
948 | | |
949 | 335k | src += 4 * src_stride; |
950 | 335k | dst += 4 * dst_stride; |
951 | 335k | i -= 4; |
952 | 335k | } while (i); |
953 | 138k | } |
954 | 723k | } |
955 | | |
956 | | #define DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, r0, c0, r1, c1, r2, c2, r3, c3) \ |
957 | 54.0M | do { \ |
958 | 54.0M | src_0 = _mm256_cvtepu8_epi16( \ |
959 | 54.0M | _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0]))); \ |
960 | 54.0M | src_1 = _mm256_cvtepu8_epi16( \ |
961 | 54.0M | _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1]))); \ |
962 | 54.0M | src_2 = _mm256_cvtepu8_epi16( \ |
963 | 54.0M | _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2]))); \ |
964 | 54.0M | src_3 = _mm256_cvtepu8_epi16( \ |
965 | 54.0M | _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3]))); \ |
966 | 54.0M | \ |
967 | 54.0M | src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT); \ |
968 | 54.0M | src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT); \ |
969 | 54.0M | src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT); \ |
970 | 54.0M | src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT); \ |
971 | 54.0M | src_0 = _mm256_add_epi16(src_0, offset_const); \ |
972 | 54.0M | src_1 = _mm256_add_epi16(src_1, offset_const); \ |
973 | 54.0M | src_2 = _mm256_add_epi16(src_2, offset_const); \ |
974 | 54.0M | src_3 = _mm256_add_epi16(src_3, offset_const); \ |
975 | 54.0M | \ |
976 | 54.0M | ref_0 = _mm256_loadu_si256((__m256i *)(&dst[r0 * dst_stride + c0])); \ |
977 | 54.0M | ref_1 = _mm256_loadu_si256((__m256i *)(&dst[r1 * dst_stride + c1])); \ |
978 | 54.0M | ref_2 = _mm256_loadu_si256((__m256i *)(&dst[r2 * dst_stride + c2])); \ |
979 | 54.0M | ref_3 = _mm256_loadu_si256((__m256i *)(&dst[r3 * dst_stride + c3])); \ |
980 | 54.0M | \ |
981 | 54.0M | res_0 = comp_avg(&ref_0, &src_0, &wt, USE_DIST_WEIGHTED); \ |
982 | 54.0M | res_1 = comp_avg(&ref_1, &src_1, &wt, USE_DIST_WEIGHTED); \ |
983 | 54.0M | res_2 = comp_avg(&ref_2, &src_2, &wt, USE_DIST_WEIGHTED); \ |
984 | 54.0M | res_3 = comp_avg(&ref_3, &src_3, &wt, USE_DIST_WEIGHTED); \ |
985 | 54.0M | \ |
986 | 54.0M | res_0 = convolve_rounding(&res_0, &offset_const, &rounding_const, \ |
987 | 54.0M | rounding_shift); \ |
988 | 54.0M | res_1 = convolve_rounding(&res_1, &offset_const, &rounding_const, \ |
989 | 54.0M | rounding_shift); \ |
990 | 54.0M | res_2 = convolve_rounding(&res_2, &offset_const, &rounding_const, \ |
991 | 54.0M | rounding_shift); \ |
992 | 54.0M | res_3 = convolve_rounding(&res_3, &offset_const, &rounding_const, \ |
993 | 54.0M | rounding_shift); \ |
994 | 54.0M | \ |
995 | 54.0M | res_10 = _mm256_packus_epi16(res_0, res_1); \ |
996 | 54.0M | res_32 = _mm256_packus_epi16(res_2, res_3); \ |
997 | 54.0M | res_10 = _mm256_permute4x64_epi64(res_10, 0xD8); \ |
998 | 54.0M | res_32 = _mm256_permute4x64_epi64(res_32, 0xD8); \ |
999 | 54.0M | \ |
1000 | 54.0M | _mm_store_si128((__m128i *)(&dst0[r0 * dst_stride0 + c0]), \ |
1001 | 54.0M | _mm256_castsi256_si128(res_10)); \ |
1002 | 54.0M | _mm_store_si128((__m128i *)(&dst0[r1 * dst_stride0 + c1]), \ |
1003 | 54.0M | _mm256_extracti128_si256(res_10, 1)); \ |
1004 | 54.0M | _mm_store_si128((__m128i *)(&dst0[r2 * dst_stride0 + c2]), \ |
1005 | 54.0M | _mm256_castsi256_si128(res_32)); \ |
1006 | 54.0M | _mm_store_si128((__m128i *)(&dst0[r3 * dst_stride0 + c3]), \ |
1007 | 54.0M | _mm256_extracti128_si256(res_32, 1)); \ |
1008 | 54.0M | } while (0) |
1009 | | |
1010 | | #define DO_AVG_2D_COPY(USE_DIST_WEIGHTED) \ |
1011 | 623k | int i = h; \ |
1012 | 623k | if (w >= 16) { \ |
1013 | 547k | __m256i src_0, src_1, src_2, src_3; \ |
1014 | 547k | __m256i ref_0, ref_1, ref_2, ref_3; \ |
1015 | 547k | __m256i res_0, res_1, res_2, res_3; \ |
1016 | 547k | __m256i res_10, res_32; \ |
1017 | 547k | if (w == 128) { \ |
1018 | 14.3M | do { \ |
1019 | 14.3M | DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 0, 16, 0, 32, 0, 48); \ |
1020 | 14.3M | DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 64, 0, 80, 0, 96, 0, 112); \ |
1021 | 14.3M | i -= 1; \ |
1022 | 14.3M | src += 1 * src_stride; \ |
1023 | 14.3M | dst += 1 * dst_stride; \ |
1024 | 14.3M | dst0 += 1 * dst_stride0; \ |
1025 | 14.3M | } while (i); \ |
1026 | 435k | } else if (w == 64) { \ |
1027 | 21.5M | do { \ |
1028 | 21.5M | DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 0, 16, 0, 32, 0, 48); \ |
1029 | 21.5M | \ |
1030 | 21.5M | i -= 1; \ |
1031 | 21.5M | src += 1 * src_stride; \ |
1032 | 21.5M | dst += 1 * dst_stride; \ |
1033 | 21.5M | dst0 += 1 * dst_stride0; \ |
1034 | 21.5M | } while (i); \ |
1035 | 282k | } else if (w == 32) { \ |
1036 | 3.70M | do { \ |
1037 | 3.70M | DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 1, 0, 0, 16, 1, 16); \ |
1038 | 3.70M | \ |
1039 | 3.70M | i -= 2; \ |
1040 | 3.70M | src += 2 * src_stride; \ |
1041 | 3.70M | dst += 2 * dst_stride; \ |
1042 | 3.70M | dst0 += 2 * dst_stride0; \ |
1043 | 3.70M | } while (i); \ |
1044 | 123k | } else { \ |
1045 | 29.4k | assert(w == 16); \ |
1046 | 120k | do { \ |
1047 | 120k | DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 1, 0, 2, 0, 3, 0); \ |
1048 | 120k | \ |
1049 | 120k | i -= 4; \ |
1050 | 120k | src += 4 * src_stride; \ |
1051 | 120k | dst += 4 * dst_stride; \ |
1052 | 120k | dst0 += 4 * dst_stride0; \ |
1053 | 120k | } while (i); \ |
1054 | 30.4k | } \ |
1055 | 547k | } else if (w == 8) { \ |
1056 | 123k | do { \ |
1057 | 123k | const __m128i src_0 = \ |
1058 | 123k | _mm_loadl_epi64((__m128i *)(&src[0 * src_stride])); \ |
1059 | 123k | const __m128i src_1 = \ |
1060 | 123k | _mm_loadl_epi64((__m128i *)(&src[1 * src_stride])); \ |
1061 | 123k | const __m128i src_2 = \ |
1062 | 123k | _mm_loadl_epi64((__m128i *)(&src[2 * src_stride])); \ |
1063 | 123k | const __m128i src_3 = \ |
1064 | 123k | _mm_loadl_epi64((__m128i *)(&src[3 * src_stride])); \ |
1065 | 123k | __m256i src_10 = \ |
1066 | 123k | _mm256_insertf128_si256(_mm256_castsi128_si256(src_0), src_1, 1); \ |
1067 | 123k | __m256i src_32 = \ |
1068 | 123k | _mm256_insertf128_si256(_mm256_castsi128_si256(src_2), src_3, 1); \ |
1069 | 123k | \ |
1070 | 123k | src_10 = _mm256_unpacklo_epi8(src_10, zero); \ |
1071 | 123k | src_32 = _mm256_unpacklo_epi8(src_32, zero); \ |
1072 | 123k | \ |
1073 | 123k | src_10 = _mm256_slli_epi16(src_10, LEFT_SHIFT); \ |
1074 | 123k | src_32 = _mm256_slli_epi16(src_32, LEFT_SHIFT); \ |
1075 | 123k | \ |
1076 | 123k | src_10 = _mm256_add_epi16(src_10, offset_const); \ |
1077 | 123k | src_32 = _mm256_add_epi16(src_32, offset_const); \ |
1078 | 123k | \ |
1079 | 123k | const __m256i ref_10 = \ |
1080 | 123k | load_line2_avx2(&dst[0 * dst_stride], &dst[1 * dst_stride]); \ |
1081 | 123k | const __m256i ref_32 = \ |
1082 | 123k | load_line2_avx2(&dst[2 * dst_stride], &dst[3 * dst_stride]); \ |
1083 | 123k | __m256i res_10 = comp_avg(&ref_10, &src_10, &wt, USE_DIST_WEIGHTED); \ |
1084 | 123k | __m256i res_32 = comp_avg(&ref_32, &src_32, &wt, USE_DIST_WEIGHTED); \ |
1085 | 123k | \ |
1086 | 123k | res_10 = convolve_rounding(&res_10, &offset_const, &rounding_const, \ |
1087 | 123k | rounding_shift); \ |
1088 | 123k | res_32 = convolve_rounding(&res_32, &offset_const, &rounding_const, \ |
1089 | 123k | rounding_shift); \ |
1090 | 123k | \ |
1091 | 123k | __m256i res = _mm256_packus_epi16(res_10, res_32); \ |
1092 | 123k | const __m128i res_20 = _mm256_castsi256_si128(res); \ |
1093 | 123k | const __m128i res_31 = _mm256_extracti128_si256(res, 1); \ |
1094 | 123k | \ |
1095 | 123k | _mm_storel_epi64((__m128i *)(&dst0[0 * dst_stride0]), res_20); \ |
1096 | 123k | _mm_storel_epi64((__m128i *)((&dst0[1 * dst_stride0])), res_31); \ |
1097 | 123k | _mm_storeh_epi64((__m128i *)(&dst0[2 * dst_stride0]), res_20); \ |
1098 | 123k | _mm_storeh_epi64((__m128i *)((&dst0[3 * dst_stride0])), res_31); \ |
1099 | 123k | i -= 4; \ |
1100 | 123k | src += 4 * src_stride; \ |
1101 | 123k | dst += 4 * dst_stride; \ |
1102 | 123k | dst0 += 4 * dst_stride0; \ |
1103 | 123k | } while (i); \ |
1104 | 45.8k | } else { \ |
1105 | 29.7k | assert(w == 4); \ |
1106 | 46.3k | do { \ |
1107 | 46.3k | __m256i src_3210_8bit = \ |
1108 | 46.3k | _mm256_setr_epi32(loadu_int32(src + 0 * src_stride), \ |
1109 | 46.3k | loadu_int32(src + 1 * src_stride), 0, 0, \ |
1110 | 46.3k | loadu_int32(src + 2 * src_stride), \ |
1111 | 46.3k | loadu_int32(src + 3 * src_stride), 0, 0); \ |
1112 | 46.3k | \ |
1113 | 46.3k | __m256i src_3210 = _mm256_unpacklo_epi8(src_3210_8bit, zero); \ |
1114 | 46.3k | src_3210 = _mm256_slli_epi16(src_3210, LEFT_SHIFT); \ |
1115 | 46.3k | src_3210 = _mm256_add_epi16(src_3210, offset_const); \ |
1116 | 46.3k | \ |
1117 | 46.3k | __m256i ref_3210 = \ |
1118 | 46.3k | _mm256_setr_epi64x(*(int64_t *)(dst + 0 * dst_stride), \ |
1119 | 46.3k | *(int64_t *)(dst + 1 * dst_stride), \ |
1120 | 46.3k | *(int64_t *)(dst + 2 * dst_stride), \ |
1121 | 46.3k | *(int64_t *)(dst + 3 * dst_stride)); \ |
1122 | 46.3k | __m256i res_3210 = \ |
1123 | 46.3k | comp_avg(&ref_3210, &src_3210, &wt, USE_DIST_WEIGHTED); \ |
1124 | 46.3k | \ |
1125 | 46.3k | res_3210 = convolve_rounding(&res_3210, &offset_const, &rounding_const, \ |
1126 | 46.3k | rounding_shift); \ |
1127 | 46.3k | \ |
1128 | 46.3k | res_3210 = _mm256_packus_epi16(res_3210, res_3210); \ |
1129 | 46.3k | const __m128i res_10 = _mm256_castsi256_si128(res_3210); \ |
1130 | 46.3k | const __m128i res_32 = _mm256_extracti128_si256(res_3210, 1); \ |
1131 | 46.3k | \ |
1132 | 46.3k | *(int *)(&dst0[0 * dst_stride0]) = _mm_cvtsi128_si32(res_10); \ |
1133 | 46.3k | *(int *)(&dst0[2 * dst_stride0]) = _mm_cvtsi128_si32(res_32); \ |
1134 | 46.3k | *(int *)(&dst0[1 * dst_stride0]) = _mm_extract_epi32(res_10, 1); \ |
1135 | 46.3k | *(int *)(&dst0[3 * dst_stride0]) = _mm_extract_epi32(res_32, 1); \ |
1136 | 46.3k | i -= 4; \ |
1137 | 46.3k | src += 4 * src_stride; \ |
1138 | 46.3k | dst += 4 * dst_stride; \ |
1139 | 46.3k | dst0 += 4 * dst_stride0; \ |
1140 | 46.3k | } while (i); \ |
1141 | 29.7k | } |
1142 | | |
1143 | | void av1_dist_wtd_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, |
1144 | | uint8_t *dst0, int dst_stride0, int w, |
1145 | 1.34M | int h, ConvolveParams *conv_params) { |
1146 | 1.34M | const int bd = 8; |
1147 | 1.34M | CONV_BUF_TYPE *dst = conv_params->dst; |
1148 | 1.34M | int dst_stride = conv_params->dst_stride; |
1149 | 1.34M | assert(conv_params->round_0 == 3); |
1150 | 1.34M | assert(conv_params->round_1 == 7); |
1151 | 1.34M | assert(w % 4 == 0); |
1152 | 1.34M | assert(h % 4 == 0); |
1153 | | |
1154 | 1.34M | const int do_average = conv_params->do_average; |
1155 | 1.34M | const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; |
1156 | 1.34M | const __m256i wt = unpack_weights_avx2(conv_params); |
1157 | 1.34M | const __m256i zero = _mm256_setzero_si256(); |
1158 | | |
1159 | 1.34M | const int offset_0 = |
1160 | 1.34M | bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
1161 | 1.34M | const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); |
1162 | 1.34M | const __m256i offset_const = _mm256_set1_epi16(offset); |
1163 | 1.34M | const int rounding_shift = |
1164 | 1.34M | 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; |
1165 | 1.34M | const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); |
1166 | | |
1167 | 1.34M | if (do_average) { |
1168 | 623k | if (use_dist_wtd_comp_avg) { |
1169 | 58.2k | DO_AVG_2D_COPY(1) |
1170 | 565k | } else { |
1171 | 565k | DO_AVG_2D_COPY(0) |
1172 | 565k | } |
1173 | 721k | } else { |
1174 | 721k | av1_dist_wtd_convolve_2d_no_avg_copy_avx2(src, src_stride, dst, dst_stride, |
1175 | 721k | w, h, offset_const); |
1176 | 721k | } |
1177 | 1.34M | } |
1178 | | #undef LEFT_SHIFT |