/src/aom/av1/common/x86/convolve_avx2.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2017, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #include <immintrin.h> |
13 | | |
14 | | #include "config/av1_rtcd.h" |
15 | | |
16 | | #if CONFIG_SVT_AV1 |
17 | | #include "third_party/SVT-AV1/convolve_avx2.h" |
18 | | #endif |
19 | | |
20 | | #include "aom_dsp/aom_dsp_common.h" |
21 | | #include "aom_dsp/x86/convolve_avx2.h" |
22 | | #include "aom_dsp/x86/convolve_common_intrin.h" |
23 | | #include "aom_dsp/x86/synonyms.h" |
24 | | |
25 | | static inline void av1_convolve_y_sr_general_avx2( |
26 | | const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, |
27 | 0 | int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn) { |
28 | | // right shift is F-1 because we are already dividing |
29 | | // filter co-efficients by 2 |
30 | 0 | const int right_shift_bits = (FILTER_BITS - 1); |
31 | 0 | __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits); |
32 | 0 | __m256i right_shift_const = _mm256_set1_epi16((1 << right_shift_bits) >> 1); |
33 | |
|
34 | 0 | __m256i coeffs[6], s[12]; |
35 | 0 | __m128i d[10]; |
36 | |
|
37 | 0 | int i, vert_tap = get_filter_tap(filter_params_y, subpel_y_qn); |
38 | |
|
39 | 0 | if (vert_tap == 6) |
40 | 0 | prepare_coeffs_6t_lowbd(filter_params_y, subpel_y_qn, coeffs); |
41 | 0 | else if (vert_tap == 12) { |
42 | 0 | prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs); |
43 | 0 | } else { |
44 | 0 | prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs); |
45 | 0 | } |
46 | | |
47 | | // vert_filt as 4 tap |
48 | 0 | if (vert_tap == 4) { |
49 | 0 | const int fo_vert = 1; |
50 | 0 | const uint8_t *const src_ptr = src - fo_vert * src_stride; |
51 | 0 | for (int j = 0; j < w; j += 16) { |
52 | 0 | const uint8_t *data = &src_ptr[j]; |
53 | 0 | d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); |
54 | 0 | d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); |
55 | 0 | d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); |
56 | 0 | d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); |
57 | 0 | d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); |
58 | | |
59 | | // Load lines a and b. Line a to lower 128, line b to upper 128 |
60 | 0 | const __m256i src_01a = _mm256_permute2x128_si256( |
61 | 0 | _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20); |
62 | |
|
63 | 0 | const __m256i src_12a = _mm256_permute2x128_si256( |
64 | 0 | _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20); |
65 | |
|
66 | 0 | const __m256i src_23a = _mm256_permute2x128_si256( |
67 | 0 | _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20); |
68 | |
|
69 | 0 | const __m256i src_34a = _mm256_permute2x128_si256( |
70 | 0 | _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20); |
71 | |
|
72 | 0 | s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); |
73 | 0 | s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); |
74 | |
|
75 | 0 | s[3] = _mm256_unpackhi_epi8(src_01a, src_12a); |
76 | 0 | s[4] = _mm256_unpackhi_epi8(src_23a, src_34a); |
77 | |
|
78 | 0 | for (i = 0; i < h; i += 2) { |
79 | 0 | data = &src_ptr[i * src_stride + j]; |
80 | 0 | d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); |
81 | 0 | const __m256i src_45a = _mm256_permute2x128_si256( |
82 | 0 | _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20); |
83 | |
|
84 | 0 | d[4] = _mm_loadu_si128((__m128i *)(data + 6 * src_stride)); |
85 | 0 | const __m256i src_56a = _mm256_permute2x128_si256( |
86 | 0 | _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[4]), 0x20); |
87 | |
|
88 | 0 | s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); |
89 | 0 | s[5] = _mm256_unpackhi_epi8(src_45a, src_56a); |
90 | |
|
91 | 0 | const __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1); |
92 | | /* rounding code */ |
93 | | // shift by F - 1 |
94 | 0 | const __m256i res_16b_lo = _mm256_sra_epi16( |
95 | 0 | _mm256_add_epi16(res_lo, right_shift_const), right_shift); |
96 | | // 8 bit conversion and saturation to uint8 |
97 | 0 | __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); |
98 | |
|
99 | 0 | if (w - j > 8) { |
100 | 0 | const __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1); |
101 | | |
102 | | /* rounding code */ |
103 | | // shift by F - 1 |
104 | 0 | const __m256i res_16b_hi = _mm256_sra_epi16( |
105 | 0 | _mm256_add_epi16(res_hi, right_shift_const), right_shift); |
106 | | // 8 bit conversion and saturation to uint8 |
107 | 0 | __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); |
108 | |
|
109 | 0 | __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi); |
110 | |
|
111 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_a); |
112 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); |
113 | |
|
114 | 0 | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0); |
115 | 0 | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], |
116 | 0 | res_1); |
117 | 0 | } else { |
118 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo); |
119 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); |
120 | 0 | if (w - j > 4) { |
121 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); |
122 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], |
123 | 0 | res_1); |
124 | 0 | } else if (w - j > 2) { |
125 | 0 | xx_storel_32(&dst[i * dst_stride + j], res_0); |
126 | 0 | xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1); |
127 | 0 | } else { |
128 | 0 | __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; |
129 | 0 | __m128i *const p_1 = |
130 | 0 | (__m128i *)&dst[i * dst_stride + j + dst_stride]; |
131 | 0 | *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); |
132 | 0 | *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); |
133 | 0 | } |
134 | 0 | } |
135 | 0 | s[0] = s[1]; |
136 | 0 | s[1] = s[2]; |
137 | |
|
138 | 0 | s[3] = s[4]; |
139 | 0 | s[4] = s[5]; |
140 | 0 | } |
141 | 0 | } |
142 | 0 | } else if (vert_tap == 6) { |
143 | 0 | const int fo_vert = vert_tap / 2 - 1; |
144 | 0 | const uint8_t *const src_ptr = src - fo_vert * src_stride; |
145 | |
|
146 | 0 | for (int j = 0; j < w; j += 16) { |
147 | 0 | const uint8_t *data = &src_ptr[j]; |
148 | 0 | __m256i src6; |
149 | |
|
150 | 0 | d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); |
151 | 0 | d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); |
152 | 0 | d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); |
153 | 0 | d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); |
154 | | // Load lines a and b. Line a to lower 128, line b to upper 128 |
155 | 0 | const __m256i src_01a = _mm256_permute2x128_si256( |
156 | 0 | _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20); |
157 | |
|
158 | 0 | const __m256i src_12a = _mm256_permute2x128_si256( |
159 | 0 | _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20); |
160 | |
|
161 | 0 | const __m256i src_23a = _mm256_permute2x128_si256( |
162 | 0 | _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20); |
163 | |
|
164 | 0 | src6 = _mm256_castsi128_si256( |
165 | 0 | _mm_loadu_si128((__m128i *)(data + 4 * src_stride))); |
166 | 0 | const __m256i src_34a = |
167 | 0 | _mm256_permute2x128_si256(_mm256_castsi128_si256(d[3]), src6, 0x20); |
168 | |
|
169 | 0 | s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); |
170 | 0 | s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); |
171 | |
|
172 | 0 | s[3] = _mm256_unpackhi_epi8(src_01a, src_12a); |
173 | 0 | s[4] = _mm256_unpackhi_epi8(src_23a, src_34a); |
174 | |
|
175 | 0 | for (i = 0; i < h; i += 2) { |
176 | 0 | data = &src_ptr[i * src_stride + j]; |
177 | 0 | const __m256i src_45a = _mm256_permute2x128_si256( |
178 | 0 | src6, |
179 | 0 | _mm256_castsi128_si256( |
180 | 0 | _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), |
181 | 0 | 0x20); |
182 | |
|
183 | 0 | src6 = _mm256_castsi128_si256( |
184 | 0 | _mm_loadu_si128((__m128i *)(data + 6 * src_stride))); |
185 | 0 | const __m256i src_56a = _mm256_permute2x128_si256( |
186 | 0 | _mm256_castsi128_si256( |
187 | 0 | _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), |
188 | 0 | src6, 0x20); |
189 | |
|
190 | 0 | s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); |
191 | 0 | s[5] = _mm256_unpackhi_epi8(src_45a, src_56a); |
192 | |
|
193 | 0 | const __m256i res_lo = convolve_lowbd_6tap(s, coeffs); |
194 | | |
195 | | /* rounding code */ |
196 | | // shift by F - 1 |
197 | 0 | const __m256i res_16b_lo = _mm256_sra_epi16( |
198 | 0 | _mm256_add_epi16(res_lo, right_shift_const), right_shift); |
199 | | // 8 bit conversion and saturation to uint8 |
200 | 0 | __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); |
201 | |
|
202 | 0 | if (w - j > 8) { |
203 | 0 | const __m256i res_hi = convolve_lowbd_6tap(s + 3, coeffs); |
204 | | |
205 | | /* rounding code */ |
206 | | // shift by F - 1 |
207 | 0 | const __m256i res_16b_hi = _mm256_sra_epi16( |
208 | 0 | _mm256_add_epi16(res_hi, right_shift_const), right_shift); |
209 | | // 8 bit conversion and saturation to uint8 |
210 | 0 | __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); |
211 | |
|
212 | 0 | __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi); |
213 | |
|
214 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_a); |
215 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); |
216 | |
|
217 | 0 | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0); |
218 | 0 | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], |
219 | 0 | res_1); |
220 | 0 | } else { |
221 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo); |
222 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); |
223 | 0 | if (w - j > 4) { |
224 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); |
225 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], |
226 | 0 | res_1); |
227 | 0 | } else if (w - j > 2) { |
228 | 0 | xx_storel_32(&dst[i * dst_stride + j], res_0); |
229 | 0 | xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1); |
230 | 0 | } else { |
231 | 0 | __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; |
232 | 0 | __m128i *const p_1 = |
233 | 0 | (__m128i *)&dst[i * dst_stride + j + dst_stride]; |
234 | 0 | *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); |
235 | 0 | *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); |
236 | 0 | } |
237 | 0 | } |
238 | 0 | s[0] = s[1]; |
239 | 0 | s[1] = s[2]; |
240 | 0 | s[3] = s[4]; |
241 | 0 | s[4] = s[5]; |
242 | 0 | } |
243 | 0 | } |
244 | 0 | } else if (vert_tap == 12) { // vert_tap == 12 |
245 | 0 | const int fo_vert = filter_params_y->taps / 2 - 1; |
246 | 0 | const uint8_t *const src_ptr = src - fo_vert * src_stride; |
247 | 0 | const __m256i v_zero = _mm256_setzero_si256(); |
248 | 0 | right_shift = _mm_cvtsi32_si128(FILTER_BITS); |
249 | 0 | right_shift_const = _mm256_set1_epi32((1 << FILTER_BITS) >> 1); |
250 | |
|
251 | 0 | for (int j = 0; j < w; j += 8) { |
252 | 0 | const uint8_t *data = &src_ptr[j]; |
253 | 0 | __m256i src10; |
254 | |
|
255 | 0 | d[0] = _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)); |
256 | 0 | d[1] = _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)); |
257 | 0 | d[2] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)); |
258 | 0 | d[3] = _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)); |
259 | 0 | d[4] = _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)); |
260 | 0 | d[5] = _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)); |
261 | 0 | d[6] = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride)); |
262 | 0 | d[7] = _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)); |
263 | 0 | d[8] = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride)); |
264 | 0 | d[9] = _mm_loadl_epi64((__m128i *)(data + 9 * src_stride)); |
265 | | // Load lines a and b. Line a to lower 128, line b to upper 128 |
266 | 0 | const __m256i src_01a = _mm256_permute2x128_si256( |
267 | 0 | _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20); |
268 | |
|
269 | 0 | const __m256i src_12a = _mm256_permute2x128_si256( |
270 | 0 | _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20); |
271 | |
|
272 | 0 | const __m256i src_23a = _mm256_permute2x128_si256( |
273 | 0 | _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20); |
274 | |
|
275 | 0 | const __m256i src_34a = _mm256_permute2x128_si256( |
276 | 0 | _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20); |
277 | |
|
278 | 0 | const __m256i src_45a = _mm256_permute2x128_si256( |
279 | 0 | _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20); |
280 | |
|
281 | 0 | const __m256i src_56a = _mm256_permute2x128_si256( |
282 | 0 | _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[6]), 0x20); |
283 | |
|
284 | 0 | const __m256i src_67a = _mm256_permute2x128_si256( |
285 | 0 | _mm256_castsi128_si256(d[6]), _mm256_castsi128_si256(d[7]), 0x20); |
286 | |
|
287 | 0 | const __m256i src_78a = _mm256_permute2x128_si256( |
288 | 0 | _mm256_castsi128_si256(d[7]), _mm256_castsi128_si256(d[8]), 0x20); |
289 | |
|
290 | 0 | const __m256i src_89a = _mm256_permute2x128_si256( |
291 | 0 | _mm256_castsi128_si256(d[8]), _mm256_castsi128_si256(d[9]), 0x20); |
292 | |
|
293 | 0 | src10 = _mm256_castsi128_si256( |
294 | 0 | _mm_loadl_epi64((__m128i *)(data + 10 * src_stride))); |
295 | 0 | const __m256i src_910a = |
296 | 0 | _mm256_permute2x128_si256(_mm256_castsi128_si256(d[9]), src10, 0x20); |
297 | |
|
298 | 0 | const __m256i src_01 = _mm256_unpacklo_epi8(src_01a, v_zero); |
299 | 0 | const __m256i src_12 = _mm256_unpacklo_epi8(src_12a, v_zero); |
300 | 0 | const __m256i src_23 = _mm256_unpacklo_epi8(src_23a, v_zero); |
301 | 0 | const __m256i src_34 = _mm256_unpacklo_epi8(src_34a, v_zero); |
302 | 0 | const __m256i src_45 = _mm256_unpacklo_epi8(src_45a, v_zero); |
303 | 0 | const __m256i src_56 = _mm256_unpacklo_epi8(src_56a, v_zero); |
304 | 0 | const __m256i src_67 = _mm256_unpacklo_epi8(src_67a, v_zero); |
305 | 0 | const __m256i src_78 = _mm256_unpacklo_epi8(src_78a, v_zero); |
306 | 0 | const __m256i src_89 = _mm256_unpacklo_epi8(src_89a, v_zero); |
307 | 0 | const __m256i src_910 = _mm256_unpacklo_epi8(src_910a, v_zero); |
308 | |
|
309 | 0 | s[0] = _mm256_unpacklo_epi16(src_01, src_12); |
310 | 0 | s[1] = _mm256_unpacklo_epi16(src_23, src_34); |
311 | 0 | s[2] = _mm256_unpacklo_epi16(src_45, src_56); |
312 | 0 | s[3] = _mm256_unpacklo_epi16(src_67, src_78); |
313 | 0 | s[4] = _mm256_unpacklo_epi16(src_89, src_910); |
314 | |
|
315 | 0 | s[6] = _mm256_unpackhi_epi16(src_01, src_12); |
316 | 0 | s[7] = _mm256_unpackhi_epi16(src_23, src_34); |
317 | 0 | s[8] = _mm256_unpackhi_epi16(src_45, src_56); |
318 | 0 | s[9] = _mm256_unpackhi_epi16(src_67, src_78); |
319 | 0 | s[10] = _mm256_unpackhi_epi16(src_89, src_910); |
320 | |
|
321 | 0 | for (i = 0; i < h; i += 2) { |
322 | 0 | data = &src_ptr[i * src_stride + j]; |
323 | 0 | const __m256i src_1011a = _mm256_permute2x128_si256( |
324 | 0 | src10, |
325 | 0 | _mm256_castsi128_si256( |
326 | 0 | _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))), |
327 | 0 | 0x20); |
328 | |
|
329 | 0 | src10 = _mm256_castsi128_si256( |
330 | 0 | _mm_loadl_epi64((__m128i *)(data + 12 * src_stride))); |
331 | |
|
332 | 0 | const __m256i src_1112a = _mm256_permute2x128_si256( |
333 | 0 | _mm256_castsi128_si256( |
334 | 0 | _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))), |
335 | 0 | src10, 0x20); |
336 | |
|
337 | 0 | const __m256i src_1011 = _mm256_unpacklo_epi8(src_1011a, v_zero); |
338 | 0 | const __m256i src_1112 = _mm256_unpacklo_epi8(src_1112a, v_zero); |
339 | |
|
340 | 0 | s[5] = _mm256_unpacklo_epi16(src_1011, src_1112); |
341 | 0 | s[11] = _mm256_unpackhi_epi16(src_1011, src_1112); |
342 | |
|
343 | 0 | const __m256i res_lo = convolve_12taps(s, coeffs); |
344 | |
|
345 | 0 | const __m256i res_32b_lo = _mm256_sra_epi32( |
346 | 0 | _mm256_add_epi32(res_lo, right_shift_const), right_shift); |
347 | | // 8 bit conversion and saturation to uint8 |
348 | 0 | __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); |
349 | 0 | __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); |
350 | |
|
351 | 0 | if (w - j > 4) { |
352 | 0 | const __m256i res_hi = convolve_12taps(s + 6, coeffs); |
353 | |
|
354 | 0 | const __m256i res_32b_hi = _mm256_sra_epi32( |
355 | 0 | _mm256_add_epi32(res_hi, right_shift_const), right_shift); |
356 | 0 | __m256i res_16b_hi = _mm256_packs_epi32(res_32b_hi, res_32b_hi); |
357 | | // 8 bit conversion and saturation to uint8 |
358 | 0 | __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); |
359 | |
|
360 | 0 | __m256i res_a = _mm256_unpacklo_epi32(res_8b_lo, res_8b_hi); |
361 | |
|
362 | 0 | const __m128i res_0 = _mm256_extracti128_si256(res_a, 0); |
363 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); |
364 | |
|
365 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); |
366 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], |
367 | 0 | res_1); |
368 | 0 | } else { |
369 | 0 | const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0); |
370 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); |
371 | 0 | if (w - j > 2) { |
372 | 0 | *(int *)&dst[i * dst_stride + j] = _mm_cvtsi128_si32(res_0); |
373 | 0 | *(int *)&dst[i * dst_stride + j + dst_stride] = |
374 | 0 | _mm_cvtsi128_si32(res_1); |
375 | 0 | } else { |
376 | 0 | *(uint16_t *)&dst[i * dst_stride + j] = |
377 | 0 | (uint16_t)_mm_cvtsi128_si32(res_0); |
378 | 0 | *(uint16_t *)&dst[i * dst_stride + j + dst_stride] = |
379 | 0 | (uint16_t)_mm_cvtsi128_si32(res_1); |
380 | 0 | } |
381 | 0 | } |
382 | 0 | s[0] = s[1]; |
383 | 0 | s[1] = s[2]; |
384 | 0 | s[2] = s[3]; |
385 | 0 | s[3] = s[4]; |
386 | 0 | s[4] = s[5]; |
387 | |
|
388 | 0 | s[6] = s[7]; |
389 | 0 | s[7] = s[8]; |
390 | 0 | s[8] = s[9]; |
391 | 0 | s[9] = s[10]; |
392 | 0 | s[10] = s[11]; |
393 | 0 | } |
394 | 0 | } |
395 | 0 | } else { |
396 | 0 | const int fo_vert = filter_params_y->taps / 2 - 1; |
397 | 0 | const uint8_t *const src_ptr = src - fo_vert * src_stride; |
398 | |
|
399 | 0 | for (int j = 0; j < w; j += 16) { |
400 | 0 | const uint8_t *data = &src_ptr[j]; |
401 | 0 | __m256i src6; |
402 | |
|
403 | 0 | d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); |
404 | 0 | d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); |
405 | 0 | d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); |
406 | 0 | d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); |
407 | 0 | d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); |
408 | 0 | d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); |
409 | | // Load lines a and b. Line a to lower 128, line b to upper 128 |
410 | 0 | const __m256i src_01a = _mm256_permute2x128_si256( |
411 | 0 | _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20); |
412 | |
|
413 | 0 | const __m256i src_12a = _mm256_permute2x128_si256( |
414 | 0 | _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20); |
415 | |
|
416 | 0 | const __m256i src_23a = _mm256_permute2x128_si256( |
417 | 0 | _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20); |
418 | |
|
419 | 0 | const __m256i src_34a = _mm256_permute2x128_si256( |
420 | 0 | _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20); |
421 | |
|
422 | 0 | const __m256i src_45a = _mm256_permute2x128_si256( |
423 | 0 | _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20); |
424 | |
|
425 | 0 | src6 = _mm256_castsi128_si256( |
426 | 0 | _mm_loadu_si128((__m128i *)(data + 6 * src_stride))); |
427 | 0 | const __m256i src_56a = |
428 | 0 | _mm256_permute2x128_si256(_mm256_castsi128_si256(d[5]), src6, 0x20); |
429 | |
|
430 | 0 | s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); |
431 | 0 | s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); |
432 | 0 | s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); |
433 | |
|
434 | 0 | s[4] = _mm256_unpackhi_epi8(src_01a, src_12a); |
435 | 0 | s[5] = _mm256_unpackhi_epi8(src_23a, src_34a); |
436 | 0 | s[6] = _mm256_unpackhi_epi8(src_45a, src_56a); |
437 | |
|
438 | 0 | for (i = 0; i < h; i += 2) { |
439 | 0 | data = &src_ptr[i * src_stride + j]; |
440 | 0 | const __m256i src_67a = _mm256_permute2x128_si256( |
441 | 0 | src6, |
442 | 0 | _mm256_castsi128_si256( |
443 | 0 | _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), |
444 | 0 | 0x20); |
445 | |
|
446 | 0 | src6 = _mm256_castsi128_si256( |
447 | 0 | _mm_loadu_si128((__m128i *)(data + 8 * src_stride))); |
448 | 0 | const __m256i src_78a = _mm256_permute2x128_si256( |
449 | 0 | _mm256_castsi128_si256( |
450 | 0 | _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), |
451 | 0 | src6, 0x20); |
452 | |
|
453 | 0 | s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); |
454 | 0 | s[7] = _mm256_unpackhi_epi8(src_67a, src_78a); |
455 | |
|
456 | 0 | const __m256i res_lo = convolve_lowbd(s, coeffs); |
457 | | |
458 | | /* rounding code */ |
459 | | // shift by F - 1 |
460 | 0 | const __m256i res_16b_lo = _mm256_sra_epi16( |
461 | 0 | _mm256_add_epi16(res_lo, right_shift_const), right_shift); |
462 | | // 8 bit conversion and saturation to uint8 |
463 | 0 | __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); |
464 | |
|
465 | 0 | if (w - j > 8) { |
466 | 0 | const __m256i res_hi = convolve_lowbd(s + 4, coeffs); |
467 | | |
468 | | /* rounding code */ |
469 | | // shift by F - 1 |
470 | 0 | const __m256i res_16b_hi = _mm256_sra_epi16( |
471 | 0 | _mm256_add_epi16(res_hi, right_shift_const), right_shift); |
472 | | // 8 bit conversion and saturation to uint8 |
473 | 0 | __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); |
474 | |
|
475 | 0 | __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi); |
476 | |
|
477 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_a); |
478 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); |
479 | |
|
480 | 0 | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0); |
481 | 0 | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], |
482 | 0 | res_1); |
483 | 0 | } else { |
484 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo); |
485 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); |
486 | 0 | if (w - j > 4) { |
487 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); |
488 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], |
489 | 0 | res_1); |
490 | 0 | } else if (w - j > 2) { |
491 | 0 | xx_storel_32(&dst[i * dst_stride + j], res_0); |
492 | 0 | xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1); |
493 | 0 | } else { |
494 | 0 | __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; |
495 | 0 | __m128i *const p_1 = |
496 | 0 | (__m128i *)&dst[i * dst_stride + j + dst_stride]; |
497 | 0 | *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); |
498 | 0 | *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); |
499 | 0 | } |
500 | 0 | } |
501 | 0 | s[0] = s[1]; |
502 | 0 | s[1] = s[2]; |
503 | 0 | s[2] = s[3]; |
504 | |
|
505 | 0 | s[4] = s[5]; |
506 | 0 | s[5] = s[6]; |
507 | 0 | s[6] = s[7]; |
508 | 0 | } |
509 | 0 | } |
510 | 0 | } |
511 | 0 | } |
512 | | |
513 | | void av1_convolve_y_sr_avx2(const uint8_t *src, int32_t src_stride, |
514 | | uint8_t *dst, int32_t dst_stride, int32_t w, |
515 | | int32_t h, |
516 | | const InterpFilterParams *filter_params_y, |
517 | 705k | const int32_t subpel_y_qn) { |
518 | 705k | #if CONFIG_SVT_AV1 |
519 | 705k | const int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn); |
520 | | |
521 | 705k | if (vert_tap == 12) { |
522 | 0 | av1_convolve_y_sr_general_avx2(src, src_stride, dst, dst_stride, w, h, |
523 | 0 | filter_params_y, subpel_y_qn); |
524 | 705k | } else { |
525 | 705k | av1_convolve_y_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h, |
526 | 705k | filter_params_y, subpel_y_qn); |
527 | 705k | } |
528 | | #else |
529 | | av1_convolve_y_sr_general_avx2(src, src_stride, dst, dst_stride, w, h, |
530 | | filter_params_y, subpel_y_qn); |
531 | | #endif |
532 | 705k | } |
533 | | |
534 | | static inline void av1_convolve_x_sr_general_avx2( |
535 | | const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, |
536 | | int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, |
537 | 748k | ConvolveParams *conv_params) { |
538 | 748k | const int bits = FILTER_BITS - conv_params->round_0; |
539 | 748k | int i, j, horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn); |
540 | | |
541 | 748k | assert(bits >= 0); |
542 | 748k | assert((FILTER_BITS - conv_params->round_1) >= 0 || |
543 | 748k | ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); |
544 | 748k | assert(conv_params->round_0 > 0); |
545 | | |
546 | 748k | assert(horiz_tap == 2 || horiz_tap == 4 || horiz_tap == 6 || horiz_tap == 8 || |
547 | 748k | horiz_tap == 12); |
548 | 748k | assert((!(w % 2)) || (w <= 128)); |
549 | 748k | assert((h % 2) == 0); |
550 | | |
551 | 748k | __m256i coeffs[6] = { 0 }, filt[4] = { 0 }; |
552 | 748k | __m128i coeffs_128[4] = { 0 }; |
553 | | |
554 | 748k | i = 0; |
555 | | // horz_filt as 4 tap |
556 | 748k | if (horiz_tap == 4) { |
557 | | // since fo_horiz = 1 |
558 | 311k | const uint8_t *src_ptr = src - 1; |
559 | 311k | if (w == 2) { |
560 | 53.0k | prepare_coeffs_4t_ssse3(filter_params_x, subpel_x_qn, coeffs_128); |
561 | 138k | do { |
562 | 138k | const __m128i res = |
563 | 138k | convolve_x_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128); |
564 | 138k | const __m128i reg = round_sr_x_ssse3(res); |
565 | 138k | pack_store_x_2x2_sse2(reg, dst, dst_stride); |
566 | 138k | src_ptr += 2 * src_stride; |
567 | 138k | dst += 2 * dst_stride; |
568 | 138k | h -= 2; |
569 | 138k | } while (h); |
570 | 258k | } else if (w == 4) { |
571 | 232k | prepare_coeffs_4t_ssse3(filter_params_x, subpel_x_qn, coeffs_128); |
572 | 776k | do { |
573 | 776k | const __m128i reg = |
574 | 776k | convolve_x_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128); |
575 | 776k | const __m128i res = round_sr_x_ssse3(reg); |
576 | 776k | pack_store_x_4x2_sse2(res, dst, dst_stride); |
577 | 776k | src_ptr += 2 * src_stride; |
578 | 776k | dst += 2 * dst_stride; |
579 | 776k | h -= 2; |
580 | 776k | } while (h); |
581 | 232k | } else if (w == 8) { |
582 | 14.7k | prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs); |
583 | 14.7k | filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2)); |
584 | 14.7k | filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); |
585 | 57.0k | do { |
586 | 57.0k | const __m256i data = _mm256_setr_m128i( |
587 | 57.0k | _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride])), |
588 | 57.0k | _mm_loadu_si128( |
589 | 57.0k | (__m128i *)(&src_ptr[i * src_stride + src_stride]))); |
590 | | |
591 | 57.0k | __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt); |
592 | | |
593 | 57.0k | res_16b = round_sr_x_avx2(res_16b); |
594 | | |
595 | | /* rounding code */ |
596 | | // 8 bit conversion and saturation to uint8 |
597 | 57.0k | __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); |
598 | | |
599 | 57.0k | const __m128i res_0 = _mm256_castsi256_si128(res_8b); |
600 | 57.0k | const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); |
601 | | |
602 | 57.0k | _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0); |
603 | 57.0k | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1); |
604 | 57.0k | i += 2; |
605 | 57.0k | } while (i < h); |
606 | 14.7k | } else { |
607 | 11.4k | assert(!(w % 16)); |
608 | 11.4k | prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs); |
609 | 11.4k | filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2)); |
610 | 11.4k | filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); |
611 | 237k | do { |
612 | 237k | j = 0; |
613 | 754k | do { |
614 | | // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 |
615 | | // 18 19 20 21 22 23 |
616 | 754k | const __m256i data = _mm256_inserti128_si256( |
617 | 754k | _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]), |
618 | 754k | _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]), |
619 | 754k | 1); |
620 | | |
621 | 754k | __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt); |
622 | | |
623 | 754k | res_16b = round_sr_x_avx2(res_16b); |
624 | | |
625 | | /* rounding code */ |
626 | | // 8 bit conversion and saturation to uint8 |
627 | 754k | __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); |
628 | | |
629 | | // Store values into the destination buffer |
630 | | // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
631 | 754k | res_8b = _mm256_permute4x64_epi64(res_8b, 216); |
632 | 754k | __m128i res = _mm256_castsi256_si128(res_8b); |
633 | 754k | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); |
634 | 754k | j += 16; |
635 | 754k | } while (j < w); |
636 | 237k | i++; |
637 | 237k | } while (i < h); |
638 | 11.4k | } |
639 | 436k | } else if (horiz_tap == 6) { |
640 | | // since (horiz_tap/2 - 1 == 2) |
641 | 373k | const uint8_t *src_ptr = src - 2; |
642 | 373k | prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs); |
643 | 373k | filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2)); |
644 | 373k | filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); |
645 | 373k | filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); |
646 | 373k | if (w == 8) { |
647 | 798k | do { |
648 | 798k | const __m256i data = _mm256_setr_m128i( |
649 | 798k | _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride])), |
650 | 798k | _mm_loadu_si128( |
651 | 798k | (__m128i *)(&src_ptr[i * src_stride + src_stride]))); |
652 | | |
653 | 798k | __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt); |
654 | | |
655 | 798k | res_16b = round_sr_x_avx2(res_16b); |
656 | | |
657 | | /* rounding code */ |
658 | | // 8 bit conversion and saturation to uint8 |
659 | 798k | __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); |
660 | | |
661 | 798k | const __m128i res_0 = _mm256_castsi256_si128(res_8b); |
662 | 798k | const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); |
663 | 798k | _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0); |
664 | 798k | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1); |
665 | 798k | i += 2; |
666 | 798k | } while (i < h); |
667 | 212k | } else if (w == 16) { |
668 | 645k | do { |
669 | 645k | __m256i data[2] = { 0 }; |
670 | | |
671 | 645k | load_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs, filt, data); |
672 | 645k | round_pack_store_16x2_avx2(data, dst, dst_stride); |
673 | 645k | src_ptr += 2 * src_stride; |
674 | 645k | dst += 2 * dst_stride; |
675 | 645k | h -= 2; |
676 | 645k | } while (h); |
677 | 125k | } else if (w == 32) { |
678 | 589k | do { |
679 | 589k | convolve_sr_store_6tap_32_avx2(src_ptr, coeffs, filt, dst); |
680 | 589k | src_ptr += src_stride; |
681 | 589k | dst += dst_stride; |
682 | 589k | } while ((--h) > 0); |
683 | 30.5k | } else if (w == 64) { |
684 | 233k | do { |
685 | 233k | convolve_sr_store_6tap_32_avx2(src_ptr, coeffs, filt, dst); |
686 | 233k | convolve_sr_store_6tap_32_avx2(src_ptr + 32, coeffs, filt, dst + 32); |
687 | 233k | src_ptr += src_stride; |
688 | 233k | dst += dst_stride; |
689 | 233k | } while ((--h) > 0); |
690 | 4.64k | } else { |
691 | 816 | assert(w == 128); |
692 | | |
693 | 97.9k | do { |
694 | 97.9k | convolve_sr_store_6tap_32_avx2(src_ptr, coeffs, filt, dst); |
695 | 97.9k | convolve_sr_store_6tap_32_avx2(src_ptr + SECOND_32_BLK, coeffs, filt, |
696 | 97.9k | dst + SECOND_32_BLK); |
697 | 97.9k | convolve_sr_store_6tap_32_avx2(src_ptr + THIRD_32_BLK, coeffs, filt, |
698 | 97.9k | dst + THIRD_32_BLK); |
699 | 97.9k | convolve_sr_store_6tap_32_avx2(src_ptr + FOURTH_32_BLK, coeffs, filt, |
700 | 97.9k | dst + FOURTH_32_BLK); |
701 | 97.9k | src_ptr += src_stride; |
702 | 97.9k | dst += dst_stride; |
703 | 97.9k | } while ((--h) > 0); |
704 | 842 | } |
705 | 373k | } else if (horiz_tap == 8) { |
706 | | // since (horiz_tap / 2 - 1) == 3 |
707 | 18.0k | const uint8_t *src_ptr = src - 3; |
708 | 18.0k | prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs); |
709 | 18.0k | filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2)); |
710 | 18.0k | filt[1] = |
711 | 18.0k | _mm256_load_si256((__m256i const *)(filt_global_avx2 + SECOND_32_BLK)); |
712 | 18.0k | filt[2] = |
713 | 18.0k | _mm256_load_si256((__m256i const *)(filt_global_avx2 + THIRD_32_BLK)); |
714 | 18.0k | filt[3] = |
715 | 18.0k | _mm256_load_si256((__m256i const *)(filt_global_avx2 + FOURTH_32_BLK)); |
716 | | |
717 | 18.0k | if (w == 8) { |
718 | 39.6k | do { |
719 | 39.6k | const __m256i data = _mm256_setr_m128i( |
720 | 39.6k | _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride])), |
721 | 39.6k | _mm_loadu_si128( |
722 | 39.6k | (__m128i *)(&src_ptr[i * src_stride + src_stride]))); |
723 | | |
724 | 39.6k | __m256i res_16b = convolve_lowbd_x(data, coeffs, filt); |
725 | | |
726 | 39.6k | res_16b = round_sr_x_avx2(res_16b); |
727 | | |
728 | | /* rounding code */ |
729 | | // 8 bit conversion and saturation to uint8 |
730 | 39.6k | __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); |
731 | | |
732 | 39.6k | const __m128i res_0 = _mm256_castsi256_si128(res_8b); |
733 | 39.6k | const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); |
734 | 39.6k | _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0); |
735 | 39.6k | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1); |
736 | 39.6k | i += 2; |
737 | 39.6k | } while (i < h); |
738 | 9.32k | } else if (w == 16) { |
739 | 35.6k | do { |
740 | 35.6k | __m256i data[2] = { 0 }; |
741 | | |
742 | 35.6k | load_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs, filt, data); |
743 | 35.6k | round_pack_store_16x2_avx2(data, dst, dst_stride); |
744 | 35.6k | src_ptr += 2 * src_stride; |
745 | 35.6k | dst += 2 * dst_stride; |
746 | 35.6k | h -= 2; |
747 | 35.6k | } while (h); |
748 | 6.08k | } else if (w == 32) { |
749 | 42.5k | do { |
750 | 42.5k | load_convolve_round_8tap_32_avx2(src_ptr, coeffs, filt, dst); |
751 | 42.5k | src_ptr += src_stride; |
752 | 42.5k | dst += dst_stride; |
753 | 42.5k | } while ((--h) > 0); |
754 | 1.84k | } else if (w == 64) { |
755 | 30.8k | do { |
756 | 30.8k | load_convolve_round_8tap_32_avx2(src_ptr, coeffs, filt, dst); |
757 | 30.8k | load_convolve_round_8tap_32_avx2(src_ptr + 32, coeffs, filt, dst + 32); |
758 | 30.8k | src_ptr += src_stride; |
759 | 30.8k | dst += dst_stride; |
760 | 30.8k | } while ((--h) > 0); |
761 | 630 | } else { |
762 | 152 | assert(w == 128); |
763 | 15.2k | do { |
764 | 15.2k | load_convolve_round_8tap_32_avx2(src_ptr, coeffs, filt, dst); |
765 | 15.2k | load_convolve_round_8tap_32_avx2(src_ptr + SECOND_32_BLK, coeffs, filt, |
766 | 15.2k | dst + SECOND_32_BLK); |
767 | 15.2k | load_convolve_round_8tap_32_avx2(src_ptr + THIRD_32_BLK, coeffs, filt, |
768 | 15.2k | dst + THIRD_32_BLK); |
769 | 15.2k | load_convolve_round_8tap_32_avx2(src_ptr + FOURTH_32_BLK, coeffs, filt, |
770 | 15.2k | dst + FOURTH_32_BLK); |
771 | 15.2k | src_ptr += src_stride; |
772 | 15.2k | dst += dst_stride; |
773 | 15.2k | } while ((--h) > 0); |
774 | 152 | } |
775 | 44.5k | } else if (horiz_tap == 12) { // horiz_tap == 12 |
776 | 0 | const int fo_horiz = filter_params_x->taps / 2 - 1; |
777 | 0 | prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs); |
778 | 0 | const __m128i round_shift = _mm_cvtsi32_si128(bits); |
779 | 0 | const uint8_t *const src_ptr = src - fo_horiz; |
780 | 0 | const __m256i v_zero = _mm256_setzero_si256(); |
781 | 0 | __m256i round_0_const = |
782 | 0 | _mm256_set1_epi32((1 << (conv_params->round_0)) >> 1); |
783 | 0 | __m256i round_const = _mm256_set1_epi32((1 << bits) >> 1); |
784 | 0 | __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0); |
785 | 0 | __m256i s[6] = { 0 }; |
786 | |
|
787 | 0 | if (w <= 4) { |
788 | 0 | do { |
789 | 0 | const __m256i data = _mm256_permute2x128_si256( |
790 | 0 | _mm256_castsi128_si256( |
791 | 0 | _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))), |
792 | 0 | _mm256_castsi128_si256(_mm_loadu_si128( |
793 | 0 | (__m128i *)(&src_ptr[i * src_stride + src_stride]))), |
794 | 0 | 0x20); |
795 | | // row0 0..7 row1 0..7 |
796 | 0 | const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero); |
797 | | // row0 8..F row1 8..F |
798 | 0 | const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero); |
799 | | |
800 | | // row0 00 00 01 01 .. 03 03 row1 00 00 01 01 .. 03 03 |
801 | 0 | const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo); |
802 | | // row0 04 04 .. 07 07 row1 04 04 .. 07 07 |
803 | 0 | const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo); |
804 | | |
805 | | // row0 08 08 09 09 .. 0B 0B row1 08 08 09 09 .. 0B 0B |
806 | 0 | const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi); |
807 | | // row0 0C 0C .. 0F 0F row1 0C 0C .. 0F 0F |
808 | 0 | const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi); |
809 | | |
810 | | // 00 01 01 02 02 03 03 04 10 11 11 12 12 13 13 14 |
811 | 0 | s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2); |
812 | | // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16 |
813 | 0 | s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10); |
814 | | // 04 05 05 06 06 07 07 08 14 15 15 16 16 17 17 18 |
815 | 0 | s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2); |
816 | | // 06 07 07 08 08 09 09 0A 16 17 17 18 18 19 19 1A |
817 | 0 | s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10); |
818 | | // 08 09 09 0A 0A 0B 0B 0C 18 19 19 1A 1A 1B 1B 1C |
819 | 0 | s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2); |
820 | | // 0A 0B 0B 0C 0C 0D 0D 0E 1A 1B 1B 1C 1C 1D 1D 1E |
821 | 0 | s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10); |
822 | |
|
823 | 0 | const __m256i res_lo = convolve_12taps(s, coeffs); |
824 | |
|
825 | 0 | __m256i res_32b_lo = _mm256_sra_epi32( |
826 | 0 | _mm256_add_epi32(res_lo, round_0_const), round_0_shift); |
827 | | |
828 | | // 00 01 02 03 10 12 13 14 |
829 | 0 | res_32b_lo = _mm256_sra_epi32(_mm256_add_epi32(res_32b_lo, round_const), |
830 | 0 | round_shift); |
831 | | // 8 bit conversion and saturation to uint8 |
832 | | // 00 01 02 03 00 01 02 03 10 11 12 13 10 11 12 13 |
833 | 0 | __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); |
834 | | // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03 |
835 | | // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13 |
836 | 0 | __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); |
837 | | |
838 | | // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03 |
839 | 0 | const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0); |
840 | | // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13 |
841 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); |
842 | 0 | if (w > 2) { |
843 | | // 00 01 02 03 |
844 | 0 | *(int *)&dst[i * dst_stride] = _mm_cvtsi128_si32(res_0); |
845 | | // 10 11 12 13 |
846 | 0 | *(int *)&dst[i * dst_stride + dst_stride] = _mm_cvtsi128_si32(res_1); |
847 | 0 | } else { |
848 | | // 00 01 |
849 | 0 | *(uint16_t *)&dst[i * dst_stride] = |
850 | 0 | (uint16_t)_mm_cvtsi128_si32(res_0); |
851 | | // 10 11 |
852 | 0 | *(uint16_t *)&dst[i * dst_stride + dst_stride] = |
853 | 0 | (uint16_t)_mm_cvtsi128_si32(res_1); |
854 | 0 | } |
855 | 0 | i += 2; |
856 | 0 | } while (i < h); |
857 | 0 | } else { |
858 | 0 | assert(!(w % 8)); |
859 | 0 | do { |
860 | 0 | j = 0; |
861 | 0 | do { |
862 | 0 | const __m256i data = _mm256_permute2x128_si256( |
863 | 0 | _mm256_castsi128_si256( |
864 | 0 | _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))), |
865 | 0 | _mm256_castsi128_si256(_mm_loadu_si128( |
866 | 0 | (__m128i *)(&src_ptr[i * src_stride + j + 4]))), |
867 | 0 | 0x20); |
868 | | // row0 0..7 4..B |
869 | 0 | const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero); |
870 | | // row0 8..F C..13 |
871 | 0 | const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero); |
872 | | |
873 | | // row0 00 00 01 01 .. 03 03 04 04 05 05 .. 07 07 |
874 | 0 | const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo); |
875 | | // row0 04 04 .. 07 07 08 08 .. 0B 0B |
876 | 0 | const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo); |
877 | | |
878 | | // row0 08 08 09 09 .. 0B 0B 0C 0C 0D 0D .. 0F 0F |
879 | 0 | const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi); |
880 | | // row0 0C 0C 0D 0D .. 0F 0F 10 10 11 11 .. 13 13 |
881 | 0 | const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi); |
882 | |
|
883 | 0 | s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2); |
884 | 0 | s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10); |
885 | 0 | s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2); |
886 | 0 | s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10); |
887 | 0 | s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2); |
888 | 0 | s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10); |
889 | |
|
890 | 0 | const __m256i res_lo = convolve_12taps(s, coeffs); |
891 | |
|
892 | 0 | __m256i res_32b_lo = _mm256_sra_epi32( |
893 | 0 | _mm256_add_epi32(res_lo, round_0_const), round_0_shift); |
894 | |
|
895 | 0 | res_32b_lo = _mm256_sra_epi32( |
896 | 0 | _mm256_add_epi32(res_32b_lo, round_const), round_shift); |
897 | | // 8 bit conversion and saturation to uint8 |
898 | 0 | __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); |
899 | 0 | __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); |
900 | 0 | const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0); |
901 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); |
902 | 0 | *(int *)&dst[i * dst_stride + j] = _mm_cvtsi128_si32(res_0); |
903 | 0 | *(int *)&dst[i * dst_stride + j + 4] = _mm_cvtsi128_si32(res_1); |
904 | |
|
905 | 0 | j += 8; |
906 | 0 | } while (j < w); |
907 | 0 | i++; |
908 | 0 | } while (i < h); |
909 | 0 | } |
910 | 44.5k | } else { |
911 | 44.5k | assert(horiz_tap == 2); |
912 | | // since (filter_params_x->taps / 2 - 1) == 0 |
913 | 44.5k | const uint8_t *src_ptr = src; |
914 | 44.5k | if (subpel_x_qn != 8) { |
915 | 13.5k | if (w <= 8) { |
916 | 9.95k | prepare_coeffs_2t_ssse3(filter_params_x, subpel_x_qn, coeffs_128); |
917 | | |
918 | 9.95k | if (w == 2) { |
919 | 3.28k | do { |
920 | 3.28k | const __m128i data = |
921 | 3.28k | convolve_x_2tap_2x2_ssse3(src_ptr, src_stride, coeffs_128); |
922 | 3.28k | const __m128i reg = round_sr_x_ssse3(data); |
923 | 3.28k | pack_store_x_2x2_sse2(reg, dst, dst_stride); |
924 | 3.28k | src_ptr += 2 * src_stride; |
925 | 3.28k | dst += 2 * dst_stride; |
926 | 3.28k | h -= 2; |
927 | 3.28k | } while (h); |
928 | 8.47k | } else if (w == 4) { |
929 | 13.9k | do { |
930 | 13.9k | const __m128i data = |
931 | 13.9k | convolve_x_2tap_4x2_ssse3(src_ptr, src_stride, coeffs_128); |
932 | 13.9k | const __m128i reg = round_sr_x_ssse3(data); |
933 | 13.9k | pack_store_4x2_sse2(reg, dst, dst_stride); |
934 | 13.9k | src_ptr += 2 * src_stride; |
935 | 13.9k | dst += 2 * dst_stride; |
936 | 13.9k | h -= 2; |
937 | 13.9k | } while (h); |
938 | 4.57k | } else { |
939 | 3.89k | assert(w == 8); |
940 | | |
941 | 13.5k | do { |
942 | 13.5k | __m128i data[2] = { 0 }; |
943 | | |
944 | 13.5k | convolve_x_2tap_8x2_ssse3(src_ptr, src_stride, coeffs_128, data); |
945 | 13.5k | data[0] = round_sr_x_ssse3(data[0]); |
946 | 13.5k | data[1] = round_sr_x_ssse3(data[1]); |
947 | 13.5k | const __m128i reg = _mm_packus_epi16(data[0], data[1]); |
948 | 13.5k | _mm_storel_epi64((__m128i *)dst, reg); |
949 | 13.5k | _mm_storeh_epi64((__m128i *)(dst + dst_stride), reg); |
950 | | |
951 | 13.5k | src_ptr += 2 * src_stride; |
952 | 13.5k | dst += 2 * dst_stride; |
953 | 13.5k | h -= 2; |
954 | 13.5k | } while (h); |
955 | 3.89k | } |
956 | 9.95k | } else { |
957 | 3.60k | prepare_coeffs_2t_lowbd(filter_params_x, subpel_x_qn, coeffs); |
958 | | |
959 | 3.60k | if (w == 16) { |
960 | 10.4k | do { |
961 | 10.4k | __m256i data[2] = { 0 }; |
962 | | |
963 | 10.4k | convolve_x_2tap_16x2_avx2(src_ptr, src_stride, coeffs, data); |
964 | 10.4k | round_pack_store_16x2_avx2(data, dst, dst_stride); |
965 | 10.4k | src_ptr += 2 * src_stride; |
966 | 10.4k | dst += 2 * dst_stride; |
967 | 10.4k | h -= 2; |
968 | 10.4k | } while (h); |
969 | 2.09k | } else if (w == 32) { |
970 | 18.4k | do { |
971 | 18.4k | convolve_round_2tap_32_avx2(src_ptr, coeffs, dst); |
972 | 18.4k | src_ptr += src_stride; |
973 | 18.4k | dst += dst_stride; |
974 | 18.4k | } while ((--h) > 0); |
975 | 778 | } else if (w == 64) { |
976 | 26.9k | do { |
977 | 26.9k | convolve_round_2tap_32_avx2(src_ptr, coeffs, dst); |
978 | 26.9k | convolve_round_2tap_32_avx2(src_ptr + SECOND_32_BLK, coeffs, |
979 | 26.9k | dst + SECOND_32_BLK); |
980 | 26.9k | src_ptr += src_stride; |
981 | 26.9k | dst += dst_stride; |
982 | 26.9k | } while ((--h) > 0); |
983 | 569 | } else { |
984 | 160 | assert(w == 128); |
985 | | |
986 | 14.2k | do { |
987 | 14.2k | convolve_round_2tap_32_avx2(src_ptr, coeffs, dst); |
988 | 14.2k | convolve_round_2tap_32_avx2(src_ptr + (SECOND_32_BLK), coeffs, |
989 | 14.2k | dst + (SECOND_32_BLK)); |
990 | 14.2k | convolve_round_2tap_32_avx2(src_ptr + (THIRD_32_BLK), coeffs, |
991 | 14.2k | dst + (THIRD_32_BLK)); |
992 | 14.2k | convolve_round_2tap_32_avx2(src_ptr + (FOURTH_32_BLK), coeffs, |
993 | 14.2k | dst + (FOURTH_32_BLK)); |
994 | 14.2k | src_ptr += src_stride; |
995 | 14.2k | dst += dst_stride; |
996 | 14.2k | } while ((--h) > 0); |
997 | 160 | } |
998 | 3.60k | } |
999 | 30.9k | } else { |
1000 | 30.9k | if (w == 2) { |
1001 | 8.67k | do { |
1002 | 8.67k | __m128i data = load_x_u8_4x2_sse4(src_ptr, src_stride); |
1003 | 8.67k | const __m128i reg1 = _mm_srli_si128(data, 1); |
1004 | 8.67k | const __m128i reg2 = _mm_avg_epu8(data, reg1); |
1005 | 8.67k | *(uint16_t *)dst = (uint16_t)_mm_cvtsi128_si32(reg2); |
1006 | 8.67k | *(uint16_t *)(dst + dst_stride) = _mm_extract_epi16(reg2, 2); |
1007 | | |
1008 | 8.67k | src_ptr += 2 * src_stride; |
1009 | 8.67k | dst += 2 * dst_stride; |
1010 | 8.67k | h -= 2; |
1011 | 8.67k | } while (h); |
1012 | 26.7k | } else if (w == 4) { |
1013 | 34.8k | do { |
1014 | 34.8k | __m128i data = load_8bit_8x2_to_1_reg_sse2( |
1015 | 34.8k | src_ptr, (int)(sizeof(*src_ptr) * src_stride)); |
1016 | 34.8k | const __m128i reg1 = _mm_srli_si128(data, 1); |
1017 | 34.8k | const __m128i reg2 = _mm_avg_epu8(data, reg1); |
1018 | 34.8k | xx_storel_32(dst, reg2); |
1019 | 34.8k | *(int32_t *)(dst + dst_stride) = _mm_extract_epi32(reg2, 2); |
1020 | | |
1021 | 34.8k | src_ptr += 2 * src_stride; |
1022 | 34.8k | dst += 2 * dst_stride; |
1023 | 34.8k | h -= 2; |
1024 | 34.8k | } while (h); |
1025 | 13.8k | } else if (w == 8) { |
1026 | 28.4k | do { |
1027 | 28.4k | const __m128i data00 = _mm_loadu_si128((__m128i *)src_ptr); |
1028 | 28.4k | const __m128i data10 = |
1029 | 28.4k | _mm_loadu_si128((__m128i *)(src_ptr + src_stride)); |
1030 | 28.4k | const __m128i data01 = _mm_srli_si128(data00, 1); |
1031 | 28.4k | const __m128i data11 = _mm_srli_si128(data10, 1); |
1032 | 28.4k | const __m128i reg0 = _mm_avg_epu8(data00, data01); |
1033 | 28.4k | const __m128i reg1 = _mm_avg_epu8(data10, data11); |
1034 | 28.4k | _mm_storel_epi64((__m128i *)dst, reg0); |
1035 | 28.4k | _mm_storel_epi64((__m128i *)(dst + dst_stride), reg1); |
1036 | | |
1037 | 28.4k | src_ptr += 2 * src_stride; |
1038 | 28.4k | dst += 2 * dst_stride; |
1039 | 28.4k | h -= 2; |
1040 | 28.4k | } while (h); |
1041 | 9.14k | } else if (w == 16) { |
1042 | 17.0k | do { |
1043 | 17.0k | const __m128i data00 = _mm_loadu_si128((__m128i *)src_ptr); |
1044 | 17.0k | const __m128i data01 = _mm_loadu_si128((__m128i *)(src_ptr + 1)); |
1045 | 17.0k | const __m128i data10 = |
1046 | 17.0k | _mm_loadu_si128((__m128i *)(src_ptr + src_stride)); |
1047 | 17.0k | const __m128i data11 = |
1048 | 17.0k | _mm_loadu_si128((__m128i *)(src_ptr + src_stride + 1)); |
1049 | 17.0k | const __m128i reg0 = _mm_avg_epu8(data00, data01); |
1050 | 17.0k | const __m128i reg1 = _mm_avg_epu8(data10, data11); |
1051 | 17.0k | _mm_storeu_si128((__m128i *)dst, reg0); |
1052 | 17.0k | _mm_storeu_si128((__m128i *)(dst + dst_stride), reg1); |
1053 | | |
1054 | 17.0k | src_ptr += 2 * src_stride; |
1055 | 17.0k | dst += 2 * dst_stride; |
1056 | 17.0k | h -= 2; |
1057 | 17.0k | } while (h); |
1058 | 3.25k | } else if (w == 32) { |
1059 | 23.3k | do { |
1060 | 23.3k | load_avg_store_2tap_32_avx2(src_ptr, dst); |
1061 | 23.3k | src_ptr += src_stride; |
1062 | 23.3k | dst += dst_stride; |
1063 | 23.3k | } while ((--h) > 0); |
1064 | 1.01k | } else if (w == 64) { |
1065 | 13.6k | do { |
1066 | 13.6k | load_avg_store_2tap_32_avx2(src_ptr, dst); |
1067 | 13.6k | load_avg_store_2tap_32_avx2(src_ptr + (SECOND_32_BLK), |
1068 | 13.6k | dst + (SECOND_32_BLK)); |
1069 | 13.6k | src_ptr += src_stride; |
1070 | 13.6k | dst += dst_stride; |
1071 | 13.6k | } while ((--h) > 0); |
1072 | 282 | } else { |
1073 | 129 | assert(w == 128); |
1074 | | |
1075 | 12.8k | do { |
1076 | 12.8k | load_avg_store_2tap_32_avx2(src_ptr, dst); |
1077 | 12.8k | load_avg_store_2tap_32_avx2(src_ptr + (SECOND_32_BLK), |
1078 | 12.8k | dst + (SECOND_32_BLK)); |
1079 | 12.8k | load_avg_store_2tap_32_avx2(src_ptr + (THIRD_32_BLK), |
1080 | 12.8k | dst + (THIRD_32_BLK)); |
1081 | 12.8k | load_avg_store_2tap_32_avx2(src_ptr + (FOURTH_32_BLK), |
1082 | 12.8k | dst + (FOURTH_32_BLK)); |
1083 | 12.8k | src_ptr += src_stride; |
1084 | 12.8k | dst += dst_stride; |
1085 | 12.8k | } while ((--h) > 0); |
1086 | 129 | } |
1087 | 30.9k | } |
1088 | 44.5k | } |
1089 | 748k | } |
1090 | | |
1091 | | void av1_convolve_x_sr_avx2(const uint8_t *src, int32_t src_stride, |
1092 | | uint8_t *dst, int32_t dst_stride, int32_t w, |
1093 | | int32_t h, |
1094 | | const InterpFilterParams *filter_params_x, |
1095 | | const int32_t subpel_x_qn, |
1096 | 748k | ConvolveParams *conv_params) { |
1097 | 748k | av1_convolve_x_sr_general_avx2(src, src_stride, dst, dst_stride, w, h, |
1098 | 748k | filter_params_x, subpel_x_qn, conv_params); |
1099 | 748k | } |