/src/aom/av1/common/x86/convolve_avx2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2017, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #include <immintrin.h> |
13 | | |
14 | | #include "config/av1_rtcd.h" |
15 | | |
16 | | #if CONFIG_SVT_AV1 |
17 | | #include "third_party/SVT-AV1/convolve_avx2.h" |
18 | | #endif |
19 | | |
20 | | #include "aom_dsp/aom_dsp_common.h" |
21 | | #include "aom_dsp/x86/convolve_avx2.h" |
22 | | #include "aom_dsp/x86/convolve_common_intrin.h" |
23 | | #include "aom_dsp/x86/synonyms.h" |
24 | | |
25 | | static inline void av1_convolve_y_sr_general_avx2( |
26 | | const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, |
27 | 0 | int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn) { |
28 | | // right shift is F-1 because we are already dividing |
29 | | // filter co-efficients by 2 |
30 | 0 | const int right_shift_bits = (FILTER_BITS - 1); |
31 | 0 | __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits); |
32 | 0 | __m256i right_shift_const = _mm256_set1_epi16((1 << right_shift_bits) >> 1); |
33 | |
|
34 | 0 | __m256i coeffs[6], s[12]; |
35 | 0 | __m128i d[10]; |
36 | |
|
37 | 0 | int i, vert_tap = get_filter_tap(filter_params_y, subpel_y_qn); |
38 | |
|
39 | 0 | if (vert_tap == 6) |
40 | 0 | prepare_coeffs_6t_lowbd(filter_params_y, subpel_y_qn, coeffs); |
41 | 0 | else if (vert_tap == 12) { |
42 | 0 | prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs); |
43 | 0 | } else { |
44 | 0 | prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs); |
45 | 0 | } |
46 | | |
47 | | // vert_filt as 4 tap |
48 | 0 | if (vert_tap == 4) { |
49 | 0 | const int fo_vert = 1; |
50 | 0 | const uint8_t *const src_ptr = src - fo_vert * src_stride; |
51 | 0 | for (int j = 0; j < w; j += 16) { |
52 | 0 | const uint8_t *data = &src_ptr[j]; |
53 | 0 | d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); |
54 | 0 | d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); |
55 | 0 | d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); |
56 | 0 | d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); |
57 | 0 | d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); |
58 | | |
59 | | // Load lines a and b. Line a to lower 128, line b to upper 128 |
60 | 0 | const __m256i src_01a = _mm256_permute2x128_si256( |
61 | 0 | _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20); |
62 | |
|
63 | 0 | const __m256i src_12a = _mm256_permute2x128_si256( |
64 | 0 | _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20); |
65 | |
|
66 | 0 | const __m256i src_23a = _mm256_permute2x128_si256( |
67 | 0 | _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20); |
68 | |
|
69 | 0 | const __m256i src_34a = _mm256_permute2x128_si256( |
70 | 0 | _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20); |
71 | |
|
72 | 0 | s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); |
73 | 0 | s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); |
74 | |
|
75 | 0 | s[3] = _mm256_unpackhi_epi8(src_01a, src_12a); |
76 | 0 | s[4] = _mm256_unpackhi_epi8(src_23a, src_34a); |
77 | |
|
78 | 0 | for (i = 0; i < h; i += 2) { |
79 | 0 | data = &src_ptr[i * src_stride + j]; |
80 | 0 | d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); |
81 | 0 | const __m256i src_45a = _mm256_permute2x128_si256( |
82 | 0 | _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20); |
83 | |
|
84 | 0 | d[4] = _mm_loadu_si128((__m128i *)(data + 6 * src_stride)); |
85 | 0 | const __m256i src_56a = _mm256_permute2x128_si256( |
86 | 0 | _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[4]), 0x20); |
87 | |
|
88 | 0 | s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); |
89 | 0 | s[5] = _mm256_unpackhi_epi8(src_45a, src_56a); |
90 | |
|
91 | 0 | const __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1); |
92 | | /* rounding code */ |
93 | | // shift by F - 1 |
94 | 0 | const __m256i res_16b_lo = _mm256_sra_epi16( |
95 | 0 | _mm256_add_epi16(res_lo, right_shift_const), right_shift); |
96 | | // 8 bit conversion and saturation to uint8 |
97 | 0 | __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); |
98 | |
|
99 | 0 | if (w - j > 8) { |
100 | 0 | const __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1); |
101 | | |
102 | | /* rounding code */ |
103 | | // shift by F - 1 |
104 | 0 | const __m256i res_16b_hi = _mm256_sra_epi16( |
105 | 0 | _mm256_add_epi16(res_hi, right_shift_const), right_shift); |
106 | | // 8 bit conversion and saturation to uint8 |
107 | 0 | __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); |
108 | |
|
109 | 0 | __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi); |
110 | |
|
111 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_a); |
112 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); |
113 | |
|
114 | 0 | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0); |
115 | 0 | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], |
116 | 0 | res_1); |
117 | 0 | } else { |
118 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo); |
119 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); |
120 | 0 | if (w - j > 4) { |
121 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); |
122 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], |
123 | 0 | res_1); |
124 | 0 | } else if (w - j > 2) { |
125 | 0 | xx_storel_32(&dst[i * dst_stride + j], res_0); |
126 | 0 | xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1); |
127 | 0 | } else { |
128 | 0 | __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; |
129 | 0 | __m128i *const p_1 = |
130 | 0 | (__m128i *)&dst[i * dst_stride + j + dst_stride]; |
131 | 0 | *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); |
132 | 0 | *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); |
133 | 0 | } |
134 | 0 | } |
135 | 0 | s[0] = s[1]; |
136 | 0 | s[1] = s[2]; |
137 | |
|
138 | 0 | s[3] = s[4]; |
139 | 0 | s[4] = s[5]; |
140 | 0 | } |
141 | 0 | } |
142 | 0 | } else if (vert_tap == 6) { |
143 | 0 | const int fo_vert = vert_tap / 2 - 1; |
144 | 0 | const uint8_t *const src_ptr = src - fo_vert * src_stride; |
145 | |
|
146 | 0 | for (int j = 0; j < w; j += 16) { |
147 | 0 | const uint8_t *data = &src_ptr[j]; |
148 | 0 | __m256i src6; |
149 | |
|
150 | 0 | d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); |
151 | 0 | d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); |
152 | 0 | d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); |
153 | 0 | d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); |
154 | | // Load lines a and b. Line a to lower 128, line b to upper 128 |
155 | 0 | const __m256i src_01a = _mm256_permute2x128_si256( |
156 | 0 | _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20); |
157 | |
|
158 | 0 | const __m256i src_12a = _mm256_permute2x128_si256( |
159 | 0 | _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20); |
160 | |
|
161 | 0 | const __m256i src_23a = _mm256_permute2x128_si256( |
162 | 0 | _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20); |
163 | |
|
164 | 0 | src6 = _mm256_castsi128_si256( |
165 | 0 | _mm_loadu_si128((__m128i *)(data + 4 * src_stride))); |
166 | 0 | const __m256i src_34a = |
167 | 0 | _mm256_permute2x128_si256(_mm256_castsi128_si256(d[3]), src6, 0x20); |
168 | |
|
169 | 0 | s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); |
170 | 0 | s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); |
171 | |
|
172 | 0 | s[3] = _mm256_unpackhi_epi8(src_01a, src_12a); |
173 | 0 | s[4] = _mm256_unpackhi_epi8(src_23a, src_34a); |
174 | |
|
175 | 0 | for (i = 0; i < h; i += 2) { |
176 | 0 | data = &src_ptr[i * src_stride + j]; |
177 | 0 | const __m256i src_45a = _mm256_permute2x128_si256( |
178 | 0 | src6, |
179 | 0 | _mm256_castsi128_si256( |
180 | 0 | _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), |
181 | 0 | 0x20); |
182 | |
|
183 | 0 | src6 = _mm256_castsi128_si256( |
184 | 0 | _mm_loadu_si128((__m128i *)(data + 6 * src_stride))); |
185 | 0 | const __m256i src_56a = _mm256_permute2x128_si256( |
186 | 0 | _mm256_castsi128_si256( |
187 | 0 | _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), |
188 | 0 | src6, 0x20); |
189 | |
|
190 | 0 | s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); |
191 | 0 | s[5] = _mm256_unpackhi_epi8(src_45a, src_56a); |
192 | |
|
193 | 0 | const __m256i res_lo = convolve_lowbd_6tap(s, coeffs); |
194 | | |
195 | | /* rounding code */ |
196 | | // shift by F - 1 |
197 | 0 | const __m256i res_16b_lo = _mm256_sra_epi16( |
198 | 0 | _mm256_add_epi16(res_lo, right_shift_const), right_shift); |
199 | | // 8 bit conversion and saturation to uint8 |
200 | 0 | __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); |
201 | |
|
202 | 0 | if (w - j > 8) { |
203 | 0 | const __m256i res_hi = convolve_lowbd_6tap(s + 3, coeffs); |
204 | | |
205 | | /* rounding code */ |
206 | | // shift by F - 1 |
207 | 0 | const __m256i res_16b_hi = _mm256_sra_epi16( |
208 | 0 | _mm256_add_epi16(res_hi, right_shift_const), right_shift); |
209 | | // 8 bit conversion and saturation to uint8 |
210 | 0 | __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); |
211 | |
|
212 | 0 | __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi); |
213 | |
|
214 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_a); |
215 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); |
216 | |
|
217 | 0 | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0); |
218 | 0 | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], |
219 | 0 | res_1); |
220 | 0 | } else { |
221 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo); |
222 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); |
223 | 0 | if (w - j > 4) { |
224 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); |
225 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], |
226 | 0 | res_1); |
227 | 0 | } else if (w - j > 2) { |
228 | 0 | xx_storel_32(&dst[i * dst_stride + j], res_0); |
229 | 0 | xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1); |
230 | 0 | } else { |
231 | 0 | __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; |
232 | 0 | __m128i *const p_1 = |
233 | 0 | (__m128i *)&dst[i * dst_stride + j + dst_stride]; |
234 | 0 | *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); |
235 | 0 | *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); |
236 | 0 | } |
237 | 0 | } |
238 | 0 | s[0] = s[1]; |
239 | 0 | s[1] = s[2]; |
240 | 0 | s[3] = s[4]; |
241 | 0 | s[4] = s[5]; |
242 | 0 | } |
243 | 0 | } |
244 | 0 | } else if (vert_tap == 12) { // vert_tap == 12 |
245 | 0 | const int fo_vert = filter_params_y->taps / 2 - 1; |
246 | 0 | const uint8_t *const src_ptr = src - fo_vert * src_stride; |
247 | 0 | const __m256i v_zero = _mm256_setzero_si256(); |
248 | 0 | right_shift = _mm_cvtsi32_si128(FILTER_BITS); |
249 | 0 | right_shift_const = _mm256_set1_epi32((1 << FILTER_BITS) >> 1); |
250 | |
|
251 | 0 | for (int j = 0; j < w; j += 8) { |
252 | 0 | const uint8_t *data = &src_ptr[j]; |
253 | 0 | __m256i src10; |
254 | |
|
255 | 0 | d[0] = _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)); |
256 | 0 | d[1] = _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)); |
257 | 0 | d[2] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)); |
258 | 0 | d[3] = _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)); |
259 | 0 | d[4] = _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)); |
260 | 0 | d[5] = _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)); |
261 | 0 | d[6] = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride)); |
262 | 0 | d[7] = _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)); |
263 | 0 | d[8] = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride)); |
264 | 0 | d[9] = _mm_loadl_epi64((__m128i *)(data + 9 * src_stride)); |
265 | | // Load lines a and b. Line a to lower 128, line b to upper 128 |
266 | 0 | const __m256i src_01a = _mm256_permute2x128_si256( |
267 | 0 | _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20); |
268 | |
|
269 | 0 | const __m256i src_12a = _mm256_permute2x128_si256( |
270 | 0 | _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20); |
271 | |
|
272 | 0 | const __m256i src_23a = _mm256_permute2x128_si256( |
273 | 0 | _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20); |
274 | |
|
275 | 0 | const __m256i src_34a = _mm256_permute2x128_si256( |
276 | 0 | _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20); |
277 | |
|
278 | 0 | const __m256i src_45a = _mm256_permute2x128_si256( |
279 | 0 | _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20); |
280 | |
|
281 | 0 | const __m256i src_56a = _mm256_permute2x128_si256( |
282 | 0 | _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[6]), 0x20); |
283 | |
|
284 | 0 | const __m256i src_67a = _mm256_permute2x128_si256( |
285 | 0 | _mm256_castsi128_si256(d[6]), _mm256_castsi128_si256(d[7]), 0x20); |
286 | |
|
287 | 0 | const __m256i src_78a = _mm256_permute2x128_si256( |
288 | 0 | _mm256_castsi128_si256(d[7]), _mm256_castsi128_si256(d[8]), 0x20); |
289 | |
|
290 | 0 | const __m256i src_89a = _mm256_permute2x128_si256( |
291 | 0 | _mm256_castsi128_si256(d[8]), _mm256_castsi128_si256(d[9]), 0x20); |
292 | |
|
293 | 0 | src10 = _mm256_castsi128_si256( |
294 | 0 | _mm_loadl_epi64((__m128i *)(data + 10 * src_stride))); |
295 | 0 | const __m256i src_910a = |
296 | 0 | _mm256_permute2x128_si256(_mm256_castsi128_si256(d[9]), src10, 0x20); |
297 | |
|
298 | 0 | const __m256i src_01 = _mm256_unpacklo_epi8(src_01a, v_zero); |
299 | 0 | const __m256i src_12 = _mm256_unpacklo_epi8(src_12a, v_zero); |
300 | 0 | const __m256i src_23 = _mm256_unpacklo_epi8(src_23a, v_zero); |
301 | 0 | const __m256i src_34 = _mm256_unpacklo_epi8(src_34a, v_zero); |
302 | 0 | const __m256i src_45 = _mm256_unpacklo_epi8(src_45a, v_zero); |
303 | 0 | const __m256i src_56 = _mm256_unpacklo_epi8(src_56a, v_zero); |
304 | 0 | const __m256i src_67 = _mm256_unpacklo_epi8(src_67a, v_zero); |
305 | 0 | const __m256i src_78 = _mm256_unpacklo_epi8(src_78a, v_zero); |
306 | 0 | const __m256i src_89 = _mm256_unpacklo_epi8(src_89a, v_zero); |
307 | 0 | const __m256i src_910 = _mm256_unpacklo_epi8(src_910a, v_zero); |
308 | |
|
309 | 0 | s[0] = _mm256_unpacklo_epi16(src_01, src_12); |
310 | 0 | s[1] = _mm256_unpacklo_epi16(src_23, src_34); |
311 | 0 | s[2] = _mm256_unpacklo_epi16(src_45, src_56); |
312 | 0 | s[3] = _mm256_unpacklo_epi16(src_67, src_78); |
313 | 0 | s[4] = _mm256_unpacklo_epi16(src_89, src_910); |
314 | |
|
315 | 0 | s[6] = _mm256_unpackhi_epi16(src_01, src_12); |
316 | 0 | s[7] = _mm256_unpackhi_epi16(src_23, src_34); |
317 | 0 | s[8] = _mm256_unpackhi_epi16(src_45, src_56); |
318 | 0 | s[9] = _mm256_unpackhi_epi16(src_67, src_78); |
319 | 0 | s[10] = _mm256_unpackhi_epi16(src_89, src_910); |
320 | |
|
321 | 0 | for (i = 0; i < h; i += 2) { |
322 | 0 | data = &src_ptr[i * src_stride + j]; |
323 | 0 | const __m256i src_1011a = _mm256_permute2x128_si256( |
324 | 0 | src10, |
325 | 0 | _mm256_castsi128_si256( |
326 | 0 | _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))), |
327 | 0 | 0x20); |
328 | |
|
329 | 0 | src10 = _mm256_castsi128_si256( |
330 | 0 | _mm_loadl_epi64((__m128i *)(data + 12 * src_stride))); |
331 | |
|
332 | 0 | const __m256i src_1112a = _mm256_permute2x128_si256( |
333 | 0 | _mm256_castsi128_si256( |
334 | 0 | _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))), |
335 | 0 | src10, 0x20); |
336 | |
|
337 | 0 | const __m256i src_1011 = _mm256_unpacklo_epi8(src_1011a, v_zero); |
338 | 0 | const __m256i src_1112 = _mm256_unpacklo_epi8(src_1112a, v_zero); |
339 | |
|
340 | 0 | s[5] = _mm256_unpacklo_epi16(src_1011, src_1112); |
341 | 0 | s[11] = _mm256_unpackhi_epi16(src_1011, src_1112); |
342 | |
|
343 | 0 | const __m256i res_lo = convolve_12taps(s, coeffs); |
344 | |
|
345 | 0 | const __m256i res_32b_lo = _mm256_sra_epi32( |
346 | 0 | _mm256_add_epi32(res_lo, right_shift_const), right_shift); |
347 | | // 8 bit conversion and saturation to uint8 |
348 | 0 | __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); |
349 | 0 | __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); |
350 | |
|
351 | 0 | if (w - j > 4) { |
352 | 0 | const __m256i res_hi = convolve_12taps(s + 6, coeffs); |
353 | |
|
354 | 0 | const __m256i res_32b_hi = _mm256_sra_epi32( |
355 | 0 | _mm256_add_epi32(res_hi, right_shift_const), right_shift); |
356 | 0 | __m256i res_16b_hi = _mm256_packs_epi32(res_32b_hi, res_32b_hi); |
357 | | // 8 bit conversion and saturation to uint8 |
358 | 0 | __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); |
359 | |
|
360 | 0 | __m256i res_a = _mm256_unpacklo_epi32(res_8b_lo, res_8b_hi); |
361 | |
|
362 | 0 | const __m128i res_0 = _mm256_extracti128_si256(res_a, 0); |
363 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); |
364 | |
|
365 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); |
366 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], |
367 | 0 | res_1); |
368 | 0 | } else { |
369 | 0 | const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0); |
370 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); |
371 | 0 | if (w - j > 2) { |
372 | 0 | *(int *)&dst[i * dst_stride + j] = _mm_cvtsi128_si32(res_0); |
373 | 0 | *(int *)&dst[i * dst_stride + j + dst_stride] = |
374 | 0 | _mm_cvtsi128_si32(res_1); |
375 | 0 | } else { |
376 | 0 | *(uint16_t *)&dst[i * dst_stride + j] = |
377 | 0 | (uint16_t)_mm_cvtsi128_si32(res_0); |
378 | 0 | *(uint16_t *)&dst[i * dst_stride + j + dst_stride] = |
379 | 0 | (uint16_t)_mm_cvtsi128_si32(res_1); |
380 | 0 | } |
381 | 0 | } |
382 | 0 | s[0] = s[1]; |
383 | 0 | s[1] = s[2]; |
384 | 0 | s[2] = s[3]; |
385 | 0 | s[3] = s[4]; |
386 | 0 | s[4] = s[5]; |
387 | |
|
388 | 0 | s[6] = s[7]; |
389 | 0 | s[7] = s[8]; |
390 | 0 | s[8] = s[9]; |
391 | 0 | s[9] = s[10]; |
392 | 0 | s[10] = s[11]; |
393 | 0 | } |
394 | 0 | } |
395 | 0 | } else { |
396 | 0 | const int fo_vert = filter_params_y->taps / 2 - 1; |
397 | 0 | const uint8_t *const src_ptr = src - fo_vert * src_stride; |
398 | |
|
399 | 0 | for (int j = 0; j < w; j += 16) { |
400 | 0 | const uint8_t *data = &src_ptr[j]; |
401 | 0 | __m256i src6; |
402 | |
|
403 | 0 | d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); |
404 | 0 | d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); |
405 | 0 | d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); |
406 | 0 | d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); |
407 | 0 | d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); |
408 | 0 | d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); |
409 | | // Load lines a and b. Line a to lower 128, line b to upper 128 |
410 | 0 | const __m256i src_01a = _mm256_permute2x128_si256( |
411 | 0 | _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20); |
412 | |
|
413 | 0 | const __m256i src_12a = _mm256_permute2x128_si256( |
414 | 0 | _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20); |
415 | |
|
416 | 0 | const __m256i src_23a = _mm256_permute2x128_si256( |
417 | 0 | _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20); |
418 | |
|
419 | 0 | const __m256i src_34a = _mm256_permute2x128_si256( |
420 | 0 | _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20); |
421 | |
|
422 | 0 | const __m256i src_45a = _mm256_permute2x128_si256( |
423 | 0 | _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20); |
424 | |
|
425 | 0 | src6 = _mm256_castsi128_si256( |
426 | 0 | _mm_loadu_si128((__m128i *)(data + 6 * src_stride))); |
427 | 0 | const __m256i src_56a = |
428 | 0 | _mm256_permute2x128_si256(_mm256_castsi128_si256(d[5]), src6, 0x20); |
429 | |
|
430 | 0 | s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); |
431 | 0 | s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); |
432 | 0 | s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); |
433 | |
|
434 | 0 | s[4] = _mm256_unpackhi_epi8(src_01a, src_12a); |
435 | 0 | s[5] = _mm256_unpackhi_epi8(src_23a, src_34a); |
436 | 0 | s[6] = _mm256_unpackhi_epi8(src_45a, src_56a); |
437 | |
|
438 | 0 | for (i = 0; i < h; i += 2) { |
439 | 0 | data = &src_ptr[i * src_stride + j]; |
440 | 0 | const __m256i src_67a = _mm256_permute2x128_si256( |
441 | 0 | src6, |
442 | 0 | _mm256_castsi128_si256( |
443 | 0 | _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), |
444 | 0 | 0x20); |
445 | |
|
446 | 0 | src6 = _mm256_castsi128_si256( |
447 | 0 | _mm_loadu_si128((__m128i *)(data + 8 * src_stride))); |
448 | 0 | const __m256i src_78a = _mm256_permute2x128_si256( |
449 | 0 | _mm256_castsi128_si256( |
450 | 0 | _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), |
451 | 0 | src6, 0x20); |
452 | |
|
453 | 0 | s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); |
454 | 0 | s[7] = _mm256_unpackhi_epi8(src_67a, src_78a); |
455 | |
|
456 | 0 | const __m256i res_lo = convolve_lowbd(s, coeffs); |
457 | | |
458 | | /* rounding code */ |
459 | | // shift by F - 1 |
460 | 0 | const __m256i res_16b_lo = _mm256_sra_epi16( |
461 | 0 | _mm256_add_epi16(res_lo, right_shift_const), right_shift); |
462 | | // 8 bit conversion and saturation to uint8 |
463 | 0 | __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); |
464 | |
|
465 | 0 | if (w - j > 8) { |
466 | 0 | const __m256i res_hi = convolve_lowbd(s + 4, coeffs); |
467 | | |
468 | | /* rounding code */ |
469 | | // shift by F - 1 |
470 | 0 | const __m256i res_16b_hi = _mm256_sra_epi16( |
471 | 0 | _mm256_add_epi16(res_hi, right_shift_const), right_shift); |
472 | | // 8 bit conversion and saturation to uint8 |
473 | 0 | __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); |
474 | |
|
475 | 0 | __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi); |
476 | |
|
477 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_a); |
478 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); |
479 | |
|
480 | 0 | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0); |
481 | 0 | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], |
482 | 0 | res_1); |
483 | 0 | } else { |
484 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo); |
485 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); |
486 | 0 | if (w - j > 4) { |
487 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); |
488 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], |
489 | 0 | res_1); |
490 | 0 | } else if (w - j > 2) { |
491 | 0 | xx_storel_32(&dst[i * dst_stride + j], res_0); |
492 | 0 | xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1); |
493 | 0 | } else { |
494 | 0 | __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; |
495 | 0 | __m128i *const p_1 = |
496 | 0 | (__m128i *)&dst[i * dst_stride + j + dst_stride]; |
497 | 0 | *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); |
498 | 0 | *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); |
499 | 0 | } |
500 | 0 | } |
501 | 0 | s[0] = s[1]; |
502 | 0 | s[1] = s[2]; |
503 | 0 | s[2] = s[3]; |
504 | |
|
505 | 0 | s[4] = s[5]; |
506 | 0 | s[5] = s[6]; |
507 | 0 | s[6] = s[7]; |
508 | 0 | } |
509 | 0 | } |
510 | 0 | } |
511 | 0 | } |
512 | | |
513 | | void av1_convolve_y_sr_avx2(const uint8_t *src, int32_t src_stride, |
514 | | uint8_t *dst, int32_t dst_stride, int32_t w, |
515 | | int32_t h, |
516 | | const InterpFilterParams *filter_params_y, |
517 | 747k | const int32_t subpel_y_qn) { |
518 | 747k | #if CONFIG_SVT_AV1 |
519 | 747k | const int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn); |
520 | | |
521 | 747k | if (vert_tap == 12) { |
522 | 0 | av1_convolve_y_sr_general_avx2(src, src_stride, dst, dst_stride, w, h, |
523 | 0 | filter_params_y, subpel_y_qn); |
524 | 747k | } else { |
525 | 747k | av1_convolve_y_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h, |
526 | 747k | filter_params_y, subpel_y_qn); |
527 | 747k | } |
528 | | #else |
529 | | av1_convolve_y_sr_general_avx2(src, src_stride, dst, dst_stride, w, h, |
530 | | filter_params_y, subpel_y_qn); |
531 | | #endif |
532 | 747k | } |
533 | | |
534 | | static inline void av1_convolve_x_sr_general_avx2( |
535 | | const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, |
536 | | int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, |
537 | 0 | ConvolveParams *conv_params) { |
538 | 0 | const int bits = FILTER_BITS - conv_params->round_0; |
539 | 0 | const __m128i round_shift = _mm_cvtsi32_si128(bits); |
540 | 0 | __m256i round_0_const = |
541 | 0 | _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1); |
542 | 0 | __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1); |
543 | 0 | __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1); |
544 | 0 | int i, horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn); |
545 | |
|
546 | 0 | assert(bits >= 0); |
547 | 0 | assert((FILTER_BITS - conv_params->round_1) >= 0 || |
548 | 0 | ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); |
549 | 0 | assert(conv_params->round_0 > 0); |
550 | | |
551 | 0 | __m256i coeffs[6], filt[4]; |
552 | 0 | filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2)); |
553 | 0 | filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); |
554 | |
|
555 | 0 | if (horiz_tap == 6) |
556 | 0 | prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs); |
557 | 0 | else if (horiz_tap == 12) { |
558 | 0 | prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs); |
559 | 0 | } else { |
560 | 0 | prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs); |
561 | 0 | } |
562 | | |
563 | | // horz_filt as 4 tap |
564 | 0 | if (horiz_tap == 4) { |
565 | 0 | const int fo_horiz = 1; |
566 | 0 | const uint8_t *const src_ptr = src - fo_horiz; |
567 | 0 | if (w <= 8) { |
568 | 0 | for (i = 0; i < h; i += 2) { |
569 | 0 | const __m256i data = _mm256_permute2x128_si256( |
570 | 0 | _mm256_castsi128_si256( |
571 | 0 | _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))), |
572 | 0 | _mm256_castsi128_si256(_mm_loadu_si128( |
573 | 0 | (__m128i *)(&src_ptr[i * src_stride + src_stride]))), |
574 | 0 | 0x20); |
575 | |
|
576 | 0 | __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt); |
577 | |
|
578 | 0 | res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), |
579 | 0 | round_0_shift); |
580 | |
|
581 | 0 | res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), |
582 | 0 | round_shift); |
583 | | |
584 | | /* rounding code */ |
585 | | // 8 bit conversion and saturation to uint8 |
586 | 0 | __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); |
587 | |
|
588 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_8b); |
589 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); |
590 | |
|
591 | 0 | if (w > 4) { |
592 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0); |
593 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1); |
594 | 0 | } else if (w > 2) { |
595 | 0 | xx_storel_32(&dst[i * dst_stride], res_0); |
596 | 0 | xx_storel_32(&dst[i * dst_stride + dst_stride], res_1); |
597 | 0 | } else { |
598 | 0 | __m128i *const p_0 = (__m128i *)&dst[i * dst_stride]; |
599 | 0 | __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride]; |
600 | 0 | *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); |
601 | 0 | *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); |
602 | 0 | } |
603 | 0 | } |
604 | 0 | } else { |
605 | 0 | for (i = 0; i < h; ++i) { |
606 | 0 | for (int j = 0; j < w; j += 16) { |
607 | | // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 |
608 | | // 18 19 20 21 22 23 |
609 | 0 | const __m256i data = _mm256_inserti128_si256( |
610 | 0 | _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]), |
611 | 0 | _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]), |
612 | 0 | 1); |
613 | |
|
614 | 0 | __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt); |
615 | |
|
616 | 0 | res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), |
617 | 0 | round_0_shift); |
618 | |
|
619 | 0 | res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), |
620 | 0 | round_shift); |
621 | | |
622 | | /* rounding code */ |
623 | | // 8 bit conversion and saturation to uint8 |
624 | 0 | __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); |
625 | | |
626 | | // Store values into the destination buffer |
627 | | // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
628 | 0 | res_8b = _mm256_permute4x64_epi64(res_8b, 216); |
629 | 0 | __m128i res = _mm256_castsi256_si128(res_8b); |
630 | 0 | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); |
631 | 0 | } |
632 | 0 | } |
633 | 0 | } |
634 | 0 | } else if (horiz_tap == 6) { |
635 | 0 | const int fo_horiz = horiz_tap / 2 - 1; |
636 | 0 | const uint8_t *const src_ptr = src - fo_horiz; |
637 | 0 | filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); |
638 | 0 | filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); |
639 | |
|
640 | 0 | if (w <= 8) { |
641 | 0 | for (i = 0; i < h; i += 2) { |
642 | 0 | const __m256i data = _mm256_permute2x128_si256( |
643 | 0 | _mm256_castsi128_si256( |
644 | 0 | _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))), |
645 | 0 | _mm256_castsi128_si256(_mm_loadu_si128( |
646 | 0 | (__m128i *)(&src_ptr[i * src_stride + src_stride]))), |
647 | 0 | 0x20); |
648 | |
|
649 | 0 | __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt); |
650 | |
|
651 | 0 | res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), |
652 | 0 | round_0_shift); |
653 | |
|
654 | 0 | res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), |
655 | 0 | round_shift); |
656 | | |
657 | | /* rounding code */ |
658 | | // 8 bit conversion and saturation to uint8 |
659 | 0 | __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); |
660 | |
|
661 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_8b); |
662 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); |
663 | 0 | if (w > 4) { |
664 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0); |
665 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1); |
666 | 0 | } else if (w > 2) { |
667 | 0 | xx_storel_32(&dst[i * dst_stride], res_0); |
668 | 0 | xx_storel_32(&dst[i * dst_stride + dst_stride], res_1); |
669 | 0 | } else { |
670 | 0 | __m128i *const p_0 = (__m128i *)&dst[i * dst_stride]; |
671 | 0 | __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride]; |
672 | 0 | *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0); |
673 | 0 | *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1); |
674 | 0 | } |
675 | 0 | } |
676 | 0 | } else { |
677 | 0 | for (i = 0; i < h; ++i) { |
678 | 0 | for (int j = 0; j < w; j += 16) { |
679 | | // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 |
680 | | // 18 19 20 21 22 23 |
681 | 0 | const __m256i data = _mm256_inserti128_si256( |
682 | 0 | _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]), |
683 | 0 | _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]), |
684 | 0 | 1); |
685 | |
|
686 | 0 | __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt); |
687 | |
|
688 | 0 | res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), |
689 | 0 | round_0_shift); |
690 | |
|
691 | 0 | res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), |
692 | 0 | round_shift); |
693 | | |
694 | | /* rounding code */ |
695 | | // 8 bit conversion and saturation to uint8 |
696 | 0 | __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); |
697 | | |
698 | | // Store values into the destination buffer |
699 | | // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
700 | 0 | res_8b = _mm256_permute4x64_epi64(res_8b, 216); |
701 | 0 | __m128i res = _mm256_castsi256_si128(res_8b); |
702 | 0 | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); |
703 | 0 | } |
704 | 0 | } |
705 | 0 | } |
706 | 0 | } else if (horiz_tap == 12) { // horiz_tap == 12 |
707 | 0 | const int fo_horiz = filter_params_x->taps / 2 - 1; |
708 | 0 | const uint8_t *const src_ptr = src - fo_horiz; |
709 | 0 | const __m256i v_zero = _mm256_setzero_si256(); |
710 | 0 | round_0_const = _mm256_set1_epi32((1 << (conv_params->round_0)) >> 1); |
711 | 0 | round_const = _mm256_set1_epi32((1 << bits) >> 1); |
712 | 0 | round_0_shift = _mm_cvtsi32_si128(conv_params->round_0); |
713 | 0 | __m256i s[6]; |
714 | |
|
715 | 0 | if (w <= 4) { |
716 | 0 | for (i = 0; i < h; i += 2) { |
717 | 0 | const __m256i data = _mm256_permute2x128_si256( |
718 | 0 | _mm256_castsi128_si256( |
719 | 0 | _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))), |
720 | 0 | _mm256_castsi128_si256(_mm_loadu_si128( |
721 | 0 | (__m128i *)(&src_ptr[i * src_stride + src_stride]))), |
722 | 0 | 0x20); |
723 | | // row0 0..7 row1 0..7 |
724 | 0 | const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero); |
725 | | // row0 8..F row1 8..F |
726 | 0 | const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero); |
727 | | |
728 | | // row0 00 00 01 01 .. 03 03 row1 00 00 01 01 .. 03 03 |
729 | 0 | const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo); |
730 | | // row0 04 04 .. 07 07 row1 04 04 .. 07 07 |
731 | 0 | const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo); |
732 | | |
733 | | // row0 08 08 09 09 .. 0B 0B row1 08 08 09 09 .. 0B 0B |
734 | 0 | const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi); |
735 | | // row0 0C 0C .. 0F 0F row1 0C 0C .. 0F 0F |
736 | 0 | const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi); |
737 | | |
738 | | // 00 01 01 02 02 03 03 04 10 11 11 12 12 13 13 14 |
739 | 0 | s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2); |
740 | | // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16 |
741 | 0 | s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10); |
742 | | // 04 05 05 06 06 07 07 08 14 15 15 16 16 17 17 18 |
743 | 0 | s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2); |
744 | | // 06 07 07 08 08 09 09 0A 16 17 17 18 18 19 19 1A |
745 | 0 | s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10); |
746 | | // 08 09 09 0A 0A 0B 0B 0C 18 19 19 1A 1A 1B 1B 1C |
747 | 0 | s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2); |
748 | | // 0A 0B 0B 0C 0C 0D 0D 0E 1A 1B 1B 1C 1C 1D 1D 1E |
749 | 0 | s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10); |
750 | |
|
751 | 0 | const __m256i res_lo = convolve_12taps(s, coeffs); |
752 | |
|
753 | 0 | __m256i res_32b_lo = _mm256_sra_epi32( |
754 | 0 | _mm256_add_epi32(res_lo, round_0_const), round_0_shift); |
755 | | |
756 | | // 00 01 02 03 10 12 13 14 |
757 | 0 | res_32b_lo = _mm256_sra_epi32(_mm256_add_epi32(res_32b_lo, round_const), |
758 | 0 | round_shift); |
759 | | // 8 bit conversion and saturation to uint8 |
760 | | // 00 01 02 03 00 01 02 03 10 11 12 13 10 11 12 13 |
761 | 0 | __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); |
762 | | // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03 |
763 | | // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13 |
764 | 0 | __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); |
765 | | |
766 | | // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03 |
767 | 0 | const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0); |
768 | | // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13 |
769 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); |
770 | 0 | if (w > 2) { |
771 | | // 00 01 02 03 |
772 | 0 | *(int *)&dst[i * dst_stride] = _mm_cvtsi128_si32(res_0); |
773 | | // 10 11 12 13 |
774 | 0 | *(int *)&dst[i * dst_stride + dst_stride] = _mm_cvtsi128_si32(res_1); |
775 | 0 | } else { |
776 | | // 00 01 |
777 | 0 | *(uint16_t *)&dst[i * dst_stride] = |
778 | 0 | (uint16_t)_mm_cvtsi128_si32(res_0); |
779 | | // 10 11 |
780 | 0 | *(uint16_t *)&dst[i * dst_stride + dst_stride] = |
781 | 0 | (uint16_t)_mm_cvtsi128_si32(res_1); |
782 | 0 | } |
783 | 0 | } |
784 | 0 | } else { |
785 | 0 | for (i = 0; i < h; i++) { |
786 | 0 | for (int j = 0; j < w; j += 8) { |
787 | 0 | const __m256i data = _mm256_permute2x128_si256( |
788 | 0 | _mm256_castsi128_si256( |
789 | 0 | _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))), |
790 | 0 | _mm256_castsi128_si256(_mm_loadu_si128( |
791 | 0 | (__m128i *)(&src_ptr[i * src_stride + j + 4]))), |
792 | 0 | 0x20); |
793 | | // row0 0..7 4..B |
794 | 0 | const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero); |
795 | | // row0 8..F C..13 |
796 | 0 | const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero); |
797 | | |
798 | | // row0 00 00 01 01 .. 03 03 04 04 05 05 .. 07 07 |
799 | 0 | const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo); |
800 | | // row0 04 04 .. 07 07 08 08 .. 0B 0B |
801 | 0 | const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo); |
802 | | |
803 | | // row0 08 08 09 09 .. 0B 0B 0C 0C 0D 0D .. 0F 0F |
804 | 0 | const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi); |
805 | | // row0 0C 0C 0D 0D .. 0F 0F 10 10 11 11 .. 13 13 |
806 | 0 | const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi); |
807 | |
|
808 | 0 | s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2); |
809 | 0 | s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10); |
810 | 0 | s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2); |
811 | 0 | s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10); |
812 | 0 | s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2); |
813 | 0 | s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10); |
814 | |
|
815 | 0 | const __m256i res_lo = convolve_12taps(s, coeffs); |
816 | |
|
817 | 0 | __m256i res_32b_lo = _mm256_sra_epi32( |
818 | 0 | _mm256_add_epi32(res_lo, round_0_const), round_0_shift); |
819 | |
|
820 | 0 | res_32b_lo = _mm256_sra_epi32( |
821 | 0 | _mm256_add_epi32(res_32b_lo, round_const), round_shift); |
822 | | // 8 bit conversion and saturation to uint8 |
823 | 0 | __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); |
824 | 0 | __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); |
825 | 0 | const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0); |
826 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); |
827 | 0 | *(int *)&dst[i * dst_stride + j] = _mm_cvtsi128_si32(res_0); |
828 | 0 | *(int *)&dst[i * dst_stride + j + 4] = _mm_cvtsi128_si32(res_1); |
829 | 0 | } |
830 | 0 | } |
831 | 0 | } |
832 | 0 | } else { |
833 | 0 | const int fo_horiz = filter_params_x->taps / 2 - 1; |
834 | 0 | const uint8_t *const src_ptr = src - fo_horiz; |
835 | 0 | filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); |
836 | 0 | filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); |
837 | |
|
838 | 0 | if (w <= 8) { |
839 | 0 | for (i = 0; i < h; i += 2) { |
840 | 0 | const __m256i data = _mm256_permute2x128_si256( |
841 | 0 | _mm256_castsi128_si256( |
842 | 0 | _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))), |
843 | 0 | _mm256_castsi128_si256(_mm_loadu_si128( |
844 | 0 | (__m128i *)(&src_ptr[i * src_stride + src_stride]))), |
845 | 0 | 0x20); |
846 | |
|
847 | 0 | __m256i res_16b = convolve_lowbd_x(data, coeffs, filt); |
848 | |
|
849 | 0 | res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), |
850 | 0 | round_0_shift); |
851 | |
|
852 | 0 | res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), |
853 | 0 | round_shift); |
854 | | |
855 | | /* rounding code */ |
856 | | // 8 bit conversion and saturation to uint8 |
857 | 0 | __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); |
858 | |
|
859 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_8b); |
860 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); |
861 | 0 | if (w > 4) { |
862 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0); |
863 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1); |
864 | 0 | } else if (w > 2) { |
865 | 0 | xx_storel_32(&dst[i * dst_stride], res_0); |
866 | 0 | xx_storel_32(&dst[i * dst_stride + dst_stride], res_1); |
867 | 0 | } else { |
868 | 0 | __m128i *const p_0 = (__m128i *)&dst[i * dst_stride]; |
869 | 0 | __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride]; |
870 | 0 | *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); |
871 | 0 | *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); |
872 | 0 | } |
873 | 0 | } |
874 | 0 | } else { |
875 | 0 | for (i = 0; i < h; ++i) { |
876 | 0 | for (int j = 0; j < w; j += 16) { |
877 | | // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 |
878 | | // 18 19 20 21 22 23 |
879 | 0 | const __m256i data = _mm256_inserti128_si256( |
880 | 0 | _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]), |
881 | 0 | _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]), |
882 | 0 | 1); |
883 | |
|
884 | 0 | __m256i res_16b = convolve_lowbd_x(data, coeffs, filt); |
885 | |
|
886 | 0 | res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), |
887 | 0 | round_0_shift); |
888 | |
|
889 | 0 | res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), |
890 | 0 | round_shift); |
891 | | |
892 | | /* rounding code */ |
893 | | // 8 bit conversion and saturation to uint8 |
894 | 0 | __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); |
895 | | |
896 | | // Store values into the destination buffer |
897 | | // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
898 | 0 | res_8b = _mm256_permute4x64_epi64(res_8b, 216); |
899 | 0 | __m128i res = _mm256_castsi256_si128(res_8b); |
900 | 0 | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); |
901 | 0 | } |
902 | 0 | } |
903 | 0 | } |
904 | 0 | } |
905 | 0 | } |
906 | | |
907 | | void av1_convolve_x_sr_avx2(const uint8_t *src, int32_t src_stride, |
908 | | uint8_t *dst, int32_t dst_stride, int32_t w, |
909 | | int32_t h, |
910 | | const InterpFilterParams *filter_params_x, |
911 | | const int32_t subpel_x_qn, |
912 | 828k | ConvolveParams *conv_params) { |
913 | 828k | #if CONFIG_SVT_AV1 |
914 | 828k | const int horz_tap = get_filter_tap(filter_params_x, subpel_x_qn); |
915 | | |
916 | 828k | if (horz_tap == 12) { |
917 | 0 | av1_convolve_x_sr_general_avx2(src, src_stride, dst, dst_stride, w, h, |
918 | 0 | filter_params_x, subpel_x_qn, conv_params); |
919 | 828k | } else { |
920 | 828k | av1_convolve_x_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h, |
921 | 828k | filter_params_x, subpel_x_qn, |
922 | 828k | conv_params); |
923 | 828k | } |
924 | | #else |
925 | | av1_convolve_x_sr_general_avx2(src, src_stride, dst, dst_stride, w, h, |
926 | | filter_params_x, subpel_x_qn, conv_params); |
927 | | #endif |
928 | 828k | } |