/src/aom/av1/common/x86/convolve_avx2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2017, Alliance for Open Media. All rights reserved |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #include <immintrin.h> |
13 | | |
14 | | #include "config/av1_rtcd.h" |
15 | | |
16 | | #include "third_party/SVT-AV1/convolve_avx2.h" |
17 | | |
18 | | #include "aom_dsp/aom_dsp_common.h" |
19 | | #include "aom_dsp/x86/convolve_avx2.h" |
20 | | #include "aom_dsp/x86/convolve_common_intrin.h" |
21 | | #include "aom_dsp/x86/synonyms.h" |
22 | | |
23 | | static AOM_INLINE void av1_convolve_y_sr_general_avx2( |
24 | | const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, |
25 | 0 | int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn) { |
26 | | // right shift is F-1 because we are already dividing |
27 | | // filter co-efficients by 2 |
28 | 0 | const int right_shift_bits = (FILTER_BITS - 1); |
29 | 0 | __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits); |
30 | 0 | __m256i right_shift_const = _mm256_set1_epi16((1 << right_shift_bits) >> 1); |
31 | |
|
32 | 0 | __m256i coeffs[6], s[12]; |
33 | 0 | __m128i d[10]; |
34 | |
|
35 | 0 | int i, vert_tap = get_filter_tap(filter_params_y, subpel_y_qn); |
36 | |
|
37 | 0 | if (vert_tap == 6) |
38 | 0 | prepare_coeffs_6t_lowbd(filter_params_y, subpel_y_qn, coeffs); |
39 | 0 | else if (vert_tap == 12) { |
40 | 0 | prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs); |
41 | 0 | } else { |
42 | 0 | prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs); |
43 | 0 | } |
44 | | |
45 | | // vert_filt as 4 tap |
46 | 0 | if (vert_tap == 4) { |
47 | 0 | const int fo_vert = 1; |
48 | 0 | const uint8_t *const src_ptr = src - fo_vert * src_stride; |
49 | 0 | for (int j = 0; j < w; j += 16) { |
50 | 0 | const uint8_t *data = &src_ptr[j]; |
51 | 0 | d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); |
52 | 0 | d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); |
53 | 0 | d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); |
54 | 0 | d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); |
55 | 0 | d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); |
56 | | |
57 | | // Load lines a and b. Line a to lower 128, line b to upper 128 |
58 | 0 | const __m256i src_01a = _mm256_permute2x128_si256( |
59 | 0 | _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20); |
60 | |
|
61 | 0 | const __m256i src_12a = _mm256_permute2x128_si256( |
62 | 0 | _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20); |
63 | |
|
64 | 0 | const __m256i src_23a = _mm256_permute2x128_si256( |
65 | 0 | _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20); |
66 | |
|
67 | 0 | const __m256i src_34a = _mm256_permute2x128_si256( |
68 | 0 | _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20); |
69 | |
|
70 | 0 | s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); |
71 | 0 | s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); |
72 | |
|
73 | 0 | s[3] = _mm256_unpackhi_epi8(src_01a, src_12a); |
74 | 0 | s[4] = _mm256_unpackhi_epi8(src_23a, src_34a); |
75 | |
|
76 | 0 | for (i = 0; i < h; i += 2) { |
77 | 0 | data = &src_ptr[i * src_stride + j]; |
78 | 0 | d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); |
79 | 0 | const __m256i src_45a = _mm256_permute2x128_si256( |
80 | 0 | _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20); |
81 | |
|
82 | 0 | d[4] = _mm_loadu_si128((__m128i *)(data + 6 * src_stride)); |
83 | 0 | const __m256i src_56a = _mm256_permute2x128_si256( |
84 | 0 | _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[4]), 0x20); |
85 | |
|
86 | 0 | s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); |
87 | 0 | s[5] = _mm256_unpackhi_epi8(src_45a, src_56a); |
88 | |
|
89 | 0 | const __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1); |
90 | | /* rounding code */ |
91 | | // shift by F - 1 |
92 | 0 | const __m256i res_16b_lo = _mm256_sra_epi16( |
93 | 0 | _mm256_add_epi16(res_lo, right_shift_const), right_shift); |
94 | | // 8 bit conversion and saturation to uint8 |
95 | 0 | __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); |
96 | |
|
97 | 0 | if (w - j > 8) { |
98 | 0 | const __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1); |
99 | | |
100 | | /* rounding code */ |
101 | | // shift by F - 1 |
102 | 0 | const __m256i res_16b_hi = _mm256_sra_epi16( |
103 | 0 | _mm256_add_epi16(res_hi, right_shift_const), right_shift); |
104 | | // 8 bit conversion and saturation to uint8 |
105 | 0 | __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); |
106 | |
|
107 | 0 | __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi); |
108 | |
|
109 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_a); |
110 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); |
111 | |
|
112 | 0 | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0); |
113 | 0 | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], |
114 | 0 | res_1); |
115 | 0 | } else { |
116 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo); |
117 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); |
118 | 0 | if (w - j > 4) { |
119 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); |
120 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], |
121 | 0 | res_1); |
122 | 0 | } else if (w - j > 2) { |
123 | 0 | xx_storel_32(&dst[i * dst_stride + j], res_0); |
124 | 0 | xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1); |
125 | 0 | } else { |
126 | 0 | __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; |
127 | 0 | __m128i *const p_1 = |
128 | 0 | (__m128i *)&dst[i * dst_stride + j + dst_stride]; |
129 | 0 | *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); |
130 | 0 | *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); |
131 | 0 | } |
132 | 0 | } |
133 | 0 | s[0] = s[1]; |
134 | 0 | s[1] = s[2]; |
135 | |
|
136 | 0 | s[3] = s[4]; |
137 | 0 | s[4] = s[5]; |
138 | 0 | } |
139 | 0 | } |
140 | 0 | } else if (vert_tap == 6) { |
141 | 0 | const int fo_vert = vert_tap / 2 - 1; |
142 | 0 | const uint8_t *const src_ptr = src - fo_vert * src_stride; |
143 | |
|
144 | 0 | for (int j = 0; j < w; j += 16) { |
145 | 0 | const uint8_t *data = &src_ptr[j]; |
146 | 0 | __m256i src6; |
147 | |
|
148 | 0 | d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); |
149 | 0 | d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); |
150 | 0 | d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); |
151 | 0 | d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); |
152 | | // Load lines a and b. Line a to lower 128, line b to upper 128 |
153 | 0 | const __m256i src_01a = _mm256_permute2x128_si256( |
154 | 0 | _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20); |
155 | |
|
156 | 0 | const __m256i src_12a = _mm256_permute2x128_si256( |
157 | 0 | _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20); |
158 | |
|
159 | 0 | const __m256i src_23a = _mm256_permute2x128_si256( |
160 | 0 | _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20); |
161 | |
|
162 | 0 | src6 = _mm256_castsi128_si256( |
163 | 0 | _mm_loadu_si128((__m128i *)(data + 4 * src_stride))); |
164 | 0 | const __m256i src_34a = |
165 | 0 | _mm256_permute2x128_si256(_mm256_castsi128_si256(d[3]), src6, 0x20); |
166 | |
|
167 | 0 | s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); |
168 | 0 | s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); |
169 | |
|
170 | 0 | s[3] = _mm256_unpackhi_epi8(src_01a, src_12a); |
171 | 0 | s[4] = _mm256_unpackhi_epi8(src_23a, src_34a); |
172 | |
|
173 | 0 | for (i = 0; i < h; i += 2) { |
174 | 0 | data = &src_ptr[i * src_stride + j]; |
175 | 0 | const __m256i src_45a = _mm256_permute2x128_si256( |
176 | 0 | src6, |
177 | 0 | _mm256_castsi128_si256( |
178 | 0 | _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), |
179 | 0 | 0x20); |
180 | |
|
181 | 0 | src6 = _mm256_castsi128_si256( |
182 | 0 | _mm_loadu_si128((__m128i *)(data + 6 * src_stride))); |
183 | 0 | const __m256i src_56a = _mm256_permute2x128_si256( |
184 | 0 | _mm256_castsi128_si256( |
185 | 0 | _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), |
186 | 0 | src6, 0x20); |
187 | |
|
188 | 0 | s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); |
189 | 0 | s[5] = _mm256_unpackhi_epi8(src_45a, src_56a); |
190 | |
|
191 | 0 | const __m256i res_lo = convolve_lowbd_6tap(s, coeffs); |
192 | | |
193 | | /* rounding code */ |
194 | | // shift by F - 1 |
195 | 0 | const __m256i res_16b_lo = _mm256_sra_epi16( |
196 | 0 | _mm256_add_epi16(res_lo, right_shift_const), right_shift); |
197 | | // 8 bit conversion and saturation to uint8 |
198 | 0 | __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); |
199 | |
|
200 | 0 | if (w - j > 8) { |
201 | 0 | const __m256i res_hi = convolve_lowbd_6tap(s + 3, coeffs); |
202 | | |
203 | | /* rounding code */ |
204 | | // shift by F - 1 |
205 | 0 | const __m256i res_16b_hi = _mm256_sra_epi16( |
206 | 0 | _mm256_add_epi16(res_hi, right_shift_const), right_shift); |
207 | | // 8 bit conversion and saturation to uint8 |
208 | 0 | __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); |
209 | |
|
210 | 0 | __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi); |
211 | |
|
212 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_a); |
213 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); |
214 | |
|
215 | 0 | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0); |
216 | 0 | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], |
217 | 0 | res_1); |
218 | 0 | } else { |
219 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo); |
220 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); |
221 | 0 | if (w - j > 4) { |
222 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); |
223 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], |
224 | 0 | res_1); |
225 | 0 | } else if (w - j > 2) { |
226 | 0 | xx_storel_32(&dst[i * dst_stride + j], res_0); |
227 | 0 | xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1); |
228 | 0 | } else { |
229 | 0 | __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; |
230 | 0 | __m128i *const p_1 = |
231 | 0 | (__m128i *)&dst[i * dst_stride + j + dst_stride]; |
232 | 0 | *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); |
233 | 0 | *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); |
234 | 0 | } |
235 | 0 | } |
236 | 0 | s[0] = s[1]; |
237 | 0 | s[1] = s[2]; |
238 | 0 | s[3] = s[4]; |
239 | 0 | s[4] = s[5]; |
240 | 0 | } |
241 | 0 | } |
242 | 0 | } else if (vert_tap == 12) { // vert_tap == 12 |
243 | 0 | const int fo_vert = filter_params_y->taps / 2 - 1; |
244 | 0 | const uint8_t *const src_ptr = src - fo_vert * src_stride; |
245 | 0 | const __m256i v_zero = _mm256_setzero_si256(); |
246 | 0 | right_shift = _mm_cvtsi32_si128(FILTER_BITS); |
247 | 0 | right_shift_const = _mm256_set1_epi32((1 << FILTER_BITS) >> 1); |
248 | |
|
249 | 0 | for (int j = 0; j < w; j += 8) { |
250 | 0 | const uint8_t *data = &src_ptr[j]; |
251 | 0 | __m256i src10; |
252 | |
|
253 | 0 | d[0] = _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)); |
254 | 0 | d[1] = _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)); |
255 | 0 | d[2] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)); |
256 | 0 | d[3] = _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)); |
257 | 0 | d[4] = _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)); |
258 | 0 | d[5] = _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)); |
259 | 0 | d[6] = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride)); |
260 | 0 | d[7] = _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)); |
261 | 0 | d[8] = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride)); |
262 | 0 | d[9] = _mm_loadl_epi64((__m128i *)(data + 9 * src_stride)); |
263 | | // Load lines a and b. Line a to lower 128, line b to upper 128 |
264 | 0 | const __m256i src_01a = _mm256_permute2x128_si256( |
265 | 0 | _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20); |
266 | |
|
267 | 0 | const __m256i src_12a = _mm256_permute2x128_si256( |
268 | 0 | _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20); |
269 | |
|
270 | 0 | const __m256i src_23a = _mm256_permute2x128_si256( |
271 | 0 | _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20); |
272 | |
|
273 | 0 | const __m256i src_34a = _mm256_permute2x128_si256( |
274 | 0 | _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20); |
275 | |
|
276 | 0 | const __m256i src_45a = _mm256_permute2x128_si256( |
277 | 0 | _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20); |
278 | |
|
279 | 0 | const __m256i src_56a = _mm256_permute2x128_si256( |
280 | 0 | _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[6]), 0x20); |
281 | |
|
282 | 0 | const __m256i src_67a = _mm256_permute2x128_si256( |
283 | 0 | _mm256_castsi128_si256(d[6]), _mm256_castsi128_si256(d[7]), 0x20); |
284 | |
|
285 | 0 | const __m256i src_78a = _mm256_permute2x128_si256( |
286 | 0 | _mm256_castsi128_si256(d[7]), _mm256_castsi128_si256(d[8]), 0x20); |
287 | |
|
288 | 0 | const __m256i src_89a = _mm256_permute2x128_si256( |
289 | 0 | _mm256_castsi128_si256(d[8]), _mm256_castsi128_si256(d[9]), 0x20); |
290 | |
|
291 | 0 | src10 = _mm256_castsi128_si256( |
292 | 0 | _mm_loadl_epi64((__m128i *)(data + 10 * src_stride))); |
293 | 0 | const __m256i src_910a = |
294 | 0 | _mm256_permute2x128_si256(_mm256_castsi128_si256(d[9]), src10, 0x20); |
295 | |
|
296 | 0 | const __m256i src_01 = _mm256_unpacklo_epi8(src_01a, v_zero); |
297 | 0 | const __m256i src_12 = _mm256_unpacklo_epi8(src_12a, v_zero); |
298 | 0 | const __m256i src_23 = _mm256_unpacklo_epi8(src_23a, v_zero); |
299 | 0 | const __m256i src_34 = _mm256_unpacklo_epi8(src_34a, v_zero); |
300 | 0 | const __m256i src_45 = _mm256_unpacklo_epi8(src_45a, v_zero); |
301 | 0 | const __m256i src_56 = _mm256_unpacklo_epi8(src_56a, v_zero); |
302 | 0 | const __m256i src_67 = _mm256_unpacklo_epi8(src_67a, v_zero); |
303 | 0 | const __m256i src_78 = _mm256_unpacklo_epi8(src_78a, v_zero); |
304 | 0 | const __m256i src_89 = _mm256_unpacklo_epi8(src_89a, v_zero); |
305 | 0 | const __m256i src_910 = _mm256_unpacklo_epi8(src_910a, v_zero); |
306 | |
|
307 | 0 | s[0] = _mm256_unpacklo_epi16(src_01, src_12); |
308 | 0 | s[1] = _mm256_unpacklo_epi16(src_23, src_34); |
309 | 0 | s[2] = _mm256_unpacklo_epi16(src_45, src_56); |
310 | 0 | s[3] = _mm256_unpacklo_epi16(src_67, src_78); |
311 | 0 | s[4] = _mm256_unpacklo_epi16(src_89, src_910); |
312 | |
|
313 | 0 | s[6] = _mm256_unpackhi_epi16(src_01, src_12); |
314 | 0 | s[7] = _mm256_unpackhi_epi16(src_23, src_34); |
315 | 0 | s[8] = _mm256_unpackhi_epi16(src_45, src_56); |
316 | 0 | s[9] = _mm256_unpackhi_epi16(src_67, src_78); |
317 | 0 | s[10] = _mm256_unpackhi_epi16(src_89, src_910); |
318 | |
|
319 | 0 | for (i = 0; i < h; i += 2) { |
320 | 0 | data = &src_ptr[i * src_stride + j]; |
321 | 0 | const __m256i src_1011a = _mm256_permute2x128_si256( |
322 | 0 | src10, |
323 | 0 | _mm256_castsi128_si256( |
324 | 0 | _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))), |
325 | 0 | 0x20); |
326 | |
|
327 | 0 | src10 = _mm256_castsi128_si256( |
328 | 0 | _mm_loadl_epi64((__m128i *)(data + 12 * src_stride))); |
329 | |
|
330 | 0 | const __m256i src_1112a = _mm256_permute2x128_si256( |
331 | 0 | _mm256_castsi128_si256( |
332 | 0 | _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))), |
333 | 0 | src10, 0x20); |
334 | |
|
335 | 0 | const __m256i src_1011 = _mm256_unpacklo_epi8(src_1011a, v_zero); |
336 | 0 | const __m256i src_1112 = _mm256_unpacklo_epi8(src_1112a, v_zero); |
337 | |
|
338 | 0 | s[5] = _mm256_unpacklo_epi16(src_1011, src_1112); |
339 | 0 | s[11] = _mm256_unpackhi_epi16(src_1011, src_1112); |
340 | |
|
341 | 0 | const __m256i res_lo = convolve_12taps(s, coeffs); |
342 | |
|
343 | 0 | const __m256i res_32b_lo = _mm256_sra_epi32( |
344 | 0 | _mm256_add_epi32(res_lo, right_shift_const), right_shift); |
345 | | // 8 bit conversion and saturation to uint8 |
346 | 0 | __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); |
347 | 0 | __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); |
348 | |
|
349 | 0 | if (w - j > 4) { |
350 | 0 | const __m256i res_hi = convolve_12taps(s + 6, coeffs); |
351 | |
|
352 | 0 | const __m256i res_32b_hi = _mm256_sra_epi32( |
353 | 0 | _mm256_add_epi32(res_hi, right_shift_const), right_shift); |
354 | 0 | __m256i res_16b_hi = _mm256_packs_epi32(res_32b_hi, res_32b_hi); |
355 | | // 8 bit conversion and saturation to uint8 |
356 | 0 | __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); |
357 | |
|
358 | 0 | __m256i res_a = _mm256_unpacklo_epi32(res_8b_lo, res_8b_hi); |
359 | |
|
360 | 0 | const __m128i res_0 = _mm256_extracti128_si256(res_a, 0); |
361 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); |
362 | |
|
363 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); |
364 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], |
365 | 0 | res_1); |
366 | 0 | } else { |
367 | 0 | const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0); |
368 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); |
369 | 0 | if (w - j > 2) { |
370 | 0 | *(int *)&dst[i * dst_stride + j] = _mm_cvtsi128_si32(res_0); |
371 | 0 | *(int *)&dst[i * dst_stride + j + dst_stride] = |
372 | 0 | _mm_cvtsi128_si32(res_1); |
373 | 0 | } else { |
374 | 0 | *(uint16_t *)&dst[i * dst_stride + j] = |
375 | 0 | (uint16_t)_mm_cvtsi128_si32(res_0); |
376 | 0 | *(uint16_t *)&dst[i * dst_stride + j + dst_stride] = |
377 | 0 | (uint16_t)_mm_cvtsi128_si32(res_1); |
378 | 0 | } |
379 | 0 | } |
380 | 0 | s[0] = s[1]; |
381 | 0 | s[1] = s[2]; |
382 | 0 | s[2] = s[3]; |
383 | 0 | s[3] = s[4]; |
384 | 0 | s[4] = s[5]; |
385 | |
|
386 | 0 | s[6] = s[7]; |
387 | 0 | s[7] = s[8]; |
388 | 0 | s[8] = s[9]; |
389 | 0 | s[9] = s[10]; |
390 | 0 | s[10] = s[11]; |
391 | 0 | } |
392 | 0 | } |
393 | 0 | } else { |
394 | 0 | const int fo_vert = filter_params_y->taps / 2 - 1; |
395 | 0 | const uint8_t *const src_ptr = src - fo_vert * src_stride; |
396 | |
|
397 | 0 | for (int j = 0; j < w; j += 16) { |
398 | 0 | const uint8_t *data = &src_ptr[j]; |
399 | 0 | __m256i src6; |
400 | |
|
401 | 0 | d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); |
402 | 0 | d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); |
403 | 0 | d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); |
404 | 0 | d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); |
405 | 0 | d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); |
406 | 0 | d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); |
407 | | // Load lines a and b. Line a to lower 128, line b to upper 128 |
408 | 0 | const __m256i src_01a = _mm256_permute2x128_si256( |
409 | 0 | _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20); |
410 | |
|
411 | 0 | const __m256i src_12a = _mm256_permute2x128_si256( |
412 | 0 | _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20); |
413 | |
|
414 | 0 | const __m256i src_23a = _mm256_permute2x128_si256( |
415 | 0 | _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20); |
416 | |
|
417 | 0 | const __m256i src_34a = _mm256_permute2x128_si256( |
418 | 0 | _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20); |
419 | |
|
420 | 0 | const __m256i src_45a = _mm256_permute2x128_si256( |
421 | 0 | _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20); |
422 | |
|
423 | 0 | src6 = _mm256_castsi128_si256( |
424 | 0 | _mm_loadu_si128((__m128i *)(data + 6 * src_stride))); |
425 | 0 | const __m256i src_56a = |
426 | 0 | _mm256_permute2x128_si256(_mm256_castsi128_si256(d[5]), src6, 0x20); |
427 | |
|
428 | 0 | s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); |
429 | 0 | s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); |
430 | 0 | s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); |
431 | |
|
432 | 0 | s[4] = _mm256_unpackhi_epi8(src_01a, src_12a); |
433 | 0 | s[5] = _mm256_unpackhi_epi8(src_23a, src_34a); |
434 | 0 | s[6] = _mm256_unpackhi_epi8(src_45a, src_56a); |
435 | |
|
436 | 0 | for (i = 0; i < h; i += 2) { |
437 | 0 | data = &src_ptr[i * src_stride + j]; |
438 | 0 | const __m256i src_67a = _mm256_permute2x128_si256( |
439 | 0 | src6, |
440 | 0 | _mm256_castsi128_si256( |
441 | 0 | _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), |
442 | 0 | 0x20); |
443 | |
|
444 | 0 | src6 = _mm256_castsi128_si256( |
445 | 0 | _mm_loadu_si128((__m128i *)(data + 8 * src_stride))); |
446 | 0 | const __m256i src_78a = _mm256_permute2x128_si256( |
447 | 0 | _mm256_castsi128_si256( |
448 | 0 | _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), |
449 | 0 | src6, 0x20); |
450 | |
|
451 | 0 | s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); |
452 | 0 | s[7] = _mm256_unpackhi_epi8(src_67a, src_78a); |
453 | |
|
454 | 0 | const __m256i res_lo = convolve_lowbd(s, coeffs); |
455 | | |
456 | | /* rounding code */ |
457 | | // shift by F - 1 |
458 | 0 | const __m256i res_16b_lo = _mm256_sra_epi16( |
459 | 0 | _mm256_add_epi16(res_lo, right_shift_const), right_shift); |
460 | | // 8 bit conversion and saturation to uint8 |
461 | 0 | __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); |
462 | |
|
463 | 0 | if (w - j > 8) { |
464 | 0 | const __m256i res_hi = convolve_lowbd(s + 4, coeffs); |
465 | | |
466 | | /* rounding code */ |
467 | | // shift by F - 1 |
468 | 0 | const __m256i res_16b_hi = _mm256_sra_epi16( |
469 | 0 | _mm256_add_epi16(res_hi, right_shift_const), right_shift); |
470 | | // 8 bit conversion and saturation to uint8 |
471 | 0 | __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); |
472 | |
|
473 | 0 | __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi); |
474 | |
|
475 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_a); |
476 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); |
477 | |
|
478 | 0 | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0); |
479 | 0 | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], |
480 | 0 | res_1); |
481 | 0 | } else { |
482 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo); |
483 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); |
484 | 0 | if (w - j > 4) { |
485 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); |
486 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], |
487 | 0 | res_1); |
488 | 0 | } else if (w - j > 2) { |
489 | 0 | xx_storel_32(&dst[i * dst_stride + j], res_0); |
490 | 0 | xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1); |
491 | 0 | } else { |
492 | 0 | __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; |
493 | 0 | __m128i *const p_1 = |
494 | 0 | (__m128i *)&dst[i * dst_stride + j + dst_stride]; |
495 | 0 | *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); |
496 | 0 | *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); |
497 | 0 | } |
498 | 0 | } |
499 | 0 | s[0] = s[1]; |
500 | 0 | s[1] = s[2]; |
501 | 0 | s[2] = s[3]; |
502 | |
|
503 | 0 | s[4] = s[5]; |
504 | 0 | s[5] = s[6]; |
505 | 0 | s[6] = s[7]; |
506 | 0 | } |
507 | 0 | } |
508 | 0 | } |
509 | 0 | } |
510 | | |
511 | | void av1_convolve_y_sr_avx2(const uint8_t *src, int32_t src_stride, |
512 | | uint8_t *dst, int32_t dst_stride, int32_t w, |
513 | | int32_t h, |
514 | | const InterpFilterParams *filter_params_y, |
515 | 1.39M | const int32_t subpel_y_q4) { |
516 | 1.39M | const int vert_tap = get_filter_tap(filter_params_y, subpel_y_q4); |
517 | | |
518 | 1.39M | if (vert_tap == 12) { |
519 | 0 | av1_convolve_y_sr_general_avx2(src, src_stride, dst, dst_stride, w, h, |
520 | 0 | filter_params_y, subpel_y_q4); |
521 | 1.39M | } else { |
522 | 1.39M | av1_convolve_y_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h, |
523 | 1.39M | filter_params_y, subpel_y_q4); |
524 | 1.39M | } |
525 | 1.39M | } |
526 | | |
527 | | static AOM_INLINE void av1_convolve_x_sr_general_avx2( |
528 | | const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, |
529 | | int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, |
530 | 0 | ConvolveParams *conv_params) { |
531 | 0 | const int bits = FILTER_BITS - conv_params->round_0; |
532 | 0 | const __m128i round_shift = _mm_cvtsi32_si128(bits); |
533 | 0 | __m256i round_0_const = |
534 | 0 | _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1); |
535 | 0 | __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1); |
536 | 0 | __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1); |
537 | 0 | int i, horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn); |
538 | |
|
539 | 0 | assert(bits >= 0); |
540 | 0 | assert((FILTER_BITS - conv_params->round_1) >= 0 || |
541 | 0 | ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); |
542 | 0 | assert(conv_params->round_0 > 0); |
543 | | |
544 | 0 | __m256i coeffs[6], filt[4]; |
545 | 0 | filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2)); |
546 | 0 | filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); |
547 | |
|
548 | 0 | if (horiz_tap == 6) |
549 | 0 | prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs); |
550 | 0 | else if (horiz_tap == 12) { |
551 | 0 | prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs); |
552 | 0 | } else { |
553 | 0 | prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs); |
554 | 0 | } |
555 | | |
556 | | // horz_filt as 4 tap |
557 | 0 | if (horiz_tap == 4) { |
558 | 0 | const int fo_horiz = 1; |
559 | 0 | const uint8_t *const src_ptr = src - fo_horiz; |
560 | 0 | if (w <= 8) { |
561 | 0 | for (i = 0; i < h; i += 2) { |
562 | 0 | const __m256i data = _mm256_permute2x128_si256( |
563 | 0 | _mm256_castsi128_si256( |
564 | 0 | _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))), |
565 | 0 | _mm256_castsi128_si256(_mm_loadu_si128( |
566 | 0 | (__m128i *)(&src_ptr[i * src_stride + src_stride]))), |
567 | 0 | 0x20); |
568 | |
|
569 | 0 | __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt); |
570 | |
|
571 | 0 | res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), |
572 | 0 | round_0_shift); |
573 | |
|
574 | 0 | res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), |
575 | 0 | round_shift); |
576 | | |
577 | | /* rounding code */ |
578 | | // 8 bit conversion and saturation to uint8 |
579 | 0 | __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); |
580 | |
|
581 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_8b); |
582 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); |
583 | |
|
584 | 0 | if (w > 4) { |
585 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0); |
586 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1); |
587 | 0 | } else if (w > 2) { |
588 | 0 | xx_storel_32(&dst[i * dst_stride], res_0); |
589 | 0 | xx_storel_32(&dst[i * dst_stride + dst_stride], res_1); |
590 | 0 | } else { |
591 | 0 | __m128i *const p_0 = (__m128i *)&dst[i * dst_stride]; |
592 | 0 | __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride]; |
593 | 0 | *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); |
594 | 0 | *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); |
595 | 0 | } |
596 | 0 | } |
597 | 0 | } else { |
598 | 0 | for (i = 0; i < h; ++i) { |
599 | 0 | for (int j = 0; j < w; j += 16) { |
600 | | // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 |
601 | | // 18 19 20 21 22 23 |
602 | 0 | const __m256i data = _mm256_inserti128_si256( |
603 | 0 | _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]), |
604 | 0 | _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]), |
605 | 0 | 1); |
606 | |
|
607 | 0 | __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt); |
608 | |
|
609 | 0 | res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), |
610 | 0 | round_0_shift); |
611 | |
|
612 | 0 | res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), |
613 | 0 | round_shift); |
614 | | |
615 | | /* rounding code */ |
616 | | // 8 bit conversion and saturation to uint8 |
617 | 0 | __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); |
618 | | |
619 | | // Store values into the destination buffer |
620 | | // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
621 | 0 | res_8b = _mm256_permute4x64_epi64(res_8b, 216); |
622 | 0 | __m128i res = _mm256_castsi256_si128(res_8b); |
623 | 0 | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); |
624 | 0 | } |
625 | 0 | } |
626 | 0 | } |
627 | 0 | } else if (horiz_tap == 6) { |
628 | 0 | const int fo_horiz = horiz_tap / 2 - 1; |
629 | 0 | const uint8_t *const src_ptr = src - fo_horiz; |
630 | 0 | filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); |
631 | 0 | filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); |
632 | |
|
633 | 0 | if (w <= 8) { |
634 | 0 | for (i = 0; i < h; i += 2) { |
635 | 0 | const __m256i data = _mm256_permute2x128_si256( |
636 | 0 | _mm256_castsi128_si256( |
637 | 0 | _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))), |
638 | 0 | _mm256_castsi128_si256(_mm_loadu_si128( |
639 | 0 | (__m128i *)(&src_ptr[i * src_stride + src_stride]))), |
640 | 0 | 0x20); |
641 | |
|
642 | 0 | __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt); |
643 | |
|
644 | 0 | res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), |
645 | 0 | round_0_shift); |
646 | |
|
647 | 0 | res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), |
648 | 0 | round_shift); |
649 | | |
650 | | /* rounding code */ |
651 | | // 8 bit conversion and saturation to uint8 |
652 | 0 | __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); |
653 | |
|
654 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_8b); |
655 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); |
656 | 0 | if (w > 4) { |
657 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0); |
658 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1); |
659 | 0 | } else if (w > 2) { |
660 | 0 | xx_storel_32(&dst[i * dst_stride], res_0); |
661 | 0 | xx_storel_32(&dst[i * dst_stride + dst_stride], res_1); |
662 | 0 | } else { |
663 | 0 | __m128i *const p_0 = (__m128i *)&dst[i * dst_stride]; |
664 | 0 | __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride]; |
665 | 0 | *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0); |
666 | 0 | *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1); |
667 | 0 | } |
668 | 0 | } |
669 | 0 | } else { |
670 | 0 | for (i = 0; i < h; ++i) { |
671 | 0 | for (int j = 0; j < w; j += 16) { |
672 | | // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 |
673 | | // 18 19 20 21 22 23 |
674 | 0 | const __m256i data = _mm256_inserti128_si256( |
675 | 0 | _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]), |
676 | 0 | _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]), |
677 | 0 | 1); |
678 | |
|
679 | 0 | __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt); |
680 | |
|
681 | 0 | res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), |
682 | 0 | round_0_shift); |
683 | |
|
684 | 0 | res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), |
685 | 0 | round_shift); |
686 | | |
687 | | /* rounding code */ |
688 | | // 8 bit conversion and saturation to uint8 |
689 | 0 | __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); |
690 | | |
691 | | // Store values into the destination buffer |
692 | | // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
693 | 0 | res_8b = _mm256_permute4x64_epi64(res_8b, 216); |
694 | 0 | __m128i res = _mm256_castsi256_si128(res_8b); |
695 | 0 | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); |
696 | 0 | } |
697 | 0 | } |
698 | 0 | } |
699 | 0 | } else if (horiz_tap == 12) { // horiz_tap == 12 |
700 | 0 | const int fo_horiz = filter_params_x->taps / 2 - 1; |
701 | 0 | const uint8_t *const src_ptr = src - fo_horiz; |
702 | 0 | const __m256i v_zero = _mm256_setzero_si256(); |
703 | 0 | round_0_const = _mm256_set1_epi32((1 << (conv_params->round_0)) >> 1); |
704 | 0 | round_const = _mm256_set1_epi32((1 << bits) >> 1); |
705 | 0 | round_0_shift = _mm_cvtsi32_si128(conv_params->round_0); |
706 | 0 | __m256i s[6]; |
707 | |
|
708 | 0 | if (w <= 4) { |
709 | 0 | for (i = 0; i < h; i += 2) { |
710 | 0 | const __m256i data = _mm256_permute2x128_si256( |
711 | 0 | _mm256_castsi128_si256( |
712 | 0 | _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))), |
713 | 0 | _mm256_castsi128_si256(_mm_loadu_si128( |
714 | 0 | (__m128i *)(&src_ptr[i * src_stride + src_stride]))), |
715 | 0 | 0x20); |
716 | | // row0 0..7 row1 0..7 |
717 | 0 | const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero); |
718 | | // row0 8..F row1 8..F |
719 | 0 | const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero); |
720 | | |
721 | | // row0 00 00 01 01 .. 03 03 row1 00 00 01 01 .. 03 03 |
722 | 0 | const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo); |
723 | | // row0 04 04 .. 07 07 row1 04 04 .. 07 07 |
724 | 0 | const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo); |
725 | | |
726 | | // row0 08 08 09 09 .. 0B 0B row1 08 08 09 09 .. 0B 0B |
727 | 0 | const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi); |
728 | | // row0 0C 0C .. 0F 0F row1 0C 0C .. 0F 0F |
729 | 0 | const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi); |
730 | | |
731 | | // 00 01 01 02 02 03 03 04 10 11 11 12 12 13 13 14 |
732 | 0 | s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2); |
733 | | // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16 |
734 | 0 | s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10); |
735 | | // 04 05 05 06 06 07 07 08 14 15 15 16 16 17 17 18 |
736 | 0 | s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2); |
737 | | // 06 07 07 08 08 09 09 0A 16 17 17 18 18 19 19 1A |
738 | 0 | s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10); |
739 | | // 08 09 09 0A 0A 0B 0B 0C 18 19 19 1A 1A 1B 1B 1C |
740 | 0 | s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2); |
741 | | // 0A 0B 0B 0C 0C 0D 0D 0E 1A 1B 1B 1C 1C 1D 1D 1E |
742 | 0 | s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10); |
743 | |
|
744 | 0 | const __m256i res_lo = convolve_12taps(s, coeffs); |
745 | |
|
746 | 0 | __m256i res_32b_lo = _mm256_sra_epi32( |
747 | 0 | _mm256_add_epi32(res_lo, round_0_const), round_0_shift); |
748 | | |
749 | | // 00 01 02 03 10 12 13 14 |
750 | 0 | res_32b_lo = _mm256_sra_epi32(_mm256_add_epi32(res_32b_lo, round_const), |
751 | 0 | round_shift); |
752 | | // 8 bit conversion and saturation to uint8 |
753 | | // 00 01 02 03 00 01 02 03 10 11 12 13 10 11 12 13 |
754 | 0 | __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); |
755 | | // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03 |
756 | | // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13 |
757 | 0 | __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); |
758 | | |
759 | | // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03 |
760 | 0 | const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0); |
761 | | // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13 |
762 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); |
763 | 0 | if (w > 2) { |
764 | | // 00 01 02 03 |
765 | 0 | *(int *)&dst[i * dst_stride] = _mm_cvtsi128_si32(res_0); |
766 | | // 10 11 12 13 |
767 | 0 | *(int *)&dst[i * dst_stride + dst_stride] = _mm_cvtsi128_si32(res_1); |
768 | 0 | } else { |
769 | | // 00 01 |
770 | 0 | *(uint16_t *)&dst[i * dst_stride] = |
771 | 0 | (uint16_t)_mm_cvtsi128_si32(res_0); |
772 | | // 10 11 |
773 | 0 | *(uint16_t *)&dst[i * dst_stride + dst_stride] = |
774 | 0 | (uint16_t)_mm_cvtsi128_si32(res_1); |
775 | 0 | } |
776 | 0 | } |
777 | 0 | } else { |
778 | 0 | for (i = 0; i < h; i++) { |
779 | 0 | for (int j = 0; j < w; j += 8) { |
780 | 0 | const __m256i data = _mm256_permute2x128_si256( |
781 | 0 | _mm256_castsi128_si256( |
782 | 0 | _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))), |
783 | 0 | _mm256_castsi128_si256(_mm_loadu_si128( |
784 | 0 | (__m128i *)(&src_ptr[i * src_stride + j + 4]))), |
785 | 0 | 0x20); |
786 | | // row0 0..7 4..B |
787 | 0 | const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero); |
788 | | // row0 8..F C..13 |
789 | 0 | const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero); |
790 | | |
791 | | // row0 00 00 01 01 .. 03 03 04 04 05 05 .. 07 07 |
792 | 0 | const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo); |
793 | | // row0 04 04 .. 07 07 08 08 .. 0B 0B |
794 | 0 | const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo); |
795 | | |
796 | | // row0 08 08 09 09 .. 0B 0B 0C 0C 0D 0D .. 0F 0F |
797 | 0 | const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi); |
798 | | // row0 0C 0C 0D 0D .. 0F 0F 10 10 11 11 .. 13 13 |
799 | 0 | const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi); |
800 | |
|
801 | 0 | s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2); |
802 | 0 | s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10); |
803 | 0 | s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2); |
804 | 0 | s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10); |
805 | 0 | s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2); |
806 | 0 | s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10); |
807 | |
|
808 | 0 | const __m256i res_lo = convolve_12taps(s, coeffs); |
809 | |
|
810 | 0 | __m256i res_32b_lo = _mm256_sra_epi32( |
811 | 0 | _mm256_add_epi32(res_lo, round_0_const), round_0_shift); |
812 | |
|
813 | 0 | res_32b_lo = _mm256_sra_epi32( |
814 | 0 | _mm256_add_epi32(res_32b_lo, round_const), round_shift); |
815 | | // 8 bit conversion and saturation to uint8 |
816 | 0 | __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); |
817 | 0 | __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); |
818 | 0 | const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0); |
819 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); |
820 | 0 | *(int *)&dst[i * dst_stride + j] = _mm_cvtsi128_si32(res_0); |
821 | 0 | *(int *)&dst[i * dst_stride + j + 4] = _mm_cvtsi128_si32(res_1); |
822 | 0 | } |
823 | 0 | } |
824 | 0 | } |
825 | 0 | } else { |
826 | 0 | const int fo_horiz = filter_params_x->taps / 2 - 1; |
827 | 0 | const uint8_t *const src_ptr = src - fo_horiz; |
828 | 0 | filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); |
829 | 0 | filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); |
830 | |
|
831 | 0 | if (w <= 8) { |
832 | 0 | for (i = 0; i < h; i += 2) { |
833 | 0 | const __m256i data = _mm256_permute2x128_si256( |
834 | 0 | _mm256_castsi128_si256( |
835 | 0 | _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))), |
836 | 0 | _mm256_castsi128_si256(_mm_loadu_si128( |
837 | 0 | (__m128i *)(&src_ptr[i * src_stride + src_stride]))), |
838 | 0 | 0x20); |
839 | |
|
840 | 0 | __m256i res_16b = convolve_lowbd_x(data, coeffs, filt); |
841 | |
|
842 | 0 | res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), |
843 | 0 | round_0_shift); |
844 | |
|
845 | 0 | res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), |
846 | 0 | round_shift); |
847 | | |
848 | | /* rounding code */ |
849 | | // 8 bit conversion and saturation to uint8 |
850 | 0 | __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); |
851 | |
|
852 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_8b); |
853 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); |
854 | 0 | if (w > 4) { |
855 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0); |
856 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1); |
857 | 0 | } else if (w > 2) { |
858 | 0 | xx_storel_32(&dst[i * dst_stride], res_0); |
859 | 0 | xx_storel_32(&dst[i * dst_stride + dst_stride], res_1); |
860 | 0 | } else { |
861 | 0 | __m128i *const p_0 = (__m128i *)&dst[i * dst_stride]; |
862 | 0 | __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride]; |
863 | 0 | *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); |
864 | 0 | *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); |
865 | 0 | } |
866 | 0 | } |
867 | 0 | } else { |
868 | 0 | for (i = 0; i < h; ++i) { |
869 | 0 | for (int j = 0; j < w; j += 16) { |
870 | | // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 |
871 | | // 18 19 20 21 22 23 |
872 | 0 | const __m256i data = _mm256_inserti128_si256( |
873 | 0 | _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]), |
874 | 0 | _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]), |
875 | 0 | 1); |
876 | |
|
877 | 0 | __m256i res_16b = convolve_lowbd_x(data, coeffs, filt); |
878 | |
|
879 | 0 | res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), |
880 | 0 | round_0_shift); |
881 | |
|
882 | 0 | res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), |
883 | 0 | round_shift); |
884 | | |
885 | | /* rounding code */ |
886 | | // 8 bit conversion and saturation to uint8 |
887 | 0 | __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); |
888 | | |
889 | | // Store values into the destination buffer |
890 | | // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
891 | 0 | res_8b = _mm256_permute4x64_epi64(res_8b, 216); |
892 | 0 | __m128i res = _mm256_castsi256_si128(res_8b); |
893 | 0 | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); |
894 | 0 | } |
895 | 0 | } |
896 | 0 | } |
897 | 0 | } |
898 | 0 | } |
899 | | |
900 | | void av1_convolve_x_sr_avx2(const uint8_t *src, int32_t src_stride, |
901 | | uint8_t *dst, int32_t dst_stride, int32_t w, |
902 | | int32_t h, |
903 | | const InterpFilterParams *filter_params_x, |
904 | | const int32_t subpel_x_q4, |
905 | 1.47M | ConvolveParams *conv_params) { |
906 | 1.47M | const int horz_tap = get_filter_tap(filter_params_x, subpel_x_q4); |
907 | | |
908 | 1.47M | if (horz_tap == 12) { |
909 | 0 | av1_convolve_x_sr_general_avx2(src, src_stride, dst, dst_stride, w, h, |
910 | 0 | filter_params_x, subpel_x_q4, conv_params); |
911 | 1.47M | } else { |
912 | 1.47M | av1_convolve_x_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h, |
913 | 1.47M | filter_params_x, subpel_x_q4, |
914 | 1.47M | conv_params); |
915 | 1.47M | } |
916 | 1.47M | } |