/src/aom/third_party/SVT-AV1/convolve_2d_avx2.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2017, Alliance for Open Media. All rights reserved |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #ifndef THIRD_PARTY_SVT_AV1_CONVOLVE_2D_AVX2_H_ |
13 | | #define THIRD_PARTY_SVT_AV1_CONVOLVE_2D_AVX2_H_ |
14 | | |
15 | | #include "convolve_avx2.h" |
16 | | |
17 | | static void convolve_2d_sr_hor_2tap_avx2( |
18 | | const uint8_t *const src, const int32_t src_stride, const int32_t w, |
19 | | const int32_t h, const InterpFilterParams *const filter_params_x, |
20 | 70.2k | const int32_t subpel_x_q4, int16_t *const im_block) { |
21 | 70.2k | const uint8_t *src_ptr = src; |
22 | 70.2k | int32_t y = h; |
23 | 70.2k | int16_t *im = im_block; |
24 | | |
25 | 70.2k | if (w <= 8) { |
26 | 55.7k | __m128i coeffs_128; |
27 | | |
28 | 55.7k | prepare_half_coeffs_2tap_ssse3(filter_params_x, subpel_x_q4, &coeffs_128); |
29 | | |
30 | 55.7k | if (w == 2) { |
31 | 24.6k | do { |
32 | 24.6k | const __m128i r = |
33 | 24.6k | x_convolve_2tap_2x2_sse4_1(src_ptr, src_stride, &coeffs_128); |
34 | 24.6k | xy_x_round_store_2x2_sse2(r, im); |
35 | 24.6k | src_ptr += 2 * src_stride; |
36 | 24.6k | im += 2 * 2; |
37 | 24.6k | y -= 2; |
38 | 24.6k | } while (y); |
39 | 47.6k | } else if (w == 4) { |
40 | 106k | do { |
41 | 106k | const __m128i r = |
42 | 106k | x_convolve_2tap_4x2_ssse3(src_ptr, src_stride, &coeffs_128); |
43 | 106k | xy_x_round_store_4x2_sse2(r, im); |
44 | 106k | src_ptr += 2 * src_stride; |
45 | 106k | im += 2 * 4; |
46 | 106k | y -= 2; |
47 | 106k | } while (y); |
48 | 26.8k | } else { |
49 | 20.7k | assert(w == 8); |
50 | | |
51 | 90.6k | do { |
52 | 90.6k | __m128i r[2]; |
53 | | |
54 | 90.6k | x_convolve_2tap_8x2_ssse3(src_ptr, src_stride, &coeffs_128, r); |
55 | 90.6k | xy_x_round_store_8x2_sse2(r, im); |
56 | 90.6k | src_ptr += 2 * src_stride; |
57 | 90.6k | im += 2 * 8; |
58 | 90.6k | y -= 2; |
59 | 90.6k | } while (y); |
60 | 20.7k | } |
61 | 55.7k | } else { |
62 | 14.4k | __m256i coeffs_256; |
63 | | |
64 | 14.4k | prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, &coeffs_256); |
65 | | |
66 | 14.4k | if (w == 16) { |
67 | 56.7k | do { |
68 | 56.7k | __m256i r[2]; |
69 | | |
70 | 56.7k | x_convolve_2tap_16x2_avx2(src_ptr, src_stride, &coeffs_256, r); |
71 | 56.7k | xy_x_round_store_32_avx2(r, im); |
72 | 56.7k | src_ptr += 2 * src_stride; |
73 | 56.7k | im += 2 * 16; |
74 | 56.7k | y -= 2; |
75 | 56.7k | } while (y); |
76 | 9.02k | } else if (w == 32) { |
77 | 84.7k | do { |
78 | 84.7k | xy_x_2tap_32_avx2(src_ptr, &coeffs_256, im); |
79 | 84.7k | src_ptr += src_stride; |
80 | 84.7k | im += 32; |
81 | 84.7k | } while (--y); |
82 | 3.31k | } else if (w == 64) { |
83 | 77.7k | do { |
84 | 77.7k | xy_x_2tap_32_avx2(src_ptr + 0 * 32, &coeffs_256, im + 0 * 32); |
85 | 77.7k | xy_x_2tap_32_avx2(src_ptr + 1 * 32, &coeffs_256, im + 1 * 32); |
86 | 77.7k | src_ptr += src_stride; |
87 | 77.7k | im += 64; |
88 | 77.7k | } while (--y); |
89 | 1.64k | } else { |
90 | 493 | assert(w == 128); |
91 | | |
92 | 47.5k | do { |
93 | 47.5k | xy_x_2tap_32_avx2(src_ptr + 0 * 32, &coeffs_256, im + 0 * 32); |
94 | 47.5k | xy_x_2tap_32_avx2(src_ptr + 1 * 32, &coeffs_256, im + 1 * 32); |
95 | 47.5k | xy_x_2tap_32_avx2(src_ptr + 2 * 32, &coeffs_256, im + 2 * 32); |
96 | 47.5k | xy_x_2tap_32_avx2(src_ptr + 3 * 32, &coeffs_256, im + 3 * 32); |
97 | 47.5k | src_ptr += src_stride; |
98 | 47.5k | im += 128; |
99 | 47.5k | } while (--y); |
100 | 494 | } |
101 | 14.4k | } |
102 | 70.2k | } |
103 | | |
104 | | static void convolve_2d_sr_hor_4tap_ssse3( |
105 | | const uint8_t *const src, const int32_t src_stride, const int32_t w, |
106 | | const int32_t h, const InterpFilterParams *const filter_params_x, |
107 | 805k | const int32_t subpel_x_q4, int16_t *const im_block) { |
108 | 805k | const uint8_t *src_ptr = src - 1; |
109 | 805k | int32_t y = h; |
110 | 805k | int16_t *im = im_block; |
111 | | |
112 | 805k | if (w <= 4) { |
113 | 747k | __m128i coeffs_128[2]; |
114 | | |
115 | 747k | prepare_half_coeffs_4tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128); |
116 | 747k | if (w == 2) { |
117 | 760k | do { |
118 | 760k | const __m128i r = |
119 | 760k | x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128); |
120 | 760k | xy_x_round_store_2x2_sse2(r, im); |
121 | 760k | src_ptr += 2 * src_stride; |
122 | 760k | im += 2 * 2; |
123 | 760k | y -= 2; |
124 | 760k | } while (y); |
125 | 598k | } else if (w == 4) { |
126 | 3.42M | do { |
127 | 3.42M | const __m128i r = |
128 | 3.42M | x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128); |
129 | 3.42M | xy_x_round_store_4x2_sse2(r, im); |
130 | 3.42M | src_ptr += 2 * src_stride; |
131 | 3.42M | im += 2 * 4; |
132 | 3.42M | y -= 2; |
133 | 3.42M | } while (y); |
134 | 598k | } |
135 | 747k | } else { |
136 | | // TODO(chiyotsai@google.com): Add better optimization |
137 | 58.6k | __m256i coeffs_256[2], filt_256[2]; |
138 | | |
139 | 58.6k | prepare_half_coeffs_4tap_avx2(filter_params_x, subpel_x_q4, coeffs_256); |
140 | 58.6k | filt_256[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2); |
141 | 58.6k | filt_256[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2); |
142 | | |
143 | 58.6k | if (w == 8) { |
144 | 200k | do { |
145 | 200k | __m256i res = |
146 | 200k | x_convolve_4tap_8x2_avx2(src_ptr, src_stride, coeffs_256, filt_256); |
147 | 200k | xy_x_round_store_8x2_avx2(res, im); |
148 | | |
149 | 200k | src_ptr += 2 * src_stride; |
150 | 200k | im += 2 * 8; |
151 | 200k | y -= 2; |
152 | 200k | } while (y); |
153 | 33.1k | } else if (w == 16) { |
154 | 141k | do { |
155 | 141k | __m256i r[2]; |
156 | | |
157 | 141k | x_convolve_4tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256, r); |
158 | 141k | xy_x_round_store_32_avx2(r, im); |
159 | 141k | src_ptr += 2 * src_stride; |
160 | 141k | im += 2 * 16; |
161 | 141k | y -= 2; |
162 | 141k | } while (y); |
163 | 18.3k | } else if (w == 32) { |
164 | 133k | do { |
165 | 133k | xy_x_4tap_32_avx2(src_ptr, coeffs_256, filt_256, im); |
166 | | |
167 | 133k | src_ptr += src_stride; |
168 | 133k | im += 32; |
169 | 133k | } while (--y); |
170 | 5.04k | } else if (w == 64) { |
171 | 92.9k | do { |
172 | 92.9k | xy_x_4tap_32_avx2(src_ptr, coeffs_256, filt_256, im); |
173 | 92.9k | xy_x_4tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32); |
174 | 92.9k | src_ptr += src_stride; |
175 | 92.9k | im += 64; |
176 | 92.9k | } while (--y); |
177 | 1.71k | } else { |
178 | 377 | assert(w == 128); |
179 | | |
180 | 44.0k | do { |
181 | 44.0k | xy_x_4tap_32_avx2(src_ptr, coeffs_256, filt_256, im); |
182 | 44.0k | xy_x_4tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32); |
183 | 44.0k | xy_x_4tap_32_avx2(src_ptr + 64, coeffs_256, filt_256, im + 64); |
184 | 44.0k | xy_x_4tap_32_avx2(src_ptr + 96, coeffs_256, filt_256, im + 96); |
185 | 44.0k | src_ptr += src_stride; |
186 | 44.0k | im += 128; |
187 | 44.0k | } while (--y); |
188 | 390 | } |
189 | 58.6k | } |
190 | 805k | } |
191 | | |
192 | | static void convolve_2d_sr_hor_6tap_avx2( |
193 | | const uint8_t *const src, const int32_t src_stride, const int32_t w, |
194 | | const int32_t h, const InterpFilterParams *const filter_params_x, |
195 | 890k | const int32_t subpel_x_q4, int16_t *const im_block) { |
196 | 890k | const uint8_t *src_ptr = src - 2; |
197 | 890k | int32_t y = h; |
198 | 890k | int16_t *im = im_block; |
199 | | |
200 | 890k | if (w <= 4) { |
201 | 0 | __m128i coeffs_128[3]; |
202 | |
|
203 | 0 | prepare_half_coeffs_6tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128); |
204 | 0 | if (w == 2) { |
205 | 0 | do { |
206 | 0 | const __m128i r = |
207 | 0 | x_convolve_6tap_2x2_ssse3(src_ptr, src_stride, coeffs_128); |
208 | 0 | xy_x_round_store_2x2_sse2(r, im); |
209 | 0 | src_ptr += 2 * src_stride; |
210 | 0 | im += 2 * 2; |
211 | 0 | y -= 2; |
212 | 0 | } while (y); |
213 | 0 | } else if (w == 4) { |
214 | 0 | do { |
215 | 0 | const __m128i r = |
216 | 0 | x_convolve_6tap_4x2_ssse3(src_ptr, src_stride, coeffs_128); |
217 | 0 | xy_x_round_store_4x2_sse2(r, im); |
218 | 0 | src_ptr += 2 * src_stride; |
219 | 0 | im += 2 * 4; |
220 | 0 | y -= 2; |
221 | 0 | } while (y); |
222 | 0 | } |
223 | 890k | } else { |
224 | 890k | __m256i coeffs_256[3], filt_256[3]; |
225 | | |
226 | 890k | filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2); |
227 | 890k | filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2); |
228 | 890k | filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx2); |
229 | | |
230 | 890k | prepare_half_coeffs_6tap_avx2(filter_params_x, subpel_x_q4, coeffs_256); |
231 | | |
232 | 890k | if (w == 8) { |
233 | 3.35M | do { |
234 | 3.35M | const __m256i res = |
235 | 3.35M | x_convolve_6tap_8x2_avx2(src_ptr, src_stride, coeffs_256, filt_256); |
236 | 3.35M | xy_x_round_store_8x2_avx2(res, im); |
237 | | |
238 | 3.35M | src_ptr += 2 * src_stride; |
239 | 3.35M | im += 2 * 8; |
240 | 3.35M | y -= 2; |
241 | 3.35M | } while (y); |
242 | 528k | } else if (w == 16) { |
243 | 2.23M | do { |
244 | 2.23M | __m256i r[2]; |
245 | | |
246 | 2.23M | x_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256, r); |
247 | 2.23M | xy_x_round_store_32_avx2(r, im); |
248 | 2.23M | src_ptr += 2 * src_stride; |
249 | 2.23M | im += 2 * 16; |
250 | 2.23M | y -= 2; |
251 | 2.23M | } while (y); |
252 | 282k | } else if (w == 32) { |
253 | 1.68M | do { |
254 | 1.68M | xy_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, im); |
255 | 1.68M | src_ptr += src_stride; |
256 | 1.68M | im += 32; |
257 | 1.68M | } while (--y); |
258 | 65.5k | } else if (w == 64) { |
259 | 686k | do { |
260 | 686k | xy_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, im); |
261 | 686k | xy_x_6tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32); |
262 | 686k | src_ptr += src_stride; |
263 | 686k | im += 64; |
264 | 686k | } while (--y); |
265 | 11.9k | } else { |
266 | 1.82k | assert(w == 128); |
267 | | |
268 | 231k | do { |
269 | 231k | xy_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, im); |
270 | 231k | xy_x_6tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32); |
271 | 231k | xy_x_6tap_32_avx2(src_ptr + 64, coeffs_256, filt_256, im + 64); |
272 | 231k | xy_x_6tap_32_avx2(src_ptr + 96, coeffs_256, filt_256, im + 96); |
273 | 231k | src_ptr += src_stride; |
274 | 231k | im += 128; |
275 | 231k | } while (--y); |
276 | 1.91k | } |
277 | 890k | } |
278 | 890k | } |
279 | | |
280 | | static void convolve_2d_sr_hor_8tap_avx2( |
281 | | const uint8_t *const src, const int32_t src_stride, const int32_t w, |
282 | | const int32_t h, const InterpFilterParams *const filter_params_x, |
283 | 53.9k | const int32_t subpel_x_q4, int16_t *const im_block) { |
284 | 53.9k | const uint8_t *src_ptr = src - 3; |
285 | 53.9k | int32_t y = h; |
286 | 53.9k | int16_t *im = im_block; |
287 | 53.9k | __m256i coeffs_256[4], filt_256[4]; |
288 | | |
289 | 53.9k | filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2); |
290 | 53.9k | filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2); |
291 | 53.9k | filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx2); |
292 | 53.9k | filt_256[3] = _mm256_loadu_si256((__m256i const *)filt4_global_avx2); |
293 | | |
294 | 53.9k | prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_256); |
295 | | |
296 | 53.9k | if (w == 8) { |
297 | 147k | do { |
298 | 147k | const __m256i res = |
299 | 147k | x_convolve_8tap_8x2_avx2(src_ptr, src_stride, coeffs_256, filt_256); |
300 | 147k | xy_x_round_store_8x2_avx2(res, im); |
301 | 147k | src_ptr += 2 * src_stride; |
302 | 147k | im += 2 * 8; |
303 | 147k | y -= 2; |
304 | 147k | } while (y); |
305 | 32.8k | } else if (w == 16) { |
306 | 100k | do { |
307 | 100k | __m256i r[2]; |
308 | | |
309 | 100k | x_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256, r); |
310 | 100k | xy_x_round_store_32_avx2(r, im); |
311 | 100k | src_ptr += 2 * src_stride; |
312 | 100k | im += 2 * 16; |
313 | 100k | y -= 2; |
314 | 100k | } while (y); |
315 | 21.7k | } else if (w == 32) { |
316 | 379k | do { |
317 | 379k | xy_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, im); |
318 | 379k | src_ptr += src_stride; |
319 | 379k | im += 32; |
320 | 379k | } while (--y); |
321 | 14.5k | } else if (w == 64) { |
322 | 303k | do { |
323 | 303k | xy_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, im); |
324 | 303k | xy_x_8tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32); |
325 | 303k | src_ptr += src_stride; |
326 | 303k | im += 64; |
327 | 303k | } while (--y); |
328 | 6.87k | } else { |
329 | 264 | assert(w == 128); |
330 | | |
331 | 30.9k | do { |
332 | 30.9k | xy_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, im); |
333 | 30.9k | xy_x_8tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32); |
334 | 30.9k | xy_x_8tap_32_avx2(src_ptr + 64, coeffs_256, filt_256, im + 64); |
335 | 30.9k | xy_x_8tap_32_avx2(src_ptr + 96, coeffs_256, filt_256, im + 96); |
336 | 30.9k | src_ptr += src_stride; |
337 | 30.9k | im += 128; |
338 | 30.9k | } while (--y); |
339 | 265 | } |
340 | 53.9k | } |
341 | | |
342 | | static void convolve_2d_sr_ver_2tap_avx2( |
343 | | const int16_t *const im_block, const int32_t w, const int32_t h, |
344 | | const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4, |
345 | 54.8k | uint8_t *dst, const int32_t dst_stride) { |
346 | 54.8k | const int16_t *im = im_block; |
347 | 54.8k | int32_t y = h; |
348 | | |
349 | 54.8k | if (w <= 4) { |
350 | 28.4k | __m128i coeffs_128; |
351 | | |
352 | 28.4k | prepare_coeffs_2tap_sse2(filter_params_y, subpel_y_q4, &coeffs_128); |
353 | | |
354 | 28.4k | if (w == 2) { |
355 | 6.57k | __m128i s_32[2]; |
356 | | |
357 | 6.57k | s_32[0] = _mm_cvtsi32_si128(*(int32_t *)im); |
358 | | |
359 | 13.4k | do { |
360 | 13.4k | const __m128i res = xy_y_convolve_2tap_2x2_sse2(im, s_32, &coeffs_128); |
361 | 13.4k | xy_y_round_store_2x2_sse2(res, dst, dst_stride); |
362 | 13.4k | im += 2 * 2; |
363 | 13.4k | dst += 2 * dst_stride; |
364 | 13.4k | y -= 2; |
365 | 13.4k | } while (y); |
366 | 21.8k | } else { |
367 | 21.8k | __m128i s_64[2], r[2]; |
368 | | |
369 | 21.8k | assert(w == 4); |
370 | | |
371 | 21.8k | s_64[0] = _mm_loadl_epi64((__m128i *)im); |
372 | | |
373 | 64.6k | do { |
374 | 64.6k | xy_y_convolve_2tap_4x2_sse2(im, s_64, &coeffs_128, r); |
375 | 64.6k | r[0] = xy_y_round_sse2(r[0]); |
376 | 64.6k | r[1] = xy_y_round_sse2(r[1]); |
377 | 64.6k | const __m128i rr = _mm_packs_epi32(r[0], r[1]); |
378 | 64.6k | pack_store_4x2_sse2(rr, dst, dst_stride); |
379 | 64.6k | im += 2 * 4; |
380 | 64.6k | dst += 2 * dst_stride; |
381 | 64.6k | y -= 2; |
382 | 64.6k | } while (y); |
383 | 21.8k | } |
384 | 28.4k | } else { |
385 | 26.4k | __m256i coeffs_256; |
386 | | |
387 | 26.4k | prepare_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, &coeffs_256); |
388 | | |
389 | 26.4k | if (w == 8) { |
390 | 16.2k | __m128i s_128[2]; |
391 | 16.2k | __m256i r[2]; |
392 | | |
393 | 16.2k | s_128[0] = _mm_loadu_si128((__m128i *)im); |
394 | | |
395 | 52.7k | do { |
396 | 52.7k | xy_y_convolve_2tap_8x2_avx2(im, s_128, &coeffs_256, r); |
397 | 52.7k | xy_y_round_store_8x2_avx2(r, dst, dst_stride); |
398 | 52.7k | im += 2 * 8; |
399 | 52.7k | dst += 2 * dst_stride; |
400 | 52.7k | y -= 2; |
401 | 52.7k | } while (y); |
402 | 16.2k | } else if (w == 16) { |
403 | 6.52k | __m256i s_256[2], r[4]; |
404 | | |
405 | 6.52k | s_256[0] = _mm256_loadu_si256((__m256i *)im); |
406 | | |
407 | 31.2k | do { |
408 | 31.2k | xy_y_convolve_2tap_16x2_avx2(im, s_256, &coeffs_256, r); |
409 | 31.2k | xy_y_round_store_16x2_avx2(r, dst, dst_stride); |
410 | 31.2k | im += 2 * 16; |
411 | 31.2k | dst += 2 * dst_stride; |
412 | 31.2k | y -= 2; |
413 | 31.2k | } while (y); |
414 | 6.52k | } else if (w == 32) { |
415 | 2.04k | __m256i s_256[2][2]; |
416 | | |
417 | 2.04k | s_256[0][0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16)); |
418 | 2.04k | s_256[0][1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16)); |
419 | | |
420 | 22.8k | do { |
421 | 22.8k | xy_y_convolve_2tap_32_all_avx2(im + 32, s_256[0], s_256[1], &coeffs_256, |
422 | 22.8k | dst); |
423 | 22.8k | im += 2 * 32; |
424 | 22.8k | xy_y_convolve_2tap_32_all_avx2(im, s_256[1], s_256[0], &coeffs_256, |
425 | 22.8k | dst + dst_stride); |
426 | 22.8k | dst += 2 * dst_stride; |
427 | 22.8k | y -= 2; |
428 | 22.8k | } while (y); |
429 | 2.04k | } else if (w == 64) { |
430 | 1.33k | __m256i s_256[2][4]; |
431 | | |
432 | 1.33k | s_256[0][0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16)); |
433 | 1.33k | s_256[0][1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16)); |
434 | 1.33k | s_256[0][2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16)); |
435 | 1.33k | s_256[0][3] = _mm256_loadu_si256((__m256i *)(im + 3 * 16)); |
436 | | |
437 | 30.1k | do { |
438 | 30.1k | xy_y_convolve_2tap_32_all_avx2(im + 64, s_256[0] + 0, s_256[1] + 0, |
439 | 30.1k | &coeffs_256, dst); |
440 | 30.1k | xy_y_convolve_2tap_32_all_avx2(im + 96, s_256[0] + 2, s_256[1] + 2, |
441 | 30.1k | &coeffs_256, dst + 32); |
442 | 30.1k | im += 2 * 64; |
443 | 30.1k | xy_y_convolve_2tap_32_all_avx2(im, s_256[1] + 0, s_256[0] + 0, |
444 | 30.1k | &coeffs_256, dst + dst_stride); |
445 | 30.1k | xy_y_convolve_2tap_32_all_avx2(im + 32, s_256[1] + 2, s_256[0] + 2, |
446 | 30.1k | &coeffs_256, dst + dst_stride + 32); |
447 | 30.1k | dst += 2 * dst_stride; |
448 | 30.1k | y -= 2; |
449 | 30.1k | } while (y); |
450 | 1.33k | } else { |
451 | 293 | __m256i s_256[2][8]; |
452 | | |
453 | 293 | assert(w == 128); |
454 | | |
455 | 293 | load_16bit_8rows_avx2(im, 16, s_256[0]); |
456 | | |
457 | 14.3k | do { |
458 | 14.3k | xy_y_convolve_2tap_32_all_avx2(im + 128, s_256[0] + 0, s_256[1] + 0, |
459 | 14.3k | &coeffs_256, dst); |
460 | 14.3k | xy_y_convolve_2tap_32_all_avx2(im + 160, s_256[0] + 2, s_256[1] + 2, |
461 | 14.3k | &coeffs_256, dst + 1 * 32); |
462 | 14.3k | xy_y_convolve_2tap_32_all_avx2(im + 192, s_256[0] + 4, s_256[1] + 4, |
463 | 14.3k | &coeffs_256, dst + 2 * 32); |
464 | 14.3k | xy_y_convolve_2tap_32_all_avx2(im + 224, s_256[0] + 6, s_256[1] + 6, |
465 | 14.3k | &coeffs_256, dst + 3 * 32); |
466 | 14.3k | im += 2 * 128; |
467 | 14.3k | xy_y_convolve_2tap_32_all_avx2(im, s_256[1] + 0, s_256[0] + 0, |
468 | 14.3k | &coeffs_256, dst + dst_stride); |
469 | 14.3k | xy_y_convolve_2tap_32_all_avx2(im + 32, s_256[1] + 2, s_256[0] + 2, |
470 | 14.3k | &coeffs_256, dst + dst_stride + 1 * 32); |
471 | 14.3k | xy_y_convolve_2tap_32_all_avx2(im + 64, s_256[1] + 4, s_256[0] + 4, |
472 | 14.3k | &coeffs_256, dst + dst_stride + 2 * 32); |
473 | 14.3k | xy_y_convolve_2tap_32_all_avx2(im + 96, s_256[1] + 6, s_256[0] + 6, |
474 | 14.3k | &coeffs_256, dst + dst_stride + 3 * 32); |
475 | 14.3k | dst += 2 * dst_stride; |
476 | 14.3k | y -= 2; |
477 | 14.3k | } while (y); |
478 | 293 | } |
479 | 26.4k | } |
480 | 54.8k | } |
481 | | |
482 | | static void convolve_2d_sr_ver_2tap_half_avx2( |
483 | | const int16_t *const im_block, const int32_t w, const int32_t h, |
484 | | const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4, |
485 | 15.3k | uint8_t *dst, const int32_t dst_stride) { |
486 | 15.3k | const int16_t *im = im_block; |
487 | 15.3k | int32_t y = h; |
488 | | |
489 | 15.3k | (void)filter_params_y; |
490 | 15.3k | (void)subpel_y_q4; |
491 | | |
492 | 15.3k | if (w == 2) { |
493 | 1.58k | __m128i s_32[2]; |
494 | | |
495 | 1.58k | s_32[0] = _mm_cvtsi32_si128(*(int32_t *)im); |
496 | | |
497 | 3.03k | do { |
498 | 3.03k | const __m128i res = xy_y_convolve_2tap_2x2_half_pel_sse2(im, s_32); |
499 | 3.03k | const __m128i r = xy_y_round_half_pel_sse2(res); |
500 | 3.03k | pack_store_2x2_sse2(r, dst, dst_stride); |
501 | 3.03k | im += 2 * 2; |
502 | 3.03k | dst += 2 * dst_stride; |
503 | 3.03k | y -= 2; |
504 | 3.03k | } while (y); |
505 | 13.8k | } else if (w == 4) { |
506 | 5.04k | __m128i s_64[2]; |
507 | | |
508 | 5.04k | s_64[0] = _mm_loadl_epi64((__m128i *)im); |
509 | | |
510 | 15.1k | do { |
511 | 15.1k | const __m128i res = xy_y_convolve_2tap_4x2_half_pel_sse2(im, s_64); |
512 | 15.1k | const __m128i r = xy_y_round_half_pel_sse2(res); |
513 | 15.1k | pack_store_4x2_sse2(r, dst, dst_stride); |
514 | 15.1k | im += 2 * 4; |
515 | 15.1k | dst += 2 * dst_stride; |
516 | 15.1k | y -= 2; |
517 | 15.1k | } while (y); |
518 | 8.76k | } else if (w == 8) { |
519 | 4.48k | __m128i s_128[2]; |
520 | | |
521 | 4.48k | s_128[0] = _mm_loadu_si128((__m128i *)im); |
522 | | |
523 | 17.0k | do { |
524 | 17.0k | const __m256i res = xy_y_convolve_2tap_8x2_half_pel_avx2(im, s_128); |
525 | 17.0k | const __m256i r = xy_y_round_half_pel_avx2(res); |
526 | 17.0k | pack_store_8x2_avx2(r, dst, dst_stride); |
527 | 17.0k | im += 2 * 8; |
528 | 17.0k | dst += 2 * dst_stride; |
529 | 17.0k | y -= 2; |
530 | 17.0k | } while (y); |
531 | 4.48k | } else if (w == 16) { |
532 | 2.49k | __m256i s_256[2], r[2]; |
533 | | |
534 | 2.49k | s_256[0] = _mm256_loadu_si256((__m256i *)im); |
535 | | |
536 | 16.4k | do { |
537 | 16.4k | xy_y_convolve_2tap_16x2_half_pel_avx2(im, s_256, r); |
538 | 16.4k | r[0] = xy_y_round_half_pel_avx2(r[0]); |
539 | 16.4k | r[1] = xy_y_round_half_pel_avx2(r[1]); |
540 | 16.4k | xy_y_pack_store_16x2_avx2(r[0], r[1], dst, dst_stride); |
541 | 16.4k | im += 2 * 16; |
542 | 16.4k | dst += 2 * dst_stride; |
543 | 16.4k | y -= 2; |
544 | 16.4k | } while (y); |
545 | 2.49k | } else if (w == 32) { |
546 | 1.26k | __m256i s_256[2][2]; |
547 | | |
548 | 1.26k | s_256[0][0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16)); |
549 | 1.26k | s_256[0][1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16)); |
550 | | |
551 | 16.2k | do { |
552 | 16.2k | xy_y_convolve_2tap_half_pel_32_all_avx2(im + 32, s_256[0], s_256[1], dst); |
553 | 16.2k | xy_y_convolve_2tap_half_pel_32_all_avx2(im + 2 * 32, s_256[1], s_256[0], |
554 | 16.2k | dst + dst_stride); |
555 | 16.2k | im += 2 * 32; |
556 | 16.2k | dst += 2 * dst_stride; |
557 | 16.2k | y -= 2; |
558 | 16.2k | } while (y); |
559 | 1.26k | } else if (w == 64) { |
560 | 314 | __m256i s_256[2][4]; |
561 | | |
562 | 314 | s_256[0][0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16)); |
563 | 314 | s_256[0][1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16)); |
564 | 314 | s_256[0][2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16)); |
565 | 314 | s_256[0][3] = _mm256_loadu_si256((__m256i *)(im + 3 * 16)); |
566 | | |
567 | 7.10k | do { |
568 | 7.10k | xy_y_convolve_2tap_half_pel_32_all_avx2(im + 64, s_256[0] + 0, |
569 | 7.10k | s_256[1] + 0, dst); |
570 | 7.10k | xy_y_convolve_2tap_half_pel_32_all_avx2(im + 96, s_256[0] + 2, |
571 | 7.10k | s_256[1] + 2, dst + 32); |
572 | 7.10k | im += 2 * 64; |
573 | 7.10k | xy_y_convolve_2tap_half_pel_32_all_avx2(im, s_256[1] + 0, s_256[0] + 0, |
574 | 7.10k | dst + dst_stride); |
575 | 7.10k | xy_y_convolve_2tap_half_pel_32_all_avx2( |
576 | 7.10k | im + 32, s_256[1] + 2, s_256[0] + 2, dst + dst_stride + 32); |
577 | 7.10k | dst += 2 * dst_stride; |
578 | 7.10k | y -= 2; |
579 | 7.10k | } while (y); |
580 | 314 | } else { |
581 | 200 | __m256i s_256[2][8]; |
582 | | |
583 | 200 | assert(w == 128); |
584 | | |
585 | 201 | load_16bit_8rows_avx2(im, 16, s_256[0]); |
586 | | |
587 | 8.96k | do { |
588 | 8.96k | xy_y_convolve_2tap_half_pel_32_all_avx2(im + 128, s_256[0] + 0, |
589 | 8.96k | s_256[1] + 0, dst); |
590 | 8.96k | xy_y_convolve_2tap_half_pel_32_all_avx2(im + 160, s_256[0] + 2, |
591 | 8.96k | s_256[1] + 2, dst + 1 * 32); |
592 | 8.96k | xy_y_convolve_2tap_half_pel_32_all_avx2(im + 192, s_256[0] + 4, |
593 | 8.96k | s_256[1] + 4, dst + 2 * 32); |
594 | 8.96k | xy_y_convolve_2tap_half_pel_32_all_avx2(im + 224, s_256[0] + 6, |
595 | 8.96k | s_256[1] + 6, dst + 3 * 32); |
596 | 8.96k | im += 2 * 128; |
597 | 8.96k | xy_y_convolve_2tap_half_pel_32_all_avx2(im, s_256[1] + 0, s_256[0] + 0, |
598 | 8.96k | dst + dst_stride); |
599 | 8.96k | xy_y_convolve_2tap_half_pel_32_all_avx2( |
600 | 8.96k | im + 32, s_256[1] + 2, s_256[0] + 2, dst + dst_stride + 1 * 32); |
601 | 8.96k | xy_y_convolve_2tap_half_pel_32_all_avx2( |
602 | 8.96k | im + 64, s_256[1] + 4, s_256[0] + 4, dst + dst_stride + 2 * 32); |
603 | 8.96k | xy_y_convolve_2tap_half_pel_32_all_avx2( |
604 | 8.96k | im + 96, s_256[1] + 6, s_256[0] + 6, dst + dst_stride + 3 * 32); |
605 | 8.96k | dst += 2 * dst_stride; |
606 | 8.96k | y -= 2; |
607 | 8.96k | } while (y); |
608 | 201 | } |
609 | 15.3k | } |
610 | | |
611 | | static void convolve_2d_sr_ver_4tap_avx2( |
612 | | const int16_t *const im_block, const int32_t w, const int32_t h, |
613 | | const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4, |
614 | 879k | uint8_t *dst, const int32_t dst_stride) { |
615 | 879k | const int16_t *im = im_block; |
616 | 879k | int32_t y = h; |
617 | | |
618 | 879k | if (w == 2) { |
619 | 86.6k | __m128i coeffs_128[2], s_32[4], ss_128[2]; |
620 | | |
621 | 86.6k | prepare_coeffs_4tap_sse2(filter_params_y, subpel_y_q4, coeffs_128); |
622 | | |
623 | 86.6k | s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2)); |
624 | 86.6k | s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2)); |
625 | 86.6k | s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2)); |
626 | | |
627 | 86.6k | const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]); |
628 | 86.6k | const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]); |
629 | | |
630 | 86.6k | ss_128[0] = _mm_unpacklo_epi16(src01, src12); |
631 | | |
632 | 146k | do { |
633 | 146k | const __m128i res = |
634 | 146k | xy_y_convolve_4tap_2x2_sse2(im, s_32, ss_128, coeffs_128); |
635 | 146k | xy_y_round_store_2x2_sse2(res, dst, dst_stride); |
636 | 146k | im += 2 * 2; |
637 | 146k | dst += 2 * dst_stride; |
638 | 146k | y -= 2; |
639 | 146k | } while (y); |
640 | 792k | } else { |
641 | 792k | __m256i coeffs_256[2]; |
642 | | |
643 | 792k | prepare_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256); |
644 | | |
645 | 792k | if (w == 4) { |
646 | 363k | __m128i s_64[4]; |
647 | 363k | __m256i s_256[2], ss_256[2]; |
648 | | |
649 | 363k | s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4)); |
650 | 363k | s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4)); |
651 | 363k | s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4)); |
652 | | |
653 | | // Load lines a and b. Line a to lower 128, line b to upper 128 |
654 | 363k | s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]); |
655 | 363k | s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]); |
656 | | |
657 | 363k | ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]); |
658 | | |
659 | 726k | do { |
660 | 726k | const __m256i res = |
661 | 726k | xy_y_convolve_4tap_4x2_avx2(im, s_64, ss_256, coeffs_256); |
662 | 726k | xy_y_round_store_4x2_avx2(res, dst, dst_stride); |
663 | 726k | im += 2 * 4; |
664 | 726k | dst += 2 * dst_stride; |
665 | 726k | y -= 2; |
666 | 726k | } while (y); |
667 | 429k | } else if (w == 8) { |
668 | 291k | __m256i s_256[4], r[2]; |
669 | | |
670 | 291k | s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8)); |
671 | 291k | s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8)); |
672 | | |
673 | 291k | if (subpel_y_q4 != 8) { |
674 | 241k | __m256i ss_256[4]; |
675 | | |
676 | 241k | ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]); |
677 | 241k | ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]); |
678 | | |
679 | 482k | do { |
680 | 482k | xy_y_convolve_4tap_8x2_avx2(im, ss_256, coeffs_256, r); |
681 | 482k | xy_y_round_store_8x2_avx2(r, dst, dst_stride); |
682 | 482k | im += 2 * 8; |
683 | 482k | dst += 2 * dst_stride; |
684 | 482k | y -= 2; |
685 | 482k | } while (y); |
686 | 241k | } else { |
687 | 88.0k | do { |
688 | 88.0k | xy_y_convolve_4tap_8x2_half_pel_avx2(im, coeffs_256, s_256, r); |
689 | 88.0k | xy_y_round_store_8x2_avx2(r, dst, dst_stride); |
690 | 88.0k | im += 2 * 8; |
691 | 88.0k | dst += 2 * dst_stride; |
692 | 88.0k | y -= 2; |
693 | 88.0k | } while (y); |
694 | 50.0k | } |
695 | 291k | } else if (w == 16) { |
696 | 126k | __m256i s_256[5]; |
697 | | |
698 | 126k | s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16)); |
699 | 126k | s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16)); |
700 | 126k | s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16)); |
701 | | |
702 | 126k | if (subpel_y_q4 != 8) { |
703 | 102k | __m256i ss_256[4], tt_256[4], r[4]; |
704 | | |
705 | 102k | ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]); |
706 | 102k | ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]); |
707 | | |
708 | 102k | tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]); |
709 | 102k | tt_256[2] = _mm256_unpackhi_epi16(s_256[1], s_256[2]); |
710 | | |
711 | 266k | do { |
712 | 266k | xy_y_convolve_4tap_16x2_avx2(im, s_256, ss_256, tt_256, coeffs_256, |
713 | 266k | r); |
714 | 266k | xy_y_round_store_16x2_avx2(r, dst, dst_stride); |
715 | 266k | im += 2 * 16; |
716 | 266k | dst += 2 * dst_stride; |
717 | 266k | y -= 2; |
718 | 266k | } while (y); |
719 | 102k | } else { |
720 | 24.5k | __m256i r[4]; |
721 | | |
722 | 49.0k | do { |
723 | 49.0k | xy_y_convolve_4tap_16x2_half_pelavx2(im, s_256, coeffs_256, r); |
724 | 49.0k | xy_y_round_store_16x2_avx2(r, dst, dst_stride); |
725 | 49.0k | im += 2 * 16; |
726 | 49.0k | dst += 2 * dst_stride; |
727 | 49.0k | y -= 2; |
728 | 49.0k | } while (y); |
729 | 24.5k | } |
730 | 126k | } else { |
731 | | /*It's a special condition for OBMC. A/c to Av1 spec 4-tap won't |
732 | | support for width(w)>16, but for OBMC while predicting above block |
733 | | it reduces size block to Wx(h/2), for example, if above block size |
734 | | is 32x8, we get block size as 32x4 for OBMC.*/ |
735 | 10.6k | int32_t x = 0; |
736 | | |
737 | 10.6k | assert(!(w % 32)); |
738 | | |
739 | 10.7k | __m256i s_256[2][4], ss_256[2][4], tt_256[2][4], r0[4], r1[4]; |
740 | 13.5k | do { |
741 | 13.5k | const int16_t *s = im + x; |
742 | 13.5k | uint8_t *d = dst + x; |
743 | | |
744 | 13.5k | loadu_unpack_16bit_3rows_avx2(s, w, s_256[0], ss_256[0], tt_256[0]); |
745 | 13.5k | loadu_unpack_16bit_3rows_avx2(s + 16, w, s_256[1], ss_256[1], |
746 | 13.5k | tt_256[1]); |
747 | | |
748 | 13.5k | y = h; |
749 | 222k | do { |
750 | 222k | xy_y_convolve_4tap_32x2_avx2(s, w, s_256[0], ss_256[0], tt_256[0], |
751 | 222k | coeffs_256, r0); |
752 | 222k | xy_y_convolve_4tap_32x2_avx2(s + 16, w, s_256[1], ss_256[1], |
753 | 222k | tt_256[1], coeffs_256, r1); |
754 | | |
755 | 222k | xy_y_round_store_32_avx2(r0 + 0, r1 + 0, d); |
756 | 222k | xy_y_round_store_32_avx2(r0 + 2, r1 + 2, d + dst_stride); |
757 | | |
758 | 222k | s += 2 * w; |
759 | 222k | d += 2 * dst_stride; |
760 | 222k | y -= 2; |
761 | 222k | } while (y); |
762 | | |
763 | 13.5k | x += 32; |
764 | 13.5k | } while (x < w); |
765 | 10.7k | } |
766 | 792k | } |
767 | 879k | } |
768 | | |
769 | | static void convolve_2d_sr_ver_6tap_avx2( |
770 | | const int16_t *const im_block, const int32_t w, const int32_t h, |
771 | | const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4, |
772 | 818k | uint8_t *dst, const int32_t dst_stride) { |
773 | 818k | const int16_t *im = im_block; |
774 | 818k | int32_t y; |
775 | | |
776 | 818k | if (w == 2) { |
777 | 59.7k | __m128i coeffs_128[3], s_32[6], ss_128[3]; |
778 | | |
779 | 59.7k | prepare_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128); |
780 | | |
781 | 59.7k | s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2)); |
782 | 59.7k | s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2)); |
783 | 59.7k | s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2)); |
784 | 59.7k | s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(im + 3 * 2)); |
785 | 59.7k | s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(im + 4 * 2)); |
786 | | |
787 | 59.7k | const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]); |
788 | 59.7k | const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]); |
789 | 59.7k | const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]); |
790 | 59.7k | const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]); |
791 | | |
792 | 59.7k | ss_128[0] = _mm_unpacklo_epi16(src01, src12); |
793 | 59.7k | ss_128[1] = _mm_unpacklo_epi16(src23, src34); |
794 | | |
795 | 59.7k | y = h; |
796 | 239k | do { |
797 | 239k | const __m128i res = |
798 | 239k | xy_y_convolve_6tap_2x2_sse2(im, s_32, ss_128, coeffs_128); |
799 | 239k | xy_y_round_store_2x2_sse2(res, dst, dst_stride); |
800 | 239k | im += 2 * 2; |
801 | 239k | dst += 2 * dst_stride; |
802 | 239k | y -= 2; |
803 | 239k | } while (y); |
804 | 758k | } else { |
805 | 758k | __m256i coeffs_256[3]; |
806 | | |
807 | 758k | prepare_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256); |
808 | | |
809 | 758k | if (w == 4) { |
810 | 224k | __m128i s_64[6]; |
811 | 224k | __m256i s_256[6], ss_256[3]; |
812 | | |
813 | 224k | s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4)); |
814 | 224k | s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4)); |
815 | 224k | s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4)); |
816 | 224k | s_64[3] = _mm_loadl_epi64((__m128i *)(im + 3 * 4)); |
817 | 224k | s_64[4] = _mm_loadl_epi64((__m128i *)(im + 4 * 4)); |
818 | | |
819 | | // Load lines a and b. Line a to lower 128, line b to upper 128 |
820 | 224k | s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]); |
821 | 224k | s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]); |
822 | 224k | s_256[2] = _mm256_setr_m128i(s_64[2], s_64[3]); |
823 | 224k | s_256[3] = _mm256_setr_m128i(s_64[3], s_64[4]); |
824 | | |
825 | 224k | ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]); |
826 | 224k | ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]); |
827 | | |
828 | 224k | y = h; |
829 | 1.20M | do { |
830 | 1.20M | const __m256i res = |
831 | 1.20M | xy_y_convolve_6tap_4x2_avx2(im, s_64, ss_256, coeffs_256); |
832 | 1.20M | xy_y_round_store_4x2_avx2(res, dst, dst_stride); |
833 | 1.20M | im += 2 * 4; |
834 | 1.20M | dst += 2 * dst_stride; |
835 | 1.20M | y -= 2; |
836 | 1.20M | } while (y); |
837 | 534k | } else if (w == 8) { |
838 | 280k | __m256i s_256[6], r[2]; |
839 | | |
840 | 280k | s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8)); |
841 | 280k | s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8)); |
842 | 280k | s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 8)); |
843 | 280k | s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 8)); |
844 | 280k | y = h; |
845 | | |
846 | 280k | if (subpel_y_q4 != 8) { |
847 | 219k | __m256i ss_256[6]; |
848 | | |
849 | 219k | ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]); |
850 | 219k | ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]); |
851 | | |
852 | 219k | ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]); |
853 | 219k | ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]); |
854 | | |
855 | 1.23M | do { |
856 | 1.23M | xy_y_convolve_6tap_8x2_avx2(im, ss_256, coeffs_256, r); |
857 | 1.23M | xy_y_round_store_8x2_avx2(r, dst, dst_stride); |
858 | 1.23M | im += 2 * 8; |
859 | 1.23M | dst += 2 * dst_stride; |
860 | 1.23M | y -= 2; |
861 | 1.23M | } while (y); |
862 | 219k | } else { |
863 | 358k | do { |
864 | 358k | xy_y_convolve_6tap_8x2_half_pel_avx2(im, coeffs_256, s_256, r); |
865 | 358k | xy_y_round_store_8x2_avx2(r, dst, dst_stride); |
866 | 358k | im += 2 * 8; |
867 | 358k | dst += 2 * dst_stride; |
868 | 358k | y -= 2; |
869 | 358k | } while (y); |
870 | 60.8k | } |
871 | 280k | } else if (w == 16) { |
872 | 178k | __m256i s_256[6]; |
873 | | |
874 | 178k | s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16)); |
875 | 178k | s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16)); |
876 | 178k | s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16)); |
877 | 178k | s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 16)); |
878 | 178k | s_256[4] = _mm256_loadu_si256((__m256i *)(im + 4 * 16)); |
879 | 178k | y = h; |
880 | | |
881 | 178k | if (subpel_y_q4 != 8) { |
882 | 135k | __m256i ss_256[6], tt_256[6], r[4]; |
883 | | |
884 | 135k | ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]); |
885 | 135k | ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]); |
886 | 135k | ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]); |
887 | 135k | ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]); |
888 | | |
889 | 135k | tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]); |
890 | 135k | tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[4]); |
891 | 135k | tt_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]); |
892 | 135k | tt_256[4] = _mm256_unpackhi_epi16(s_256[3], s_256[4]); |
893 | | |
894 | 971k | do { |
895 | 971k | xy_y_convolve_6tap_16x2_avx2(im, 16, s_256, ss_256, tt_256, |
896 | 971k | coeffs_256, r); |
897 | 971k | xy_y_round_store_16x2_avx2(r, dst, dst_stride); |
898 | 971k | im += 2 * 16; |
899 | 971k | dst += 2 * dst_stride; |
900 | 971k | y -= 2; |
901 | 971k | } while (y); |
902 | 135k | } else { |
903 | 43.0k | __m256i ss_256[4], r[4]; |
904 | | |
905 | 321k | do { |
906 | 321k | xy_y_convolve_6tap_16x2_half_pel_avx2(im, 16, s_256, ss_256, |
907 | 321k | coeffs_256, r); |
908 | 321k | xy_y_round_store_16x2_avx2(r, dst, dst_stride); |
909 | | |
910 | 321k | im += 2 * 16; |
911 | 321k | dst += 2 * dst_stride; |
912 | 321k | y -= 2; |
913 | 321k | } while (y); |
914 | 43.0k | } |
915 | 178k | } else { |
916 | 75.8k | int32_t x = 0; |
917 | | |
918 | 75.8k | assert(!(w % 32)); |
919 | | |
920 | 75.9k | __m256i s_256[2][6], ss_256[2][6], tt_256[2][6], r0[4], r1[4]; |
921 | | |
922 | 93.6k | do { |
923 | 93.6k | const int16_t *s = im + x; |
924 | 93.6k | uint8_t *d = dst + x; |
925 | | |
926 | 93.6k | loadu_unpack_16bit_5rows_avx2(s, w, s_256[0], ss_256[0], tt_256[0]); |
927 | 93.6k | loadu_unpack_16bit_5rows_avx2(s + 16, w, s_256[1], ss_256[1], |
928 | 93.6k | tt_256[1]); |
929 | | |
930 | 93.6k | y = h; |
931 | 1.71M | do { |
932 | 1.71M | xy_y_convolve_6tap_16x2_avx2(s, w, s_256[0], ss_256[0], tt_256[0], |
933 | 1.71M | coeffs_256, r0); |
934 | 1.71M | xy_y_convolve_6tap_16x2_avx2(s + 16, w, s_256[1], ss_256[1], |
935 | 1.71M | tt_256[1], coeffs_256, r1); |
936 | | |
937 | 1.71M | xy_y_round_store_32_avx2(r0 + 0, r1 + 0, d); |
938 | 1.71M | xy_y_round_store_32_avx2(r0 + 2, r1 + 2, d + dst_stride); |
939 | | |
940 | 1.71M | s += 2 * w; |
941 | 1.71M | d += 2 * dst_stride; |
942 | 1.71M | y -= 2; |
943 | 1.71M | } while (y); |
944 | | |
945 | 93.6k | x += 32; |
946 | 93.6k | } while (x < w); |
947 | 75.9k | } |
948 | 758k | } |
949 | 818k | } |
950 | | |
951 | | static void convolve_2d_sr_ver_8tap_avx2( |
952 | | const int16_t *const im_block, const int32_t w, const int32_t h, |
953 | | const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4, |
954 | 52.2k | uint8_t *dst, const int32_t dst_stride) { |
955 | 52.2k | const int16_t *im = im_block; |
956 | 52.2k | int32_t y; |
957 | | |
958 | 52.2k | if (w == 2) { |
959 | 2.74k | __m128i coeffs_128[4], s_32[8], ss_128[4]; |
960 | | |
961 | 2.74k | prepare_coeffs_8tap_sse2(filter_params_y, subpel_y_q4, coeffs_128); |
962 | | |
963 | 2.74k | s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2)); |
964 | 2.74k | s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2)); |
965 | 2.74k | s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2)); |
966 | 2.74k | s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(im + 3 * 2)); |
967 | 2.74k | s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(im + 4 * 2)); |
968 | 2.74k | s_32[5] = _mm_cvtsi32_si128(*(int32_t *)(im + 5 * 2)); |
969 | 2.74k | s_32[6] = _mm_cvtsi32_si128(*(int32_t *)(im + 6 * 2)); |
970 | | |
971 | 2.74k | const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]); |
972 | 2.74k | const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]); |
973 | 2.74k | const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]); |
974 | 2.74k | const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]); |
975 | 2.74k | const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]); |
976 | 2.74k | const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]); |
977 | | |
978 | 2.74k | ss_128[0] = _mm_unpacklo_epi16(src01, src12); |
979 | 2.74k | ss_128[1] = _mm_unpacklo_epi16(src23, src34); |
980 | 2.74k | ss_128[2] = _mm_unpacklo_epi16(src45, src56); |
981 | | |
982 | 2.74k | y = h; |
983 | 10.9k | do { |
984 | 10.9k | const __m128i res = |
985 | 10.9k | xy_y_convolve_8tap_2x2_sse2(im, s_32, ss_128, coeffs_128); |
986 | 10.9k | xy_y_round_store_2x2_sse2(res, dst, dst_stride); |
987 | 10.9k | im += 2 * 2; |
988 | 10.9k | dst += 2 * dst_stride; |
989 | 10.9k | y -= 2; |
990 | 10.9k | } while (y); |
991 | 49.4k | } else { |
992 | 49.4k | __m256i coeffs_256[4]; |
993 | | |
994 | 49.4k | prepare_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256); |
995 | | |
996 | 49.4k | if (w == 4) { |
997 | 9.97k | __m128i s_64[8]; |
998 | 9.97k | __m256i s_256[8], ss_256[4]; |
999 | | |
1000 | 9.97k | s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4)); |
1001 | 9.97k | s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4)); |
1002 | 9.97k | s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4)); |
1003 | 9.97k | s_64[3] = _mm_loadl_epi64((__m128i *)(im + 3 * 4)); |
1004 | 9.97k | s_64[4] = _mm_loadl_epi64((__m128i *)(im + 4 * 4)); |
1005 | 9.97k | s_64[5] = _mm_loadl_epi64((__m128i *)(im + 5 * 4)); |
1006 | 9.97k | s_64[6] = _mm_loadl_epi64((__m128i *)(im + 6 * 4)); |
1007 | | |
1008 | | // Load lines a and b. Line a to lower 128, line b to upper 128 |
1009 | 9.97k | s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]); |
1010 | 9.97k | s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]); |
1011 | 9.97k | s_256[2] = _mm256_setr_m128i(s_64[2], s_64[3]); |
1012 | 9.97k | s_256[3] = _mm256_setr_m128i(s_64[3], s_64[4]); |
1013 | 9.97k | s_256[4] = _mm256_setr_m128i(s_64[4], s_64[5]); |
1014 | 9.97k | s_256[5] = _mm256_setr_m128i(s_64[5], s_64[6]); |
1015 | | |
1016 | 9.97k | ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]); |
1017 | 9.97k | ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]); |
1018 | 9.97k | ss_256[2] = _mm256_unpacklo_epi16(s_256[4], s_256[5]); |
1019 | | |
1020 | 9.97k | y = h; |
1021 | 52.6k | do { |
1022 | 52.6k | const __m256i res = |
1023 | 52.6k | xy_y_convolve_8tap_4x2_avx2(im, s_64, ss_256, coeffs_256); |
1024 | 52.6k | xy_y_round_store_4x2_avx2(res, dst, dst_stride); |
1025 | 52.6k | im += 2 * 4; |
1026 | 52.6k | dst += 2 * dst_stride; |
1027 | 52.6k | y -= 2; |
1028 | 52.6k | } while (y); |
1029 | 39.4k | } else if (w == 8) { |
1030 | 11.0k | __m256i s_256[8], r[2]; |
1031 | | |
1032 | 11.0k | s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8)); |
1033 | 11.0k | s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8)); |
1034 | 11.0k | s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 8)); |
1035 | 11.0k | s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 8)); |
1036 | 11.0k | s_256[4] = _mm256_loadu_si256((__m256i *)(im + 4 * 8)); |
1037 | 11.0k | s_256[5] = _mm256_loadu_si256((__m256i *)(im + 5 * 8)); |
1038 | 11.0k | y = h; |
1039 | | |
1040 | 11.0k | if (subpel_y_q4 != 8) { |
1041 | 7.60k | __m256i ss_256[8]; |
1042 | | |
1043 | 7.60k | convolve_8tap_unpack_avx2(s_256, ss_256); |
1044 | | |
1045 | 43.1k | do { |
1046 | 43.1k | xy_y_convolve_8tap_8x2_avx2(im, ss_256, coeffs_256, r); |
1047 | 43.1k | xy_y_round_store_8x2_avx2(r, dst, dst_stride); |
1048 | 43.1k | im += 2 * 8; |
1049 | 43.1k | dst += 2 * dst_stride; |
1050 | 43.1k | y -= 2; |
1051 | 43.1k | } while (y); |
1052 | 7.60k | } else { |
1053 | 22.0k | do { |
1054 | 22.0k | xy_y_convolve_8tap_8x2_half_pel_avx2(im, coeffs_256, s_256, r); |
1055 | 22.0k | xy_y_round_store_8x2_avx2(r, dst, dst_stride); |
1056 | 22.0k | im += 2 * 8; |
1057 | 22.0k | dst += 2 * dst_stride; |
1058 | 22.0k | y -= 2; |
1059 | 22.0k | } while (y); |
1060 | 3.39k | } |
1061 | 28.4k | } else if (w == 16) { |
1062 | 6.92k | __m256i s_256[8], r[4]; |
1063 | | |
1064 | 6.92k | load_16bit_7rows_avx2(im, 16, s_256); |
1065 | 6.92k | y = h; |
1066 | | |
1067 | 6.92k | if (subpel_y_q4 != 8) { |
1068 | 4.67k | __m256i ss_256[8], tt_256[8]; |
1069 | | |
1070 | 4.67k | convolve_8tap_unpack_avx2(s_256, ss_256); |
1071 | 4.67k | convolve_8tap_unpack_avx2(s_256 + 1, tt_256); |
1072 | | |
1073 | 37.9k | do { |
1074 | 37.9k | xy_y_convolve_8tap_16x2_avx2(im, 16, coeffs_256, s_256, ss_256, |
1075 | 37.9k | tt_256, r); |
1076 | 37.9k | xy_y_round_store_16x2_avx2(r, dst, dst_stride); |
1077 | | |
1078 | 37.9k | im += 2 * 16; |
1079 | 37.9k | dst += 2 * dst_stride; |
1080 | 37.9k | y -= 2; |
1081 | 37.9k | } while (y); |
1082 | 4.67k | } else { |
1083 | 17.4k | do { |
1084 | 17.4k | xy_y_convolve_8tap_16x2_half_pel_avx2(im, 16, coeffs_256, s_256, r); |
1085 | 17.4k | xy_y_round_store_16x2_avx2(r, dst, dst_stride); |
1086 | | |
1087 | 17.4k | im += 2 * 16; |
1088 | 17.4k | dst += 2 * dst_stride; |
1089 | 17.4k | y -= 2; |
1090 | 17.4k | } while (y); |
1091 | 2.24k | } |
1092 | 21.5k | } else { |
1093 | 21.5k | int32_t x = 0; |
1094 | 21.5k | __m256i s_256[2][8], r0[4], r1[4]; |
1095 | | |
1096 | 21.5k | assert(!(w % 32)); |
1097 | | |
1098 | 21.5k | __m256i ss_256[2][8], tt_256[2][8]; |
1099 | | |
1100 | 29.2k | do { |
1101 | 29.2k | const int16_t *s = im + x; |
1102 | 29.2k | uint8_t *d = dst + x; |
1103 | | |
1104 | 29.2k | load_16bit_7rows_avx2(s, w, s_256[0]); |
1105 | 29.2k | convolve_8tap_unpack_avx2(s_256[0], ss_256[0]); |
1106 | 29.2k | convolve_8tap_unpack_avx2(s_256[0] + 1, tt_256[0]); |
1107 | | |
1108 | 29.2k | load_16bit_7rows_avx2(s + 16, w, s_256[1]); |
1109 | 29.2k | convolve_8tap_unpack_avx2(s_256[1], ss_256[1]); |
1110 | 29.2k | convolve_8tap_unpack_avx2(s_256[1] + 1, tt_256[1]); |
1111 | | |
1112 | 29.2k | y = h; |
1113 | 436k | do { |
1114 | 436k | xy_y_convolve_8tap_16x2_avx2(s, w, coeffs_256, s_256[0], ss_256[0], |
1115 | 436k | tt_256[0], r0); |
1116 | 436k | xy_y_convolve_8tap_16x2_avx2(s + 16, w, coeffs_256, s_256[1], |
1117 | 436k | ss_256[1], tt_256[1], r1); |
1118 | 436k | xy_y_round_store_32_avx2(r0 + 0, r1 + 0, d); |
1119 | 436k | xy_y_round_store_32_avx2(r0 + 2, r1 + 2, d + dst_stride); |
1120 | | |
1121 | 436k | s += 2 * w; |
1122 | 436k | d += 2 * dst_stride; |
1123 | 436k | y -= 2; |
1124 | 436k | } while (y); |
1125 | | |
1126 | 29.2k | x += 32; |
1127 | 29.2k | } while (x < w); |
1128 | 21.5k | } |
1129 | 49.4k | } |
1130 | 52.2k | } |
1131 | | |
1132 | | typedef void (*Convolve2dSrHorTapFunc)( |
1133 | | const uint8_t *const src, const int32_t src_stride, const int32_t w, |
1134 | | const int32_t h, const InterpFilterParams *const filter_params_x, |
1135 | | const int32_t subpel_x_q4, int16_t *const im_block); |
1136 | | |
1137 | | typedef void (*Convolve2dSrVerTapFunc)( |
1138 | | const int16_t *const im_block, const int32_t w, const int32_t h, |
1139 | | const InterpFilterParams *const filter_params_y, const int32_t subpel_y_q4, |
1140 | | uint8_t *dst, const int32_t dst_stride); |
1141 | | |
1142 | | static AOM_FORCE_INLINE void av1_convolve_2d_sr_specialized_avx2( |
1143 | | const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, |
1144 | | int32_t w, int32_t h, const InterpFilterParams *filter_params_x, |
1145 | | const InterpFilterParams *filter_params_y, const int32_t subpel_x_q4, |
1146 | 1.82M | const int32_t subpel_y_q4, ConvolveParams *conv_params) { |
1147 | 1.82M | static const Convolve2dSrHorTapFunc |
1148 | 1.82M | convolve_2d_sr_hor_tap_func_table[MAX_FILTER_TAP + 1] = { |
1149 | 1.82M | NULL, |
1150 | 1.82M | NULL, |
1151 | 1.82M | convolve_2d_sr_hor_2tap_avx2, |
1152 | 1.82M | NULL, |
1153 | 1.82M | convolve_2d_sr_hor_4tap_ssse3, |
1154 | 1.82M | NULL, |
1155 | 1.82M | convolve_2d_sr_hor_6tap_avx2, |
1156 | 1.82M | NULL, |
1157 | 1.82M | convolve_2d_sr_hor_8tap_avx2 |
1158 | 1.82M | }; |
1159 | 1.82M | static const Convolve2dSrVerTapFunc |
1160 | 1.82M | convolve_2d_sr_ver_tap_func_table[MAX_FILTER_TAP + 1] = { |
1161 | 1.82M | NULL, |
1162 | 1.82M | convolve_2d_sr_ver_2tap_half_avx2, |
1163 | 1.82M | convolve_2d_sr_ver_2tap_avx2, |
1164 | 1.82M | convolve_2d_sr_ver_4tap_avx2, |
1165 | 1.82M | convolve_2d_sr_ver_4tap_avx2, |
1166 | 1.82M | convolve_2d_sr_ver_6tap_avx2, |
1167 | 1.82M | convolve_2d_sr_ver_6tap_avx2, |
1168 | 1.82M | convolve_2d_sr_ver_8tap_avx2, |
1169 | 1.82M | convolve_2d_sr_ver_8tap_avx2 |
1170 | 1.82M | }; |
1171 | 1.82M | const int32_t tap_x = get_filter_tap(filter_params_x, subpel_x_q4); |
1172 | 1.82M | const int32_t tap_y = get_filter_tap(filter_params_y, subpel_y_q4); |
1173 | | |
1174 | 1.82M | assert(tap_x != 12 && tap_y != 12); |
1175 | | |
1176 | 1.82M | const uint8_t *src_ptr = src - ((tap_y >> 1) - 1) * src_stride; |
1177 | | // Note: im_block is 8-pixel interlaced for width 32 and up, to avoid data |
1178 | | // permutation. |
1179 | 1.82M | DECLARE_ALIGNED(32, int16_t, |
1180 | 1.82M | im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); |
1181 | | |
1182 | 1.82M | (void)conv_params; |
1183 | | |
1184 | 1.82M | assert(conv_params->round_0 == 3); |
1185 | 1.82M | assert(conv_params->round_1 == 11); |
1186 | | |
1187 | | // horizontal filter |
1188 | 1.82M | int32_t hh = h + tap_y; |
1189 | 1.82M | assert(!(hh % 2)); |
1190 | | |
1191 | 1.82M | convolve_2d_sr_hor_tap_func_table[tap_x]( |
1192 | 1.82M | src_ptr, src_stride, w, hh, filter_params_x, subpel_x_q4, im_block); |
1193 | | |
1194 | | // vertical filter |
1195 | 1.82M | convolve_2d_sr_ver_tap_func_table[tap_y - (subpel_y_q4 == 8)]( |
1196 | 1.82M | im_block, w, h, filter_params_y, subpel_y_q4, dst, dst_stride); |
1197 | 1.82M | } |
1198 | | |
1199 | | #endif // THIRD_PARTY_SVT_AV1_CONVOLVE_2D_AVX2_H_ |