/src/aom/av1/common/x86/convolve_avx2.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2017, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #include <immintrin.h> |
13 | | |
14 | | #include "config/av1_rtcd.h" |
15 | | |
16 | | #include "aom_dsp/aom_dsp_common.h" |
17 | | #include "aom_dsp/x86/convolve_avx2.h" |
18 | | #include "aom_dsp/x86/convolve_common_intrin.h" |
19 | | #include "aom_dsp/x86/synonyms.h" |
20 | | |
21 | | void av1_convolve_y_sr_avx2(const uint8_t *src, int32_t src_stride, |
22 | | uint8_t *dst, int32_t dst_stride, int32_t w, |
23 | | int32_t h, |
24 | | const InterpFilterParams *filter_params_y, |
25 | 486k | const int32_t subpel_y_qn) { |
26 | 486k | __m128i coeffs_128[4]; |
27 | 486k | __m256i coeffs[6]; |
28 | 486k | int x = 0, y = h; |
29 | | |
30 | 486k | int i, vert_tap = get_filter_tap(filter_params_y, subpel_y_qn); |
31 | 486k | assert(vert_tap == 2 || vert_tap == 4 || vert_tap == 6 || vert_tap == 8 || |
32 | 486k | vert_tap == 12); |
33 | 486k | assert(!(w % 2)); |
34 | 486k | assert(!(h % 2)); |
35 | | |
36 | 486k | const int fo_vert = vert_tap / 2 - 1; |
37 | 486k | const uint8_t *const src_ptr = src - fo_vert * src_stride; |
38 | 486k | const uint8_t *data = src_ptr; |
39 | 486k | uint8_t *dst_ptr = dst; |
40 | | |
41 | 486k | if (vert_tap == 2) { |
42 | 37.3k | if (subpel_y_qn != 8) { |
43 | 13.5k | if (w <= 4) { |
44 | 6.71k | prepare_coeffs_2t_ssse3(filter_params_y, subpel_y_qn, coeffs_128); |
45 | 6.71k | __m128i d[2], res; |
46 | 6.71k | if (w == 2) { |
47 | 1.67k | d[0] = _mm_cvtsi32_si128(loadu_int16(data)); |
48 | | |
49 | 3.20k | do { |
50 | 3.20k | convolve_y_2tap_2x2_ssse3(data, src_stride, coeffs_128, d, &res); |
51 | 3.20k | res = round_sr_y_ssse3(res); |
52 | 3.20k | pack_store_u8_2x2_sse2(res, dst_ptr, dst_stride); |
53 | | |
54 | 3.20k | dst_ptr += 2 * dst_stride; |
55 | 3.20k | data += 2 * src_stride; |
56 | 3.20k | y -= 2; |
57 | 3.20k | } while (y > 0); |
58 | 5.04k | } else { |
59 | 5.04k | assert(w == 4); |
60 | 5.04k | d[0] = _mm_cvtsi32_si128(loadu_int32(data)); |
61 | | |
62 | 14.8k | do { |
63 | 14.8k | convolve_y_2tap_4x2_ssse3(data, src_stride, coeffs_128, d, &res); |
64 | 14.8k | res = round_sr_y_ssse3(res); |
65 | 14.8k | pack_store_u8_4x2_sse2(res, dst_ptr, dst_stride); |
66 | | |
67 | 14.8k | dst_ptr += 2 * dst_stride; |
68 | 14.8k | data += 2 * src_stride; |
69 | 14.8k | y -= 2; |
70 | 14.8k | } while (y > 0); |
71 | 5.04k | } |
72 | 6.84k | } else { |
73 | 6.84k | prepare_coeffs_2t_lowbd(filter_params_y, subpel_y_qn, coeffs); |
74 | | |
75 | 6.84k | if (w == 8) { |
76 | 3.84k | __m128i d[2]; |
77 | 3.84k | d[0] = _mm_loadl_epi64((__m128i *)data); |
78 | | |
79 | 12.5k | do { |
80 | 12.5k | __m256i res; |
81 | 12.5k | convolve_y_2tap_8x2_avx2(data, src_stride, coeffs, d, &res); |
82 | 12.5k | round_pack_store_y_8x2_avx2(res, dst_ptr, dst_stride); |
83 | | |
84 | 12.5k | dst_ptr += 2 * dst_stride; |
85 | 12.5k | data += 2 * src_stride; |
86 | 12.5k | y -= 2; |
87 | | |
88 | 12.5k | } while (y > 0); |
89 | | |
90 | 3.84k | } else if (w == 16) { |
91 | 1.86k | __m128i d[2]; |
92 | 1.86k | d[0] = _mm_loadu_si128((__m128i *)data); |
93 | | |
94 | 12.8k | do { |
95 | 12.8k | __m256i res[2]; |
96 | 12.8k | convolve_y_2tap_16x2_avx2(data, src_stride, coeffs, d, res); |
97 | 12.8k | round_pack_store_y_16x2_avx2(res, dst_ptr, dst_stride); |
98 | | |
99 | 12.8k | dst_ptr += 2 * dst_stride; |
100 | 12.8k | data += 2 * src_stride; |
101 | 12.8k | y -= 2; |
102 | 12.8k | } while (y > 0); |
103 | | |
104 | 1.86k | } else { |
105 | 1.13k | assert(!(w % 32)); |
106 | | |
107 | 1.13k | __m256i d[2]; |
108 | 1.68k | do { |
109 | 1.68k | data = src_ptr + x; |
110 | 1.68k | dst_ptr = dst + x; |
111 | 1.68k | y = h; |
112 | | |
113 | 1.68k | d[0] = _mm256_loadu_si256((__m256i *)data); |
114 | | |
115 | 39.6k | do { |
116 | 39.6k | __m256i res[4]; |
117 | 39.6k | convolve_y_2tap_32x2_avx2(data, src_stride, coeffs, d, res); |
118 | 39.6k | round_pack_store_y_32x2_avx2(res, dst_ptr, dst_stride); |
119 | | |
120 | 39.6k | dst_ptr += 2 * dst_stride; |
121 | 39.6k | data += 2 * src_stride; |
122 | 39.6k | y -= 2; |
123 | 39.6k | } while (y > 0); |
124 | | |
125 | 1.68k | x += 32; |
126 | 1.68k | } while (x < w); |
127 | 1.13k | } |
128 | 6.84k | } |
129 | 23.7k | } else { |
130 | 23.7k | if (w <= 16) { |
131 | 22.7k | __m128i s[2], res; |
132 | | |
133 | 22.7k | if (w == 2) { |
134 | 4.81k | s[0] = _mm_cvtsi32_si128(loadu_int16(data)); |
135 | | |
136 | 9.22k | do { |
137 | 9.22k | s[1] = _mm_cvtsi32_si128(loadu_int16(data + src_stride)); |
138 | 9.22k | res = _mm_avg_epu8(s[0], s[1]); |
139 | 9.22k | xx_storel_16(dst_ptr, res); |
140 | 9.22k | s[0] = _mm_cvtsi32_si128(loadu_int16(data + 2 * src_stride)); |
141 | 9.22k | res = _mm_avg_epu8(s[1], s[0]); |
142 | 9.22k | xx_storel_16(dst_ptr + dst_stride, res); |
143 | | |
144 | 9.22k | data += 2 * src_stride; |
145 | 9.22k | dst_ptr += 2 * dst_stride; |
146 | 9.22k | y -= 2; |
147 | 9.22k | } while (y > 0); |
148 | 17.9k | } else if (w == 4) { |
149 | 9.57k | s[0] = _mm_cvtsi32_si128(loadu_int32(data)); |
150 | | |
151 | 26.4k | do { |
152 | 26.4k | s[1] = _mm_cvtsi32_si128(loadu_int32(data + src_stride)); |
153 | 26.4k | res = _mm_avg_epu8(s[0], s[1]); |
154 | 26.4k | xx_storel_32(dst_ptr, res); |
155 | 26.4k | s[0] = _mm_cvtsi32_si128(loadu_int32(data + 2 * src_stride)); |
156 | 26.4k | res = _mm_avg_epu8(s[1], s[0]); |
157 | 26.4k | xx_storel_32(dst_ptr + dst_stride, res); |
158 | | |
159 | 26.4k | data += 2 * src_stride; |
160 | 26.4k | dst_ptr += 2 * dst_stride; |
161 | 26.4k | y -= 2; |
162 | 26.4k | } while (y > 0); |
163 | 9.57k | } else if (w == 8) { |
164 | 6.32k | s[0] = _mm_loadl_epi64((__m128i *)data); |
165 | | |
166 | 21.1k | do { |
167 | 21.1k | s[1] = _mm_loadl_epi64((__m128i *)(data + src_stride)); |
168 | 21.1k | res = _mm_avg_epu8(s[0], s[1]); |
169 | 21.1k | _mm_storel_epi64((__m128i *)dst_ptr, res); |
170 | 21.1k | s[0] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)); |
171 | 21.1k | res = _mm_avg_epu8(s[1], s[0]); |
172 | 21.1k | _mm_storel_epi64((__m128i *)(dst_ptr + dst_stride), res); |
173 | | |
174 | 21.1k | data += 2 * src_stride; |
175 | 21.1k | dst_ptr += 2 * dst_stride; |
176 | 21.1k | y -= 2; |
177 | 21.1k | } while (y > 0); |
178 | 6.32k | } else { |
179 | 2.07k | assert(w == 16); |
180 | | |
181 | 2.07k | s[0] = _mm_loadu_si128((__m128i *)data); |
182 | | |
183 | 11.8k | do { |
184 | 11.8k | s[1] = _mm_loadu_si128((__m128i *)(data + src_stride)); |
185 | 11.8k | res = _mm_avg_epu8(s[0], s[1]); |
186 | 11.8k | _mm_storeu_si128((__m128i *)dst_ptr, res); |
187 | 11.8k | s[0] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); |
188 | 11.8k | res = _mm_avg_epu8(s[1], s[0]); |
189 | 11.8k | _mm_storeu_si128((__m128i *)(dst_ptr + dst_stride), res); |
190 | | |
191 | 11.8k | data += 2 * src_stride; |
192 | 11.8k | dst_ptr += 2 * dst_stride; |
193 | 11.8k | y -= 2; |
194 | 11.8k | } while (y > 0); |
195 | 2.07k | } |
196 | 22.7k | } else { |
197 | 982 | assert(!(w % 32)); |
198 | | |
199 | 982 | __m256i s[2], res; |
200 | 1.45k | do { |
201 | 1.45k | data = src_ptr + x; |
202 | 1.45k | dst_ptr = dst + x; |
203 | 1.45k | y = h; |
204 | | |
205 | 1.45k | s[0] = _mm256_loadu_si256((__m256i *)data); |
206 | | |
207 | 37.1k | do { |
208 | 37.1k | s[1] = _mm256_loadu_si256((__m256i *)(data + src_stride)); |
209 | 37.1k | res = _mm256_avg_epu8(s[0], s[1]); |
210 | 37.1k | _mm256_storeu_si256((__m256i *)dst_ptr, res); |
211 | 37.1k | s[0] = _mm256_loadu_si256((__m256i *)(data + 2 * src_stride)); |
212 | 37.1k | res = _mm256_avg_epu8(s[1], s[0]); |
213 | 37.1k | _mm256_storeu_si256((__m256i *)(dst_ptr + dst_stride), res); |
214 | | |
215 | 37.1k | data += 2 * src_stride; |
216 | 37.1k | dst_ptr += 2 * dst_stride; |
217 | 37.1k | y -= 2; |
218 | 37.1k | } while (y > 0); |
219 | | |
220 | 1.45k | x += 32; |
221 | 1.45k | } while (x < w); |
222 | 982 | } |
223 | 23.7k | } |
224 | 449k | } else if (vert_tap == 4) { |
225 | 237k | if (w <= 4) { |
226 | 113k | prepare_coeffs_4t_ssse3(filter_params_y, subpel_y_qn, coeffs_128); |
227 | 113k | __m128i d[4], s[2]; |
228 | | |
229 | 113k | if (w == 2) { |
230 | 19.9k | d[0] = _mm_cvtsi32_si128(loadu_int16(data + 0 * src_stride)); |
231 | 19.9k | d[1] = _mm_cvtsi32_si128(loadu_int16(data + 1 * src_stride)); |
232 | 19.9k | d[2] = _mm_cvtsi32_si128(loadu_int16(data + 2 * src_stride)); |
233 | | |
234 | 19.9k | const __m128i src_01a = _mm_unpacklo_epi16(d[0], d[1]); |
235 | 19.9k | const __m128i src_12a = _mm_unpacklo_epi16(d[1], d[2]); |
236 | | |
237 | 19.9k | s[0] = _mm_unpacklo_epi8(src_01a, src_12a); |
238 | 33.6k | do { |
239 | 33.6k | __m128i res; |
240 | 33.6k | convolve_y_4tap_2x2_ssse3(data, src_stride, coeffs_128, d, s, &res); |
241 | 33.6k | res = round_sr_y_ssse3(res); |
242 | 33.6k | pack_store_u8_2x2_sse2(res, dst_ptr, dst_stride); |
243 | | |
244 | 33.6k | dst_ptr += 2 * dst_stride; |
245 | 33.6k | data += 2 * src_stride; |
246 | 33.6k | y -= 2; |
247 | | |
248 | 33.6k | s[0] = s[1]; |
249 | 33.6k | } while (y > 0); |
250 | | |
251 | 93.9k | } else { |
252 | 93.9k | assert(w == 4); |
253 | | |
254 | 93.9k | d[0] = _mm_cvtsi32_si128(loadu_int32(data + 0 * src_stride)); |
255 | 93.9k | d[1] = _mm_cvtsi32_si128(loadu_int32(data + 1 * src_stride)); |
256 | 93.9k | d[2] = _mm_cvtsi32_si128(loadu_int32(data + 2 * src_stride)); |
257 | | |
258 | 93.9k | const __m128i src_01a = _mm_unpacklo_epi32(d[0], d[1]); |
259 | 93.9k | const __m128i src_12a = _mm_unpacklo_epi32(d[1], d[2]); |
260 | | |
261 | 93.9k | s[0] = _mm_unpacklo_epi8(src_01a, src_12a); |
262 | 185k | do { |
263 | 185k | __m128i res; |
264 | 185k | convolve_y_4tap_4x2_ssse3(data, src_stride, coeffs_128, d, s, &res); |
265 | 185k | res = round_sr_y_ssse3(res); |
266 | 185k | pack_store_u8_4x2_sse2(res, dst_ptr, dst_stride); |
267 | | |
268 | 185k | dst_ptr += 2 * dst_stride; |
269 | 185k | data += 2 * src_stride; |
270 | 185k | y -= 2; |
271 | | |
272 | 185k | s[0] = s[1]; |
273 | 185k | } while (y > 0); |
274 | 93.9k | } |
275 | 123k | } else { |
276 | 123k | prepare_coeffs_4t_lowbd(filter_params_y, subpel_y_qn, coeffs); |
277 | | |
278 | 123k | if (w == 8) { |
279 | 81.8k | __m128i d[4]; |
280 | 81.8k | __m256i s[2]; |
281 | | |
282 | 81.8k | d[0] = _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)); |
283 | 81.8k | d[1] = _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)); |
284 | 81.8k | d[2] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)); |
285 | | |
286 | 81.8k | const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]); |
287 | 81.8k | const __m256i src_12a = _mm256_setr_m128i(d[1], d[2]); |
288 | | |
289 | 81.8k | s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); |
290 | 160k | do { |
291 | 160k | __m256i res; |
292 | 160k | convolve_y_4tap_8x2_avx2(data, src_stride, coeffs, d, s, &res); |
293 | 160k | round_pack_store_y_8x2_avx2(res, dst_ptr, dst_stride); |
294 | | |
295 | 160k | dst_ptr += 2 * dst_stride; |
296 | 160k | data += 2 * src_stride; |
297 | 160k | y -= 2; |
298 | | |
299 | 160k | s[0] = s[1]; |
300 | 160k | } while (y > 0); |
301 | 81.8k | } else if (w == 16) { |
302 | 37.6k | __m128i d[4]; |
303 | 37.6k | __m256i s[4]; |
304 | | |
305 | 37.6k | d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); |
306 | 37.6k | d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); |
307 | 37.6k | d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); |
308 | | |
309 | 37.6k | const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]); |
310 | 37.6k | const __m256i src_12a = _mm256_setr_m128i(d[1], d[2]); |
311 | | |
312 | 37.6k | s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); |
313 | 37.6k | s[2] = _mm256_unpackhi_epi8(src_01a, src_12a); |
314 | | |
315 | 92.8k | do { |
316 | 92.8k | __m256i res[2]; |
317 | 92.8k | convolve_y_4tap_16x2_avx2(data, src_stride, coeffs, d, s, res); |
318 | 92.8k | round_pack_store_y_16x2_avx2(res, dst_ptr, dst_stride); |
319 | | |
320 | 92.8k | dst_ptr += 2 * dst_stride; |
321 | 92.8k | data += 2 * src_stride; |
322 | 92.8k | y -= 2; |
323 | | |
324 | 92.8k | s[0] = s[1]; |
325 | 92.8k | s[2] = s[3]; |
326 | 92.8k | } while (y > 0); |
327 | 37.6k | } else { |
328 | 3.81k | assert(!(w % 32)); |
329 | | |
330 | 3.81k | __m256i d[4], s1[4], s2[4]; |
331 | 5.60k | do { |
332 | 5.60k | data = src_ptr + x; |
333 | 5.60k | dst_ptr = dst + x; |
334 | 5.60k | y = h; |
335 | | |
336 | 5.60k | d[0] = _mm256_loadu_si256((__m256i *)(data + 0 * src_stride)); |
337 | 5.60k | d[1] = _mm256_loadu_si256((__m256i *)(data + 1 * src_stride)); |
338 | 5.60k | d[2] = _mm256_loadu_si256((__m256i *)(data + 2 * src_stride)); |
339 | | |
340 | 5.60k | s1[0] = _mm256_unpacklo_epi8(d[0], d[1]); |
341 | 5.60k | s1[2] = _mm256_unpackhi_epi8(d[0], d[1]); |
342 | | |
343 | 5.60k | s2[0] = _mm256_unpacklo_epi8(d[1], d[2]); |
344 | 5.60k | s2[2] = _mm256_unpackhi_epi8(d[1], d[2]); |
345 | | |
346 | 141k | do { |
347 | 141k | __m256i res[4]; |
348 | 141k | convolve_y_4tap_32x2_avx2(data, src_stride, coeffs, d, s1, s2, res); |
349 | 141k | round_pack_store_y_32x2_avx2(res, dst_ptr, dst_stride); |
350 | | |
351 | 141k | dst_ptr += 2 * dst_stride; |
352 | 141k | data += 2 * src_stride; |
353 | 141k | y -= 2; |
354 | | |
355 | 141k | s1[0] = s1[1]; |
356 | 141k | s1[2] = s1[3]; |
357 | | |
358 | 141k | s2[0] = s2[1]; |
359 | 141k | s2[2] = s2[3]; |
360 | 141k | } while (y > 0); |
361 | | |
362 | 5.60k | x += 32; |
363 | 5.60k | } while (x < w); |
364 | 3.81k | } |
365 | 123k | } |
366 | 237k | } else if (vert_tap == 6) { |
367 | 200k | if (w <= 4) { |
368 | 61.9k | prepare_coeffs_6t_ssse3(filter_params_y, subpel_y_qn, coeffs_128); |
369 | | |
370 | 61.9k | __m128i d[6], s[3]; |
371 | 61.9k | if (w == 2) { |
372 | 11.6k | d[0] = _mm_cvtsi32_si128(loadu_int16(data + 0 * src_stride)); |
373 | 11.6k | d[1] = _mm_cvtsi32_si128(loadu_int16(data + 1 * src_stride)); |
374 | 11.6k | d[2] = _mm_cvtsi32_si128(loadu_int16(data + 2 * src_stride)); |
375 | 11.6k | d[3] = _mm_cvtsi32_si128(loadu_int16(data + 3 * src_stride)); |
376 | 11.6k | d[4] = _mm_cvtsi32_si128(loadu_int16(data + 4 * src_stride)); |
377 | | |
378 | 11.6k | const __m128i src_01a = _mm_unpacklo_epi16(d[0], d[1]); |
379 | 11.6k | const __m128i src_12a = _mm_unpacklo_epi16(d[1], d[2]); |
380 | 11.6k | const __m128i src_23a = _mm_unpacklo_epi16(d[2], d[3]); |
381 | 11.6k | const __m128i src_34a = _mm_unpacklo_epi16(d[3], d[4]); |
382 | | |
383 | 11.6k | s[0] = _mm_unpacklo_epi8(src_01a, src_12a); |
384 | 11.6k | s[1] = _mm_unpacklo_epi8(src_23a, src_34a); |
385 | | |
386 | 46.5k | do { |
387 | 46.5k | __m128i res; |
388 | 46.5k | convolve_y_6tap_2x2_ssse3(data, src_stride, coeffs_128, d, s, &res); |
389 | 46.5k | res = round_sr_y_ssse3(res); |
390 | 46.5k | pack_store_u8_2x2_sse2(res, dst_ptr, dst_stride); |
391 | | |
392 | 46.5k | dst_ptr += 2 * dst_stride; |
393 | 46.5k | data += 2 * src_stride; |
394 | 46.5k | y -= 2; |
395 | | |
396 | 46.5k | s[0] = s[1]; |
397 | 46.5k | s[1] = s[2]; |
398 | 46.5k | } while (y > 0); |
399 | | |
400 | 50.2k | } else { |
401 | 50.2k | assert(w == 4); |
402 | 50.2k | d[0] = _mm_cvtsi32_si128(loadu_int32(data + 0 * src_stride)); |
403 | 50.2k | d[1] = _mm_cvtsi32_si128(loadu_int32(data + 1 * src_stride)); |
404 | 50.2k | d[2] = _mm_cvtsi32_si128(loadu_int32(data + 2 * src_stride)); |
405 | 50.2k | d[3] = _mm_cvtsi32_si128(loadu_int32(data + 3 * src_stride)); |
406 | 50.2k | d[4] = _mm_cvtsi32_si128(loadu_int32(data + 4 * src_stride)); |
407 | | |
408 | 50.2k | const __m128i src_01a = _mm_unpacklo_epi32(d[0], d[1]); |
409 | 50.2k | const __m128i src_12a = _mm_unpacklo_epi32(d[1], d[2]); |
410 | 50.2k | const __m128i src_23a = _mm_unpacklo_epi32(d[2], d[3]); |
411 | 50.2k | const __m128i src_34a = _mm_unpacklo_epi32(d[3], d[4]); |
412 | | |
413 | 50.2k | s[0] = _mm_unpacklo_epi8(src_01a, src_12a); |
414 | 50.2k | s[1] = _mm_unpacklo_epi8(src_23a, src_34a); |
415 | | |
416 | 274k | do { |
417 | 274k | __m128i res; |
418 | 274k | convolve_y_6tap_4x2_ssse3(data, src_stride, coeffs_128, d, s, &res); |
419 | 274k | res = round_sr_y_ssse3(res); |
420 | 274k | pack_store_u8_4x2_sse2(res, dst_ptr, dst_stride); |
421 | | |
422 | 274k | dst_ptr += 2 * dst_stride; |
423 | 274k | data += 2 * src_stride; |
424 | 274k | y -= 2; |
425 | | |
426 | 274k | s[0] = s[1]; |
427 | 274k | s[1] = s[2]; |
428 | 274k | } while (y > 0); |
429 | 50.2k | } |
430 | 138k | } else { |
431 | 138k | prepare_coeffs_6t_lowbd(filter_params_y, subpel_y_qn, coeffs); |
432 | | |
433 | 138k | if (w == 8) { |
434 | 66.7k | __m128i d[6]; |
435 | 66.7k | __m256i s[3]; |
436 | | |
437 | 66.7k | d[0] = _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)); |
438 | 66.7k | d[1] = _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)); |
439 | 66.7k | d[2] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)); |
440 | 66.7k | d[3] = _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)); |
441 | 66.7k | d[4] = _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)); |
442 | | |
443 | 66.7k | const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]); |
444 | 66.7k | const __m256i src_12a = _mm256_setr_m128i(d[1], d[2]); |
445 | 66.7k | const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]); |
446 | 66.7k | const __m256i src_34a = _mm256_setr_m128i(d[3], d[4]); |
447 | | |
448 | 66.7k | s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); |
449 | 66.7k | s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); |
450 | | |
451 | 382k | do { |
452 | 382k | __m256i res; |
453 | 382k | convolve_y_6tap_8x2_avx2(data, src_stride, coeffs, d, s, &res); |
454 | 382k | round_pack_store_y_8x2_avx2(res, dst_ptr, dst_stride); |
455 | | |
456 | 382k | dst_ptr += 2 * dst_stride; |
457 | 382k | data += 2 * src_stride; |
458 | 382k | y -= 2; |
459 | | |
460 | 382k | s[0] = s[1]; |
461 | 382k | s[1] = s[2]; |
462 | 382k | } while (y > 0); |
463 | | |
464 | 71.3k | } else { |
465 | 71.3k | assert(!(w % 16)); |
466 | | |
467 | 71.3k | __m128i d[6]; |
468 | 71.3k | __m256i s[6]; |
469 | 101k | do { |
470 | 101k | data = src_ptr + x; |
471 | 101k | dst_ptr = dst + x; |
472 | 101k | y = h; |
473 | | |
474 | 101k | d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); |
475 | 101k | d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); |
476 | 101k | d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); |
477 | 101k | d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); |
478 | 101k | d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); |
479 | | |
480 | 101k | const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]); |
481 | 101k | const __m256i src_12a = _mm256_setr_m128i(d[1], d[2]); |
482 | 101k | const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]); |
483 | 101k | const __m256i src_34a = _mm256_setr_m128i(d[3], d[4]); |
484 | | |
485 | 101k | s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); |
486 | 101k | s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); |
487 | | |
488 | 101k | s[3] = _mm256_unpackhi_epi8(src_01a, src_12a); |
489 | 101k | s[4] = _mm256_unpackhi_epi8(src_23a, src_34a); |
490 | | |
491 | 1.24M | do { |
492 | 1.24M | __m256i res[2]; |
493 | 1.24M | convolve_y_6tap_16x2_avx2(data, src_stride, coeffs, d, s, res); |
494 | 1.24M | round_pack_store_y_16x2_avx2(res, dst_ptr, dst_stride); |
495 | | |
496 | 1.24M | dst_ptr += 2 * dst_stride; |
497 | 1.24M | data += 2 * src_stride; |
498 | 1.24M | y -= 2; |
499 | | |
500 | 1.24M | s[0] = s[1]; |
501 | 1.24M | s[1] = s[2]; |
502 | | |
503 | 1.24M | s[3] = s[4]; |
504 | 1.24M | s[4] = s[5]; |
505 | 1.24M | } while (y > 0); |
506 | | |
507 | 101k | x += 16; |
508 | 101k | } while (x < w); |
509 | 71.3k | } |
510 | 138k | } |
511 | 200k | } else if (vert_tap == 12) { // vert_tap == 12 |
512 | 0 | __m128i d[12]; |
513 | 0 | __m256i s[12]; |
514 | 0 | prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs); |
515 | 0 | const __m256i v_zero = _mm256_setzero_si256(); |
516 | 0 | __m128i right_shift = _mm_cvtsi32_si128(FILTER_BITS); |
517 | 0 | __m256i right_shift_const = _mm256_set1_epi32((1 << FILTER_BITS) >> 1); |
518 | |
|
519 | 0 | for (int j = 0; j < w; j += 8) { |
520 | 0 | data = &src_ptr[j]; |
521 | 0 | __m256i src10; |
522 | |
|
523 | 0 | d[0] = _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)); |
524 | 0 | d[1] = _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)); |
525 | 0 | d[2] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)); |
526 | 0 | d[3] = _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)); |
527 | 0 | d[4] = _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)); |
528 | 0 | d[5] = _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)); |
529 | 0 | d[6] = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride)); |
530 | 0 | d[7] = _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)); |
531 | 0 | d[8] = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride)); |
532 | 0 | d[9] = _mm_loadl_epi64((__m128i *)(data + 9 * src_stride)); |
533 | | // Load lines a and b. Line a to lower 128, line b to upper 128 |
534 | 0 | const __m256i src_01a = _mm256_permute2x128_si256( |
535 | 0 | _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20); |
536 | |
|
537 | 0 | const __m256i src_12a = _mm256_permute2x128_si256( |
538 | 0 | _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20); |
539 | |
|
540 | 0 | const __m256i src_23a = _mm256_permute2x128_si256( |
541 | 0 | _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20); |
542 | |
|
543 | 0 | const __m256i src_34a = _mm256_permute2x128_si256( |
544 | 0 | _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20); |
545 | |
|
546 | 0 | const __m256i src_45a = _mm256_permute2x128_si256( |
547 | 0 | _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20); |
548 | |
|
549 | 0 | const __m256i src_56a = _mm256_permute2x128_si256( |
550 | 0 | _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[6]), 0x20); |
551 | |
|
552 | 0 | const __m256i src_67a = _mm256_permute2x128_si256( |
553 | 0 | _mm256_castsi128_si256(d[6]), _mm256_castsi128_si256(d[7]), 0x20); |
554 | |
|
555 | 0 | const __m256i src_78a = _mm256_permute2x128_si256( |
556 | 0 | _mm256_castsi128_si256(d[7]), _mm256_castsi128_si256(d[8]), 0x20); |
557 | |
|
558 | 0 | const __m256i src_89a = _mm256_permute2x128_si256( |
559 | 0 | _mm256_castsi128_si256(d[8]), _mm256_castsi128_si256(d[9]), 0x20); |
560 | |
|
561 | 0 | src10 = _mm256_castsi128_si256( |
562 | 0 | _mm_loadl_epi64((__m128i *)(data + 10 * src_stride))); |
563 | 0 | const __m256i src_910a = |
564 | 0 | _mm256_permute2x128_si256(_mm256_castsi128_si256(d[9]), src10, 0x20); |
565 | |
|
566 | 0 | const __m256i src_01 = _mm256_unpacklo_epi8(src_01a, v_zero); |
567 | 0 | const __m256i src_12 = _mm256_unpacklo_epi8(src_12a, v_zero); |
568 | 0 | const __m256i src_23 = _mm256_unpacklo_epi8(src_23a, v_zero); |
569 | 0 | const __m256i src_34 = _mm256_unpacklo_epi8(src_34a, v_zero); |
570 | 0 | const __m256i src_45 = _mm256_unpacklo_epi8(src_45a, v_zero); |
571 | 0 | const __m256i src_56 = _mm256_unpacklo_epi8(src_56a, v_zero); |
572 | 0 | const __m256i src_67 = _mm256_unpacklo_epi8(src_67a, v_zero); |
573 | 0 | const __m256i src_78 = _mm256_unpacklo_epi8(src_78a, v_zero); |
574 | 0 | const __m256i src_89 = _mm256_unpacklo_epi8(src_89a, v_zero); |
575 | 0 | const __m256i src_910 = _mm256_unpacklo_epi8(src_910a, v_zero); |
576 | |
|
577 | 0 | s[0] = _mm256_unpacklo_epi16(src_01, src_12); |
578 | 0 | s[1] = _mm256_unpacklo_epi16(src_23, src_34); |
579 | 0 | s[2] = _mm256_unpacklo_epi16(src_45, src_56); |
580 | 0 | s[3] = _mm256_unpacklo_epi16(src_67, src_78); |
581 | 0 | s[4] = _mm256_unpacklo_epi16(src_89, src_910); |
582 | |
|
583 | 0 | s[6] = _mm256_unpackhi_epi16(src_01, src_12); |
584 | 0 | s[7] = _mm256_unpackhi_epi16(src_23, src_34); |
585 | 0 | s[8] = _mm256_unpackhi_epi16(src_45, src_56); |
586 | 0 | s[9] = _mm256_unpackhi_epi16(src_67, src_78); |
587 | 0 | s[10] = _mm256_unpackhi_epi16(src_89, src_910); |
588 | |
|
589 | 0 | for (i = 0; i < h; i += 2) { |
590 | 0 | data = &src_ptr[i * src_stride + j]; |
591 | 0 | const __m256i src_1011a = _mm256_permute2x128_si256( |
592 | 0 | src10, |
593 | 0 | _mm256_castsi128_si256( |
594 | 0 | _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))), |
595 | 0 | 0x20); |
596 | |
|
597 | 0 | src10 = _mm256_castsi128_si256( |
598 | 0 | _mm_loadl_epi64((__m128i *)(data + 12 * src_stride))); |
599 | |
|
600 | 0 | const __m256i src_1112a = _mm256_permute2x128_si256( |
601 | 0 | _mm256_castsi128_si256( |
602 | 0 | _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))), |
603 | 0 | src10, 0x20); |
604 | |
|
605 | 0 | const __m256i src_1011 = _mm256_unpacklo_epi8(src_1011a, v_zero); |
606 | 0 | const __m256i src_1112 = _mm256_unpacklo_epi8(src_1112a, v_zero); |
607 | |
|
608 | 0 | s[5] = _mm256_unpacklo_epi16(src_1011, src_1112); |
609 | 0 | s[11] = _mm256_unpackhi_epi16(src_1011, src_1112); |
610 | |
|
611 | 0 | const __m256i res_lo = convolve_12taps(s, coeffs); |
612 | |
|
613 | 0 | const __m256i res_32b_lo = _mm256_sra_epi32( |
614 | 0 | _mm256_add_epi32(res_lo, right_shift_const), right_shift); |
615 | | // 8 bit conversion and saturation to uint8 |
616 | 0 | __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); |
617 | 0 | __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); |
618 | |
|
619 | 0 | if (w - j > 4) { |
620 | 0 | const __m256i res_hi = convolve_12taps(s + 6, coeffs); |
621 | |
|
622 | 0 | const __m256i res_32b_hi = _mm256_sra_epi32( |
623 | 0 | _mm256_add_epi32(res_hi, right_shift_const), right_shift); |
624 | 0 | __m256i res_16b_hi = _mm256_packs_epi32(res_32b_hi, res_32b_hi); |
625 | | // 8 bit conversion and saturation to uint8 |
626 | 0 | __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); |
627 | |
|
628 | 0 | __m256i res_a = _mm256_unpacklo_epi32(res_8b_lo, res_8b_hi); |
629 | |
|
630 | 0 | const __m128i res_0 = _mm256_extracti128_si256(res_a, 0); |
631 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); |
632 | |
|
633 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); |
634 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], |
635 | 0 | res_1); |
636 | 0 | } else { |
637 | 0 | const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0); |
638 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); |
639 | 0 | if (w - j > 2) { |
640 | 0 | xx_storel_32(&dst[i * dst_stride + j], res_0); |
641 | 0 | xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1); |
642 | 0 | } else { |
643 | 0 | xx_storel_16(&dst[i * dst_stride + j], res_0); |
644 | 0 | xx_storel_16(&dst[i * dst_stride + j + dst_stride], res_1); |
645 | 0 | } |
646 | 0 | } |
647 | 0 | s[0] = s[1]; |
648 | 0 | s[1] = s[2]; |
649 | 0 | s[2] = s[3]; |
650 | 0 | s[3] = s[4]; |
651 | 0 | s[4] = s[5]; |
652 | |
|
653 | 0 | s[6] = s[7]; |
654 | 0 | s[7] = s[8]; |
655 | 0 | s[8] = s[9]; |
656 | 0 | s[9] = s[10]; |
657 | 0 | s[10] = s[11]; |
658 | 0 | } |
659 | 0 | } |
660 | 11.8k | } else { |
661 | 11.8k | assert(vert_tap == 8); |
662 | | |
663 | 11.8k | if (w <= 4) { |
664 | 5.08k | prepare_coeffs_ssse3(filter_params_y, subpel_y_qn, coeffs_128); |
665 | | |
666 | 5.08k | __m128i d[8], s[4], res; |
667 | 5.08k | if (w == 2) { |
668 | 1.11k | d[0] = _mm_cvtsi32_si128(loadu_int16(data + 0 * src_stride)); |
669 | 1.11k | d[1] = _mm_cvtsi32_si128(loadu_int16(data + 1 * src_stride)); |
670 | 1.11k | d[2] = _mm_cvtsi32_si128(loadu_int16(data + 2 * src_stride)); |
671 | 1.11k | d[3] = _mm_cvtsi32_si128(loadu_int16(data + 3 * src_stride)); |
672 | 1.11k | d[4] = _mm_cvtsi32_si128(loadu_int16(data + 4 * src_stride)); |
673 | 1.11k | d[5] = _mm_cvtsi32_si128(loadu_int16(data + 5 * src_stride)); |
674 | 1.11k | d[6] = _mm_cvtsi32_si128(loadu_int16(data + 6 * src_stride)); |
675 | | |
676 | 1.11k | const __m128i src_01a = _mm_unpacklo_epi16(d[0], d[1]); |
677 | 1.11k | const __m128i src_12a = _mm_unpacklo_epi16(d[1], d[2]); |
678 | 1.11k | const __m128i src_23a = _mm_unpacklo_epi16(d[2], d[3]); |
679 | 1.11k | const __m128i src_34a = _mm_unpacklo_epi16(d[3], d[4]); |
680 | 1.11k | const __m128i src_45a = _mm_unpacklo_epi16(d[4], d[5]); |
681 | 1.11k | const __m128i src_56a = _mm_unpacklo_epi16(d[5], d[6]); |
682 | | |
683 | 1.11k | s[0] = _mm_unpacklo_epi8(src_01a, src_12a); |
684 | 1.11k | s[1] = _mm_unpacklo_epi8(src_23a, src_34a); |
685 | 1.11k | s[2] = _mm_unpacklo_epi8(src_45a, src_56a); |
686 | | |
687 | 4.46k | do { |
688 | 4.46k | convolve_y_8tap_2x2_ssse3(data, src_stride, coeffs_128, d, s, &res); |
689 | 4.46k | res = round_sr_y_ssse3(res); |
690 | 4.46k | pack_store_u8_2x2_sse2(res, dst_ptr, dst_stride); |
691 | | |
692 | 4.46k | dst_ptr += 2 * dst_stride; |
693 | 4.46k | data += 2 * src_stride; |
694 | 4.46k | y -= 2; |
695 | | |
696 | 4.46k | s[0] = s[1]; |
697 | 4.46k | s[1] = s[2]; |
698 | 4.46k | s[2] = s[3]; |
699 | 4.46k | } while (y > 0); |
700 | | |
701 | 3.97k | } else { |
702 | 3.97k | assert(w == 4); |
703 | | |
704 | 3.97k | d[0] = _mm_cvtsi32_si128(loadu_int32(data + 0 * src_stride)); |
705 | 3.97k | d[1] = _mm_cvtsi32_si128(loadu_int32(data + 1 * src_stride)); |
706 | 3.97k | d[2] = _mm_cvtsi32_si128(loadu_int32(data + 2 * src_stride)); |
707 | 3.97k | d[3] = _mm_cvtsi32_si128(loadu_int32(data + 3 * src_stride)); |
708 | 3.97k | d[4] = _mm_cvtsi32_si128(loadu_int32(data + 4 * src_stride)); |
709 | 3.97k | d[5] = _mm_cvtsi32_si128(loadu_int32(data + 5 * src_stride)); |
710 | 3.97k | d[6] = _mm_cvtsi32_si128(loadu_int32(data + 6 * src_stride)); |
711 | | |
712 | 3.97k | const __m128i src_01a = _mm_unpacklo_epi32(d[0], d[1]); |
713 | 3.97k | const __m128i src_12a = _mm_unpacklo_epi32(d[1], d[2]); |
714 | 3.97k | const __m128i src_23a = _mm_unpacklo_epi32(d[2], d[3]); |
715 | 3.97k | const __m128i src_34a = _mm_unpacklo_epi32(d[3], d[4]); |
716 | 3.97k | const __m128i src_45a = _mm_unpacklo_epi32(d[4], d[5]); |
717 | 3.97k | const __m128i src_56a = _mm_unpacklo_epi32(d[5], d[6]); |
718 | | |
719 | 3.97k | s[0] = _mm_unpacklo_epi8(src_01a, src_12a); |
720 | 3.97k | s[1] = _mm_unpacklo_epi8(src_23a, src_34a); |
721 | 3.97k | s[2] = _mm_unpacklo_epi8(src_45a, src_56a); |
722 | | |
723 | 21.6k | do { |
724 | 21.6k | convolve_y_8tap_4x2_ssse3(data, src_stride, coeffs_128, d, s, &res); |
725 | 21.6k | res = round_sr_y_ssse3(res); |
726 | 21.6k | pack_store_u8_4x2_sse2(res, dst_ptr, dst_stride); |
727 | | |
728 | 21.6k | dst_ptr += 2 * dst_stride; |
729 | 21.6k | data += 2 * src_stride; |
730 | 21.6k | y -= 2; |
731 | | |
732 | 21.6k | s[0] = s[1]; |
733 | 21.6k | s[1] = s[2]; |
734 | 21.6k | s[2] = s[3]; |
735 | 21.6k | } while (y > 0); |
736 | 3.97k | } |
737 | 6.74k | } else { |
738 | 6.74k | prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs); |
739 | | |
740 | 6.74k | if (w == 8) { |
741 | 3.43k | __m128i d[8]; |
742 | 3.43k | __m256i s[4]; |
743 | | |
744 | 3.43k | d[0] = _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)); |
745 | 3.43k | d[1] = _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)); |
746 | 3.43k | d[2] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)); |
747 | 3.43k | d[3] = _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)); |
748 | 3.43k | d[4] = _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)); |
749 | 3.43k | d[5] = _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)); |
750 | 3.43k | d[6] = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride)); |
751 | | |
752 | 3.43k | const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]); |
753 | 3.43k | const __m256i src_12a = _mm256_setr_m128i(d[1], d[2]); |
754 | 3.43k | const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]); |
755 | 3.43k | const __m256i src_34a = _mm256_setr_m128i(d[3], d[4]); |
756 | 3.43k | const __m256i src_45a = _mm256_setr_m128i(d[4], d[5]); |
757 | 3.43k | const __m256i src_56a = _mm256_setr_m128i(d[5], d[6]); |
758 | | |
759 | 3.43k | s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); |
760 | 3.43k | s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); |
761 | 3.43k | s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); |
762 | | |
763 | 20.9k | do { |
764 | 20.9k | __m256i res; |
765 | 20.9k | convolve_y_8tap_8x2_avx2(data, src_stride, coeffs, d, s, &res); |
766 | 20.9k | round_pack_store_y_8x2_avx2(res, dst_ptr, dst_stride); |
767 | | |
768 | 20.9k | dst_ptr += 2 * dst_stride; |
769 | 20.9k | data += 2 * src_stride; |
770 | 20.9k | y -= 2; |
771 | | |
772 | 20.9k | s[0] = s[1]; |
773 | 20.9k | s[1] = s[2]; |
774 | 20.9k | s[2] = s[3]; |
775 | 20.9k | } while (y > 0); |
776 | | |
777 | 3.43k | } else { |
778 | 3.31k | assert(!(w % 16)); |
779 | | |
780 | 3.31k | __m128i d[8]; |
781 | 3.31k | __m256i s[8]; |
782 | 5.39k | do { |
783 | 5.39k | data = src_ptr + x; |
784 | 5.39k | dst_ptr = dst + x; |
785 | 5.39k | y = h; |
786 | | |
787 | 5.39k | d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); |
788 | 5.39k | d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); |
789 | 5.39k | d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); |
790 | 5.39k | d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); |
791 | 5.39k | d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); |
792 | 5.39k | d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); |
793 | 5.39k | d[6] = _mm_loadu_si128((__m128i *)(data + 6 * src_stride)); |
794 | | |
795 | 5.39k | const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]); |
796 | 5.39k | const __m256i src_12a = _mm256_setr_m128i(d[1], d[2]); |
797 | 5.39k | const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]); |
798 | 5.39k | const __m256i src_34a = _mm256_setr_m128i(d[3], d[4]); |
799 | 5.39k | const __m256i src_45a = _mm256_setr_m128i(d[4], d[5]); |
800 | 5.39k | const __m256i src_56a = _mm256_setr_m128i(d[5], d[6]); |
801 | | |
802 | 5.39k | s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); |
803 | 5.39k | s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); |
804 | 5.39k | s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); |
805 | | |
806 | 5.39k | s[4] = _mm256_unpackhi_epi8(src_01a, src_12a); |
807 | 5.39k | s[5] = _mm256_unpackhi_epi8(src_23a, src_34a); |
808 | 5.39k | s[6] = _mm256_unpackhi_epi8(src_45a, src_56a); |
809 | | |
810 | 79.4k | do { |
811 | 79.4k | __m256i res[2]; |
812 | 79.4k | convolve_y_8tap_16x2_avx2(data, src_stride, coeffs, d, s, res); |
813 | 79.4k | round_pack_store_y_16x2_avx2(res, dst_ptr, dst_stride); |
814 | | |
815 | 79.4k | dst_ptr += 2 * dst_stride; |
816 | 79.4k | data += 2 * src_stride; |
817 | 79.4k | y -= 2; |
818 | | |
819 | 79.4k | s[0] = s[1]; |
820 | 79.4k | s[1] = s[2]; |
821 | 79.4k | s[2] = s[3]; |
822 | | |
823 | 79.4k | s[4] = s[5]; |
824 | 79.4k | s[5] = s[6]; |
825 | 79.4k | s[6] = s[7]; |
826 | 79.4k | } while (y > 0); |
827 | | |
828 | 5.39k | x += 16; |
829 | 5.39k | } while (x < w); |
830 | 3.31k | } |
831 | 6.74k | } |
832 | 11.8k | } |
833 | 486k | } |
834 | | |
835 | | void av1_convolve_x_sr_avx2(const uint8_t *src, int32_t src_stride, |
836 | | uint8_t *dst, int32_t dst_stride, int32_t w, |
837 | | int32_t h, |
838 | | const InterpFilterParams *filter_params_x, |
839 | | const int32_t subpel_x_qn, |
840 | 501k | ConvolveParams *conv_params) { |
841 | 501k | const int bits = FILTER_BITS - conv_params->round_0; |
842 | 501k | int i, j, horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn); |
843 | | |
844 | 501k | assert(bits >= 0); |
845 | 501k | assert((FILTER_BITS - conv_params->round_1) >= 0 || |
846 | 501k | ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); |
847 | 501k | assert(conv_params->round_0 > 0); |
848 | | |
849 | 501k | assert(horiz_tap == 2 || horiz_tap == 4 || horiz_tap == 6 || horiz_tap == 8 || |
850 | 501k | horiz_tap == 12); |
851 | 501k | assert((!(w % 2)) || (w <= 128)); |
852 | 501k | assert((h % 2) == 0); |
853 | | |
854 | 501k | __m256i coeffs[6] = { 0 }, filt[4] = { 0 }; |
855 | 501k | __m128i coeffs_128[4] = { 0 }; |
856 | | |
857 | 501k | i = 0; |
858 | | // horz_filt as 4 tap |
859 | 501k | if (horiz_tap == 4) { |
860 | | // since fo_horiz = 1 |
861 | 202k | const uint8_t *src_ptr = src - 1; |
862 | 202k | if (w == 2) { |
863 | 33.4k | prepare_coeffs_4t_ssse3(filter_params_x, subpel_x_qn, coeffs_128); |
864 | 88.0k | do { |
865 | 88.0k | const __m128i res = |
866 | 88.0k | convolve_x_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128); |
867 | 88.0k | const __m128i reg = round_sr_x_ssse3(res); |
868 | 88.0k | pack_store_u8_2x2_sse2(reg, dst, dst_stride); |
869 | 88.0k | src_ptr += 2 * src_stride; |
870 | 88.0k | dst += 2 * dst_stride; |
871 | 88.0k | h -= 2; |
872 | 88.0k | } while (h); |
873 | 168k | } else if (w == 4) { |
874 | 153k | prepare_coeffs_4t_ssse3(filter_params_x, subpel_x_qn, coeffs_128); |
875 | 507k | do { |
876 | 507k | const __m128i reg = |
877 | 507k | convolve_x_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128); |
878 | 507k | const __m128i res = round_sr_x_ssse3(reg); |
879 | 507k | pack_store_u8_4x2_sse2(res, dst, dst_stride); |
880 | 507k | src_ptr += 2 * src_stride; |
881 | 507k | dst += 2 * dst_stride; |
882 | 507k | h -= 2; |
883 | 507k | } while (h); |
884 | 153k | } else if (w == 8) { |
885 | 8.79k | prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs); |
886 | 8.79k | filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2)); |
887 | 8.79k | filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); |
888 | 34.3k | do { |
889 | 34.3k | const __m256i data = _mm256_setr_m128i( |
890 | 34.3k | _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride])), |
891 | 34.3k | _mm_loadu_si128( |
892 | 34.3k | (__m128i *)(&src_ptr[i * src_stride + src_stride]))); |
893 | | |
894 | 34.3k | __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt); |
895 | | |
896 | 34.3k | res_16b = round_sr_x_avx2(res_16b); |
897 | | |
898 | | /* rounding code */ |
899 | | // 8 bit conversion and saturation to uint8 |
900 | 34.3k | __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); |
901 | | |
902 | 34.3k | const __m128i res_0 = _mm256_castsi256_si128(res_8b); |
903 | 34.3k | const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); |
904 | | |
905 | 34.3k | _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0); |
906 | 34.3k | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1); |
907 | 34.3k | i += 2; |
908 | 34.3k | } while (i < h); |
909 | 8.79k | } else { |
910 | 6.94k | assert(!(w % 16)); |
911 | 6.94k | prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs); |
912 | 6.94k | filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2)); |
913 | 6.94k | filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); |
914 | 159k | do { |
915 | 159k | j = 0; |
916 | 554k | do { |
917 | | // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 |
918 | | // 18 19 20 21 22 23 |
919 | 554k | const __m256i data = _mm256_inserti128_si256( |
920 | 554k | _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]), |
921 | 554k | _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]), |
922 | 554k | 1); |
923 | | |
924 | 554k | __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt); |
925 | | |
926 | 554k | res_16b = round_sr_x_avx2(res_16b); |
927 | | |
928 | | /* rounding code */ |
929 | | // 8 bit conversion and saturation to uint8 |
930 | 554k | __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); |
931 | | |
932 | | // Store values into the destination buffer |
933 | | // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
934 | 554k | res_8b = _mm256_permute4x64_epi64(res_8b, 216); |
935 | 554k | __m128i res = _mm256_castsi256_si128(res_8b); |
936 | 554k | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); |
937 | 554k | j += 16; |
938 | 554k | } while (j < w); |
939 | 159k | i++; |
940 | 159k | } while (i < h); |
941 | 6.94k | } |
942 | 299k | } else if (horiz_tap == 6) { |
943 | | // since (horiz_tap/2 - 1 == 2) |
944 | 245k | const uint8_t *src_ptr = src - 2; |
945 | 245k | prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs); |
946 | 245k | filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2)); |
947 | 245k | filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); |
948 | 245k | filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); |
949 | 245k | if (w == 8) { |
950 | 522k | do { |
951 | 522k | const __m256i data = _mm256_setr_m128i( |
952 | 522k | _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride])), |
953 | 522k | _mm_loadu_si128( |
954 | 522k | (__m128i *)(&src_ptr[i * src_stride + src_stride]))); |
955 | | |
956 | 522k | __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt); |
957 | | |
958 | 522k | res_16b = round_sr_x_avx2(res_16b); |
959 | | |
960 | | /* rounding code */ |
961 | | // 8 bit conversion and saturation to uint8 |
962 | 522k | __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); |
963 | | |
964 | 522k | const __m128i res_0 = _mm256_castsi256_si128(res_8b); |
965 | 522k | const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); |
966 | 522k | _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0); |
967 | 522k | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1); |
968 | 522k | i += 2; |
969 | 522k | } while (i < h); |
970 | 140k | } else if (w == 16) { |
971 | 415k | do { |
972 | 415k | __m256i data[2] = { 0 }; |
973 | | |
974 | 415k | load_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs, filt, data); |
975 | 415k | round_pack_store_16x2_avx2(data, dst, dst_stride); |
976 | 415k | src_ptr += 2 * src_stride; |
977 | 415k | dst += 2 * dst_stride; |
978 | 415k | h -= 2; |
979 | 415k | } while (h); |
980 | 80.8k | } else if (w == 32) { |
981 | 386k | do { |
982 | 386k | convolve_sr_store_6tap_32_avx2(src_ptr, coeffs, filt, dst); |
983 | 386k | src_ptr += src_stride; |
984 | 386k | dst += dst_stride; |
985 | 386k | } while ((--h) > 0); |
986 | 19.7k | } else if (w == 64) { |
987 | 160k | do { |
988 | 160k | convolve_sr_store_6tap_32_avx2(src_ptr, coeffs, filt, dst); |
989 | 160k | convolve_sr_store_6tap_32_avx2(src_ptr + 32, coeffs, filt, dst + 32); |
990 | 160k | src_ptr += src_stride; |
991 | 160k | dst += dst_stride; |
992 | 160k | } while ((--h) > 0); |
993 | 3.25k | } else { |
994 | 529 | assert(w == 128); |
995 | | |
996 | 65.5k | do { |
997 | 65.5k | convolve_sr_store_6tap_32_avx2(src_ptr, coeffs, filt, dst); |
998 | 65.5k | convolve_sr_store_6tap_32_avx2(src_ptr + SECOND_32_BLK, coeffs, filt, |
999 | 65.5k | dst + SECOND_32_BLK); |
1000 | 65.5k | convolve_sr_store_6tap_32_avx2(src_ptr + THIRD_32_BLK, coeffs, filt, |
1001 | 65.5k | dst + THIRD_32_BLK); |
1002 | 65.5k | convolve_sr_store_6tap_32_avx2(src_ptr + FOURTH_32_BLK, coeffs, filt, |
1003 | 65.5k | dst + FOURTH_32_BLK); |
1004 | 65.5k | src_ptr += src_stride; |
1005 | 65.5k | dst += dst_stride; |
1006 | 65.5k | } while ((--h) > 0); |
1007 | 566 | } |
1008 | 245k | } else if (horiz_tap == 8) { |
1009 | | // since (horiz_tap / 2 - 1) == 3 |
1010 | 12.9k | const uint8_t *src_ptr = src - 3; |
1011 | 12.9k | prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs); |
1012 | 12.9k | filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2)); |
1013 | 12.9k | filt[1] = |
1014 | 12.9k | _mm256_load_si256((__m256i const *)(filt_global_avx2 + SECOND_32_BLK)); |
1015 | 12.9k | filt[2] = |
1016 | 12.9k | _mm256_load_si256((__m256i const *)(filt_global_avx2 + THIRD_32_BLK)); |
1017 | 12.9k | filt[3] = |
1018 | 12.9k | _mm256_load_si256((__m256i const *)(filt_global_avx2 + FOURTH_32_BLK)); |
1019 | | |
1020 | 12.9k | if (w == 8) { |
1021 | 28.7k | do { |
1022 | 28.7k | const __m256i data = _mm256_setr_m128i( |
1023 | 28.7k | _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride])), |
1024 | 28.7k | _mm_loadu_si128( |
1025 | 28.7k | (__m128i *)(&src_ptr[i * src_stride + src_stride]))); |
1026 | | |
1027 | 28.7k | __m256i res_16b = convolve_lowbd_x(data, coeffs, filt); |
1028 | | |
1029 | 28.7k | res_16b = round_sr_x_avx2(res_16b); |
1030 | | |
1031 | | /* rounding code */ |
1032 | | // 8 bit conversion and saturation to uint8 |
1033 | 28.7k | __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); |
1034 | | |
1035 | 28.7k | const __m128i res_0 = _mm256_castsi256_si128(res_8b); |
1036 | 28.7k | const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); |
1037 | 28.7k | _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0); |
1038 | 28.7k | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1); |
1039 | 28.7k | i += 2; |
1040 | 28.7k | } while (i < h); |
1041 | 6.96k | } else if (w == 16) { |
1042 | 23.2k | do { |
1043 | 23.2k | __m256i data[2] = { 0 }; |
1044 | | |
1045 | 23.2k | load_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs, filt, data); |
1046 | 23.2k | round_pack_store_16x2_avx2(data, dst, dst_stride); |
1047 | 23.2k | src_ptr += 2 * src_stride; |
1048 | 23.2k | dst += 2 * dst_stride; |
1049 | 23.2k | h -= 2; |
1050 | 23.2k | } while (h); |
1051 | 4.12k | } else if (w == 32) { |
1052 | 27.9k | do { |
1053 | 27.9k | load_convolve_round_8tap_32_avx2(src_ptr, coeffs, filt, dst); |
1054 | 27.9k | src_ptr += src_stride; |
1055 | 27.9k | dst += dst_stride; |
1056 | 27.9k | } while ((--h) > 0); |
1057 | 1.26k | } else if (w == 64) { |
1058 | 22.2k | do { |
1059 | 22.2k | load_convolve_round_8tap_32_avx2(src_ptr, coeffs, filt, dst); |
1060 | 22.2k | load_convolve_round_8tap_32_avx2(src_ptr + 32, coeffs, filt, dst + 32); |
1061 | 22.2k | src_ptr += src_stride; |
1062 | 22.2k | dst += dst_stride; |
1063 | 22.2k | } while ((--h) > 0); |
1064 | 489 | } else { |
1065 | 148 | assert(w == 128); |
1066 | 14.0k | do { |
1067 | 14.0k | load_convolve_round_8tap_32_avx2(src_ptr, coeffs, filt, dst); |
1068 | 14.0k | load_convolve_round_8tap_32_avx2(src_ptr + SECOND_32_BLK, coeffs, filt, |
1069 | 14.0k | dst + SECOND_32_BLK); |
1070 | 14.0k | load_convolve_round_8tap_32_avx2(src_ptr + THIRD_32_BLK, coeffs, filt, |
1071 | 14.0k | dst + THIRD_32_BLK); |
1072 | 14.0k | load_convolve_round_8tap_32_avx2(src_ptr + FOURTH_32_BLK, coeffs, filt, |
1073 | 14.0k | dst + FOURTH_32_BLK); |
1074 | 14.0k | src_ptr += src_stride; |
1075 | 14.0k | dst += dst_stride; |
1076 | 14.0k | } while ((--h) > 0); |
1077 | 148 | } |
1078 | 41.1k | } else if (horiz_tap == 12) { // horiz_tap == 12 |
1079 | 0 | const int fo_horiz = filter_params_x->taps / 2 - 1; |
1080 | 0 | prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs); |
1081 | 0 | const __m128i round_shift = _mm_cvtsi32_si128(bits); |
1082 | 0 | const uint8_t *const src_ptr = src - fo_horiz; |
1083 | 0 | const __m256i v_zero = _mm256_setzero_si256(); |
1084 | 0 | __m256i round_0_const = |
1085 | 0 | _mm256_set1_epi32((1 << (conv_params->round_0)) >> 1); |
1086 | 0 | __m256i round_const = _mm256_set1_epi32((1 << bits) >> 1); |
1087 | 0 | __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0); |
1088 | 0 | __m256i s[6] = { 0 }; |
1089 | |
|
1090 | 0 | if (w <= 4) { |
1091 | 0 | do { |
1092 | 0 | const __m256i data = _mm256_permute2x128_si256( |
1093 | 0 | _mm256_castsi128_si256( |
1094 | 0 | _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))), |
1095 | 0 | _mm256_castsi128_si256(_mm_loadu_si128( |
1096 | 0 | (__m128i *)(&src_ptr[i * src_stride + src_stride]))), |
1097 | 0 | 0x20); |
1098 | | // row0 0..7 row1 0..7 |
1099 | 0 | const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero); |
1100 | | // row0 8..F row1 8..F |
1101 | 0 | const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero); |
1102 | | |
1103 | | // row0 00 00 01 01 .. 03 03 row1 00 00 01 01 .. 03 03 |
1104 | 0 | const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo); |
1105 | | // row0 04 04 .. 07 07 row1 04 04 .. 07 07 |
1106 | 0 | const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo); |
1107 | | |
1108 | | // row0 08 08 09 09 .. 0B 0B row1 08 08 09 09 .. 0B 0B |
1109 | 0 | const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi); |
1110 | | // row0 0C 0C .. 0F 0F row1 0C 0C .. 0F 0F |
1111 | 0 | const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi); |
1112 | | |
1113 | | // 00 01 01 02 02 03 03 04 10 11 11 12 12 13 13 14 |
1114 | 0 | s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2); |
1115 | | // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16 |
1116 | 0 | s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10); |
1117 | | // 04 05 05 06 06 07 07 08 14 15 15 16 16 17 17 18 |
1118 | 0 | s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2); |
1119 | | // 06 07 07 08 08 09 09 0A 16 17 17 18 18 19 19 1A |
1120 | 0 | s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10); |
1121 | | // 08 09 09 0A 0A 0B 0B 0C 18 19 19 1A 1A 1B 1B 1C |
1122 | 0 | s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2); |
1123 | | // 0A 0B 0B 0C 0C 0D 0D 0E 1A 1B 1B 1C 1C 1D 1D 1E |
1124 | 0 | s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10); |
1125 | |
|
1126 | 0 | const __m256i res_lo = convolve_12taps(s, coeffs); |
1127 | |
|
1128 | 0 | __m256i res_32b_lo = _mm256_sra_epi32( |
1129 | 0 | _mm256_add_epi32(res_lo, round_0_const), round_0_shift); |
1130 | | |
1131 | | // 00 01 02 03 10 12 13 14 |
1132 | 0 | res_32b_lo = _mm256_sra_epi32(_mm256_add_epi32(res_32b_lo, round_const), |
1133 | 0 | round_shift); |
1134 | | // 8 bit conversion and saturation to uint8 |
1135 | | // 00 01 02 03 00 01 02 03 10 11 12 13 10 11 12 13 |
1136 | 0 | __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); |
1137 | | // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03 |
1138 | | // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13 |
1139 | 0 | __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); |
1140 | | |
1141 | | // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03 |
1142 | 0 | const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0); |
1143 | | // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13 |
1144 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); |
1145 | 0 | if (w > 2) { |
1146 | | // 00 01 02 03 |
1147 | 0 | xx_storel_32(&dst[i * dst_stride], res_0); |
1148 | | // 10 11 12 13 |
1149 | 0 | xx_storel_32(&dst[i * dst_stride + dst_stride], res_1); |
1150 | 0 | } else { |
1151 | | // 00 01 |
1152 | 0 | xx_storel_16(&dst[i * dst_stride], res_0); |
1153 | | // 10 11 |
1154 | 0 | xx_storel_16(&dst[i * dst_stride + dst_stride], res_1); |
1155 | 0 | } |
1156 | 0 | i += 2; |
1157 | 0 | } while (i < h); |
1158 | 0 | } else { |
1159 | 0 | assert(!(w % 8)); |
1160 | 0 | do { |
1161 | 0 | j = 0; |
1162 | 0 | do { |
1163 | 0 | const __m256i data = _mm256_permute2x128_si256( |
1164 | 0 | _mm256_castsi128_si256( |
1165 | 0 | _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))), |
1166 | 0 | _mm256_castsi128_si256(_mm_loadu_si128( |
1167 | 0 | (__m128i *)(&src_ptr[i * src_stride + j + 4]))), |
1168 | 0 | 0x20); |
1169 | | // row0 0..7 4..B |
1170 | 0 | const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero); |
1171 | | // row0 8..F C..13 |
1172 | 0 | const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero); |
1173 | | |
1174 | | // row0 00 00 01 01 .. 03 03 04 04 05 05 .. 07 07 |
1175 | 0 | const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo); |
1176 | | // row0 04 04 .. 07 07 08 08 .. 0B 0B |
1177 | 0 | const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo); |
1178 | | |
1179 | | // row0 08 08 09 09 .. 0B 0B 0C 0C 0D 0D .. 0F 0F |
1180 | 0 | const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi); |
1181 | | // row0 0C 0C 0D 0D .. 0F 0F 10 10 11 11 .. 13 13 |
1182 | 0 | const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi); |
1183 | |
|
1184 | 0 | s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2); |
1185 | 0 | s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10); |
1186 | 0 | s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2); |
1187 | 0 | s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10); |
1188 | 0 | s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2); |
1189 | 0 | s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10); |
1190 | |
|
1191 | 0 | const __m256i res_lo = convolve_12taps(s, coeffs); |
1192 | |
|
1193 | 0 | __m256i res_32b_lo = _mm256_sra_epi32( |
1194 | 0 | _mm256_add_epi32(res_lo, round_0_const), round_0_shift); |
1195 | |
|
1196 | 0 | res_32b_lo = _mm256_sra_epi32( |
1197 | 0 | _mm256_add_epi32(res_32b_lo, round_const), round_shift); |
1198 | | // 8 bit conversion and saturation to uint8 |
1199 | 0 | __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); |
1200 | 0 | __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); |
1201 | 0 | const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0); |
1202 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); |
1203 | 0 | xx_storel_32(&dst[i * dst_stride + j], res_0); |
1204 | 0 | xx_storel_32(&dst[i * dst_stride + j + 4], res_1); |
1205 | |
|
1206 | 0 | j += 8; |
1207 | 0 | } while (j < w); |
1208 | 0 | i++; |
1209 | 0 | } while (i < h); |
1210 | 0 | } |
1211 | 41.1k | } else { |
1212 | 41.1k | assert(horiz_tap == 2); |
1213 | | // since (filter_params_x->taps / 2 - 1) == 0 |
1214 | 41.1k | const uint8_t *src_ptr = src; |
1215 | 41.1k | if (subpel_x_qn != 8) { |
1216 | 11.2k | if (w <= 8) { |
1217 | 8.17k | prepare_coeffs_2t_ssse3(filter_params_x, subpel_x_qn, coeffs_128); |
1218 | | |
1219 | 8.17k | if (w == 2) { |
1220 | 2.69k | do { |
1221 | 2.69k | const __m128i data = |
1222 | 2.69k | convolve_x_2tap_2x2_ssse3(src_ptr, src_stride, coeffs_128); |
1223 | 2.69k | const __m128i reg = round_sr_x_ssse3(data); |
1224 | 2.69k | pack_store_u8_2x2_sse2(reg, dst, dst_stride); |
1225 | 2.69k | src_ptr += 2 * src_stride; |
1226 | 2.69k | dst += 2 * dst_stride; |
1227 | 2.69k | h -= 2; |
1228 | 2.69k | } while (h); |
1229 | 6.97k | } else if (w == 4) { |
1230 | 10.7k | do { |
1231 | 10.7k | const __m128i data = |
1232 | 10.7k | convolve_x_2tap_4x2_ssse3(src_ptr, src_stride, coeffs_128); |
1233 | 10.7k | const __m128i reg = round_sr_x_ssse3(data); |
1234 | 10.7k | pack_store_u8_4x2_sse2(reg, dst, dst_stride); |
1235 | 10.7k | src_ptr += 2 * src_stride; |
1236 | 10.7k | dst += 2 * dst_stride; |
1237 | 10.7k | h -= 2; |
1238 | 10.7k | } while (h); |
1239 | 3.61k | } else { |
1240 | 3.35k | assert(w == 8); |
1241 | | |
1242 | 10.8k | do { |
1243 | 10.8k | __m128i data[2] = { 0 }; |
1244 | | |
1245 | 10.8k | convolve_x_2tap_8x2_ssse3(src_ptr, src_stride, coeffs_128, data); |
1246 | 10.8k | data[0] = round_sr_x_ssse3(data[0]); |
1247 | 10.8k | data[1] = round_sr_x_ssse3(data[1]); |
1248 | 10.8k | const __m128i reg = _mm_packus_epi16(data[0], data[1]); |
1249 | 10.8k | _mm_storel_epi64((__m128i *)dst, reg); |
1250 | 10.8k | _mm_storeh_epi64((__m128i *)(dst + dst_stride), reg); |
1251 | | |
1252 | 10.8k | src_ptr += 2 * src_stride; |
1253 | 10.8k | dst += 2 * dst_stride; |
1254 | 10.8k | h -= 2; |
1255 | 10.8k | } while (h); |
1256 | 3.35k | } |
1257 | 8.17k | } else { |
1258 | 3.09k | prepare_coeffs_2t_lowbd(filter_params_x, subpel_x_qn, coeffs); |
1259 | | |
1260 | 3.09k | if (w == 16) { |
1261 | 9.00k | do { |
1262 | 9.00k | __m256i data[2] = { 0 }; |
1263 | | |
1264 | 9.00k | convolve_x_2tap_16x2_avx2(src_ptr, src_stride, coeffs, data); |
1265 | 9.00k | round_pack_store_16x2_avx2(data, dst, dst_stride); |
1266 | 9.00k | src_ptr += 2 * src_stride; |
1267 | 9.00k | dst += 2 * dst_stride; |
1268 | 9.00k | h -= 2; |
1269 | 9.00k | } while (h); |
1270 | 1.76k | } else if (w == 32) { |
1271 | 15.5k | do { |
1272 | 15.5k | convolve_round_2tap_32_avx2(src_ptr, coeffs, dst); |
1273 | 15.5k | src_ptr += src_stride; |
1274 | 15.5k | dst += dst_stride; |
1275 | 15.5k | } while ((--h) > 0); |
1276 | 699 | } else if (w == 64) { |
1277 | 23.5k | do { |
1278 | 23.5k | convolve_round_2tap_32_avx2(src_ptr, coeffs, dst); |
1279 | 23.5k | convolve_round_2tap_32_avx2(src_ptr + SECOND_32_BLK, coeffs, |
1280 | 23.5k | dst + SECOND_32_BLK); |
1281 | 23.5k | src_ptr += src_stride; |
1282 | 23.5k | dst += dst_stride; |
1283 | 23.5k | } while ((--h) > 0); |
1284 | 566 | } else { |
1285 | 133 | assert(w == 128); |
1286 | | |
1287 | 10.6k | do { |
1288 | 10.6k | convolve_round_2tap_32_avx2(src_ptr, coeffs, dst); |
1289 | 10.6k | convolve_round_2tap_32_avx2(src_ptr + (SECOND_32_BLK), coeffs, |
1290 | 10.6k | dst + (SECOND_32_BLK)); |
1291 | 10.6k | convolve_round_2tap_32_avx2(src_ptr + (THIRD_32_BLK), coeffs, |
1292 | 10.6k | dst + (THIRD_32_BLK)); |
1293 | 10.6k | convolve_round_2tap_32_avx2(src_ptr + (FOURTH_32_BLK), coeffs, |
1294 | 10.6k | dst + (FOURTH_32_BLK)); |
1295 | 10.6k | src_ptr += src_stride; |
1296 | 10.6k | dst += dst_stride; |
1297 | 10.6k | } while ((--h) > 0); |
1298 | 133 | } |
1299 | 3.09k | } |
1300 | 29.9k | } else { |
1301 | 29.9k | if (w == 2) { |
1302 | 8.86k | do { |
1303 | 8.86k | __m128i data = load_x_u8_4x2_sse4(src_ptr, src_stride); |
1304 | 8.86k | const __m128i reg1 = _mm_srli_si128(data, 1); |
1305 | 8.86k | const __m128i reg2 = _mm_avg_epu8(data, reg1); |
1306 | 8.86k | xx_storel_16(dst, reg2); |
1307 | 8.86k | { |
1308 | 8.86k | uint16_t val = (uint16_t)_mm_extract_epi16(reg2, 2); |
1309 | 8.86k | memcpy(dst + dst_stride, &val, sizeof(val)); |
1310 | 8.86k | } |
1311 | 8.86k | src_ptr += 2 * src_stride; |
1312 | 8.86k | dst += 2 * dst_stride; |
1313 | 8.86k | h -= 2; |
1314 | 8.86k | } while (h); |
1315 | 25.6k | } else if (w == 4) { |
1316 | 33.8k | do { |
1317 | 33.8k | __m128i data = load_8bit_8x2_to_1_reg_sse2( |
1318 | 33.8k | src_ptr, (int)(sizeof(*src_ptr) * src_stride)); |
1319 | 33.8k | const __m128i reg1 = _mm_srli_si128(data, 1); |
1320 | 33.8k | const __m128i reg2 = _mm_avg_epu8(data, reg1); |
1321 | 33.8k | xx_storel_32(dst, reg2); |
1322 | 33.8k | { |
1323 | 33.8k | int32_t val = _mm_extract_epi32(reg2, 2); |
1324 | 33.8k | memcpy(dst + dst_stride, &val, sizeof(val)); |
1325 | 33.8k | } |
1326 | | |
1327 | 33.8k | src_ptr += 2 * src_stride; |
1328 | 33.8k | dst += 2 * dst_stride; |
1329 | 33.8k | h -= 2; |
1330 | 33.8k | } while (h); |
1331 | 13.0k | } else if (w == 8) { |
1332 | 25.9k | do { |
1333 | 25.9k | const __m128i data00 = _mm_loadu_si128((__m128i *)src_ptr); |
1334 | 25.9k | const __m128i data10 = |
1335 | 25.9k | _mm_loadu_si128((__m128i *)(src_ptr + src_stride)); |
1336 | 25.9k | const __m128i data01 = _mm_srli_si128(data00, 1); |
1337 | 25.9k | const __m128i data11 = _mm_srli_si128(data10, 1); |
1338 | 25.9k | const __m128i reg0 = _mm_avg_epu8(data00, data01); |
1339 | 25.9k | const __m128i reg1 = _mm_avg_epu8(data10, data11); |
1340 | 25.9k | _mm_storel_epi64((__m128i *)dst, reg0); |
1341 | 25.9k | _mm_storel_epi64((__m128i *)(dst + dst_stride), reg1); |
1342 | | |
1343 | 25.9k | src_ptr += 2 * src_stride; |
1344 | 25.9k | dst += 2 * dst_stride; |
1345 | 25.9k | h -= 2; |
1346 | 25.9k | } while (h); |
1347 | 8.53k | } else if (w == 16) { |
1348 | 15.7k | do { |
1349 | 15.7k | const __m128i data00 = _mm_loadu_si128((__m128i *)src_ptr); |
1350 | 15.7k | const __m128i data01 = _mm_loadu_si128((__m128i *)(src_ptr + 1)); |
1351 | 15.7k | const __m128i data10 = |
1352 | 15.7k | _mm_loadu_si128((__m128i *)(src_ptr + src_stride)); |
1353 | 15.7k | const __m128i data11 = |
1354 | 15.7k | _mm_loadu_si128((__m128i *)(src_ptr + src_stride + 1)); |
1355 | 15.7k | const __m128i reg0 = _mm_avg_epu8(data00, data01); |
1356 | 15.7k | const __m128i reg1 = _mm_avg_epu8(data10, data11); |
1357 | 15.7k | _mm_storeu_si128((__m128i *)dst, reg0); |
1358 | 15.7k | _mm_storeu_si128((__m128i *)(dst + dst_stride), reg1); |
1359 | | |
1360 | 15.7k | src_ptr += 2 * src_stride; |
1361 | 15.7k | dst += 2 * dst_stride; |
1362 | 15.7k | h -= 2; |
1363 | 15.7k | } while (h); |
1364 | 3.08k | } else if (w == 32) { |
1365 | 22.6k | do { |
1366 | 22.6k | load_avg_store_2tap_32_avx2(src_ptr, dst); |
1367 | 22.6k | src_ptr += src_stride; |
1368 | 22.6k | dst += dst_stride; |
1369 | 22.6k | } while ((--h) > 0); |
1370 | 950 | } else if (w == 64) { |
1371 | 15.8k | do { |
1372 | 15.8k | load_avg_store_2tap_32_avx2(src_ptr, dst); |
1373 | 15.8k | load_avg_store_2tap_32_avx2(src_ptr + (SECOND_32_BLK), |
1374 | 15.8k | dst + (SECOND_32_BLK)); |
1375 | 15.8k | src_ptr += src_stride; |
1376 | 15.8k | dst += dst_stride; |
1377 | 15.8k | } while ((--h) > 0); |
1378 | 313 | } else { |
1379 | 172 | assert(w == 128); |
1380 | | |
1381 | 15.4k | do { |
1382 | 15.4k | load_avg_store_2tap_32_avx2(src_ptr, dst); |
1383 | 15.4k | load_avg_store_2tap_32_avx2(src_ptr + (SECOND_32_BLK), |
1384 | 15.4k | dst + (SECOND_32_BLK)); |
1385 | 15.4k | load_avg_store_2tap_32_avx2(src_ptr + (THIRD_32_BLK), |
1386 | 15.4k | dst + (THIRD_32_BLK)); |
1387 | 15.4k | load_avg_store_2tap_32_avx2(src_ptr + (FOURTH_32_BLK), |
1388 | 15.4k | dst + (FOURTH_32_BLK)); |
1389 | 15.4k | src_ptr += src_stride; |
1390 | 15.4k | dst += dst_stride; |
1391 | 15.4k | } while ((--h) > 0); |
1392 | 172 | } |
1393 | 29.9k | } |
1394 | 41.1k | } |
1395 | 501k | } |