/src/aom/av1/common/x86/convolve_avx2.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2017, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #include <immintrin.h> |
13 | | |
14 | | #include "config/av1_rtcd.h" |
15 | | |
16 | | #include "aom_dsp/aom_dsp_common.h" |
17 | | #include "aom_dsp/x86/convolve_avx2.h" |
18 | | #include "aom_dsp/x86/convolve_common_intrin.h" |
19 | | #include "aom_dsp/x86/synonyms.h" |
20 | | |
21 | | void av1_convolve_y_sr_avx2(const uint8_t *src, int32_t src_stride, |
22 | | uint8_t *dst, int32_t dst_stride, int32_t w, |
23 | | int32_t h, |
24 | | const InterpFilterParams *filter_params_y, |
25 | 503k | const int32_t subpel_y_qn) { |
26 | 503k | __m128i coeffs_128[4]; |
27 | 503k | __m256i coeffs[6]; |
28 | 503k | int x = 0, y = h; |
29 | | |
30 | 503k | int i, vert_tap = get_filter_tap(filter_params_y, subpel_y_qn); |
31 | 503k | assert(vert_tap == 2 || vert_tap == 4 || vert_tap == 6 || vert_tap == 8 || |
32 | 503k | vert_tap == 12); |
33 | 503k | assert(!(w % 2)); |
34 | 503k | assert(!(h % 2)); |
35 | | |
36 | 503k | const int fo_vert = vert_tap / 2 - 1; |
37 | 503k | const uint8_t *const src_ptr = src - fo_vert * src_stride; |
38 | 503k | const uint8_t *data = src_ptr; |
39 | 503k | uint8_t *dst_ptr = dst; |
40 | | |
41 | 503k | if (vert_tap == 2) { |
42 | 31.8k | if (subpel_y_qn != 8) { |
43 | 13.6k | if (w <= 4) { |
44 | 6.73k | prepare_coeffs_2t_ssse3(filter_params_y, subpel_y_qn, coeffs_128); |
45 | 6.73k | __m128i d[2], res; |
46 | 6.73k | if (w == 2) { |
47 | 1.75k | d[0] = _mm_cvtsi32_si128(loadu_int16(data)); |
48 | | |
49 | 3.55k | do { |
50 | 3.55k | convolve_y_2tap_2x2_ssse3(data, src_stride, coeffs_128, d, &res); |
51 | 3.55k | res = round_sr_y_ssse3(res); |
52 | 3.55k | pack_store_u8_2x2_sse2(res, dst_ptr, dst_stride); |
53 | | |
54 | 3.55k | dst_ptr += 2 * dst_stride; |
55 | 3.55k | data += 2 * src_stride; |
56 | 3.55k | y -= 2; |
57 | 3.55k | } while (y > 0); |
58 | 4.98k | } else { |
59 | 4.98k | assert(w == 4); |
60 | 4.98k | d[0] = _mm_cvtsi32_si128(loadu_int32(data)); |
61 | | |
62 | 15.3k | do { |
63 | 15.3k | convolve_y_2tap_4x2_ssse3(data, src_stride, coeffs_128, d, &res); |
64 | 15.3k | res = round_sr_y_ssse3(res); |
65 | 15.3k | pack_store_u8_4x2_sse2(res, dst_ptr, dst_stride); |
66 | | |
67 | 15.3k | dst_ptr += 2 * dst_stride; |
68 | 15.3k | data += 2 * src_stride; |
69 | 15.3k | y -= 2; |
70 | 15.3k | } while (y > 0); |
71 | 4.98k | } |
72 | 6.91k | } else { |
73 | 6.91k | prepare_coeffs_2t_lowbd(filter_params_y, subpel_y_qn, coeffs); |
74 | | |
75 | 6.91k | if (w == 8) { |
76 | 3.69k | __m128i d[2]; |
77 | 3.69k | d[0] = _mm_loadl_epi64((__m128i *)data); |
78 | | |
79 | 12.7k | do { |
80 | 12.7k | __m256i res; |
81 | 12.7k | convolve_y_2tap_8x2_avx2(data, src_stride, coeffs, d, &res); |
82 | 12.7k | round_pack_store_y_8x2_avx2(res, dst_ptr, dst_stride); |
83 | | |
84 | 12.7k | dst_ptr += 2 * dst_stride; |
85 | 12.7k | data += 2 * src_stride; |
86 | 12.7k | y -= 2; |
87 | | |
88 | 12.7k | } while (y > 0); |
89 | | |
90 | 3.69k | } else if (w == 16) { |
91 | 1.99k | __m128i d[2]; |
92 | 1.99k | d[0] = _mm_loadu_si128((__m128i *)data); |
93 | | |
94 | 13.9k | do { |
95 | 13.9k | __m256i res[2]; |
96 | 13.9k | convolve_y_2tap_16x2_avx2(data, src_stride, coeffs, d, res); |
97 | 13.9k | round_pack_store_y_16x2_avx2(res, dst_ptr, dst_stride); |
98 | | |
99 | 13.9k | dst_ptr += 2 * dst_stride; |
100 | 13.9k | data += 2 * src_stride; |
101 | 13.9k | y -= 2; |
102 | 13.9k | } while (y > 0); |
103 | | |
104 | 1.99k | } else { |
105 | 1.23k | assert(!(w % 32)); |
106 | | |
107 | 1.23k | __m256i d[2]; |
108 | 1.79k | do { |
109 | 1.79k | data = src_ptr + x; |
110 | 1.79k | dst_ptr = dst + x; |
111 | 1.79k | y = h; |
112 | | |
113 | 1.79k | d[0] = _mm256_loadu_si256((__m256i *)data); |
114 | | |
115 | 41.0k | do { |
116 | 41.0k | __m256i res[4]; |
117 | 41.0k | convolve_y_2tap_32x2_avx2(data, src_stride, coeffs, d, res); |
118 | 41.0k | round_pack_store_y_32x2_avx2(res, dst_ptr, dst_stride); |
119 | | |
120 | 41.0k | dst_ptr += 2 * dst_stride; |
121 | 41.0k | data += 2 * src_stride; |
122 | 41.0k | y -= 2; |
123 | 41.0k | } while (y > 0); |
124 | | |
125 | 1.79k | x += 32; |
126 | 1.79k | } while (x < w); |
127 | 1.23k | } |
128 | 6.91k | } |
129 | 18.1k | } else { |
130 | 18.1k | if (w <= 16) { |
131 | 17.2k | __m128i s[2], res; |
132 | | |
133 | 17.2k | if (w == 2) { |
134 | 3.93k | s[0] = _mm_cvtsi32_si128(loadu_int16(data)); |
135 | | |
136 | 7.46k | do { |
137 | 7.46k | s[1] = _mm_cvtsi32_si128(loadu_int16(data + src_stride)); |
138 | 7.46k | res = _mm_avg_epu8(s[0], s[1]); |
139 | 7.46k | xx_storel_16(dst_ptr, res); |
140 | 7.46k | s[0] = _mm_cvtsi32_si128(loadu_int16(data + 2 * src_stride)); |
141 | 7.46k | res = _mm_avg_epu8(s[1], s[0]); |
142 | 7.46k | xx_storel_16(dst_ptr + dst_stride, res); |
143 | | |
144 | 7.46k | data += 2 * src_stride; |
145 | 7.46k | dst_ptr += 2 * dst_stride; |
146 | 7.46k | y -= 2; |
147 | 7.46k | } while (y > 0); |
148 | 13.3k | } else if (w == 4) { |
149 | 7.09k | s[0] = _mm_cvtsi32_si128(loadu_int32(data)); |
150 | | |
151 | 20.1k | do { |
152 | 20.1k | s[1] = _mm_cvtsi32_si128(loadu_int32(data + src_stride)); |
153 | 20.1k | res = _mm_avg_epu8(s[0], s[1]); |
154 | 20.1k | xx_storel_32(dst_ptr, res); |
155 | 20.1k | s[0] = _mm_cvtsi32_si128(loadu_int32(data + 2 * src_stride)); |
156 | 20.1k | res = _mm_avg_epu8(s[1], s[0]); |
157 | 20.1k | xx_storel_32(dst_ptr + dst_stride, res); |
158 | | |
159 | 20.1k | data += 2 * src_stride; |
160 | 20.1k | dst_ptr += 2 * dst_stride; |
161 | 20.1k | y -= 2; |
162 | 20.1k | } while (y > 0); |
163 | 7.09k | } else if (w == 8) { |
164 | 4.69k | s[0] = _mm_loadl_epi64((__m128i *)data); |
165 | | |
166 | 16.1k | do { |
167 | 16.1k | s[1] = _mm_loadl_epi64((__m128i *)(data + src_stride)); |
168 | 16.1k | res = _mm_avg_epu8(s[0], s[1]); |
169 | 16.1k | _mm_storel_epi64((__m128i *)dst_ptr, res); |
170 | 16.1k | s[0] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)); |
171 | 16.1k | res = _mm_avg_epu8(s[1], s[0]); |
172 | 16.1k | _mm_storel_epi64((__m128i *)(dst_ptr + dst_stride), res); |
173 | | |
174 | 16.1k | data += 2 * src_stride; |
175 | 16.1k | dst_ptr += 2 * dst_stride; |
176 | 16.1k | y -= 2; |
177 | 16.1k | } while (y > 0); |
178 | 4.69k | } else { |
179 | 1.51k | assert(w == 16); |
180 | | |
181 | 1.51k | s[0] = _mm_loadu_si128((__m128i *)data); |
182 | | |
183 | 8.59k | do { |
184 | 8.59k | s[1] = _mm_loadu_si128((__m128i *)(data + src_stride)); |
185 | 8.59k | res = _mm_avg_epu8(s[0], s[1]); |
186 | 8.59k | _mm_storeu_si128((__m128i *)dst_ptr, res); |
187 | 8.59k | s[0] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); |
188 | 8.59k | res = _mm_avg_epu8(s[1], s[0]); |
189 | 8.59k | _mm_storeu_si128((__m128i *)(dst_ptr + dst_stride), res); |
190 | | |
191 | 8.59k | data += 2 * src_stride; |
192 | 8.59k | dst_ptr += 2 * dst_stride; |
193 | 8.59k | y -= 2; |
194 | 8.59k | } while (y > 0); |
195 | 1.51k | } |
196 | 17.2k | } else { |
197 | 944 | assert(!(w % 32)); |
198 | | |
199 | 944 | __m256i s[2], res; |
200 | 1.34k | do { |
201 | 1.34k | data = src_ptr + x; |
202 | 1.34k | dst_ptr = dst + x; |
203 | 1.34k | y = h; |
204 | | |
205 | 1.34k | s[0] = _mm256_loadu_si256((__m256i *)data); |
206 | | |
207 | 33.0k | do { |
208 | 33.0k | s[1] = _mm256_loadu_si256((__m256i *)(data + src_stride)); |
209 | 33.0k | res = _mm256_avg_epu8(s[0], s[1]); |
210 | 33.0k | _mm256_storeu_si256((__m256i *)dst_ptr, res); |
211 | 33.0k | s[0] = _mm256_loadu_si256((__m256i *)(data + 2 * src_stride)); |
212 | 33.0k | res = _mm256_avg_epu8(s[1], s[0]); |
213 | 33.0k | _mm256_storeu_si256((__m256i *)(dst_ptr + dst_stride), res); |
214 | | |
215 | 33.0k | data += 2 * src_stride; |
216 | 33.0k | dst_ptr += 2 * dst_stride; |
217 | 33.0k | y -= 2; |
218 | 33.0k | } while (y > 0); |
219 | | |
220 | 1.34k | x += 32; |
221 | 1.34k | } while (x < w); |
222 | 944 | } |
223 | 18.1k | } |
224 | 471k | } else if (vert_tap == 4) { |
225 | 251k | if (w <= 4) { |
226 | 118k | prepare_coeffs_4t_ssse3(filter_params_y, subpel_y_qn, coeffs_128); |
227 | 118k | __m128i d[4], s[2]; |
228 | | |
229 | 118k | if (w == 2) { |
230 | 20.0k | d[0] = _mm_cvtsi32_si128(loadu_int16(data + 0 * src_stride)); |
231 | 20.0k | d[1] = _mm_cvtsi32_si128(loadu_int16(data + 1 * src_stride)); |
232 | 20.0k | d[2] = _mm_cvtsi32_si128(loadu_int16(data + 2 * src_stride)); |
233 | | |
234 | 20.0k | const __m128i src_01a = _mm_unpacklo_epi16(d[0], d[1]); |
235 | 20.0k | const __m128i src_12a = _mm_unpacklo_epi16(d[1], d[2]); |
236 | | |
237 | 20.0k | s[0] = _mm_unpacklo_epi8(src_01a, src_12a); |
238 | 34.1k | do { |
239 | 34.1k | __m128i res; |
240 | 34.1k | convolve_y_4tap_2x2_ssse3(data, src_stride, coeffs_128, d, s, &res); |
241 | 34.1k | res = round_sr_y_ssse3(res); |
242 | 34.1k | pack_store_u8_2x2_sse2(res, dst_ptr, dst_stride); |
243 | | |
244 | 34.1k | dst_ptr += 2 * dst_stride; |
245 | 34.1k | data += 2 * src_stride; |
246 | 34.1k | y -= 2; |
247 | | |
248 | 34.1k | s[0] = s[1]; |
249 | 34.1k | } while (y > 0); |
250 | | |
251 | 98.3k | } else { |
252 | 98.3k | assert(w == 4); |
253 | | |
254 | 98.3k | d[0] = _mm_cvtsi32_si128(loadu_int32(data + 0 * src_stride)); |
255 | 98.3k | d[1] = _mm_cvtsi32_si128(loadu_int32(data + 1 * src_stride)); |
256 | 98.3k | d[2] = _mm_cvtsi32_si128(loadu_int32(data + 2 * src_stride)); |
257 | | |
258 | 98.3k | const __m128i src_01a = _mm_unpacklo_epi32(d[0], d[1]); |
259 | 98.3k | const __m128i src_12a = _mm_unpacklo_epi32(d[1], d[2]); |
260 | | |
261 | 98.3k | s[0] = _mm_unpacklo_epi8(src_01a, src_12a); |
262 | 194k | do { |
263 | 194k | __m128i res; |
264 | 194k | convolve_y_4tap_4x2_ssse3(data, src_stride, coeffs_128, d, s, &res); |
265 | 194k | res = round_sr_y_ssse3(res); |
266 | 194k | pack_store_u8_4x2_sse2(res, dst_ptr, dst_stride); |
267 | | |
268 | 194k | dst_ptr += 2 * dst_stride; |
269 | 194k | data += 2 * src_stride; |
270 | 194k | y -= 2; |
271 | | |
272 | 194k | s[0] = s[1]; |
273 | 194k | } while (y > 0); |
274 | 98.3k | } |
275 | 133k | } else { |
276 | 133k | prepare_coeffs_4t_lowbd(filter_params_y, subpel_y_qn, coeffs); |
277 | | |
278 | 133k | if (w == 8) { |
279 | 89.0k | __m128i d[4]; |
280 | 89.0k | __m256i s[2]; |
281 | | |
282 | 89.0k | d[0] = _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)); |
283 | 89.0k | d[1] = _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)); |
284 | 89.0k | d[2] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)); |
285 | | |
286 | 89.0k | const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]); |
287 | 89.0k | const __m256i src_12a = _mm256_setr_m128i(d[1], d[2]); |
288 | | |
289 | 89.0k | s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); |
290 | 169k | do { |
291 | 169k | __m256i res; |
292 | 169k | convolve_y_4tap_8x2_avx2(data, src_stride, coeffs, d, s, &res); |
293 | 169k | round_pack_store_y_8x2_avx2(res, dst_ptr, dst_stride); |
294 | | |
295 | 169k | dst_ptr += 2 * dst_stride; |
296 | 169k | data += 2 * src_stride; |
297 | 169k | y -= 2; |
298 | | |
299 | 169k | s[0] = s[1]; |
300 | 169k | } while (y > 0); |
301 | 89.0k | } else if (w == 16) { |
302 | 41.2k | __m128i d[4]; |
303 | 41.2k | __m256i s[4]; |
304 | | |
305 | 41.2k | d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); |
306 | 41.2k | d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); |
307 | 41.2k | d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); |
308 | | |
309 | 41.2k | const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]); |
310 | 41.2k | const __m256i src_12a = _mm256_setr_m128i(d[1], d[2]); |
311 | | |
312 | 41.2k | s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); |
313 | 41.2k | s[2] = _mm256_unpackhi_epi8(src_01a, src_12a); |
314 | | |
315 | 97.3k | do { |
316 | 97.3k | __m256i res[2]; |
317 | 97.3k | convolve_y_4tap_16x2_avx2(data, src_stride, coeffs, d, s, res); |
318 | 97.3k | round_pack_store_y_16x2_avx2(res, dst_ptr, dst_stride); |
319 | | |
320 | 97.3k | dst_ptr += 2 * dst_stride; |
321 | 97.3k | data += 2 * src_stride; |
322 | 97.3k | y -= 2; |
323 | | |
324 | 97.3k | s[0] = s[1]; |
325 | 97.3k | s[2] = s[3]; |
326 | 97.3k | } while (y > 0); |
327 | 41.2k | } else { |
328 | 3.30k | assert(!(w % 32)); |
329 | | |
330 | 3.30k | __m256i d[4], s1[4], s2[4]; |
331 | 4.27k | do { |
332 | 4.27k | data = src_ptr + x; |
333 | 4.27k | dst_ptr = dst + x; |
334 | 4.27k | y = h; |
335 | | |
336 | 4.27k | d[0] = _mm256_loadu_si256((__m256i *)(data + 0 * src_stride)); |
337 | 4.27k | d[1] = _mm256_loadu_si256((__m256i *)(data + 1 * src_stride)); |
338 | 4.27k | d[2] = _mm256_loadu_si256((__m256i *)(data + 2 * src_stride)); |
339 | | |
340 | 4.27k | s1[0] = _mm256_unpacklo_epi8(d[0], d[1]); |
341 | 4.27k | s1[2] = _mm256_unpackhi_epi8(d[0], d[1]); |
342 | | |
343 | 4.27k | s2[0] = _mm256_unpacklo_epi8(d[1], d[2]); |
344 | 4.27k | s2[2] = _mm256_unpackhi_epi8(d[1], d[2]); |
345 | | |
346 | 87.2k | do { |
347 | 87.2k | __m256i res[4]; |
348 | 87.2k | convolve_y_4tap_32x2_avx2(data, src_stride, coeffs, d, s1, s2, res); |
349 | 87.2k | round_pack_store_y_32x2_avx2(res, dst_ptr, dst_stride); |
350 | | |
351 | 87.2k | dst_ptr += 2 * dst_stride; |
352 | 87.2k | data += 2 * src_stride; |
353 | 87.2k | y -= 2; |
354 | | |
355 | 87.2k | s1[0] = s1[1]; |
356 | 87.2k | s1[2] = s1[3]; |
357 | | |
358 | 87.2k | s2[0] = s2[1]; |
359 | 87.2k | s2[2] = s2[3]; |
360 | 87.2k | } while (y > 0); |
361 | | |
362 | 4.27k | x += 32; |
363 | 4.27k | } while (x < w); |
364 | 3.30k | } |
365 | 133k | } |
366 | 251k | } else if (vert_tap == 6) { |
367 | 204k | if (w <= 4) { |
368 | 63.2k | prepare_coeffs_6t_ssse3(filter_params_y, subpel_y_qn, coeffs_128); |
369 | | |
370 | 63.2k | __m128i d[6], s[3]; |
371 | 63.2k | if (w == 2) { |
372 | 12.1k | d[0] = _mm_cvtsi32_si128(loadu_int16(data + 0 * src_stride)); |
373 | 12.1k | d[1] = _mm_cvtsi32_si128(loadu_int16(data + 1 * src_stride)); |
374 | 12.1k | d[2] = _mm_cvtsi32_si128(loadu_int16(data + 2 * src_stride)); |
375 | 12.1k | d[3] = _mm_cvtsi32_si128(loadu_int16(data + 3 * src_stride)); |
376 | 12.1k | d[4] = _mm_cvtsi32_si128(loadu_int16(data + 4 * src_stride)); |
377 | | |
378 | 12.1k | const __m128i src_01a = _mm_unpacklo_epi16(d[0], d[1]); |
379 | 12.1k | const __m128i src_12a = _mm_unpacklo_epi16(d[1], d[2]); |
380 | 12.1k | const __m128i src_23a = _mm_unpacklo_epi16(d[2], d[3]); |
381 | 12.1k | const __m128i src_34a = _mm_unpacklo_epi16(d[3], d[4]); |
382 | | |
383 | 12.1k | s[0] = _mm_unpacklo_epi8(src_01a, src_12a); |
384 | 12.1k | s[1] = _mm_unpacklo_epi8(src_23a, src_34a); |
385 | | |
386 | 48.7k | do { |
387 | 48.7k | __m128i res; |
388 | 48.7k | convolve_y_6tap_2x2_ssse3(data, src_stride, coeffs_128, d, s, &res); |
389 | 48.7k | res = round_sr_y_ssse3(res); |
390 | 48.7k | pack_store_u8_2x2_sse2(res, dst_ptr, dst_stride); |
391 | | |
392 | 48.7k | dst_ptr += 2 * dst_stride; |
393 | 48.7k | data += 2 * src_stride; |
394 | 48.7k | y -= 2; |
395 | | |
396 | 48.7k | s[0] = s[1]; |
397 | 48.7k | s[1] = s[2]; |
398 | 48.7k | } while (y > 0); |
399 | | |
400 | 51.0k | } else { |
401 | 51.0k | assert(w == 4); |
402 | 51.0k | d[0] = _mm_cvtsi32_si128(loadu_int32(data + 0 * src_stride)); |
403 | 51.0k | d[1] = _mm_cvtsi32_si128(loadu_int32(data + 1 * src_stride)); |
404 | 51.0k | d[2] = _mm_cvtsi32_si128(loadu_int32(data + 2 * src_stride)); |
405 | 51.0k | d[3] = _mm_cvtsi32_si128(loadu_int32(data + 3 * src_stride)); |
406 | 51.0k | d[4] = _mm_cvtsi32_si128(loadu_int32(data + 4 * src_stride)); |
407 | | |
408 | 51.0k | const __m128i src_01a = _mm_unpacklo_epi32(d[0], d[1]); |
409 | 51.0k | const __m128i src_12a = _mm_unpacklo_epi32(d[1], d[2]); |
410 | 51.0k | const __m128i src_23a = _mm_unpacklo_epi32(d[2], d[3]); |
411 | 51.0k | const __m128i src_34a = _mm_unpacklo_epi32(d[3], d[4]); |
412 | | |
413 | 51.0k | s[0] = _mm_unpacklo_epi8(src_01a, src_12a); |
414 | 51.0k | s[1] = _mm_unpacklo_epi8(src_23a, src_34a); |
415 | | |
416 | 279k | do { |
417 | 279k | __m128i res; |
418 | 279k | convolve_y_6tap_4x2_ssse3(data, src_stride, coeffs_128, d, s, &res); |
419 | 279k | res = round_sr_y_ssse3(res); |
420 | 279k | pack_store_u8_4x2_sse2(res, dst_ptr, dst_stride); |
421 | | |
422 | 279k | dst_ptr += 2 * dst_stride; |
423 | 279k | data += 2 * src_stride; |
424 | 279k | y -= 2; |
425 | | |
426 | 279k | s[0] = s[1]; |
427 | 279k | s[1] = s[2]; |
428 | 279k | } while (y > 0); |
429 | 51.0k | } |
430 | 141k | } else { |
431 | 141k | prepare_coeffs_6t_lowbd(filter_params_y, subpel_y_qn, coeffs); |
432 | | |
433 | 141k | if (w == 8) { |
434 | 65.7k | __m128i d[6]; |
435 | 65.7k | __m256i s[3]; |
436 | | |
437 | 65.7k | d[0] = _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)); |
438 | 65.7k | d[1] = _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)); |
439 | 65.7k | d[2] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)); |
440 | 65.7k | d[3] = _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)); |
441 | 65.7k | d[4] = _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)); |
442 | | |
443 | 65.7k | const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]); |
444 | 65.7k | const __m256i src_12a = _mm256_setr_m128i(d[1], d[2]); |
445 | 65.7k | const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]); |
446 | 65.7k | const __m256i src_34a = _mm256_setr_m128i(d[3], d[4]); |
447 | | |
448 | 65.7k | s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); |
449 | 65.7k | s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); |
450 | | |
451 | 370k | do { |
452 | 370k | __m256i res; |
453 | 370k | convolve_y_6tap_8x2_avx2(data, src_stride, coeffs, d, s, &res); |
454 | 370k | round_pack_store_y_8x2_avx2(res, dst_ptr, dst_stride); |
455 | | |
456 | 370k | dst_ptr += 2 * dst_stride; |
457 | 370k | data += 2 * src_stride; |
458 | 370k | y -= 2; |
459 | | |
460 | 370k | s[0] = s[1]; |
461 | 370k | s[1] = s[2]; |
462 | 370k | } while (y > 0); |
463 | | |
464 | 75.6k | } else { |
465 | 75.6k | assert(!(w % 16)); |
466 | | |
467 | 75.6k | __m128i d[6]; |
468 | 75.6k | __m256i s[6]; |
469 | 111k | do { |
470 | 111k | data = src_ptr + x; |
471 | 111k | dst_ptr = dst + x; |
472 | 111k | y = h; |
473 | | |
474 | 111k | d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); |
475 | 111k | d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); |
476 | 111k | d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); |
477 | 111k | d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); |
478 | 111k | d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); |
479 | | |
480 | 111k | const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]); |
481 | 111k | const __m256i src_12a = _mm256_setr_m128i(d[1], d[2]); |
482 | 111k | const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]); |
483 | 111k | const __m256i src_34a = _mm256_setr_m128i(d[3], d[4]); |
484 | | |
485 | 111k | s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); |
486 | 111k | s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); |
487 | | |
488 | 111k | s[3] = _mm256_unpackhi_epi8(src_01a, src_12a); |
489 | 111k | s[4] = _mm256_unpackhi_epi8(src_23a, src_34a); |
490 | | |
491 | 1.42M | do { |
492 | 1.42M | __m256i res[2]; |
493 | 1.42M | convolve_y_6tap_16x2_avx2(data, src_stride, coeffs, d, s, res); |
494 | 1.42M | round_pack_store_y_16x2_avx2(res, dst_ptr, dst_stride); |
495 | | |
496 | 1.42M | dst_ptr += 2 * dst_stride; |
497 | 1.42M | data += 2 * src_stride; |
498 | 1.42M | y -= 2; |
499 | | |
500 | 1.42M | s[0] = s[1]; |
501 | 1.42M | s[1] = s[2]; |
502 | | |
503 | 1.42M | s[3] = s[4]; |
504 | 1.42M | s[4] = s[5]; |
505 | 1.42M | } while (y > 0); |
506 | | |
507 | 111k | x += 16; |
508 | 111k | } while (x < w); |
509 | 75.6k | } |
510 | 141k | } |
511 | 204k | } else if (vert_tap == 12) { // vert_tap == 12 |
512 | 0 | __m128i d[12]; |
513 | 0 | __m256i s[12]; |
514 | 0 | prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs); |
515 | 0 | const __m256i v_zero = _mm256_setzero_si256(); |
516 | 0 | __m128i right_shift = _mm_cvtsi32_si128(FILTER_BITS); |
517 | 0 | __m256i right_shift_const = _mm256_set1_epi32((1 << FILTER_BITS) >> 1); |
518 | |
|
519 | 0 | for (int j = 0; j < w; j += 8) { |
520 | 0 | data = &src_ptr[j]; |
521 | 0 | __m256i src10; |
522 | |
|
523 | 0 | d[0] = _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)); |
524 | 0 | d[1] = _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)); |
525 | 0 | d[2] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)); |
526 | 0 | d[3] = _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)); |
527 | 0 | d[4] = _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)); |
528 | 0 | d[5] = _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)); |
529 | 0 | d[6] = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride)); |
530 | 0 | d[7] = _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)); |
531 | 0 | d[8] = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride)); |
532 | 0 | d[9] = _mm_loadl_epi64((__m128i *)(data + 9 * src_stride)); |
533 | | // Load lines a and b. Line a to lower 128, line b to upper 128 |
534 | 0 | const __m256i src_01a = _mm256_permute2x128_si256( |
535 | 0 | _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20); |
536 | |
|
537 | 0 | const __m256i src_12a = _mm256_permute2x128_si256( |
538 | 0 | _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20); |
539 | |
|
540 | 0 | const __m256i src_23a = _mm256_permute2x128_si256( |
541 | 0 | _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20); |
542 | |
|
543 | 0 | const __m256i src_34a = _mm256_permute2x128_si256( |
544 | 0 | _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20); |
545 | |
|
546 | 0 | const __m256i src_45a = _mm256_permute2x128_si256( |
547 | 0 | _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20); |
548 | |
|
549 | 0 | const __m256i src_56a = _mm256_permute2x128_si256( |
550 | 0 | _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[6]), 0x20); |
551 | |
|
552 | 0 | const __m256i src_67a = _mm256_permute2x128_si256( |
553 | 0 | _mm256_castsi128_si256(d[6]), _mm256_castsi128_si256(d[7]), 0x20); |
554 | |
|
555 | 0 | const __m256i src_78a = _mm256_permute2x128_si256( |
556 | 0 | _mm256_castsi128_si256(d[7]), _mm256_castsi128_si256(d[8]), 0x20); |
557 | |
|
558 | 0 | const __m256i src_89a = _mm256_permute2x128_si256( |
559 | 0 | _mm256_castsi128_si256(d[8]), _mm256_castsi128_si256(d[9]), 0x20); |
560 | |
|
561 | 0 | src10 = _mm256_castsi128_si256( |
562 | 0 | _mm_loadl_epi64((__m128i *)(data + 10 * src_stride))); |
563 | 0 | const __m256i src_910a = |
564 | 0 | _mm256_permute2x128_si256(_mm256_castsi128_si256(d[9]), src10, 0x20); |
565 | |
|
566 | 0 | const __m256i src_01 = _mm256_unpacklo_epi8(src_01a, v_zero); |
567 | 0 | const __m256i src_12 = _mm256_unpacklo_epi8(src_12a, v_zero); |
568 | 0 | const __m256i src_23 = _mm256_unpacklo_epi8(src_23a, v_zero); |
569 | 0 | const __m256i src_34 = _mm256_unpacklo_epi8(src_34a, v_zero); |
570 | 0 | const __m256i src_45 = _mm256_unpacklo_epi8(src_45a, v_zero); |
571 | 0 | const __m256i src_56 = _mm256_unpacklo_epi8(src_56a, v_zero); |
572 | 0 | const __m256i src_67 = _mm256_unpacklo_epi8(src_67a, v_zero); |
573 | 0 | const __m256i src_78 = _mm256_unpacklo_epi8(src_78a, v_zero); |
574 | 0 | const __m256i src_89 = _mm256_unpacklo_epi8(src_89a, v_zero); |
575 | 0 | const __m256i src_910 = _mm256_unpacklo_epi8(src_910a, v_zero); |
576 | |
|
577 | 0 | s[0] = _mm256_unpacklo_epi16(src_01, src_12); |
578 | 0 | s[1] = _mm256_unpacklo_epi16(src_23, src_34); |
579 | 0 | s[2] = _mm256_unpacklo_epi16(src_45, src_56); |
580 | 0 | s[3] = _mm256_unpacklo_epi16(src_67, src_78); |
581 | 0 | s[4] = _mm256_unpacklo_epi16(src_89, src_910); |
582 | |
|
583 | 0 | s[6] = _mm256_unpackhi_epi16(src_01, src_12); |
584 | 0 | s[7] = _mm256_unpackhi_epi16(src_23, src_34); |
585 | 0 | s[8] = _mm256_unpackhi_epi16(src_45, src_56); |
586 | 0 | s[9] = _mm256_unpackhi_epi16(src_67, src_78); |
587 | 0 | s[10] = _mm256_unpackhi_epi16(src_89, src_910); |
588 | |
|
589 | 0 | for (i = 0; i < h; i += 2) { |
590 | 0 | data = &src_ptr[i * src_stride + j]; |
591 | 0 | const __m256i src_1011a = _mm256_permute2x128_si256( |
592 | 0 | src10, |
593 | 0 | _mm256_castsi128_si256( |
594 | 0 | _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))), |
595 | 0 | 0x20); |
596 | |
|
597 | 0 | src10 = _mm256_castsi128_si256( |
598 | 0 | _mm_loadl_epi64((__m128i *)(data + 12 * src_stride))); |
599 | |
|
600 | 0 | const __m256i src_1112a = _mm256_permute2x128_si256( |
601 | 0 | _mm256_castsi128_si256( |
602 | 0 | _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))), |
603 | 0 | src10, 0x20); |
604 | |
|
605 | 0 | const __m256i src_1011 = _mm256_unpacklo_epi8(src_1011a, v_zero); |
606 | 0 | const __m256i src_1112 = _mm256_unpacklo_epi8(src_1112a, v_zero); |
607 | |
|
608 | 0 | s[5] = _mm256_unpacklo_epi16(src_1011, src_1112); |
609 | 0 | s[11] = _mm256_unpackhi_epi16(src_1011, src_1112); |
610 | |
|
611 | 0 | const __m256i res_lo = convolve_12taps(s, coeffs); |
612 | |
|
613 | 0 | const __m256i res_32b_lo = _mm256_sra_epi32( |
614 | 0 | _mm256_add_epi32(res_lo, right_shift_const), right_shift); |
615 | | // 8 bit conversion and saturation to uint8 |
616 | 0 | __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); |
617 | 0 | __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); |
618 | |
|
619 | 0 | if (w - j > 4) { |
620 | 0 | const __m256i res_hi = convolve_12taps(s + 6, coeffs); |
621 | |
|
622 | 0 | const __m256i res_32b_hi = _mm256_sra_epi32( |
623 | 0 | _mm256_add_epi32(res_hi, right_shift_const), right_shift); |
624 | 0 | __m256i res_16b_hi = _mm256_packs_epi32(res_32b_hi, res_32b_hi); |
625 | | // 8 bit conversion and saturation to uint8 |
626 | 0 | __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); |
627 | |
|
628 | 0 | __m256i res_a = _mm256_unpacklo_epi32(res_8b_lo, res_8b_hi); |
629 | |
|
630 | 0 | const __m128i res_0 = _mm256_extracti128_si256(res_a, 0); |
631 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); |
632 | |
|
633 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); |
634 | 0 | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], |
635 | 0 | res_1); |
636 | 0 | } else { |
637 | 0 | const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0); |
638 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); |
639 | 0 | if (w - j > 2) { |
640 | 0 | xx_storel_32(&dst[i * dst_stride + j], res_0); |
641 | 0 | xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1); |
642 | 0 | } else { |
643 | 0 | xx_storel_16(&dst[i * dst_stride + j], res_0); |
644 | 0 | xx_storel_16(&dst[i * dst_stride + j + dst_stride], res_1); |
645 | 0 | } |
646 | 0 | } |
647 | 0 | s[0] = s[1]; |
648 | 0 | s[1] = s[2]; |
649 | 0 | s[2] = s[3]; |
650 | 0 | s[3] = s[4]; |
651 | 0 | s[4] = s[5]; |
652 | |
|
653 | 0 | s[6] = s[7]; |
654 | 0 | s[7] = s[8]; |
655 | 0 | s[8] = s[9]; |
656 | 0 | s[9] = s[10]; |
657 | 0 | s[10] = s[11]; |
658 | 0 | } |
659 | 0 | } |
660 | 15.3k | } else { |
661 | 15.3k | assert(vert_tap == 8); |
662 | | |
663 | 15.3k | if (w <= 4) { |
664 | 6.74k | prepare_coeffs_ssse3(filter_params_y, subpel_y_qn, coeffs_128); |
665 | | |
666 | 6.74k | __m128i d[8], s[4], res; |
667 | 6.74k | if (w == 2) { |
668 | 1.48k | d[0] = _mm_cvtsi32_si128(loadu_int16(data + 0 * src_stride)); |
669 | 1.48k | d[1] = _mm_cvtsi32_si128(loadu_int16(data + 1 * src_stride)); |
670 | 1.48k | d[2] = _mm_cvtsi32_si128(loadu_int16(data + 2 * src_stride)); |
671 | 1.48k | d[3] = _mm_cvtsi32_si128(loadu_int16(data + 3 * src_stride)); |
672 | 1.48k | d[4] = _mm_cvtsi32_si128(loadu_int16(data + 4 * src_stride)); |
673 | 1.48k | d[5] = _mm_cvtsi32_si128(loadu_int16(data + 5 * src_stride)); |
674 | 1.48k | d[6] = _mm_cvtsi32_si128(loadu_int16(data + 6 * src_stride)); |
675 | | |
676 | 1.48k | const __m128i src_01a = _mm_unpacklo_epi16(d[0], d[1]); |
677 | 1.48k | const __m128i src_12a = _mm_unpacklo_epi16(d[1], d[2]); |
678 | 1.48k | const __m128i src_23a = _mm_unpacklo_epi16(d[2], d[3]); |
679 | 1.48k | const __m128i src_34a = _mm_unpacklo_epi16(d[3], d[4]); |
680 | 1.48k | const __m128i src_45a = _mm_unpacklo_epi16(d[4], d[5]); |
681 | 1.48k | const __m128i src_56a = _mm_unpacklo_epi16(d[5], d[6]); |
682 | | |
683 | 1.48k | s[0] = _mm_unpacklo_epi8(src_01a, src_12a); |
684 | 1.48k | s[1] = _mm_unpacklo_epi8(src_23a, src_34a); |
685 | 1.48k | s[2] = _mm_unpacklo_epi8(src_45a, src_56a); |
686 | | |
687 | 5.93k | do { |
688 | 5.93k | convolve_y_8tap_2x2_ssse3(data, src_stride, coeffs_128, d, s, &res); |
689 | 5.93k | res = round_sr_y_ssse3(res); |
690 | 5.93k | pack_store_u8_2x2_sse2(res, dst_ptr, dst_stride); |
691 | | |
692 | 5.93k | dst_ptr += 2 * dst_stride; |
693 | 5.93k | data += 2 * src_stride; |
694 | 5.93k | y -= 2; |
695 | | |
696 | 5.93k | s[0] = s[1]; |
697 | 5.93k | s[1] = s[2]; |
698 | 5.93k | s[2] = s[3]; |
699 | 5.93k | } while (y > 0); |
700 | | |
701 | 5.26k | } else { |
702 | 5.26k | assert(w == 4); |
703 | | |
704 | 5.26k | d[0] = _mm_cvtsi32_si128(loadu_int32(data + 0 * src_stride)); |
705 | 5.26k | d[1] = _mm_cvtsi32_si128(loadu_int32(data + 1 * src_stride)); |
706 | 5.26k | d[2] = _mm_cvtsi32_si128(loadu_int32(data + 2 * src_stride)); |
707 | 5.26k | d[3] = _mm_cvtsi32_si128(loadu_int32(data + 3 * src_stride)); |
708 | 5.26k | d[4] = _mm_cvtsi32_si128(loadu_int32(data + 4 * src_stride)); |
709 | 5.26k | d[5] = _mm_cvtsi32_si128(loadu_int32(data + 5 * src_stride)); |
710 | 5.26k | d[6] = _mm_cvtsi32_si128(loadu_int32(data + 6 * src_stride)); |
711 | | |
712 | 5.26k | const __m128i src_01a = _mm_unpacklo_epi32(d[0], d[1]); |
713 | 5.26k | const __m128i src_12a = _mm_unpacklo_epi32(d[1], d[2]); |
714 | 5.26k | const __m128i src_23a = _mm_unpacklo_epi32(d[2], d[3]); |
715 | 5.26k | const __m128i src_34a = _mm_unpacklo_epi32(d[3], d[4]); |
716 | 5.26k | const __m128i src_45a = _mm_unpacklo_epi32(d[4], d[5]); |
717 | 5.26k | const __m128i src_56a = _mm_unpacklo_epi32(d[5], d[6]); |
718 | | |
719 | 5.26k | s[0] = _mm_unpacklo_epi8(src_01a, src_12a); |
720 | 5.26k | s[1] = _mm_unpacklo_epi8(src_23a, src_34a); |
721 | 5.26k | s[2] = _mm_unpacklo_epi8(src_45a, src_56a); |
722 | | |
723 | 30.0k | do { |
724 | 30.0k | convolve_y_8tap_4x2_ssse3(data, src_stride, coeffs_128, d, s, &res); |
725 | 30.0k | res = round_sr_y_ssse3(res); |
726 | 30.0k | pack_store_u8_4x2_sse2(res, dst_ptr, dst_stride); |
727 | | |
728 | 30.0k | dst_ptr += 2 * dst_stride; |
729 | 30.0k | data += 2 * src_stride; |
730 | 30.0k | y -= 2; |
731 | | |
732 | 30.0k | s[0] = s[1]; |
733 | 30.0k | s[1] = s[2]; |
734 | 30.0k | s[2] = s[3]; |
735 | 30.0k | } while (y > 0); |
736 | 5.26k | } |
737 | 8.64k | } else { |
738 | 8.64k | prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs); |
739 | | |
740 | 8.64k | if (w == 8) { |
741 | 4.40k | __m128i d[8]; |
742 | 4.40k | __m256i s[4]; |
743 | | |
744 | 4.40k | d[0] = _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)); |
745 | 4.40k | d[1] = _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)); |
746 | 4.40k | d[2] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)); |
747 | 4.40k | d[3] = _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)); |
748 | 4.40k | d[4] = _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)); |
749 | 4.40k | d[5] = _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)); |
750 | 4.40k | d[6] = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride)); |
751 | | |
752 | 4.40k | const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]); |
753 | 4.40k | const __m256i src_12a = _mm256_setr_m128i(d[1], d[2]); |
754 | 4.40k | const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]); |
755 | 4.40k | const __m256i src_34a = _mm256_setr_m128i(d[3], d[4]); |
756 | 4.40k | const __m256i src_45a = _mm256_setr_m128i(d[4], d[5]); |
757 | 4.40k | const __m256i src_56a = _mm256_setr_m128i(d[5], d[6]); |
758 | | |
759 | 4.40k | s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); |
760 | 4.40k | s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); |
761 | 4.40k | s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); |
762 | | |
763 | 30.5k | do { |
764 | 30.5k | __m256i res; |
765 | 30.5k | convolve_y_8tap_8x2_avx2(data, src_stride, coeffs, d, s, &res); |
766 | 30.5k | round_pack_store_y_8x2_avx2(res, dst_ptr, dst_stride); |
767 | | |
768 | 30.5k | dst_ptr += 2 * dst_stride; |
769 | 30.5k | data += 2 * src_stride; |
770 | 30.5k | y -= 2; |
771 | | |
772 | 30.5k | s[0] = s[1]; |
773 | 30.5k | s[1] = s[2]; |
774 | 30.5k | s[2] = s[3]; |
775 | 30.5k | } while (y > 0); |
776 | | |
777 | 4.40k | } else { |
778 | 4.24k | assert(!(w % 16)); |
779 | | |
780 | 4.24k | __m128i d[8]; |
781 | 4.24k | __m256i s[8]; |
782 | 6.62k | do { |
783 | 6.62k | data = src_ptr + x; |
784 | 6.62k | dst_ptr = dst + x; |
785 | 6.62k | y = h; |
786 | | |
787 | 6.62k | d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); |
788 | 6.62k | d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); |
789 | 6.62k | d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); |
790 | 6.62k | d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); |
791 | 6.62k | d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); |
792 | 6.62k | d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); |
793 | 6.62k | d[6] = _mm_loadu_si128((__m128i *)(data + 6 * src_stride)); |
794 | | |
795 | 6.62k | const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]); |
796 | 6.62k | const __m256i src_12a = _mm256_setr_m128i(d[1], d[2]); |
797 | 6.62k | const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]); |
798 | 6.62k | const __m256i src_34a = _mm256_setr_m128i(d[3], d[4]); |
799 | 6.62k | const __m256i src_45a = _mm256_setr_m128i(d[4], d[5]); |
800 | 6.62k | const __m256i src_56a = _mm256_setr_m128i(d[5], d[6]); |
801 | | |
802 | 6.62k | s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); |
803 | 6.62k | s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); |
804 | 6.62k | s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); |
805 | | |
806 | 6.62k | s[4] = _mm256_unpackhi_epi8(src_01a, src_12a); |
807 | 6.62k | s[5] = _mm256_unpackhi_epi8(src_23a, src_34a); |
808 | 6.62k | s[6] = _mm256_unpackhi_epi8(src_45a, src_56a); |
809 | | |
810 | 94.4k | do { |
811 | 94.4k | __m256i res[2]; |
812 | 94.4k | convolve_y_8tap_16x2_avx2(data, src_stride, coeffs, d, s, res); |
813 | 94.4k | round_pack_store_y_16x2_avx2(res, dst_ptr, dst_stride); |
814 | | |
815 | 94.4k | dst_ptr += 2 * dst_stride; |
816 | 94.4k | data += 2 * src_stride; |
817 | 94.4k | y -= 2; |
818 | | |
819 | 94.4k | s[0] = s[1]; |
820 | 94.4k | s[1] = s[2]; |
821 | 94.4k | s[2] = s[3]; |
822 | | |
823 | 94.4k | s[4] = s[5]; |
824 | 94.4k | s[5] = s[6]; |
825 | 94.4k | s[6] = s[7]; |
826 | 94.4k | } while (y > 0); |
827 | | |
828 | 6.62k | x += 16; |
829 | 6.62k | } while (x < w); |
830 | 4.24k | } |
831 | 8.64k | } |
832 | 15.3k | } |
833 | 503k | } |
834 | | |
835 | | void av1_convolve_x_sr_avx2(const uint8_t *src, int32_t src_stride, |
836 | | uint8_t *dst, int32_t dst_stride, int32_t w, |
837 | | int32_t h, |
838 | | const InterpFilterParams *filter_params_x, |
839 | | const int32_t subpel_x_qn, |
840 | 409k | ConvolveParams *conv_params) { |
841 | 409k | const int bits = FILTER_BITS - conv_params->round_0; |
842 | 409k | int i, j, horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn); |
843 | | |
844 | 409k | assert(bits >= 0); |
845 | 409k | assert((FILTER_BITS - conv_params->round_1) >= 0 || |
846 | 409k | ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); |
847 | 409k | assert(conv_params->round_0 > 0); |
848 | | |
849 | 409k | assert(horiz_tap == 2 || horiz_tap == 4 || horiz_tap == 6 || horiz_tap == 8 || |
850 | 409k | horiz_tap == 12); |
851 | 409k | assert((!(w % 2)) || (w <= 128)); |
852 | 409k | assert((h % 2) == 0); |
853 | | |
854 | 409k | __m256i coeffs[6] = { 0 }, filt[4] = { 0 }; |
855 | 409k | __m128i coeffs_128[4] = { 0 }; |
856 | | |
857 | 409k | i = 0; |
858 | | // horz_filt as 4 tap |
859 | 409k | if (horiz_tap == 4) { |
860 | | // since fo_horiz = 1 |
861 | 154k | const uint8_t *src_ptr = src - 1; |
862 | 154k | if (w == 2) { |
863 | 24.7k | prepare_coeffs_4t_ssse3(filter_params_x, subpel_x_qn, coeffs_128); |
864 | 64.2k | do { |
865 | 64.2k | const __m128i res = |
866 | 64.2k | convolve_x_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128); |
867 | 64.2k | const __m128i reg = round_sr_x_ssse3(res); |
868 | 64.2k | pack_store_u8_2x2_sse2(reg, dst, dst_stride); |
869 | 64.2k | src_ptr += 2 * src_stride; |
870 | 64.2k | dst += 2 * dst_stride; |
871 | 64.2k | h -= 2; |
872 | 64.2k | } while (h); |
873 | 129k | } else if (w == 4) { |
874 | 116k | prepare_coeffs_4t_ssse3(filter_params_x, subpel_x_qn, coeffs_128); |
875 | 383k | do { |
876 | 383k | const __m128i reg = |
877 | 383k | convolve_x_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128); |
878 | 383k | const __m128i res = round_sr_x_ssse3(reg); |
879 | 383k | pack_store_u8_4x2_sse2(res, dst, dst_stride); |
880 | 383k | src_ptr += 2 * src_stride; |
881 | 383k | dst += 2 * dst_stride; |
882 | 383k | h -= 2; |
883 | 383k | } while (h); |
884 | 116k | } else if (w == 8) { |
885 | 6.99k | prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs); |
886 | 6.99k | filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2)); |
887 | 6.99k | filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); |
888 | 25.9k | do { |
889 | 25.9k | const __m256i data = _mm256_setr_m128i( |
890 | 25.9k | _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride])), |
891 | 25.9k | _mm_loadu_si128( |
892 | 25.9k | (__m128i *)(&src_ptr[i * src_stride + src_stride]))); |
893 | | |
894 | 25.9k | __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt); |
895 | | |
896 | 25.9k | res_16b = round_sr_x_avx2(res_16b); |
897 | | |
898 | | /* rounding code */ |
899 | | // 8 bit conversion and saturation to uint8 |
900 | 25.9k | __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); |
901 | | |
902 | 25.9k | const __m128i res_0 = _mm256_castsi256_si128(res_8b); |
903 | 25.9k | const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); |
904 | | |
905 | 25.9k | _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0); |
906 | 25.9k | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1); |
907 | 25.9k | i += 2; |
908 | 25.9k | } while (i < h); |
909 | 6.99k | } else { |
910 | 5.53k | assert(!(w % 16)); |
911 | 5.53k | prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs); |
912 | 5.53k | filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2)); |
913 | 5.53k | filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); |
914 | 121k | do { |
915 | 121k | j = 0; |
916 | 399k | do { |
917 | | // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 |
918 | | // 18 19 20 21 22 23 |
919 | 399k | const __m256i data = _mm256_inserti128_si256( |
920 | 399k | _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]), |
921 | 399k | _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]), |
922 | 399k | 1); |
923 | | |
924 | 399k | __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt); |
925 | | |
926 | 399k | res_16b = round_sr_x_avx2(res_16b); |
927 | | |
928 | | /* rounding code */ |
929 | | // 8 bit conversion and saturation to uint8 |
930 | 399k | __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); |
931 | | |
932 | | // Store values into the destination buffer |
933 | | // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
934 | 399k | res_8b = _mm256_permute4x64_epi64(res_8b, 216); |
935 | 399k | __m128i res = _mm256_castsi256_si128(res_8b); |
936 | 399k | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); |
937 | 399k | j += 16; |
938 | 399k | } while (j < w); |
939 | 121k | i++; |
940 | 121k | } while (i < h); |
941 | 5.53k | } |
942 | 255k | } else if (horiz_tap == 6) { |
943 | | // since (horiz_tap/2 - 1 == 2) |
944 | 202k | const uint8_t *src_ptr = src - 2; |
945 | 202k | prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs); |
946 | 202k | filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2)); |
947 | 202k | filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); |
948 | 202k | filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); |
949 | 202k | if (w == 8) { |
950 | 396k | do { |
951 | 396k | const __m256i data = _mm256_setr_m128i( |
952 | 396k | _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride])), |
953 | 396k | _mm_loadu_si128( |
954 | 396k | (__m128i *)(&src_ptr[i * src_stride + src_stride]))); |
955 | | |
956 | 396k | __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt); |
957 | | |
958 | 396k | res_16b = round_sr_x_avx2(res_16b); |
959 | | |
960 | | /* rounding code */ |
961 | | // 8 bit conversion and saturation to uint8 |
962 | 396k | __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); |
963 | | |
964 | 396k | const __m128i res_0 = _mm256_castsi256_si128(res_8b); |
965 | 396k | const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); |
966 | 396k | _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0); |
967 | 396k | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1); |
968 | 396k | i += 2; |
969 | 396k | } while (i < h); |
970 | 109k | } else if (w == 16) { |
971 | 345k | do { |
972 | 345k | __m256i data[2] = { 0 }; |
973 | | |
974 | 345k | load_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs, filt, data); |
975 | 345k | round_pack_store_16x2_avx2(data, dst, dst_stride); |
976 | 345k | src_ptr += 2 * src_stride; |
977 | 345k | dst += 2 * dst_stride; |
978 | 345k | h -= 2; |
979 | 345k | } while (h); |
980 | 68.9k | } else if (w == 32) { |
981 | 366k | do { |
982 | 366k | convolve_sr_store_6tap_32_avx2(src_ptr, coeffs, filt, dst); |
983 | 366k | src_ptr += src_stride; |
984 | 366k | dst += dst_stride; |
985 | 366k | } while ((--h) > 0); |
986 | 18.6k | } else if (w == 64) { |
987 | 210k | do { |
988 | 210k | convolve_sr_store_6tap_32_avx2(src_ptr, coeffs, filt, dst); |
989 | 210k | convolve_sr_store_6tap_32_avx2(src_ptr + 32, coeffs, filt, dst + 32); |
990 | 210k | src_ptr += src_stride; |
991 | 210k | dst += dst_stride; |
992 | 210k | } while ((--h) > 0); |
993 | 4.17k | } else { |
994 | 1.02k | assert(w == 128); |
995 | | |
996 | 118k | do { |
997 | 118k | convolve_sr_store_6tap_32_avx2(src_ptr, coeffs, filt, dst); |
998 | 118k | convolve_sr_store_6tap_32_avx2(src_ptr + SECOND_32_BLK, coeffs, filt, |
999 | 118k | dst + SECOND_32_BLK); |
1000 | 118k | convolve_sr_store_6tap_32_avx2(src_ptr + THIRD_32_BLK, coeffs, filt, |
1001 | 118k | dst + THIRD_32_BLK); |
1002 | 118k | convolve_sr_store_6tap_32_avx2(src_ptr + FOURTH_32_BLK, coeffs, filt, |
1003 | 118k | dst + FOURTH_32_BLK); |
1004 | 118k | src_ptr += src_stride; |
1005 | 118k | dst += dst_stride; |
1006 | 118k | } while ((--h) > 0); |
1007 | 1.03k | } |
1008 | 202k | } else if (horiz_tap == 8) { |
1009 | | // since (horiz_tap / 2 - 1) == 3 |
1010 | 14.9k | const uint8_t *src_ptr = src - 3; |
1011 | 14.9k | prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs); |
1012 | 14.9k | filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2)); |
1013 | 14.9k | filt[1] = |
1014 | 14.9k | _mm256_load_si256((__m256i const *)(filt_global_avx2 + SECOND_32_BLK)); |
1015 | 14.9k | filt[2] = |
1016 | 14.9k | _mm256_load_si256((__m256i const *)(filt_global_avx2 + THIRD_32_BLK)); |
1017 | 14.9k | filt[3] = |
1018 | 14.9k | _mm256_load_si256((__m256i const *)(filt_global_avx2 + FOURTH_32_BLK)); |
1019 | | |
1020 | 14.9k | if (w == 8) { |
1021 | 30.3k | do { |
1022 | 30.3k | const __m256i data = _mm256_setr_m128i( |
1023 | 30.3k | _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride])), |
1024 | 30.3k | _mm_loadu_si128( |
1025 | 30.3k | (__m128i *)(&src_ptr[i * src_stride + src_stride]))); |
1026 | | |
1027 | 30.3k | __m256i res_16b = convolve_lowbd_x(data, coeffs, filt); |
1028 | | |
1029 | 30.3k | res_16b = round_sr_x_avx2(res_16b); |
1030 | | |
1031 | | /* rounding code */ |
1032 | | // 8 bit conversion and saturation to uint8 |
1033 | 30.3k | __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); |
1034 | | |
1035 | 30.3k | const __m128i res_0 = _mm256_castsi256_si128(res_8b); |
1036 | 30.3k | const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); |
1037 | 30.3k | _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0); |
1038 | 30.3k | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1); |
1039 | 30.3k | i += 2; |
1040 | 30.3k | } while (i < h); |
1041 | 7.70k | } else if (w == 16) { |
1042 | 28.6k | do { |
1043 | 28.6k | __m256i data[2] = { 0 }; |
1044 | | |
1045 | 28.6k | load_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs, filt, data); |
1046 | 28.6k | round_pack_store_16x2_avx2(data, dst, dst_stride); |
1047 | 28.6k | src_ptr += 2 * src_stride; |
1048 | 28.6k | dst += 2 * dst_stride; |
1049 | 28.6k | h -= 2; |
1050 | 28.6k | } while (h); |
1051 | 4.93k | } else if (w == 32) { |
1052 | 37.6k | do { |
1053 | 37.6k | load_convolve_round_8tap_32_avx2(src_ptr, coeffs, filt, dst); |
1054 | 37.6k | src_ptr += src_stride; |
1055 | 37.6k | dst += dst_stride; |
1056 | 37.6k | } while ((--h) > 0); |
1057 | 1.56k | } else if (w == 64) { |
1058 | 27.8k | do { |
1059 | 27.8k | load_convolve_round_8tap_32_avx2(src_ptr, coeffs, filt, dst); |
1060 | 27.8k | load_convolve_round_8tap_32_avx2(src_ptr + 32, coeffs, filt, dst + 32); |
1061 | 27.8k | src_ptr += src_stride; |
1062 | 27.8k | dst += dst_stride; |
1063 | 27.8k | } while ((--h) > 0); |
1064 | 557 | } else { |
1065 | 157 | assert(w == 128); |
1066 | 16.0k | do { |
1067 | 16.0k | load_convolve_round_8tap_32_avx2(src_ptr, coeffs, filt, dst); |
1068 | 16.0k | load_convolve_round_8tap_32_avx2(src_ptr + SECOND_32_BLK, coeffs, filt, |
1069 | 16.0k | dst + SECOND_32_BLK); |
1070 | 16.0k | load_convolve_round_8tap_32_avx2(src_ptr + THIRD_32_BLK, coeffs, filt, |
1071 | 16.0k | dst + THIRD_32_BLK); |
1072 | 16.0k | load_convolve_round_8tap_32_avx2(src_ptr + FOURTH_32_BLK, coeffs, filt, |
1073 | 16.0k | dst + FOURTH_32_BLK); |
1074 | 16.0k | src_ptr += src_stride; |
1075 | 16.0k | dst += dst_stride; |
1076 | 16.0k | } while ((--h) > 0); |
1077 | 158 | } |
1078 | 38.4k | } else if (horiz_tap == 12) { // horiz_tap == 12 |
1079 | 0 | const int fo_horiz = filter_params_x->taps / 2 - 1; |
1080 | 0 | prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs); |
1081 | 0 | const __m128i round_shift = _mm_cvtsi32_si128(bits); |
1082 | 0 | const uint8_t *const src_ptr = src - fo_horiz; |
1083 | 0 | const __m256i v_zero = _mm256_setzero_si256(); |
1084 | 0 | __m256i round_0_const = |
1085 | 0 | _mm256_set1_epi32((1 << (conv_params->round_0)) >> 1); |
1086 | 0 | __m256i round_const = _mm256_set1_epi32((1 << bits) >> 1); |
1087 | 0 | __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0); |
1088 | 0 | __m256i s[6] = { 0 }; |
1089 | |
|
1090 | 0 | if (w <= 4) { |
1091 | 0 | do { |
1092 | 0 | const __m256i data = _mm256_permute2x128_si256( |
1093 | 0 | _mm256_castsi128_si256( |
1094 | 0 | _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))), |
1095 | 0 | _mm256_castsi128_si256(_mm_loadu_si128( |
1096 | 0 | (__m128i *)(&src_ptr[i * src_stride + src_stride]))), |
1097 | 0 | 0x20); |
1098 | | // row0 0..7 row1 0..7 |
1099 | 0 | const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero); |
1100 | | // row0 8..F row1 8..F |
1101 | 0 | const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero); |
1102 | | |
1103 | | // row0 00 00 01 01 .. 03 03 row1 00 00 01 01 .. 03 03 |
1104 | 0 | const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo); |
1105 | | // row0 04 04 .. 07 07 row1 04 04 .. 07 07 |
1106 | 0 | const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo); |
1107 | | |
1108 | | // row0 08 08 09 09 .. 0B 0B row1 08 08 09 09 .. 0B 0B |
1109 | 0 | const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi); |
1110 | | // row0 0C 0C .. 0F 0F row1 0C 0C .. 0F 0F |
1111 | 0 | const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi); |
1112 | | |
1113 | | // 00 01 01 02 02 03 03 04 10 11 11 12 12 13 13 14 |
1114 | 0 | s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2); |
1115 | | // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16 |
1116 | 0 | s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10); |
1117 | | // 04 05 05 06 06 07 07 08 14 15 15 16 16 17 17 18 |
1118 | 0 | s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2); |
1119 | | // 06 07 07 08 08 09 09 0A 16 17 17 18 18 19 19 1A |
1120 | 0 | s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10); |
1121 | | // 08 09 09 0A 0A 0B 0B 0C 18 19 19 1A 1A 1B 1B 1C |
1122 | 0 | s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2); |
1123 | | // 0A 0B 0B 0C 0C 0D 0D 0E 1A 1B 1B 1C 1C 1D 1D 1E |
1124 | 0 | s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10); |
1125 | |
|
1126 | 0 | const __m256i res_lo = convolve_12taps(s, coeffs); |
1127 | |
|
1128 | 0 | __m256i res_32b_lo = _mm256_sra_epi32( |
1129 | 0 | _mm256_add_epi32(res_lo, round_0_const), round_0_shift); |
1130 | | |
1131 | | // 00 01 02 03 10 12 13 14 |
1132 | 0 | res_32b_lo = _mm256_sra_epi32(_mm256_add_epi32(res_32b_lo, round_const), |
1133 | 0 | round_shift); |
1134 | | // 8 bit conversion and saturation to uint8 |
1135 | | // 00 01 02 03 00 01 02 03 10 11 12 13 10 11 12 13 |
1136 | 0 | __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); |
1137 | | // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03 |
1138 | | // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13 |
1139 | 0 | __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); |
1140 | | |
1141 | | // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03 |
1142 | 0 | const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0); |
1143 | | // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13 |
1144 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); |
1145 | 0 | if (w > 2) { |
1146 | | // 00 01 02 03 |
1147 | 0 | xx_storel_32(&dst[i * dst_stride], res_0); |
1148 | | // 10 11 12 13 |
1149 | 0 | xx_storel_32(&dst[i * dst_stride + dst_stride], res_1); |
1150 | 0 | } else { |
1151 | | // 00 01 |
1152 | 0 | xx_storel_16(&dst[i * dst_stride], res_0); |
1153 | | // 10 11 |
1154 | 0 | xx_storel_16(&dst[i * dst_stride + dst_stride], res_1); |
1155 | 0 | } |
1156 | 0 | i += 2; |
1157 | 0 | } while (i < h); |
1158 | 0 | } else { |
1159 | 0 | assert(!(w % 8)); |
1160 | 0 | do { |
1161 | 0 | j = 0; |
1162 | 0 | do { |
1163 | 0 | const __m256i data = _mm256_permute2x128_si256( |
1164 | 0 | _mm256_castsi128_si256( |
1165 | 0 | _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))), |
1166 | 0 | _mm256_castsi128_si256(_mm_loadu_si128( |
1167 | 0 | (__m128i *)(&src_ptr[i * src_stride + j + 4]))), |
1168 | 0 | 0x20); |
1169 | | // row0 0..7 4..B |
1170 | 0 | const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero); |
1171 | | // row0 8..F C..13 |
1172 | 0 | const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero); |
1173 | | |
1174 | | // row0 00 00 01 01 .. 03 03 04 04 05 05 .. 07 07 |
1175 | 0 | const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo); |
1176 | | // row0 04 04 .. 07 07 08 08 .. 0B 0B |
1177 | 0 | const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo); |
1178 | | |
1179 | | // row0 08 08 09 09 .. 0B 0B 0C 0C 0D 0D .. 0F 0F |
1180 | 0 | const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi); |
1181 | | // row0 0C 0C 0D 0D .. 0F 0F 10 10 11 11 .. 13 13 |
1182 | 0 | const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi); |
1183 | |
|
1184 | 0 | s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2); |
1185 | 0 | s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10); |
1186 | 0 | s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2); |
1187 | 0 | s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10); |
1188 | 0 | s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2); |
1189 | 0 | s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10); |
1190 | |
|
1191 | 0 | const __m256i res_lo = convolve_12taps(s, coeffs); |
1192 | |
|
1193 | 0 | __m256i res_32b_lo = _mm256_sra_epi32( |
1194 | 0 | _mm256_add_epi32(res_lo, round_0_const), round_0_shift); |
1195 | |
|
1196 | 0 | res_32b_lo = _mm256_sra_epi32( |
1197 | 0 | _mm256_add_epi32(res_32b_lo, round_const), round_shift); |
1198 | | // 8 bit conversion and saturation to uint8 |
1199 | 0 | __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); |
1200 | 0 | __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); |
1201 | 0 | const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0); |
1202 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); |
1203 | 0 | xx_storel_32(&dst[i * dst_stride + j], res_0); |
1204 | 0 | xx_storel_32(&dst[i * dst_stride + j + 4], res_1); |
1205 | |
|
1206 | 0 | j += 8; |
1207 | 0 | } while (j < w); |
1208 | 0 | i++; |
1209 | 0 | } while (i < h); |
1210 | 0 | } |
1211 | 38.4k | } else { |
1212 | 38.4k | assert(horiz_tap == 2); |
1213 | | // since (filter_params_x->taps / 2 - 1) == 0 |
1214 | 38.4k | const uint8_t *src_ptr = src; |
1215 | 38.4k | if (subpel_x_qn != 8) { |
1216 | 14.9k | if (w <= 8) { |
1217 | 11.1k | prepare_coeffs_2t_ssse3(filter_params_x, subpel_x_qn, coeffs_128); |
1218 | | |
1219 | 11.1k | if (w == 2) { |
1220 | 3.11k | do { |
1221 | 3.11k | const __m128i data = |
1222 | 3.11k | convolve_x_2tap_2x2_ssse3(src_ptr, src_stride, coeffs_128); |
1223 | 3.11k | const __m128i reg = round_sr_x_ssse3(data); |
1224 | 3.11k | pack_store_u8_2x2_sse2(reg, dst, dst_stride); |
1225 | 3.11k | src_ptr += 2 * src_stride; |
1226 | 3.11k | dst += 2 * dst_stride; |
1227 | 3.11k | h -= 2; |
1228 | 3.11k | } while (h); |
1229 | 9.79k | } else if (w == 4) { |
1230 | 15.1k | do { |
1231 | 15.1k | const __m128i data = |
1232 | 15.1k | convolve_x_2tap_4x2_ssse3(src_ptr, src_stride, coeffs_128); |
1233 | 15.1k | const __m128i reg = round_sr_x_ssse3(data); |
1234 | 15.1k | pack_store_u8_4x2_sse2(reg, dst, dst_stride); |
1235 | 15.1k | src_ptr += 2 * src_stride; |
1236 | 15.1k | dst += 2 * dst_stride; |
1237 | 15.1k | h -= 2; |
1238 | 15.1k | } while (h); |
1239 | 5.05k | } else { |
1240 | 4.73k | assert(w == 8); |
1241 | | |
1242 | 17.6k | do { |
1243 | 17.6k | __m128i data[2] = { 0 }; |
1244 | | |
1245 | 17.6k | convolve_x_2tap_8x2_ssse3(src_ptr, src_stride, coeffs_128, data); |
1246 | 17.6k | data[0] = round_sr_x_ssse3(data[0]); |
1247 | 17.6k | data[1] = round_sr_x_ssse3(data[1]); |
1248 | 17.6k | const __m128i reg = _mm_packus_epi16(data[0], data[1]); |
1249 | 17.6k | _mm_storel_epi64((__m128i *)dst, reg); |
1250 | 17.6k | _mm_storeh_epi64((__m128i *)(dst + dst_stride), reg); |
1251 | | |
1252 | 17.6k | src_ptr += 2 * src_stride; |
1253 | 17.6k | dst += 2 * dst_stride; |
1254 | 17.6k | h -= 2; |
1255 | 17.6k | } while (h); |
1256 | 4.73k | } |
1257 | 11.1k | } else { |
1258 | 3.77k | prepare_coeffs_2t_lowbd(filter_params_x, subpel_x_qn, coeffs); |
1259 | | |
1260 | 3.77k | if (w == 16) { |
1261 | 13.6k | do { |
1262 | 13.6k | __m256i data[2] = { 0 }; |
1263 | | |
1264 | 13.6k | convolve_x_2tap_16x2_avx2(src_ptr, src_stride, coeffs, data); |
1265 | 13.6k | round_pack_store_16x2_avx2(data, dst, dst_stride); |
1266 | 13.6k | src_ptr += 2 * src_stride; |
1267 | 13.6k | dst += 2 * dst_stride; |
1268 | 13.6k | h -= 2; |
1269 | 13.6k | } while (h); |
1270 | 2.21k | } else if (w == 32) { |
1271 | 22.0k | do { |
1272 | 22.0k | convolve_round_2tap_32_avx2(src_ptr, coeffs, dst); |
1273 | 22.0k | src_ptr += src_stride; |
1274 | 22.0k | dst += dst_stride; |
1275 | 22.0k | } while ((--h) > 0); |
1276 | 829 | } else if (w == 64) { |
1277 | 24.9k | do { |
1278 | 24.9k | convolve_round_2tap_32_avx2(src_ptr, coeffs, dst); |
1279 | 24.9k | convolve_round_2tap_32_avx2(src_ptr + SECOND_32_BLK, coeffs, |
1280 | 24.9k | dst + SECOND_32_BLK); |
1281 | 24.9k | src_ptr += src_stride; |
1282 | 24.9k | dst += dst_stride; |
1283 | 24.9k | } while ((--h) > 0); |
1284 | 592 | } else { |
1285 | 140 | assert(w == 128); |
1286 | | |
1287 | 10.5k | do { |
1288 | 10.5k | convolve_round_2tap_32_avx2(src_ptr, coeffs, dst); |
1289 | 10.5k | convolve_round_2tap_32_avx2(src_ptr + (SECOND_32_BLK), coeffs, |
1290 | 10.5k | dst + (SECOND_32_BLK)); |
1291 | 10.5k | convolve_round_2tap_32_avx2(src_ptr + (THIRD_32_BLK), coeffs, |
1292 | 10.5k | dst + (THIRD_32_BLK)); |
1293 | 10.5k | convolve_round_2tap_32_avx2(src_ptr + (FOURTH_32_BLK), coeffs, |
1294 | 10.5k | dst + (FOURTH_32_BLK)); |
1295 | 10.5k | src_ptr += src_stride; |
1296 | 10.5k | dst += dst_stride; |
1297 | 10.5k | } while ((--h) > 0); |
1298 | 140 | } |
1299 | 3.77k | } |
1300 | 23.5k | } else { |
1301 | 23.5k | if (w == 2) { |
1302 | 7.73k | do { |
1303 | 7.73k | __m128i data = load_x_u8_4x2_sse4(src_ptr, src_stride); |
1304 | 7.73k | const __m128i reg1 = _mm_srli_si128(data, 1); |
1305 | 7.73k | const __m128i reg2 = _mm_avg_epu8(data, reg1); |
1306 | 7.73k | xx_storel_16(dst, reg2); |
1307 | 7.73k | { |
1308 | 7.73k | uint16_t val = (uint16_t)_mm_extract_epi16(reg2, 2); |
1309 | 7.73k | memcpy(dst + dst_stride, &val, sizeof(val)); |
1310 | 7.73k | } |
1311 | 7.73k | src_ptr += 2 * src_stride; |
1312 | 7.73k | dst += 2 * dst_stride; |
1313 | 7.73k | h -= 2; |
1314 | 7.73k | } while (h); |
1315 | 20.0k | } else if (w == 4) { |
1316 | 23.5k | do { |
1317 | 23.5k | __m128i data = load_8bit_8x2_to_1_reg_sse2( |
1318 | 23.5k | src_ptr, (int)(sizeof(*src_ptr) * src_stride)); |
1319 | 23.5k | const __m128i reg1 = _mm_srli_si128(data, 1); |
1320 | 23.5k | const __m128i reg2 = _mm_avg_epu8(data, reg1); |
1321 | 23.5k | xx_storel_32(dst, reg2); |
1322 | 23.5k | { |
1323 | 23.5k | int32_t val = _mm_extract_epi32(reg2, 2); |
1324 | 23.5k | memcpy(dst + dst_stride, &val, sizeof(val)); |
1325 | 23.5k | } |
1326 | | |
1327 | 23.5k | src_ptr += 2 * src_stride; |
1328 | 23.5k | dst += 2 * dst_stride; |
1329 | 23.5k | h -= 2; |
1330 | 23.5k | } while (h); |
1331 | 11.6k | } else if (w == 8) { |
1332 | 26.7k | do { |
1333 | 26.7k | const __m128i data00 = _mm_loadu_si128((__m128i *)src_ptr); |
1334 | 26.7k | const __m128i data10 = |
1335 | 26.7k | _mm_loadu_si128((__m128i *)(src_ptr + src_stride)); |
1336 | 26.7k | const __m128i data01 = _mm_srli_si128(data00, 1); |
1337 | 26.7k | const __m128i data11 = _mm_srli_si128(data10, 1); |
1338 | 26.7k | const __m128i reg0 = _mm_avg_epu8(data00, data01); |
1339 | 26.7k | const __m128i reg1 = _mm_avg_epu8(data10, data11); |
1340 | 26.7k | _mm_storel_epi64((__m128i *)dst, reg0); |
1341 | 26.7k | _mm_storel_epi64((__m128i *)(dst + dst_stride), reg1); |
1342 | | |
1343 | 26.7k | src_ptr += 2 * src_stride; |
1344 | 26.7k | dst += 2 * dst_stride; |
1345 | 26.7k | h -= 2; |
1346 | 26.7k | } while (h); |
1347 | 6.95k | } else if (w == 16) { |
1348 | 20.2k | do { |
1349 | 20.2k | const __m128i data00 = _mm_loadu_si128((__m128i *)src_ptr); |
1350 | 20.2k | const __m128i data01 = _mm_loadu_si128((__m128i *)(src_ptr + 1)); |
1351 | 20.2k | const __m128i data10 = |
1352 | 20.2k | _mm_loadu_si128((__m128i *)(src_ptr + src_stride)); |
1353 | 20.2k | const __m128i data11 = |
1354 | 20.2k | _mm_loadu_si128((__m128i *)(src_ptr + src_stride + 1)); |
1355 | 20.2k | const __m128i reg0 = _mm_avg_epu8(data00, data01); |
1356 | 20.2k | const __m128i reg1 = _mm_avg_epu8(data10, data11); |
1357 | 20.2k | _mm_storeu_si128((__m128i *)dst, reg0); |
1358 | 20.2k | _mm_storeu_si128((__m128i *)(dst + dst_stride), reg1); |
1359 | | |
1360 | 20.2k | src_ptr += 2 * src_stride; |
1361 | 20.2k | dst += 2 * dst_stride; |
1362 | 20.2k | h -= 2; |
1363 | 20.2k | } while (h); |
1364 | 3.25k | } else if (w == 32) { |
1365 | 21.5k | do { |
1366 | 21.5k | load_avg_store_2tap_32_avx2(src_ptr, dst); |
1367 | 21.5k | src_ptr += src_stride; |
1368 | 21.5k | dst += dst_stride; |
1369 | 21.5k | } while ((--h) > 0); |
1370 | 1.00k | } else if (w == 64) { |
1371 | 13.5k | do { |
1372 | 13.5k | load_avg_store_2tap_32_avx2(src_ptr, dst); |
1373 | 13.5k | load_avg_store_2tap_32_avx2(src_ptr + (SECOND_32_BLK), |
1374 | 13.5k | dst + (SECOND_32_BLK)); |
1375 | 13.5k | src_ptr += src_stride; |
1376 | 13.5k | dst += dst_stride; |
1377 | 13.5k | } while ((--h) > 0); |
1378 | 266 | } else { |
1379 | 144 | assert(w == 128); |
1380 | | |
1381 | 12.8k | do { |
1382 | 12.8k | load_avg_store_2tap_32_avx2(src_ptr, dst); |
1383 | 12.8k | load_avg_store_2tap_32_avx2(src_ptr + (SECOND_32_BLK), |
1384 | 12.8k | dst + (SECOND_32_BLK)); |
1385 | 12.8k | load_avg_store_2tap_32_avx2(src_ptr + (THIRD_32_BLK), |
1386 | 12.8k | dst + (THIRD_32_BLK)); |
1387 | 12.8k | load_avg_store_2tap_32_avx2(src_ptr + (FOURTH_32_BLK), |
1388 | 12.8k | dst + (FOURTH_32_BLK)); |
1389 | 12.8k | src_ptr += src_stride; |
1390 | 12.8k | dst += dst_stride; |
1391 | 12.8k | } while ((--h) > 0); |
1392 | 144 | } |
1393 | 23.5k | } |
1394 | 38.4k | } |
1395 | 409k | } |