/src/aom/aom_dsp/x86/highbd_convolve_avx2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2017, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | #include <immintrin.h> |
12 | | #include <string.h> |
13 | | |
14 | | #include "config/av1_rtcd.h" |
15 | | |
16 | | #include "aom_dsp/x86/convolve.h" |
17 | | #include "aom_dsp/x86/convolve_avx2.h" |
18 | | #include "aom_dsp/x86/synonyms.h" |
19 | | |
20 | | // ----------------------------------------------------------------------------- |
21 | | // Copy and average |
22 | | |
23 | | static const uint8_t ip_shuffle_f2f3[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, |
24 | | 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, |
25 | | 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 }; |
26 | | static const uint8_t ip_shuffle_f4f5[32] = { 4, 5, 6, 7, 6, 7, 8, 9, |
27 | | 8, 9, 10, 11, 10, 11, 12, 13, |
28 | | 4, 5, 6, 7, 6, 7, 8, 9, |
29 | | 8, 9, 10, 11, 10, 11, 12, 13 }; |
30 | | |
31 | | void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, |
32 | | uint16_t *dst, int dst_stride, int w, int h, |
33 | | const InterpFilterParams *filter_params_x, |
34 | | const int subpel_x_qn, |
35 | | ConvolveParams *conv_params, int bd); |
36 | | void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, |
37 | | uint16_t *dst, int dst_stride, int w, int h, |
38 | | const InterpFilterParams *filter_params_y, |
39 | | const int subpel_y_qn, int bd); |
40 | | |
41 | | void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, |
42 | | uint16_t *dst, int dst_stride, int w, int h, |
43 | | const InterpFilterParams *filter_params_y, |
44 | 1.02M | const int subpel_y_qn, int bd) { |
45 | 1.02M | if (filter_params_y->taps == 12) { |
46 | 0 | av1_highbd_convolve_y_sr_ssse3(src, src_stride, dst, dst_stride, w, h, |
47 | 0 | filter_params_y, subpel_y_qn, bd); |
48 | 0 | return; |
49 | 0 | } |
50 | 1.02M | int i, j; |
51 | 1.02M | const int fo_vert = filter_params_y->taps / 2 - 1; |
52 | 1.02M | const uint16_t *const src_ptr = src - fo_vert * src_stride; |
53 | | |
54 | 1.02M | __m256i s[8], coeffs_y[4]; |
55 | | |
56 | 1.02M | const int bits = FILTER_BITS; |
57 | | |
58 | 1.02M | const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); |
59 | 1.02M | const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); |
60 | 1.02M | const __m256i clip_pixel = |
61 | 1.02M | _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); |
62 | 1.02M | const __m256i zero = _mm256_setzero_si256(); |
63 | | |
64 | 1.02M | prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); |
65 | | |
66 | 2.45M | for (j = 0; j < w; j += 8) { |
67 | 1.42M | const uint16_t *data = &src_ptr[j]; |
68 | | /* Vertical filter */ |
69 | 1.42M | { |
70 | 1.42M | __m256i src6; |
71 | 1.42M | __m256i s01 = _mm256_permute2x128_si256( |
72 | 1.42M | _mm256_castsi128_si256( |
73 | 1.42M | _mm_loadu_si128((__m128i *)(data + 0 * src_stride))), |
74 | 1.42M | _mm256_castsi128_si256( |
75 | 1.42M | _mm_loadu_si128((__m128i *)(data + 1 * src_stride))), |
76 | 1.42M | 0x20); |
77 | 1.42M | __m256i s12 = _mm256_permute2x128_si256( |
78 | 1.42M | _mm256_castsi128_si256( |
79 | 1.42M | _mm_loadu_si128((__m128i *)(data + 1 * src_stride))), |
80 | 1.42M | _mm256_castsi128_si256( |
81 | 1.42M | _mm_loadu_si128((__m128i *)(data + 2 * src_stride))), |
82 | 1.42M | 0x20); |
83 | 1.42M | __m256i s23 = _mm256_permute2x128_si256( |
84 | 1.42M | _mm256_castsi128_si256( |
85 | 1.42M | _mm_loadu_si128((__m128i *)(data + 2 * src_stride))), |
86 | 1.42M | _mm256_castsi128_si256( |
87 | 1.42M | _mm_loadu_si128((__m128i *)(data + 3 * src_stride))), |
88 | 1.42M | 0x20); |
89 | 1.42M | __m256i s34 = _mm256_permute2x128_si256( |
90 | 1.42M | _mm256_castsi128_si256( |
91 | 1.42M | _mm_loadu_si128((__m128i *)(data + 3 * src_stride))), |
92 | 1.42M | _mm256_castsi128_si256( |
93 | 1.42M | _mm_loadu_si128((__m128i *)(data + 4 * src_stride))), |
94 | 1.42M | 0x20); |
95 | 1.42M | __m256i s45 = _mm256_permute2x128_si256( |
96 | 1.42M | _mm256_castsi128_si256( |
97 | 1.42M | _mm_loadu_si128((__m128i *)(data + 4 * src_stride))), |
98 | 1.42M | _mm256_castsi128_si256( |
99 | 1.42M | _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), |
100 | 1.42M | 0x20); |
101 | 1.42M | src6 = _mm256_castsi128_si256( |
102 | 1.42M | _mm_loadu_si128((__m128i *)(data + 6 * src_stride))); |
103 | 1.42M | __m256i s56 = _mm256_permute2x128_si256( |
104 | 1.42M | _mm256_castsi128_si256( |
105 | 1.42M | _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), |
106 | 1.42M | src6, 0x20); |
107 | | |
108 | 1.42M | s[0] = _mm256_unpacklo_epi16(s01, s12); |
109 | 1.42M | s[1] = _mm256_unpacklo_epi16(s23, s34); |
110 | 1.42M | s[2] = _mm256_unpacklo_epi16(s45, s56); |
111 | | |
112 | 1.42M | s[4] = _mm256_unpackhi_epi16(s01, s12); |
113 | 1.42M | s[5] = _mm256_unpackhi_epi16(s23, s34); |
114 | 1.42M | s[6] = _mm256_unpackhi_epi16(s45, s56); |
115 | | |
116 | 9.64M | for (i = 0; i < h; i += 2) { |
117 | 8.21M | data = &src_ptr[i * src_stride + j]; |
118 | | |
119 | 8.21M | const __m256i s67 = _mm256_permute2x128_si256( |
120 | 8.21M | src6, |
121 | 8.21M | _mm256_castsi128_si256( |
122 | 8.21M | _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), |
123 | 8.21M | 0x20); |
124 | | |
125 | 8.21M | src6 = _mm256_castsi128_si256( |
126 | 8.21M | _mm_loadu_si128((__m128i *)(data + 8 * src_stride))); |
127 | | |
128 | 8.21M | const __m256i s78 = _mm256_permute2x128_si256( |
129 | 8.21M | _mm256_castsi128_si256( |
130 | 8.21M | _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), |
131 | 8.21M | src6, 0x20); |
132 | | |
133 | 8.21M | s[3] = _mm256_unpacklo_epi16(s67, s78); |
134 | 8.21M | s[7] = _mm256_unpackhi_epi16(s67, s78); |
135 | | |
136 | 8.21M | const __m256i res_a = convolve(s, coeffs_y); |
137 | | |
138 | 8.21M | __m256i res_a_round = _mm256_sra_epi32( |
139 | 8.21M | _mm256_add_epi32(res_a, round_const_bits), round_shift_bits); |
140 | | |
141 | 8.21M | if (w - j > 4) { |
142 | 6.84M | const __m256i res_b = convolve(s + 4, coeffs_y); |
143 | 6.84M | __m256i res_b_round = _mm256_sra_epi32( |
144 | 6.84M | _mm256_add_epi32(res_b, round_const_bits), round_shift_bits); |
145 | | |
146 | 6.84M | __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); |
147 | 6.84M | res_16bit = _mm256_min_epi16(res_16bit, clip_pixel); |
148 | 6.84M | res_16bit = _mm256_max_epi16(res_16bit, zero); |
149 | | |
150 | 6.84M | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], |
151 | 6.84M | _mm256_castsi256_si128(res_16bit)); |
152 | 6.84M | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], |
153 | 6.84M | _mm256_extracti128_si256(res_16bit, 1)); |
154 | 6.84M | } else if (w == 4) { |
155 | 1.11M | res_a_round = _mm256_packs_epi32(res_a_round, res_a_round); |
156 | 1.11M | res_a_round = _mm256_min_epi16(res_a_round, clip_pixel); |
157 | 1.11M | res_a_round = _mm256_max_epi16(res_a_round, zero); |
158 | | |
159 | 1.11M | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], |
160 | 1.11M | _mm256_castsi256_si128(res_a_round)); |
161 | 1.11M | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], |
162 | 1.11M | _mm256_extracti128_si256(res_a_round, 1)); |
163 | 1.11M | } else { |
164 | 263k | res_a_round = _mm256_packs_epi32(res_a_round, res_a_round); |
165 | 263k | res_a_round = _mm256_min_epi16(res_a_round, clip_pixel); |
166 | 263k | res_a_round = _mm256_max_epi16(res_a_round, zero); |
167 | | |
168 | 263k | xx_storel_32(&dst[i * dst_stride + j], |
169 | 263k | _mm256_castsi256_si128(res_a_round)); |
170 | 263k | xx_storel_32(&dst[i * dst_stride + j + dst_stride], |
171 | 263k | _mm256_extracti128_si256(res_a_round, 1)); |
172 | 263k | } |
173 | | |
174 | 8.21M | s[0] = s[1]; |
175 | 8.21M | s[1] = s[2]; |
176 | 8.21M | s[2] = s[3]; |
177 | | |
178 | 8.21M | s[4] = s[5]; |
179 | 8.21M | s[5] = s[6]; |
180 | 8.21M | s[6] = s[7]; |
181 | 8.21M | } |
182 | 1.42M | } |
183 | 1.42M | } |
184 | 1.02M | } |
185 | | |
186 | | void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride, |
187 | | uint16_t *dst, int dst_stride, int w, int h, |
188 | | const InterpFilterParams *filter_params_x, |
189 | | const int subpel_x_qn, |
190 | 890k | ConvolveParams *conv_params, int bd) { |
191 | 890k | if (filter_params_x->taps == 12) { |
192 | 0 | av1_highbd_convolve_x_sr_ssse3(src, src_stride, dst, dst_stride, w, h, |
193 | 0 | filter_params_x, subpel_x_qn, conv_params, |
194 | 0 | bd); |
195 | 0 | return; |
196 | 0 | } |
197 | 890k | int i, j; |
198 | 890k | const int fo_horiz = filter_params_x->taps / 2 - 1; |
199 | 890k | const uint16_t *const src_ptr = src - fo_horiz; |
200 | | |
201 | | // Check that, even with 12-bit input, the intermediate values will fit |
202 | | // into an unsigned 16-bit intermediate array. |
203 | 890k | assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); |
204 | | |
205 | 890k | __m256i s[4], coeffs_x[4]; |
206 | | |
207 | 890k | const __m256i round_const_x = |
208 | 890k | _mm256_set1_epi32(((1 << conv_params->round_0) >> 1)); |
209 | 890k | const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); |
210 | | |
211 | 890k | const int bits = FILTER_BITS - conv_params->round_0; |
212 | 890k | const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); |
213 | 890k | const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); |
214 | 890k | const __m256i clip_pixel = |
215 | 890k | _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); |
216 | 890k | const __m256i zero = _mm256_setzero_si256(); |
217 | | |
218 | 890k | assert(bits >= 0); |
219 | 890k | assert((FILTER_BITS - conv_params->round_1) >= 0 || |
220 | 890k | ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); |
221 | | |
222 | 890k | prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x); |
223 | | |
224 | 2.20M | for (j = 0; j < w; j += 8) { |
225 | | /* Horizontal filter */ |
226 | 10.4M | for (i = 0; i < h; i += 2) { |
227 | 9.10M | const __m256i row0 = |
228 | 9.10M | _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]); |
229 | 9.10M | __m256i row1 = |
230 | 9.10M | _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]); |
231 | | |
232 | 9.10M | const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20); |
233 | 9.10M | const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31); |
234 | | |
235 | | // even pixels |
236 | 9.10M | s[0] = _mm256_alignr_epi8(r1, r0, 0); |
237 | 9.10M | s[1] = _mm256_alignr_epi8(r1, r0, 4); |
238 | 9.10M | s[2] = _mm256_alignr_epi8(r1, r0, 8); |
239 | 9.10M | s[3] = _mm256_alignr_epi8(r1, r0, 12); |
240 | | |
241 | 9.10M | __m256i res_even = convolve(s, coeffs_x); |
242 | 9.10M | res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x), |
243 | 9.10M | round_shift_x); |
244 | | |
245 | | // odd pixels |
246 | 9.10M | s[0] = _mm256_alignr_epi8(r1, r0, 2); |
247 | 9.10M | s[1] = _mm256_alignr_epi8(r1, r0, 6); |
248 | 9.10M | s[2] = _mm256_alignr_epi8(r1, r0, 10); |
249 | 9.10M | s[3] = _mm256_alignr_epi8(r1, r0, 14); |
250 | | |
251 | 9.10M | __m256i res_odd = convolve(s, coeffs_x); |
252 | 9.10M | res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x), |
253 | 9.10M | round_shift_x); |
254 | | |
255 | 9.10M | res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_bits), |
256 | 9.10M | round_shift_bits); |
257 | 9.10M | res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_bits), |
258 | 9.10M | round_shift_bits); |
259 | | |
260 | 9.10M | __m256i res_even1 = _mm256_packs_epi32(res_even, res_even); |
261 | 9.10M | __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd); |
262 | | |
263 | 9.10M | __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1); |
264 | 9.10M | res = _mm256_min_epi16(res, clip_pixel); |
265 | 9.10M | res = _mm256_max_epi16(res, zero); |
266 | | |
267 | 9.10M | if (w - j > 4) { |
268 | 7.94M | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], |
269 | 7.94M | _mm256_castsi256_si128(res)); |
270 | 7.94M | _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], |
271 | 7.94M | _mm256_extracti128_si256(res, 1)); |
272 | 7.94M | } else if (w == 4) { |
273 | 929k | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], |
274 | 929k | _mm256_castsi256_si128(res)); |
275 | 929k | _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], |
276 | 929k | _mm256_extracti128_si256(res, 1)); |
277 | 929k | } else { |
278 | 229k | xx_storel_32(&dst[i * dst_stride + j], _mm256_castsi256_si128(res)); |
279 | 229k | xx_storel_32(&dst[i * dst_stride + j + dst_stride], |
280 | 229k | _mm256_extracti128_si256(res, 1)); |
281 | 229k | } |
282 | 9.10M | } |
283 | 1.31M | } |
284 | 890k | } |
285 | | |
286 | 0 | #define CONV8_ROUNDING_BITS (7) |
287 | | |
288 | | // ----------------------------------------------------------------------------- |
289 | | // Horizontal and vertical filtering |
290 | | |
291 | | static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, |
292 | | 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, |
293 | | 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 }; |
294 | | |
295 | | static const uint8_t signal_pattern_1[32] = { 4, 5, 6, 7, 6, 7, 8, 9, |
296 | | 8, 9, 10, 11, 10, 11, 12, 13, |
297 | | 4, 5, 6, 7, 6, 7, 8, 9, |
298 | | 8, 9, 10, 11, 10, 11, 12, 13 }; |
299 | | |
300 | | static const uint8_t signal_pattern_2[32] = { 6, 7, 8, 9, 8, 9, 10, 11, |
301 | | 10, 11, 12, 13, 12, 13, 14, 15, |
302 | | 6, 7, 8, 9, 8, 9, 10, 11, |
303 | | 10, 11, 12, 13, 12, 13, 14, 15 }; |
304 | | |
305 | | static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 }; |
306 | | |
307 | | // ----------------------------------------------------------------------------- |
308 | | // Horizontal Filtering |
309 | | |
310 | 0 | static inline void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) { |
311 | 0 | const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); |
312 | 0 | const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0); |
313 | 0 | const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1); |
314 | 0 | const __m256i c = _mm256_permutevar8x32_epi32(*s, idx); |
315 | |
|
316 | 0 | p[0] = _mm256_shuffle_epi8(*s, sf0); // x0x6 |
317 | 0 | p[1] = _mm256_shuffle_epi8(*s, sf1); // x1x7 |
318 | 0 | p[2] = _mm256_shuffle_epi8(c, sf0); // x2x4 |
319 | 0 | p[3] = _mm256_shuffle_epi8(c, sf1); // x3x5 |
320 | 0 | } |
321 | | |
322 | | // Note: |
323 | | // Shared by 8x2 and 16x1 block |
324 | | static inline void pack_16_pixels(const __m256i *s0, const __m256i *s1, |
325 | 0 | __m256i *x /*x[8]*/) { |
326 | 0 | __m256i pp[8]; |
327 | 0 | pack_pixels(s0, pp); |
328 | 0 | pack_pixels(s1, &pp[4]); |
329 | 0 | x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20); |
330 | 0 | x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20); |
331 | 0 | x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20); |
332 | 0 | x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20); |
333 | 0 | x[4] = x[2]; |
334 | 0 | x[5] = x[3]; |
335 | 0 | x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31); |
336 | 0 | x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31); |
337 | 0 | } |
338 | | |
339 | 0 | static inline void pack_8x1_pixels(const uint16_t *src, __m256i *x) { |
340 | 0 | __m256i pp[8]; |
341 | 0 | __m256i s0; |
342 | 0 | s0 = _mm256_loadu_si256((const __m256i *)src); |
343 | 0 | pack_pixels(&s0, pp); |
344 | 0 | x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30); |
345 | 0 | x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30); |
346 | 0 | x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30); |
347 | 0 | x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30); |
348 | 0 | } |
349 | | |
350 | | static inline void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride, |
351 | 0 | __m256i *x) { |
352 | 0 | __m256i s0, s1; |
353 | 0 | s0 = _mm256_loadu_si256((const __m256i *)src); |
354 | 0 | s1 = _mm256_loadu_si256((const __m256i *)(src + stride)); |
355 | 0 | pack_16_pixels(&s0, &s1, x); |
356 | 0 | } |
357 | | |
358 | 0 | static inline void pack_16x1_pixels(const uint16_t *src, __m256i *x) { |
359 | 0 | __m256i s0, s1; |
360 | 0 | s0 = _mm256_loadu_si256((const __m256i *)src); |
361 | 0 | s1 = _mm256_loadu_si256((const __m256i *)(src + 8)); |
362 | 0 | pack_16_pixels(&s0, &s1, x); |
363 | 0 | } |
364 | | |
365 | | // Note: |
366 | | // Shared by horizontal and vertical filtering |
367 | 0 | static inline void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) { |
368 | 0 | const __m128i h = _mm_loadu_si128((const __m128i *)filter); |
369 | 0 | const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1); |
370 | 0 | const __m256i p0 = _mm256_set1_epi32(0x03020100); |
371 | 0 | const __m256i p1 = _mm256_set1_epi32(0x07060504); |
372 | 0 | const __m256i p2 = _mm256_set1_epi32(0x0b0a0908); |
373 | 0 | const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c); |
374 | 0 | f[0] = _mm256_shuffle_epi8(hh, p0); |
375 | 0 | f[1] = _mm256_shuffle_epi8(hh, p1); |
376 | 0 | f[2] = _mm256_shuffle_epi8(hh, p2); |
377 | 0 | f[3] = _mm256_shuffle_epi8(hh, p3); |
378 | 0 | } |
379 | | |
380 | | static inline void pack_filters_4tap(const int16_t *filter, |
381 | 0 | __m256i *f /*f[4]*/) { |
382 | 0 | const __m128i h = _mm_loadu_si128((const __m128i *)filter); |
383 | 0 | const __m256i coeff = _mm256_broadcastsi128_si256(h); |
384 | | |
385 | | // coeffs 2 3 2 3 2 3 2 3 |
386 | 0 | f[0] = _mm256_shuffle_epi32(coeff, 0x55); |
387 | | // coeffs 4 5 4 5 4 5 4 5 |
388 | 0 | f[1] = _mm256_shuffle_epi32(coeff, 0xaa); |
389 | 0 | } |
390 | | |
391 | | static inline void filter_8x1_pixels(const __m256i *sig /*sig[4]*/, |
392 | | const __m256i *fil /*fil[4]*/, |
393 | 0 | __m256i *y) { |
394 | 0 | __m256i a, a0, a1; |
395 | |
|
396 | 0 | a0 = _mm256_madd_epi16(fil[0], sig[0]); |
397 | 0 | a1 = _mm256_madd_epi16(fil[3], sig[3]); |
398 | 0 | a = _mm256_add_epi32(a0, a1); |
399 | |
|
400 | 0 | a0 = _mm256_madd_epi16(fil[1], sig[1]); |
401 | 0 | a1 = _mm256_madd_epi16(fil[2], sig[2]); |
402 | |
|
403 | 0 | { |
404 | 0 | const __m256i min = _mm256_min_epi32(a0, a1); |
405 | 0 | a = _mm256_add_epi32(a, min); |
406 | 0 | } |
407 | 0 | { |
408 | 0 | const __m256i max = _mm256_max_epi32(a0, a1); |
409 | 0 | a = _mm256_add_epi32(a, max); |
410 | 0 | } |
411 | 0 | { |
412 | 0 | const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); |
413 | 0 | a = _mm256_add_epi32(a, rounding); |
414 | 0 | *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS); |
415 | 0 | } |
416 | 0 | } |
417 | | |
418 | | static inline void store_8x1_pixels(const __m256i *y, const __m256i *mask, |
419 | 0 | uint16_t *dst) { |
420 | 0 | const __m128i a0 = _mm256_castsi256_si128(*y); |
421 | 0 | const __m128i a1 = _mm256_extractf128_si256(*y, 1); |
422 | 0 | __m128i res = _mm_packus_epi32(a0, a1); |
423 | 0 | res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask)); |
424 | 0 | _mm_storeu_si128((__m128i *)dst, res); |
425 | 0 | } |
426 | | |
427 | | static inline void store_8x2_pixels(const __m256i *y0, const __m256i *y1, |
428 | | const __m256i *mask, uint16_t *dst, |
429 | 0 | ptrdiff_t pitch) { |
430 | 0 | __m256i a = _mm256_packus_epi32(*y0, *y1); |
431 | 0 | a = _mm256_min_epi16(a, *mask); |
432 | 0 | _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a)); |
433 | 0 | _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1)); |
434 | 0 | } |
435 | | |
436 | | static inline void store_16x1_pixels(const __m256i *y0, const __m256i *y1, |
437 | 0 | const __m256i *mask, uint16_t *dst) { |
438 | 0 | __m256i a = _mm256_packus_epi32(*y0, *y1); |
439 | 0 | a = _mm256_min_epi16(a, *mask); |
440 | 0 | _mm256_storeu_si256((__m256i *)dst, a); |
441 | 0 | } |
442 | | |
443 | | static void aom_highbd_filter_block1d8_h8_avx2( |
444 | | const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, |
445 | 0 | ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { |
446 | 0 | __m256i signal[8], res0, res1; |
447 | 0 | const __m256i max = _mm256_set1_epi16((1 << bd) - 1); |
448 | |
|
449 | 0 | __m256i ff[4]; |
450 | 0 | pack_filters(filter, ff); |
451 | |
|
452 | 0 | src_ptr -= 3; |
453 | 0 | do { |
454 | 0 | pack_8x2_pixels(src_ptr, src_pitch, signal); |
455 | 0 | filter_8x1_pixels(signal, ff, &res0); |
456 | 0 | filter_8x1_pixels(&signal[4], ff, &res1); |
457 | 0 | store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); |
458 | 0 | height -= 2; |
459 | 0 | src_ptr += src_pitch << 1; |
460 | 0 | dst_ptr += dst_pitch << 1; |
461 | 0 | } while (height > 1); |
462 | |
|
463 | 0 | if (height > 0) { |
464 | 0 | pack_8x1_pixels(src_ptr, signal); |
465 | 0 | filter_8x1_pixels(signal, ff, &res0); |
466 | 0 | store_8x1_pixels(&res0, &max, dst_ptr); |
467 | 0 | } |
468 | 0 | } |
469 | | |
470 | | static void aom_highbd_filter_block1d16_h8_avx2( |
471 | | const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, |
472 | 0 | ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { |
473 | 0 | __m256i signal[8], res0, res1; |
474 | 0 | const __m256i max = _mm256_set1_epi16((1 << bd) - 1); |
475 | |
|
476 | 0 | __m256i ff[4]; |
477 | 0 | pack_filters(filter, ff); |
478 | |
|
479 | 0 | src_ptr -= 3; |
480 | 0 | do { |
481 | 0 | pack_16x1_pixels(src_ptr, signal); |
482 | 0 | filter_8x1_pixels(signal, ff, &res0); |
483 | 0 | filter_8x1_pixels(&signal[4], ff, &res1); |
484 | 0 | store_16x1_pixels(&res0, &res1, &max, dst_ptr); |
485 | 0 | height -= 1; |
486 | 0 | src_ptr += src_pitch; |
487 | 0 | dst_ptr += dst_pitch; |
488 | 0 | } while (height > 0); |
489 | 0 | } |
490 | | |
491 | | static void aom_highbd_filter_block1d4_h4_avx2( |
492 | | const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, |
493 | 0 | ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { |
494 | 0 | const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); |
495 | 0 | __m256i ff[2], s[2]; |
496 | 0 | uint32_t i; |
497 | 0 | const __m256i clip_pixel = |
498 | 0 | _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); |
499 | 0 | const __m256i zero = _mm256_setzero_si256(); |
500 | |
|
501 | 0 | static const uint8_t shuffle_mask[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, |
502 | 0 | 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, |
503 | 0 | 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 }; |
504 | |
|
505 | 0 | __m256i mask = _mm256_loadu_si256((__m256i *)shuffle_mask); |
506 | 0 | __m256i ip_mask_f2f3 = _mm256_loadu_si256((__m256i *)ip_shuffle_f2f3); |
507 | 0 | __m256i ip_mask_f4f5 = _mm256_loadu_si256((__m256i *)ip_shuffle_f4f5); |
508 | |
|
509 | 0 | pack_filters_4tap(filter, ff); |
510 | 0 | src_ptr -= 3; |
511 | 0 | for (i = 0; i <= (height - 2); i += 2) { |
512 | 0 | __m256i row0 = _mm256_castsi128_si256( |
513 | 0 | _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 2])); |
514 | 0 | __m256i row1 = _mm256_castsi128_si256( |
515 | 0 | _mm_loadu_si128((__m128i *)&src_ptr[(i + 1) * src_pitch + 2])); |
516 | |
|
517 | 0 | s[0] = _mm256_inserti128_si256(row0, _mm256_castsi256_si128(row1), 1); |
518 | 0 | s[1] = _mm256_alignr_epi8(s[0], s[0], 4); |
519 | |
|
520 | 0 | s[0] = _mm256_shuffle_epi8(s[0], mask); |
521 | 0 | s[1] = _mm256_shuffle_epi8(s[1], mask); |
522 | |
|
523 | 0 | __m256i res = convolve_4tap(s, ff); |
524 | 0 | res = |
525 | 0 | _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS); |
526 | |
|
527 | 0 | res = _mm256_packs_epi32(res, res); |
528 | 0 | res = _mm256_min_epi16(res, clip_pixel); |
529 | 0 | res = _mm256_max_epi16(res, zero); |
530 | |
|
531 | 0 | _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch], |
532 | 0 | _mm256_castsi256_si128(res)); |
533 | 0 | _mm_storel_epi64((__m128i *)&dst_ptr[(i + 1) * dst_pitch], |
534 | 0 | _mm256_extracti128_si256(res, 1)); |
535 | 0 | } |
536 | 0 | if (height % 2 != 0) { |
537 | 0 | i = height - 1; |
538 | 0 | const __m256i row0_0 = _mm256_castsi128_si256( |
539 | 0 | _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 2])); |
540 | 0 | const __m256i row0_1 = _mm256_castsi128_si256( |
541 | 0 | _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 6])); |
542 | |
|
543 | 0 | const __m256i r0 = |
544 | 0 | _mm256_inserti128_si256(row0_0, _mm256_castsi256_si128(row0_1), 1); |
545 | |
|
546 | 0 | s[0] = _mm256_shuffle_epi8(r0, ip_mask_f2f3); |
547 | 0 | s[1] = _mm256_shuffle_epi8(r0, ip_mask_f4f5); |
548 | |
|
549 | 0 | __m256i res = convolve_4tap(s, ff); |
550 | 0 | res = |
551 | 0 | _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS); |
552 | |
|
553 | 0 | res = _mm256_packs_epi32(res, res); |
554 | 0 | res = _mm256_min_epi16(res, clip_pixel); |
555 | 0 | res = _mm256_max_epi16(res, zero); |
556 | |
|
557 | 0 | _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch], |
558 | 0 | _mm256_castsi256_si128(res)); |
559 | 0 | } |
560 | 0 | } |
561 | | |
562 | | static void aom_highbd_filter_block1d8_h4_avx2( |
563 | | const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, |
564 | 0 | ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { |
565 | 0 | const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); |
566 | 0 | __m256i ff[2], s[2]; |
567 | 0 | uint32_t i = 0; |
568 | 0 | const __m256i clip_pixel = |
569 | 0 | _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); |
570 | 0 | const __m256i zero = _mm256_setzero_si256(); |
571 | |
|
572 | 0 | static const uint8_t shuffle_mask[32] = { 0, 1, 8, 9, 2, 3, 10, 11, |
573 | 0 | 4, 5, 12, 13, 6, 7, 14, 15, |
574 | 0 | 0, 1, 8, 9, 2, 3, 10, 11, |
575 | 0 | 4, 5, 12, 13, 6, 7, 14, 15 }; |
576 | |
|
577 | 0 | __m256i mask = _mm256_loadu_si256((__m256i *)shuffle_mask); |
578 | 0 | __m256i ip_mask_f2f3 = _mm256_loadu_si256((__m256i *)ip_shuffle_f2f3); |
579 | 0 | __m256i ip_mask_f4f5 = _mm256_loadu_si256((__m256i *)ip_shuffle_f4f5); |
580 | |
|
581 | 0 | pack_filters_4tap(filter, ff); |
582 | 0 | src_ptr -= 3; |
583 | | |
584 | | /* Horizontal filter */ |
585 | |
|
586 | 0 | for (i = 0; i <= (height - 2); i += 2) { |
587 | 0 | const __m256i row0 = |
588 | 0 | _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 2]); |
589 | 0 | __m256i row1 = |
590 | 0 | _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_pitch + 2]); |
591 | |
|
592 | 0 | const __m256i r0 = |
593 | 0 | _mm256_inserti128_si256(row0, _mm256_castsi256_si128(row1), 1); |
594 | 0 | const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31); |
595 | | |
596 | | // even pixels |
597 | 0 | s[0] = r0; |
598 | 0 | s[1] = _mm256_alignr_epi8(r1, r0, 4); |
599 | |
|
600 | 0 | __m256i res_even = convolve_4tap(s, ff); |
601 | 0 | res_even = _mm256_srai_epi32(_mm256_add_epi32(res_even, rounding), |
602 | 0 | CONV8_ROUNDING_BITS); |
603 | | |
604 | | // odd pixels |
605 | 0 | s[0] = _mm256_alignr_epi8(r1, r0, 2); |
606 | 0 | s[1] = _mm256_alignr_epi8(r1, r0, 6); |
607 | |
|
608 | 0 | __m256i res_odd = convolve_4tap(s, ff); |
609 | 0 | res_odd = _mm256_srai_epi32(_mm256_add_epi32(res_odd, rounding), |
610 | 0 | CONV8_ROUNDING_BITS); |
611 | |
|
612 | 0 | __m256i res = _mm256_packs_epi32(res_even, res_odd); |
613 | 0 | res = _mm256_shuffle_epi8(res, mask); |
614 | |
|
615 | 0 | res = _mm256_min_epi16(res, clip_pixel); |
616 | 0 | res = _mm256_max_epi16(res, zero); |
617 | |
|
618 | 0 | _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch], |
619 | 0 | _mm256_castsi256_si128(res)); |
620 | 0 | _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch], |
621 | 0 | _mm256_extracti128_si256(res, 1)); |
622 | 0 | } |
623 | |
|
624 | 0 | if (height % 2 != 0) { |
625 | 0 | i = height - 1; |
626 | 0 | const __m256i row0_0 = |
627 | 0 | _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 2]); |
628 | 0 | const __m256i row0_1 = |
629 | 0 | _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 6]); |
630 | |
|
631 | 0 | const __m256i r0 = |
632 | 0 | _mm256_inserti128_si256(row0_0, _mm256_castsi256_si128(row0_1), 1); |
633 | |
|
634 | 0 | s[0] = _mm256_shuffle_epi8(r0, ip_mask_f2f3); |
635 | 0 | s[1] = _mm256_shuffle_epi8(r0, ip_mask_f4f5); |
636 | |
|
637 | 0 | __m256i res = convolve_4tap(s, ff); |
638 | 0 | res = |
639 | 0 | _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS); |
640 | |
|
641 | 0 | res = _mm256_packs_epi32(res, res); |
642 | 0 | res = _mm256_min_epi16(res, clip_pixel); |
643 | 0 | res = _mm256_max_epi16(res, zero); |
644 | |
|
645 | 0 | _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch], |
646 | 0 | _mm256_castsi256_si128(res)); |
647 | 0 | _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch + 4], |
648 | 0 | _mm256_extracti128_si256(res, 1)); |
649 | 0 | } |
650 | 0 | } |
651 | | |
652 | | static void aom_highbd_filter_block1d16_h4_avx2( |
653 | | const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, |
654 | 0 | ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { |
655 | 0 | aom_highbd_filter_block1d8_h4_avx2(src_ptr, src_pitch, dst_ptr, dst_pitch, |
656 | 0 | height, filter, bd); |
657 | 0 | aom_highbd_filter_block1d8_h4_avx2(src_ptr + 8, src_pitch, dst_ptr + 8, |
658 | 0 | dst_pitch, height, filter, bd); |
659 | 0 | } |
660 | | |
661 | | // ----------------------------------------------------------------------------- |
662 | | // 2-tap horizontal filtering |
663 | | |
664 | 0 | static inline void pack_2t_filter(const int16_t *filter, __m256i *f) { |
665 | 0 | const __m128i h = _mm_loadu_si128((const __m128i *)filter); |
666 | 0 | const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1); |
667 | 0 | const __m256i p = _mm256_set1_epi32(0x09080706); |
668 | 0 | f[0] = _mm256_shuffle_epi8(hh, p); |
669 | 0 | } |
670 | | |
671 | | // can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels() |
672 | | // the difference is s0/s1 specifies first and second rows or, |
673 | | // first 16 samples and 8-sample shifted 16 samples |
674 | | static inline void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1, |
675 | 0 | __m256i *sig) { |
676 | 0 | const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); |
677 | 0 | const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2); |
678 | 0 | __m256i x0 = _mm256_shuffle_epi8(*s0, sf2); |
679 | 0 | __m256i x1 = _mm256_shuffle_epi8(*s1, sf2); |
680 | 0 | __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx); |
681 | 0 | __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx); |
682 | 0 | r0 = _mm256_shuffle_epi8(r0, sf2); |
683 | 0 | r1 = _mm256_shuffle_epi8(r1, sf2); |
684 | 0 | sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20); |
685 | 0 | sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20); |
686 | 0 | } |
687 | | |
688 | | static inline void pack_8x2_2t_pixels(const uint16_t *src, |
689 | 0 | const ptrdiff_t pitch, __m256i *sig) { |
690 | 0 | const __m256i r0 = _mm256_loadu_si256((const __m256i *)src); |
691 | 0 | const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch)); |
692 | 0 | pack_16_2t_pixels(&r0, &r1, sig); |
693 | 0 | } |
694 | | |
695 | | static inline void pack_16x1_2t_pixels(const uint16_t *src, |
696 | 0 | __m256i *sig /*sig[2]*/) { |
697 | 0 | const __m256i r0 = _mm256_loadu_si256((const __m256i *)src); |
698 | 0 | const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8)); |
699 | 0 | pack_16_2t_pixels(&r0, &r1, sig); |
700 | 0 | } |
701 | | |
702 | | static inline void pack_8x1_2t_pixels(const uint16_t *src, |
703 | 0 | __m256i *sig /*sig[2]*/) { |
704 | 0 | const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); |
705 | 0 | const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2); |
706 | 0 | __m256i r0 = _mm256_loadu_si256((const __m256i *)src); |
707 | 0 | __m256i x0 = _mm256_shuffle_epi8(r0, sf2); |
708 | 0 | r0 = _mm256_permutevar8x32_epi32(r0, idx); |
709 | 0 | r0 = _mm256_shuffle_epi8(r0, sf2); |
710 | 0 | sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20); |
711 | 0 | } |
712 | | |
713 | | // can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels() |
714 | | static inline void filter_16_2t_pixels(const __m256i *sig, const __m256i *f, |
715 | 0 | __m256i *y0, __m256i *y1) { |
716 | 0 | const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); |
717 | 0 | __m256i x0 = _mm256_madd_epi16(sig[0], *f); |
718 | 0 | __m256i x1 = _mm256_madd_epi16(sig[1], *f); |
719 | 0 | x0 = _mm256_add_epi32(x0, rounding); |
720 | 0 | x1 = _mm256_add_epi32(x1, rounding); |
721 | 0 | *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS); |
722 | 0 | *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS); |
723 | 0 | } |
724 | | |
725 | | static inline void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f, |
726 | 0 | __m256i *y0) { |
727 | 0 | const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); |
728 | 0 | __m256i x0 = _mm256_madd_epi16(sig[0], *f); |
729 | 0 | x0 = _mm256_add_epi32(x0, rounding); |
730 | 0 | *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS); |
731 | 0 | } |
732 | | |
733 | | static void aom_highbd_filter_block1d8_h2_avx2( |
734 | | const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, |
735 | 0 | ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { |
736 | 0 | __m256i signal[2], res0, res1; |
737 | 0 | const __m256i max = _mm256_set1_epi16((1 << bd) - 1); |
738 | |
|
739 | 0 | __m256i ff; |
740 | 0 | pack_2t_filter(filter, &ff); |
741 | |
|
742 | 0 | src_ptr -= 3; |
743 | 0 | do { |
744 | 0 | pack_8x2_2t_pixels(src_ptr, src_pitch, signal); |
745 | 0 | filter_16_2t_pixels(signal, &ff, &res0, &res1); |
746 | 0 | store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); |
747 | 0 | height -= 2; |
748 | 0 | src_ptr += src_pitch << 1; |
749 | 0 | dst_ptr += dst_pitch << 1; |
750 | 0 | } while (height > 1); |
751 | |
|
752 | 0 | if (height > 0) { |
753 | 0 | pack_8x1_2t_pixels(src_ptr, signal); |
754 | 0 | filter_8x1_2t_pixels(signal, &ff, &res0); |
755 | 0 | store_8x1_pixels(&res0, &max, dst_ptr); |
756 | 0 | } |
757 | 0 | } |
758 | | |
759 | | static void aom_highbd_filter_block1d16_h2_avx2( |
760 | | const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, |
761 | 0 | ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { |
762 | 0 | __m256i signal[2], res0, res1; |
763 | 0 | const __m256i max = _mm256_set1_epi16((1 << bd) - 1); |
764 | |
|
765 | 0 | __m256i ff; |
766 | 0 | pack_2t_filter(filter, &ff); |
767 | |
|
768 | 0 | src_ptr -= 3; |
769 | 0 | do { |
770 | 0 | pack_16x1_2t_pixels(src_ptr, signal); |
771 | 0 | filter_16_2t_pixels(signal, &ff, &res0, &res1); |
772 | 0 | store_16x1_pixels(&res0, &res1, &max, dst_ptr); |
773 | 0 | height -= 1; |
774 | 0 | src_ptr += src_pitch; |
775 | 0 | dst_ptr += dst_pitch; |
776 | 0 | } while (height > 0); |
777 | 0 | } |
778 | | |
779 | | // ----------------------------------------------------------------------------- |
780 | | // Vertical Filtering |
781 | | |
782 | 0 | static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) { |
783 | 0 | __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src)); |
784 | 0 | __m256i s1 = |
785 | 0 | _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch))); |
786 | 0 | __m256i s2 = _mm256_castsi128_si256( |
787 | 0 | _mm_loadu_si128((const __m128i *)(src + 2 * pitch))); |
788 | 0 | __m256i s3 = _mm256_castsi128_si256( |
789 | 0 | _mm_loadu_si128((const __m128i *)(src + 3 * pitch))); |
790 | 0 | __m256i s4 = _mm256_castsi128_si256( |
791 | 0 | _mm_loadu_si128((const __m128i *)(src + 4 * pitch))); |
792 | 0 | __m256i s5 = _mm256_castsi128_si256( |
793 | 0 | _mm_loadu_si128((const __m128i *)(src + 5 * pitch))); |
794 | 0 | __m256i s6 = _mm256_castsi128_si256( |
795 | 0 | _mm_loadu_si128((const __m128i *)(src + 6 * pitch))); |
796 | |
|
797 | 0 | s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1); |
798 | 0 | s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1); |
799 | 0 | s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1); |
800 | 0 | s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1); |
801 | 0 | s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1); |
802 | 0 | s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1); |
803 | |
|
804 | 0 | sig[0] = _mm256_unpacklo_epi16(s0, s1); |
805 | 0 | sig[4] = _mm256_unpackhi_epi16(s0, s1); |
806 | 0 | sig[1] = _mm256_unpacklo_epi16(s2, s3); |
807 | 0 | sig[5] = _mm256_unpackhi_epi16(s2, s3); |
808 | 0 | sig[2] = _mm256_unpacklo_epi16(s4, s5); |
809 | 0 | sig[6] = _mm256_unpackhi_epi16(s4, s5); |
810 | 0 | sig[8] = s6; |
811 | 0 | } |
812 | | |
813 | | static inline void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch, |
814 | 0 | __m256i *sig) { |
815 | | // base + 7th row |
816 | 0 | __m256i s0 = _mm256_castsi128_si256( |
817 | 0 | _mm_loadu_si128((const __m128i *)(src + 7 * pitch))); |
818 | | // base + 8th row |
819 | 0 | __m256i s1 = _mm256_castsi128_si256( |
820 | 0 | _mm_loadu_si128((const __m128i *)(src + 8 * pitch))); |
821 | 0 | __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1); |
822 | 0 | __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1); |
823 | 0 | sig[3] = _mm256_unpacklo_epi16(s2, s3); |
824 | 0 | sig[7] = _mm256_unpackhi_epi16(s2, s3); |
825 | 0 | sig[8] = s1; |
826 | 0 | } |
827 | | |
828 | | static inline void filter_8x9_pixels(const __m256i *sig, const __m256i *f, |
829 | 0 | __m256i *y0, __m256i *y1) { |
830 | 0 | filter_8x1_pixels(sig, f, y0); |
831 | 0 | filter_8x1_pixels(&sig[4], f, y1); |
832 | 0 | } |
833 | | |
834 | 0 | static inline void update_pixels(__m256i *sig) { |
835 | 0 | int i; |
836 | 0 | for (i = 0; i < 3; ++i) { |
837 | 0 | sig[i] = sig[i + 1]; |
838 | 0 | sig[i + 4] = sig[i + 5]; |
839 | 0 | } |
840 | 0 | } |
841 | | |
842 | | static void aom_highbd_filter_block1d8_v8_avx2( |
843 | | const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, |
844 | 0 | ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { |
845 | 0 | __m256i signal[9], res0, res1; |
846 | 0 | const __m256i max = _mm256_set1_epi16((1 << bd) - 1); |
847 | |
|
848 | 0 | __m256i ff[4]; |
849 | 0 | pack_filters(filter, ff); |
850 | |
|
851 | 0 | pack_8x9_init(src_ptr, src_pitch, signal); |
852 | |
|
853 | 0 | do { |
854 | 0 | pack_8x9_pixels(src_ptr, src_pitch, signal); |
855 | |
|
856 | 0 | filter_8x9_pixels(signal, ff, &res0, &res1); |
857 | 0 | store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); |
858 | 0 | update_pixels(signal); |
859 | |
|
860 | 0 | src_ptr += src_pitch << 1; |
861 | 0 | dst_ptr += dst_pitch << 1; |
862 | 0 | height -= 2; |
863 | 0 | } while (height > 0); |
864 | 0 | } |
865 | | |
866 | 0 | static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) { |
867 | 0 | __m256i u0, u1, u2, u3; |
868 | | // load 0-6 rows |
869 | 0 | const __m256i s0 = _mm256_loadu_si256((const __m256i *)src); |
870 | 0 | const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch)); |
871 | 0 | const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch)); |
872 | 0 | const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch)); |
873 | 0 | const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch)); |
874 | 0 | const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch)); |
875 | 0 | const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch)); |
876 | |
|
877 | 0 | u0 = _mm256_permute2x128_si256(s0, s1, 0x20); // 0, 1 low |
878 | 0 | u1 = _mm256_permute2x128_si256(s0, s1, 0x31); // 0, 1 high |
879 | |
|
880 | 0 | u2 = _mm256_permute2x128_si256(s1, s2, 0x20); // 1, 2 low |
881 | 0 | u3 = _mm256_permute2x128_si256(s1, s2, 0x31); // 1, 2 high |
882 | |
|
883 | 0 | sig[0] = _mm256_unpacklo_epi16(u0, u2); |
884 | 0 | sig[4] = _mm256_unpackhi_epi16(u0, u2); |
885 | |
|
886 | 0 | sig[8] = _mm256_unpacklo_epi16(u1, u3); |
887 | 0 | sig[12] = _mm256_unpackhi_epi16(u1, u3); |
888 | |
|
889 | 0 | u0 = _mm256_permute2x128_si256(s2, s3, 0x20); |
890 | 0 | u1 = _mm256_permute2x128_si256(s2, s3, 0x31); |
891 | |
|
892 | 0 | u2 = _mm256_permute2x128_si256(s3, s4, 0x20); |
893 | 0 | u3 = _mm256_permute2x128_si256(s3, s4, 0x31); |
894 | |
|
895 | 0 | sig[1] = _mm256_unpacklo_epi16(u0, u2); |
896 | 0 | sig[5] = _mm256_unpackhi_epi16(u0, u2); |
897 | |
|
898 | 0 | sig[9] = _mm256_unpacklo_epi16(u1, u3); |
899 | 0 | sig[13] = _mm256_unpackhi_epi16(u1, u3); |
900 | |
|
901 | 0 | u0 = _mm256_permute2x128_si256(s4, s5, 0x20); |
902 | 0 | u1 = _mm256_permute2x128_si256(s4, s5, 0x31); |
903 | |
|
904 | 0 | u2 = _mm256_permute2x128_si256(s5, s6, 0x20); |
905 | 0 | u3 = _mm256_permute2x128_si256(s5, s6, 0x31); |
906 | |
|
907 | 0 | sig[2] = _mm256_unpacklo_epi16(u0, u2); |
908 | 0 | sig[6] = _mm256_unpackhi_epi16(u0, u2); |
909 | |
|
910 | 0 | sig[10] = _mm256_unpacklo_epi16(u1, u3); |
911 | 0 | sig[14] = _mm256_unpackhi_epi16(u1, u3); |
912 | |
|
913 | 0 | sig[16] = s6; |
914 | 0 | } |
915 | | |
916 | | static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch, |
917 | 0 | __m256i *sig) { |
918 | | // base + 7th row |
919 | 0 | const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch)); |
920 | | // base + 8th row |
921 | 0 | const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch)); |
922 | |
|
923 | 0 | __m256i u0, u1, u2, u3; |
924 | 0 | u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20); |
925 | 0 | u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31); |
926 | |
|
927 | 0 | u2 = _mm256_permute2x128_si256(s7, s8, 0x20); |
928 | 0 | u3 = _mm256_permute2x128_si256(s7, s8, 0x31); |
929 | |
|
930 | 0 | sig[3] = _mm256_unpacklo_epi16(u0, u2); |
931 | 0 | sig[7] = _mm256_unpackhi_epi16(u0, u2); |
932 | |
|
933 | 0 | sig[11] = _mm256_unpacklo_epi16(u1, u3); |
934 | 0 | sig[15] = _mm256_unpackhi_epi16(u1, u3); |
935 | |
|
936 | 0 | sig[16] = s8; |
937 | 0 | } |
938 | | |
939 | | static inline void filter_16x9_pixels(const __m256i *sig, const __m256i *f, |
940 | 0 | __m256i *y0, __m256i *y1) { |
941 | 0 | __m256i res[4]; |
942 | 0 | int i; |
943 | 0 | for (i = 0; i < 4; ++i) { |
944 | 0 | filter_8x1_pixels(&sig[i << 2], f, &res[i]); |
945 | 0 | } |
946 | |
|
947 | 0 | { |
948 | 0 | const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]); |
949 | 0 | const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]); |
950 | 0 | *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20); |
951 | 0 | *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31); |
952 | 0 | } |
953 | 0 | } |
954 | | |
955 | | static inline void store_16x2_pixels(const __m256i *y0, const __m256i *y1, |
956 | | const __m256i *mask, uint16_t *dst, |
957 | 0 | ptrdiff_t pitch) { |
958 | 0 | __m256i p = _mm256_min_epi16(*y0, *mask); |
959 | 0 | _mm256_storeu_si256((__m256i *)dst, p); |
960 | 0 | p = _mm256_min_epi16(*y1, *mask); |
961 | 0 | _mm256_storeu_si256((__m256i *)(dst + pitch), p); |
962 | 0 | } |
963 | | |
964 | 0 | static void update_16x9_pixels(__m256i *sig) { |
965 | 0 | update_pixels(&sig[0]); |
966 | 0 | update_pixels(&sig[8]); |
967 | 0 | } |
968 | | |
969 | | static void aom_highbd_filter_block1d16_v8_avx2( |
970 | | const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, |
971 | 0 | ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { |
972 | 0 | __m256i signal[17], res0, res1; |
973 | 0 | const __m256i max = _mm256_set1_epi16((1 << bd) - 1); |
974 | |
|
975 | 0 | __m256i ff[4]; |
976 | 0 | pack_filters(filter, ff); |
977 | |
|
978 | 0 | pack_16x9_init(src_ptr, src_pitch, signal); |
979 | |
|
980 | 0 | do { |
981 | 0 | pack_16x9_pixels(src_ptr, src_pitch, signal); |
982 | 0 | filter_16x9_pixels(signal, ff, &res0, &res1); |
983 | 0 | store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); |
984 | 0 | update_16x9_pixels(signal); |
985 | |
|
986 | 0 | src_ptr += src_pitch << 1; |
987 | 0 | dst_ptr += dst_pitch << 1; |
988 | 0 | height -= 2; |
989 | 0 | } while (height > 0); |
990 | 0 | } |
991 | | |
992 | | static void aom_highbd_filter_block1d4_v4_avx2( |
993 | | const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, |
994 | 0 | ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { |
995 | 0 | const int bits = FILTER_BITS; |
996 | |
|
997 | 0 | const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); |
998 | 0 | const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); |
999 | 0 | const __m256i clip_pixel = |
1000 | 0 | _mm256_set1_epi32(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); |
1001 | 0 | const __m256i zero = _mm256_setzero_si256(); |
1002 | 0 | uint32_t i; |
1003 | 0 | __m256i s[2], ff[2]; |
1004 | |
|
1005 | 0 | pack_filters_4tap(filter, ff); |
1006 | |
|
1007 | 0 | const uint16_t *data = src_ptr; |
1008 | | /* Vertical filter */ |
1009 | 0 | { |
1010 | 0 | __m128i s2 = _mm_loadl_epi64((__m128i *)(data + 2 * src_pitch)); |
1011 | 0 | __m128i s3 = _mm_loadl_epi64((__m128i *)(data + 3 * src_pitch)); |
1012 | |
|
1013 | 0 | __m256i s23 = _mm256_inserti128_si256(_mm256_castsi128_si256(s2), s3, 1); |
1014 | |
|
1015 | 0 | __m128i s4 = _mm_loadl_epi64((__m128i *)(data + 4 * src_pitch)); |
1016 | |
|
1017 | 0 | __m256i s34 = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s4, 1); |
1018 | |
|
1019 | 0 | s[0] = _mm256_unpacklo_epi16(s23, s34); |
1020 | |
|
1021 | 0 | for (i = 0; i < height; i += 2) { |
1022 | 0 | data = &src_ptr[i * src_pitch]; |
1023 | |
|
1024 | 0 | __m128i s5 = _mm_loadl_epi64((__m128i *)(data + 5 * src_pitch)); |
1025 | 0 | __m128i s6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_pitch)); |
1026 | |
|
1027 | 0 | __m256i s45 = _mm256_inserti128_si256(_mm256_castsi128_si256(s4), s5, 1); |
1028 | 0 | __m256i s56 = _mm256_inserti128_si256(_mm256_castsi128_si256(s5), s6, 1); |
1029 | |
|
1030 | 0 | s[1] = _mm256_unpacklo_epi16(s45, s56); |
1031 | |
|
1032 | 0 | const __m256i res_a = convolve_4tap(s, ff); |
1033 | |
|
1034 | 0 | __m256i res_a_round = _mm256_sra_epi32( |
1035 | 0 | _mm256_add_epi32(res_a, round_const_bits), round_shift_bits); |
1036 | |
|
1037 | 0 | __m256i res_16bit = _mm256_min_epi32(res_a_round, clip_pixel); |
1038 | 0 | res_16bit = _mm256_max_epi32(res_16bit, zero); |
1039 | 0 | res_16bit = _mm256_packs_epi32(res_16bit, res_16bit); |
1040 | |
|
1041 | 0 | _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch], |
1042 | 0 | _mm256_castsi256_si128(res_16bit)); |
1043 | 0 | _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch], |
1044 | 0 | _mm256_extracti128_si256(res_16bit, 1)); |
1045 | |
|
1046 | 0 | s[0] = s[1]; |
1047 | 0 | s4 = s6; |
1048 | 0 | } |
1049 | 0 | } |
1050 | 0 | } |
1051 | | |
1052 | | static void aom_highbd_filter_block1d8_v4_avx2( |
1053 | | const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, |
1054 | 0 | ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { |
1055 | 0 | const int bits = FILTER_BITS; |
1056 | |
|
1057 | 0 | const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); |
1058 | 0 | const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); |
1059 | 0 | const __m256i clip_pixel = |
1060 | 0 | _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); |
1061 | 0 | const __m256i zero = _mm256_setzero_si256(); |
1062 | 0 | __m256i s[4], ff[2]; |
1063 | 0 | uint32_t i; |
1064 | 0 | pack_filters_4tap(filter, ff); |
1065 | |
|
1066 | 0 | const uint16_t *data = src_ptr; |
1067 | | /* Vertical filter */ |
1068 | 0 | { |
1069 | 0 | __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_pitch)); |
1070 | 0 | __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_pitch)); |
1071 | |
|
1072 | 0 | __m256i s23 = _mm256_inserti128_si256(_mm256_castsi128_si256(s2), s3, 1); |
1073 | |
|
1074 | 0 | __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_pitch)); |
1075 | |
|
1076 | 0 | __m256i s34 = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s4, 1); |
1077 | |
|
1078 | 0 | s[0] = _mm256_unpacklo_epi16(s23, s34); |
1079 | 0 | s[2] = _mm256_unpackhi_epi16(s23, s34); |
1080 | |
|
1081 | 0 | for (i = 0; i < height; i += 2) { |
1082 | 0 | data = &src_ptr[i * src_pitch]; |
1083 | |
|
1084 | 0 | __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_pitch)); |
1085 | 0 | __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_pitch)); |
1086 | |
|
1087 | 0 | __m256i s45 = _mm256_inserti128_si256(_mm256_castsi128_si256(s4), s5, 1); |
1088 | 0 | __m256i s56 = _mm256_inserti128_si256(_mm256_castsi128_si256(s5), s6, 1); |
1089 | |
|
1090 | 0 | s[1] = _mm256_unpacklo_epi16(s45, s56); |
1091 | 0 | s[3] = _mm256_unpackhi_epi16(s45, s56); |
1092 | |
|
1093 | 0 | const __m256i res_a = convolve_4tap(s, ff); |
1094 | |
|
1095 | 0 | __m256i res_a_round = _mm256_sra_epi32( |
1096 | 0 | _mm256_add_epi32(res_a, round_const_bits), round_shift_bits); |
1097 | |
|
1098 | 0 | const __m256i res_b = convolve_4tap(s + 2, ff); |
1099 | 0 | __m256i res_b_round = _mm256_sra_epi32( |
1100 | 0 | _mm256_add_epi32(res_b, round_const_bits), round_shift_bits); |
1101 | |
|
1102 | 0 | __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); |
1103 | 0 | res_16bit = _mm256_min_epi16(res_16bit, clip_pixel); |
1104 | 0 | res_16bit = _mm256_max_epi16(res_16bit, zero); |
1105 | |
|
1106 | 0 | _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch], |
1107 | 0 | _mm256_castsi256_si128(res_16bit)); |
1108 | 0 | _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch], |
1109 | 0 | _mm256_extracti128_si256(res_16bit, 1)); |
1110 | |
|
1111 | 0 | s[0] = s[1]; |
1112 | 0 | s[2] = s[3]; |
1113 | 0 | s4 = s6; |
1114 | 0 | } |
1115 | 0 | } |
1116 | 0 | } |
1117 | | |
1118 | | static void aom_highbd_filter_block1d16_v4_avx2( |
1119 | | const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, |
1120 | 0 | ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { |
1121 | 0 | aom_highbd_filter_block1d8_v4_avx2(src_ptr, src_pitch, dst_ptr, dst_pitch, |
1122 | 0 | height, filter, bd); |
1123 | |
|
1124 | 0 | aom_highbd_filter_block1d8_v4_avx2(src_ptr + 8, src_pitch, dst_ptr + 8, |
1125 | 0 | dst_pitch, height, filter, bd); |
1126 | 0 | } |
1127 | | |
1128 | | // ----------------------------------------------------------------------------- |
1129 | | // 2-tap vertical filtering |
1130 | | |
1131 | 0 | static void pack_16x2_init(const uint16_t *src, __m256i *sig) { |
1132 | 0 | sig[2] = _mm256_loadu_si256((const __m256i *)src); |
1133 | 0 | } |
1134 | | |
1135 | | static inline void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch, |
1136 | 0 | __m256i *sig) { |
1137 | | // load the next row |
1138 | 0 | const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch)); |
1139 | 0 | sig[0] = _mm256_unpacklo_epi16(sig[2], u); |
1140 | 0 | sig[1] = _mm256_unpackhi_epi16(sig[2], u); |
1141 | 0 | sig[2] = u; |
1142 | 0 | } |
1143 | | |
1144 | | static inline void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f, |
1145 | 0 | __m256i *y0, __m256i *y1) { |
1146 | 0 | filter_16_2t_pixels(sig, f, y0, y1); |
1147 | 0 | } |
1148 | | |
1149 | | static void aom_highbd_filter_block1d16_v2_avx2( |
1150 | | const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, |
1151 | 0 | ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { |
1152 | 0 | __m256i signal[3], res0, res1; |
1153 | 0 | const __m256i max = _mm256_set1_epi16((1 << bd) - 1); |
1154 | 0 | __m256i ff; |
1155 | |
|
1156 | 0 | pack_2t_filter(filter, &ff); |
1157 | 0 | pack_16x2_init(src_ptr, signal); |
1158 | |
|
1159 | 0 | do { |
1160 | 0 | pack_16x2_2t_pixels(src_ptr, src_pitch, signal); |
1161 | 0 | filter_16x2_2t_pixels(signal, &ff, &res0, &res1); |
1162 | 0 | store_16x1_pixels(&res0, &res1, &max, dst_ptr); |
1163 | |
|
1164 | 0 | src_ptr += src_pitch; |
1165 | 0 | dst_ptr += dst_pitch; |
1166 | 0 | height -= 1; |
1167 | 0 | } while (height > 0); |
1168 | 0 | } |
1169 | | |
1170 | 0 | static inline void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) { |
1171 | 0 | const __m128i h = _mm_loadu_si128((const __m128i *)filter); |
1172 | 0 | const __m128i p = _mm_set1_epi32(0x09080706); |
1173 | 0 | f[0] = _mm_shuffle_epi8(h, p); |
1174 | 0 | } |
1175 | | |
1176 | 0 | static void pack_8x2_init(const uint16_t *src, __m128i *sig) { |
1177 | 0 | sig[2] = _mm_loadu_si128((const __m128i *)src); |
1178 | 0 | } |
1179 | | |
1180 | | static inline void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch, |
1181 | 0 | __m128i *sig) { |
1182 | | // load the next row |
1183 | 0 | const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch)); |
1184 | 0 | sig[0] = _mm_unpacklo_epi16(sig[2], u); |
1185 | 0 | sig[1] = _mm_unpackhi_epi16(sig[2], u); |
1186 | 0 | sig[2] = u; |
1187 | 0 | } |
1188 | | |
1189 | | static inline void filter_8_2t_pixels(const __m128i *sig, const __m128i *f, |
1190 | 0 | __m128i *y0, __m128i *y1) { |
1191 | 0 | const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); |
1192 | 0 | __m128i x0 = _mm_madd_epi16(sig[0], *f); |
1193 | 0 | __m128i x1 = _mm_madd_epi16(sig[1], *f); |
1194 | 0 | x0 = _mm_add_epi32(x0, rounding); |
1195 | 0 | x1 = _mm_add_epi32(x1, rounding); |
1196 | 0 | *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS); |
1197 | 0 | *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS); |
1198 | 0 | } |
1199 | | |
1200 | | static inline void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1, |
1201 | 0 | const __m128i *mask, uint16_t *dst) { |
1202 | 0 | __m128i res = _mm_packus_epi32(*y0, *y1); |
1203 | 0 | res = _mm_min_epi16(res, *mask); |
1204 | 0 | _mm_storeu_si128((__m128i *)dst, res); |
1205 | 0 | } |
1206 | | |
1207 | | static void aom_highbd_filter_block1d8_v2_avx2( |
1208 | | const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, |
1209 | 0 | ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { |
1210 | 0 | __m128i signal[3], res0, res1; |
1211 | 0 | const __m128i max = _mm_set1_epi16((1 << bd) - 1); |
1212 | 0 | __m128i ff; |
1213 | |
|
1214 | 0 | pack_8x1_2t_filter(filter, &ff); |
1215 | 0 | pack_8x2_init(src_ptr, signal); |
1216 | |
|
1217 | 0 | do { |
1218 | 0 | pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal); |
1219 | 0 | filter_8_2t_pixels(signal, &ff, &res0, &res1); |
1220 | 0 | store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr); |
1221 | |
|
1222 | 0 | src_ptr += src_pitch; |
1223 | 0 | dst_ptr += dst_pitch; |
1224 | 0 | height -= 1; |
1225 | 0 | } while (height > 0); |
1226 | 0 | } |
1227 | | |
1228 | | void aom_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *, |
1229 | | ptrdiff_t, uint32_t, const int16_t *, |
1230 | | int); |
1231 | | void aom_highbd_filter_block1d4_h2_sse2(const uint16_t *, ptrdiff_t, uint16_t *, |
1232 | | ptrdiff_t, uint32_t, const int16_t *, |
1233 | | int); |
1234 | | void aom_highbd_filter_block1d4_v8_sse2(const uint16_t *, ptrdiff_t, uint16_t *, |
1235 | | ptrdiff_t, uint32_t, const int16_t *, |
1236 | | int); |
1237 | | void aom_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *, |
1238 | | ptrdiff_t, uint32_t, const int16_t *, |
1239 | | int); |
1240 | 0 | #define aom_highbd_filter_block1d4_h8_avx2 aom_highbd_filter_block1d4_h8_sse2 |
1241 | 0 | #define aom_highbd_filter_block1d4_h2_avx2 aom_highbd_filter_block1d4_h2_sse2 |
1242 | 0 | #define aom_highbd_filter_block1d4_v8_avx2 aom_highbd_filter_block1d4_v8_sse2 |
1243 | 0 | #define aom_highbd_filter_block1d4_v2_avx2 aom_highbd_filter_block1d4_v2_sse2 |
1244 | | |
1245 | | HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2) |
1246 | | HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2) |
1247 | | |
1248 | | #undef HIGHBD_FUNC |