/src/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2017 The WebM project authors. All Rights Reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | |
11 | | #include <immintrin.h> |
12 | | #include "./vpx_dsp_rtcd.h" |
13 | | #include "vpx_dsp/x86/convolve.h" |
14 | | #include "vpx_dsp/x86/convolve_avx2.h" |
15 | | |
16 | | // ----------------------------------------------------------------------------- |
17 | | // Copy and average |
18 | | |
19 | | void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, |
20 | | uint16_t *dst, ptrdiff_t dst_stride, |
21 | | const InterpKernel *filter, int x0_q4, |
22 | | int x_step_q4, int y0_q4, int y_step_q4, |
23 | 0 | int w, int h, int bd) { |
24 | 0 | (void)filter; |
25 | 0 | (void)x0_q4; |
26 | 0 | (void)x_step_q4; |
27 | 0 | (void)y0_q4; |
28 | 0 | (void)y_step_q4; |
29 | 0 | (void)bd; |
30 | |
|
31 | 0 | assert(w % 4 == 0); |
32 | 0 | if (w > 32) { // w = 64 |
33 | 0 | do { |
34 | 0 | const __m256i p0 = _mm256_loadu_si256((const __m256i *)src); |
35 | 0 | const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); |
36 | 0 | const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32)); |
37 | 0 | const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48)); |
38 | 0 | src += src_stride; |
39 | 0 | _mm256_storeu_si256((__m256i *)dst, p0); |
40 | 0 | _mm256_storeu_si256((__m256i *)(dst + 16), p1); |
41 | 0 | _mm256_storeu_si256((__m256i *)(dst + 32), p2); |
42 | 0 | _mm256_storeu_si256((__m256i *)(dst + 48), p3); |
43 | 0 | dst += dst_stride; |
44 | 0 | h--; |
45 | 0 | } while (h > 0); |
46 | 0 | } else if (w > 16) { // w = 32 |
47 | 0 | do { |
48 | 0 | const __m256i p0 = _mm256_loadu_si256((const __m256i *)src); |
49 | 0 | const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); |
50 | 0 | src += src_stride; |
51 | 0 | _mm256_storeu_si256((__m256i *)dst, p0); |
52 | 0 | _mm256_storeu_si256((__m256i *)(dst + 16), p1); |
53 | 0 | dst += dst_stride; |
54 | 0 | h--; |
55 | 0 | } while (h > 0); |
56 | 0 | } else if (w > 8) { // w = 16 |
57 | 0 | __m256i p0, p1; |
58 | 0 | do { |
59 | 0 | p0 = _mm256_loadu_si256((const __m256i *)src); |
60 | 0 | src += src_stride; |
61 | 0 | p1 = _mm256_loadu_si256((const __m256i *)src); |
62 | 0 | src += src_stride; |
63 | |
|
64 | 0 | _mm256_storeu_si256((__m256i *)dst, p0); |
65 | 0 | dst += dst_stride; |
66 | 0 | _mm256_storeu_si256((__m256i *)dst, p1); |
67 | 0 | dst += dst_stride; |
68 | 0 | h -= 2; |
69 | 0 | } while (h > 0); |
70 | 0 | } else if (w > 4) { // w = 8 |
71 | 0 | __m128i p0, p1; |
72 | 0 | do { |
73 | 0 | p0 = _mm_loadu_si128((const __m128i *)src); |
74 | 0 | src += src_stride; |
75 | 0 | p1 = _mm_loadu_si128((const __m128i *)src); |
76 | 0 | src += src_stride; |
77 | |
|
78 | 0 | _mm_storeu_si128((__m128i *)dst, p0); |
79 | 0 | dst += dst_stride; |
80 | 0 | _mm_storeu_si128((__m128i *)dst, p1); |
81 | 0 | dst += dst_stride; |
82 | 0 | h -= 2; |
83 | 0 | } while (h > 0); |
84 | 0 | } else { // w = 4 |
85 | 0 | __m128i p0, p1; |
86 | 0 | do { |
87 | 0 | p0 = _mm_loadl_epi64((const __m128i *)src); |
88 | 0 | src += src_stride; |
89 | 0 | p1 = _mm_loadl_epi64((const __m128i *)src); |
90 | 0 | src += src_stride; |
91 | |
|
92 | 0 | _mm_storel_epi64((__m128i *)dst, p0); |
93 | 0 | dst += dst_stride; |
94 | 0 | _mm_storel_epi64((__m128i *)dst, p1); |
95 | 0 | dst += dst_stride; |
96 | 0 | h -= 2; |
97 | 0 | } while (h > 0); |
98 | 0 | } |
99 | 0 | } |
100 | | |
101 | | void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, |
102 | | uint16_t *dst, ptrdiff_t dst_stride, |
103 | | const InterpKernel *filter, int x0_q4, |
104 | | int x_step_q4, int y0_q4, int y_step_q4, |
105 | 0 | int w, int h, int bd) { |
106 | 0 | (void)filter; |
107 | 0 | (void)x0_q4; |
108 | 0 | (void)x_step_q4; |
109 | 0 | (void)y0_q4; |
110 | 0 | (void)y_step_q4; |
111 | 0 | (void)bd; |
112 | |
|
113 | 0 | assert(w % 4 == 0); |
114 | 0 | if (w > 32) { // w = 64 |
115 | 0 | __m256i p0, p1, p2, p3, u0, u1, u2, u3; |
116 | 0 | do { |
117 | 0 | p0 = _mm256_loadu_si256((const __m256i *)src); |
118 | 0 | p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); |
119 | 0 | p2 = _mm256_loadu_si256((const __m256i *)(src + 32)); |
120 | 0 | p3 = _mm256_loadu_si256((const __m256i *)(src + 48)); |
121 | 0 | src += src_stride; |
122 | 0 | u0 = _mm256_loadu_si256((const __m256i *)dst); |
123 | 0 | u1 = _mm256_loadu_si256((const __m256i *)(dst + 16)); |
124 | 0 | u2 = _mm256_loadu_si256((const __m256i *)(dst + 32)); |
125 | 0 | u3 = _mm256_loadu_si256((const __m256i *)(dst + 48)); |
126 | 0 | _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0)); |
127 | 0 | _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1)); |
128 | 0 | _mm256_storeu_si256((__m256i *)(dst + 32), _mm256_avg_epu16(p2, u2)); |
129 | 0 | _mm256_storeu_si256((__m256i *)(dst + 48), _mm256_avg_epu16(p3, u3)); |
130 | 0 | dst += dst_stride; |
131 | 0 | h--; |
132 | 0 | } while (h > 0); |
133 | 0 | } else if (w > 16) { // w = 32 |
134 | 0 | __m256i p0, p1, u0, u1; |
135 | 0 | do { |
136 | 0 | p0 = _mm256_loadu_si256((const __m256i *)src); |
137 | 0 | p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); |
138 | 0 | src += src_stride; |
139 | 0 | u0 = _mm256_loadu_si256((const __m256i *)dst); |
140 | 0 | u1 = _mm256_loadu_si256((const __m256i *)(dst + 16)); |
141 | 0 | _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0)); |
142 | 0 | _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1)); |
143 | 0 | dst += dst_stride; |
144 | 0 | h--; |
145 | 0 | } while (h > 0); |
146 | 0 | } else if (w > 8) { // w = 16 |
147 | 0 | __m256i p0, p1, u0, u1; |
148 | 0 | do { |
149 | 0 | p0 = _mm256_loadu_si256((const __m256i *)src); |
150 | 0 | p1 = _mm256_loadu_si256((const __m256i *)(src + src_stride)); |
151 | 0 | src += src_stride << 1; |
152 | 0 | u0 = _mm256_loadu_si256((const __m256i *)dst); |
153 | 0 | u1 = _mm256_loadu_si256((const __m256i *)(dst + dst_stride)); |
154 | |
|
155 | 0 | _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0)); |
156 | 0 | _mm256_storeu_si256((__m256i *)(dst + dst_stride), |
157 | 0 | _mm256_avg_epu16(p1, u1)); |
158 | 0 | dst += dst_stride << 1; |
159 | 0 | h -= 2; |
160 | 0 | } while (h > 0); |
161 | 0 | } else if (w > 4) { // w = 8 |
162 | 0 | __m128i p0, p1, u0, u1; |
163 | 0 | do { |
164 | 0 | p0 = _mm_loadu_si128((const __m128i *)src); |
165 | 0 | p1 = _mm_loadu_si128((const __m128i *)(src + src_stride)); |
166 | 0 | src += src_stride << 1; |
167 | 0 | u0 = _mm_loadu_si128((const __m128i *)dst); |
168 | 0 | u1 = _mm_loadu_si128((const __m128i *)(dst + dst_stride)); |
169 | |
|
170 | 0 | _mm_storeu_si128((__m128i *)dst, _mm_avg_epu16(p0, u0)); |
171 | 0 | _mm_storeu_si128((__m128i *)(dst + dst_stride), _mm_avg_epu16(p1, u1)); |
172 | 0 | dst += dst_stride << 1; |
173 | 0 | h -= 2; |
174 | 0 | } while (h > 0); |
175 | 0 | } else { // w = 4 |
176 | 0 | __m128i p0, p1, u0, u1; |
177 | 0 | do { |
178 | 0 | p0 = _mm_loadl_epi64((const __m128i *)src); |
179 | 0 | p1 = _mm_loadl_epi64((const __m128i *)(src + src_stride)); |
180 | 0 | src += src_stride << 1; |
181 | 0 | u0 = _mm_loadl_epi64((const __m128i *)dst); |
182 | 0 | u1 = _mm_loadl_epi64((const __m128i *)(dst + dst_stride)); |
183 | |
|
184 | 0 | _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(u0, p0)); |
185 | 0 | _mm_storel_epi64((__m128i *)(dst + dst_stride), _mm_avg_epu16(u1, p1)); |
186 | 0 | dst += dst_stride << 1; |
187 | 0 | h -= 2; |
188 | 0 | } while (h > 0); |
189 | 0 | } |
190 | 0 | } |
191 | | |
192 | | // ----------------------------------------------------------------------------- |
193 | | // Horizontal and vertical filtering |
194 | | |
195 | | static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, |
196 | | 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, |
197 | | 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 }; |
198 | | |
199 | | static const uint8_t signal_pattern_1[32] = { 4, 5, 6, 7, 6, 7, 8, 9, |
200 | | 8, 9, 10, 11, 10, 11, 12, 13, |
201 | | 4, 5, 6, 7, 6, 7, 8, 9, |
202 | | 8, 9, 10, 11, 10, 11, 12, 13 }; |
203 | | |
204 | | static const uint8_t signal_pattern_2[32] = { 6, 7, 8, 9, 8, 9, 10, 11, |
205 | | 10, 11, 12, 13, 12, 13, 14, 15, |
206 | | 6, 7, 8, 9, 8, 9, 10, 11, |
207 | | 10, 11, 12, 13, 12, 13, 14, 15 }; |
208 | | |
209 | | static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 }; |
210 | | |
211 | 0 | #define CONV8_ROUNDING_BITS (7) |
212 | 0 | #define CONV8_ROUNDING_NUM (1 << (CONV8_ROUNDING_BITS - 1)) |
213 | | |
214 | | // ----------------------------------------------------------------------------- |
215 | | // Horizontal Filtering |
216 | | |
217 | 0 | static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) { |
218 | 0 | const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); |
219 | 0 | const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0); |
220 | 0 | const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1); |
221 | 0 | const __m256i c = _mm256_permutevar8x32_epi32(*s, idx); |
222 | |
|
223 | 0 | p[0] = _mm256_shuffle_epi8(*s, sf0); // x0x6 |
224 | 0 | p[1] = _mm256_shuffle_epi8(*s, sf1); // x1x7 |
225 | 0 | p[2] = _mm256_shuffle_epi8(c, sf0); // x2x4 |
226 | 0 | p[3] = _mm256_shuffle_epi8(c, sf1); // x3x5 |
227 | 0 | } |
228 | | |
229 | | // Note: |
230 | | // Shared by 8x2 and 16x1 block |
231 | | static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1, |
232 | 0 | __m256i *x /*x[8]*/) { |
233 | 0 | __m256i pp[8]; |
234 | 0 | pack_pixels(s0, pp); |
235 | 0 | pack_pixels(s1, &pp[4]); |
236 | 0 | x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20); |
237 | 0 | x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20); |
238 | 0 | x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20); |
239 | 0 | x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20); |
240 | 0 | x[4] = x[2]; |
241 | 0 | x[5] = x[3]; |
242 | 0 | x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31); |
243 | 0 | x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31); |
244 | 0 | } |
245 | | |
246 | 0 | static INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) { |
247 | 0 | __m256i pp[8]; |
248 | 0 | __m256i s0; |
249 | 0 | s0 = _mm256_loadu_si256((const __m256i *)src); |
250 | 0 | pack_pixels(&s0, pp); |
251 | 0 | x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30); |
252 | 0 | x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30); |
253 | 0 | x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30); |
254 | 0 | x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30); |
255 | 0 | } |
256 | | |
257 | | static INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride, |
258 | 0 | __m256i *x) { |
259 | 0 | __m256i s0, s1; |
260 | 0 | s0 = _mm256_loadu_si256((const __m256i *)src); |
261 | 0 | s1 = _mm256_loadu_si256((const __m256i *)(src + stride)); |
262 | 0 | pack_16_pixels(&s0, &s1, x); |
263 | 0 | } |
264 | | |
265 | 0 | static INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) { |
266 | 0 | __m256i s0, s1; |
267 | 0 | s0 = _mm256_loadu_si256((const __m256i *)src); |
268 | 0 | s1 = _mm256_loadu_si256((const __m256i *)(src + 8)); |
269 | 0 | pack_16_pixels(&s0, &s1, x); |
270 | 0 | } |
271 | | |
272 | | // Note: |
273 | | // Shared by horizontal and vertical filtering |
274 | 0 | static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) { |
275 | 0 | const __m128i h = _mm_loadu_si128((const __m128i *)filter); |
276 | 0 | const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1); |
277 | 0 | const __m256i p0 = _mm256_set1_epi32(0x03020100); |
278 | 0 | const __m256i p1 = _mm256_set1_epi32(0x07060504); |
279 | 0 | const __m256i p2 = _mm256_set1_epi32(0x0b0a0908); |
280 | 0 | const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c); |
281 | 0 | f[0] = _mm256_shuffle_epi8(hh, p0); |
282 | 0 | f[1] = _mm256_shuffle_epi8(hh, p1); |
283 | 0 | f[2] = _mm256_shuffle_epi8(hh, p2); |
284 | 0 | f[3] = _mm256_shuffle_epi8(hh, p3); |
285 | 0 | } |
286 | | |
287 | | static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/, |
288 | | const __m256i *fil /*fil[4]*/, |
289 | 0 | __m256i *y) { |
290 | 0 | __m256i a, a0, a1; |
291 | |
|
292 | 0 | a0 = _mm256_madd_epi16(fil[0], sig[0]); |
293 | 0 | a1 = _mm256_madd_epi16(fil[3], sig[3]); |
294 | 0 | a = _mm256_add_epi32(a0, a1); |
295 | |
|
296 | 0 | a0 = _mm256_madd_epi16(fil[1], sig[1]); |
297 | 0 | a1 = _mm256_madd_epi16(fil[2], sig[2]); |
298 | |
|
299 | 0 | { |
300 | 0 | const __m256i min = _mm256_min_epi32(a0, a1); |
301 | 0 | a = _mm256_add_epi32(a, min); |
302 | 0 | } |
303 | 0 | { |
304 | 0 | const __m256i max = _mm256_max_epi32(a0, a1); |
305 | 0 | a = _mm256_add_epi32(a, max); |
306 | 0 | } |
307 | 0 | { |
308 | 0 | const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); |
309 | 0 | a = _mm256_add_epi32(a, rounding); |
310 | 0 | *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS); |
311 | 0 | } |
312 | 0 | } |
313 | | |
314 | | static INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask, |
315 | 0 | uint16_t *dst) { |
316 | 0 | const __m128i a0 = _mm256_castsi256_si128(*y); |
317 | 0 | const __m128i a1 = _mm256_extractf128_si256(*y, 1); |
318 | 0 | __m128i res = _mm_packus_epi32(a0, a1); |
319 | 0 | res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask)); |
320 | 0 | _mm_storeu_si128((__m128i *)dst, res); |
321 | 0 | } |
322 | | |
323 | | static INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1, |
324 | | const __m256i *mask, uint16_t *dst, |
325 | 0 | ptrdiff_t pitch) { |
326 | 0 | __m256i a = _mm256_packus_epi32(*y0, *y1); |
327 | 0 | a = _mm256_min_epi16(a, *mask); |
328 | 0 | _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a)); |
329 | 0 | _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1)); |
330 | 0 | } |
331 | | |
332 | | static INLINE void store_16x1_pixels(const __m256i *y0, const __m256i *y1, |
333 | 0 | const __m256i *mask, uint16_t *dst) { |
334 | 0 | __m256i a = _mm256_packus_epi32(*y0, *y1); |
335 | 0 | a = _mm256_min_epi16(a, *mask); |
336 | 0 | _mm256_storeu_si256((__m256i *)dst, a); |
337 | 0 | } |
338 | | |
339 | | static void vpx_highbd_filter_block1d8_h8_avx2( |
340 | | const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, |
341 | 0 | ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { |
342 | 0 | __m256i signal[8], res0, res1; |
343 | 0 | const __m256i max = _mm256_set1_epi16((1 << bd) - 1); |
344 | |
|
345 | 0 | __m256i ff[4]; |
346 | 0 | pack_filters(filter, ff); |
347 | |
|
348 | 0 | src_ptr -= 3; |
349 | 0 | do { |
350 | 0 | pack_8x2_pixels(src_ptr, src_pitch, signal); |
351 | 0 | filter_8x1_pixels(signal, ff, &res0); |
352 | 0 | filter_8x1_pixels(&signal[4], ff, &res1); |
353 | 0 | store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); |
354 | 0 | height -= 2; |
355 | 0 | src_ptr += src_pitch << 1; |
356 | 0 | dst_ptr += dst_pitch << 1; |
357 | 0 | } while (height > 1); |
358 | |
|
359 | 0 | if (height > 0) { |
360 | 0 | pack_8x1_pixels(src_ptr, signal); |
361 | 0 | filter_8x1_pixels(signal, ff, &res0); |
362 | 0 | store_8x1_pixels(&res0, &max, dst_ptr); |
363 | 0 | } |
364 | 0 | } |
365 | | |
366 | | static void vpx_highbd_filter_block1d16_h8_avx2( |
367 | | const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, |
368 | 0 | ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { |
369 | 0 | __m256i signal[8], res0, res1; |
370 | 0 | const __m256i max = _mm256_set1_epi16((1 << bd) - 1); |
371 | |
|
372 | 0 | __m256i ff[4]; |
373 | 0 | pack_filters(filter, ff); |
374 | |
|
375 | 0 | src_ptr -= 3; |
376 | 0 | do { |
377 | 0 | pack_16x1_pixels(src_ptr, signal); |
378 | 0 | filter_8x1_pixels(signal, ff, &res0); |
379 | 0 | filter_8x1_pixels(&signal[4], ff, &res1); |
380 | 0 | store_16x1_pixels(&res0, &res1, &max, dst_ptr); |
381 | 0 | height -= 1; |
382 | 0 | src_ptr += src_pitch; |
383 | 0 | dst_ptr += dst_pitch; |
384 | 0 | } while (height > 0); |
385 | 0 | } |
386 | | |
387 | | // ----------------------------------------------------------------------------- |
388 | | // 2-tap horizontal filtering |
389 | | |
390 | 0 | static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) { |
391 | 0 | const __m128i h = _mm_loadu_si128((const __m128i *)filter); |
392 | 0 | const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1); |
393 | 0 | const __m256i p = _mm256_set1_epi32(0x09080706); |
394 | 0 | f[0] = _mm256_shuffle_epi8(hh, p); |
395 | 0 | } |
396 | | |
397 | | // can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels() |
398 | | // the difference is s0/s1 specifies first and second rows or, |
399 | | // first 16 samples and 8-sample shifted 16 samples |
400 | | static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1, |
401 | 0 | __m256i *sig) { |
402 | 0 | const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); |
403 | 0 | const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2); |
404 | 0 | __m256i x0 = _mm256_shuffle_epi8(*s0, sf2); |
405 | 0 | __m256i x1 = _mm256_shuffle_epi8(*s1, sf2); |
406 | 0 | __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx); |
407 | 0 | __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx); |
408 | 0 | r0 = _mm256_shuffle_epi8(r0, sf2); |
409 | 0 | r1 = _mm256_shuffle_epi8(r1, sf2); |
410 | 0 | sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20); |
411 | 0 | sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20); |
412 | 0 | } |
413 | | |
414 | | static INLINE void pack_8x2_2t_pixels(const uint16_t *src, |
415 | 0 | const ptrdiff_t pitch, __m256i *sig) { |
416 | 0 | const __m256i r0 = _mm256_loadu_si256((const __m256i *)src); |
417 | 0 | const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch)); |
418 | 0 | pack_16_2t_pixels(&r0, &r1, sig); |
419 | 0 | } |
420 | | |
421 | | static INLINE void pack_16x1_2t_pixels(const uint16_t *src, |
422 | 0 | __m256i *sig /*sig[2]*/) { |
423 | 0 | const __m256i r0 = _mm256_loadu_si256((const __m256i *)src); |
424 | 0 | const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8)); |
425 | 0 | pack_16_2t_pixels(&r0, &r1, sig); |
426 | 0 | } |
427 | | |
428 | | static INLINE void pack_8x1_2t_pixels(const uint16_t *src, |
429 | 0 | __m256i *sig /*sig[2]*/) { |
430 | 0 | const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); |
431 | 0 | const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2); |
432 | 0 | __m256i r0 = _mm256_loadu_si256((const __m256i *)src); |
433 | 0 | __m256i x0 = _mm256_shuffle_epi8(r0, sf2); |
434 | 0 | r0 = _mm256_permutevar8x32_epi32(r0, idx); |
435 | 0 | r0 = _mm256_shuffle_epi8(r0, sf2); |
436 | 0 | sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20); |
437 | 0 | } |
438 | | |
439 | | // can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels() |
440 | | static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f, |
441 | 0 | __m256i *y0, __m256i *y1) { |
442 | 0 | const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); |
443 | 0 | __m256i x0 = _mm256_madd_epi16(sig[0], *f); |
444 | 0 | __m256i x1 = _mm256_madd_epi16(sig[1], *f); |
445 | 0 | x0 = _mm256_add_epi32(x0, rounding); |
446 | 0 | x1 = _mm256_add_epi32(x1, rounding); |
447 | 0 | *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS); |
448 | 0 | *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS); |
449 | 0 | } |
450 | | |
451 | | static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f, |
452 | 0 | __m256i *y0) { |
453 | 0 | const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); |
454 | 0 | __m256i x0 = _mm256_madd_epi16(sig[0], *f); |
455 | 0 | x0 = _mm256_add_epi32(x0, rounding); |
456 | 0 | *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS); |
457 | 0 | } |
458 | | |
459 | | static void vpx_highbd_filter_block1d8_h2_avx2( |
460 | | const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, |
461 | 0 | ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { |
462 | 0 | __m256i signal[2], res0, res1; |
463 | 0 | const __m256i max = _mm256_set1_epi16((1 << bd) - 1); |
464 | |
|
465 | 0 | __m256i ff; |
466 | 0 | pack_2t_filter(filter, &ff); |
467 | |
|
468 | 0 | src_ptr -= 3; |
469 | 0 | do { |
470 | 0 | pack_8x2_2t_pixels(src_ptr, src_pitch, signal); |
471 | 0 | filter_16_2t_pixels(signal, &ff, &res0, &res1); |
472 | 0 | store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); |
473 | 0 | height -= 2; |
474 | 0 | src_ptr += src_pitch << 1; |
475 | 0 | dst_ptr += dst_pitch << 1; |
476 | 0 | } while (height > 1); |
477 | |
|
478 | 0 | if (height > 0) { |
479 | 0 | pack_8x1_2t_pixels(src_ptr, signal); |
480 | 0 | filter_8x1_2t_pixels(signal, &ff, &res0); |
481 | 0 | store_8x1_pixels(&res0, &max, dst_ptr); |
482 | 0 | } |
483 | 0 | } |
484 | | |
485 | | static void vpx_highbd_filter_block1d16_h2_avx2( |
486 | | const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, |
487 | 0 | ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { |
488 | 0 | __m256i signal[2], res0, res1; |
489 | 0 | const __m256i max = _mm256_set1_epi16((1 << bd) - 1); |
490 | |
|
491 | 0 | __m256i ff; |
492 | 0 | pack_2t_filter(filter, &ff); |
493 | |
|
494 | 0 | src_ptr -= 3; |
495 | 0 | do { |
496 | 0 | pack_16x1_2t_pixels(src_ptr, signal); |
497 | 0 | filter_16_2t_pixels(signal, &ff, &res0, &res1); |
498 | 0 | store_16x1_pixels(&res0, &res1, &max, dst_ptr); |
499 | 0 | height -= 1; |
500 | 0 | src_ptr += src_pitch; |
501 | 0 | dst_ptr += dst_pitch; |
502 | 0 | } while (height > 0); |
503 | 0 | } |
504 | | |
505 | | // ----------------------------------------------------------------------------- |
506 | | // Vertical Filtering |
507 | | |
508 | 0 | static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) { |
509 | 0 | __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src)); |
510 | 0 | __m256i s1 = |
511 | 0 | _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch))); |
512 | 0 | __m256i s2 = _mm256_castsi128_si256( |
513 | 0 | _mm_loadu_si128((const __m128i *)(src + 2 * pitch))); |
514 | 0 | __m256i s3 = _mm256_castsi128_si256( |
515 | 0 | _mm_loadu_si128((const __m128i *)(src + 3 * pitch))); |
516 | 0 | __m256i s4 = _mm256_castsi128_si256( |
517 | 0 | _mm_loadu_si128((const __m128i *)(src + 4 * pitch))); |
518 | 0 | __m256i s5 = _mm256_castsi128_si256( |
519 | 0 | _mm_loadu_si128((const __m128i *)(src + 5 * pitch))); |
520 | 0 | __m256i s6 = _mm256_castsi128_si256( |
521 | 0 | _mm_loadu_si128((const __m128i *)(src + 6 * pitch))); |
522 | |
|
523 | 0 | s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1); |
524 | 0 | s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1); |
525 | 0 | s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1); |
526 | 0 | s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1); |
527 | 0 | s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1); |
528 | 0 | s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1); |
529 | |
|
530 | 0 | sig[0] = _mm256_unpacklo_epi16(s0, s1); |
531 | 0 | sig[4] = _mm256_unpackhi_epi16(s0, s1); |
532 | 0 | sig[1] = _mm256_unpacklo_epi16(s2, s3); |
533 | 0 | sig[5] = _mm256_unpackhi_epi16(s2, s3); |
534 | 0 | sig[2] = _mm256_unpacklo_epi16(s4, s5); |
535 | 0 | sig[6] = _mm256_unpackhi_epi16(s4, s5); |
536 | 0 | sig[8] = s6; |
537 | 0 | } |
538 | | |
539 | | static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch, |
540 | 0 | __m256i *sig) { |
541 | | // base + 7th row |
542 | 0 | __m256i s0 = _mm256_castsi128_si256( |
543 | 0 | _mm_loadu_si128((const __m128i *)(src + 7 * pitch))); |
544 | | // base + 8th row |
545 | 0 | __m256i s1 = _mm256_castsi128_si256( |
546 | 0 | _mm_loadu_si128((const __m128i *)(src + 8 * pitch))); |
547 | 0 | __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1); |
548 | 0 | __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1); |
549 | 0 | sig[3] = _mm256_unpacklo_epi16(s2, s3); |
550 | 0 | sig[7] = _mm256_unpackhi_epi16(s2, s3); |
551 | 0 | sig[8] = s1; |
552 | 0 | } |
553 | | |
554 | | static INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f, |
555 | 0 | __m256i *y0, __m256i *y1) { |
556 | 0 | filter_8x1_pixels(sig, f, y0); |
557 | 0 | filter_8x1_pixels(&sig[4], f, y1); |
558 | 0 | } |
559 | | |
560 | 0 | static INLINE void update_pixels(__m256i *sig) { |
561 | 0 | int i; |
562 | 0 | for (i = 0; i < 3; ++i) { |
563 | 0 | sig[i] = sig[i + 1]; |
564 | 0 | sig[i + 4] = sig[i + 5]; |
565 | 0 | } |
566 | 0 | } |
567 | | |
568 | | static void vpx_highbd_filter_block1d8_v8_avx2( |
569 | | const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, |
570 | 0 | ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { |
571 | 0 | __m256i signal[9], res0, res1; |
572 | 0 | const __m256i max = _mm256_set1_epi16((1 << bd) - 1); |
573 | |
|
574 | 0 | __m256i ff[4]; |
575 | 0 | pack_filters(filter, ff); |
576 | |
|
577 | 0 | pack_8x9_init(src_ptr, src_pitch, signal); |
578 | |
|
579 | 0 | do { |
580 | 0 | pack_8x9_pixels(src_ptr, src_pitch, signal); |
581 | |
|
582 | 0 | filter_8x9_pixels(signal, ff, &res0, &res1); |
583 | 0 | store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); |
584 | 0 | update_pixels(signal); |
585 | |
|
586 | 0 | src_ptr += src_pitch << 1; |
587 | 0 | dst_ptr += dst_pitch << 1; |
588 | 0 | height -= 2; |
589 | 0 | } while (height > 0); |
590 | 0 | } |
591 | | |
592 | 0 | static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) { |
593 | 0 | __m256i u0, u1, u2, u3; |
594 | | // load 0-6 rows |
595 | 0 | const __m256i s0 = _mm256_loadu_si256((const __m256i *)src); |
596 | 0 | const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch)); |
597 | 0 | const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch)); |
598 | 0 | const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch)); |
599 | 0 | const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch)); |
600 | 0 | const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch)); |
601 | 0 | const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch)); |
602 | |
|
603 | 0 | u0 = _mm256_permute2x128_si256(s0, s1, 0x20); // 0, 1 low |
604 | 0 | u1 = _mm256_permute2x128_si256(s0, s1, 0x31); // 0, 1 high |
605 | |
|
606 | 0 | u2 = _mm256_permute2x128_si256(s1, s2, 0x20); // 1, 2 low |
607 | 0 | u3 = _mm256_permute2x128_si256(s1, s2, 0x31); // 1, 2 high |
608 | |
|
609 | 0 | sig[0] = _mm256_unpacklo_epi16(u0, u2); |
610 | 0 | sig[4] = _mm256_unpackhi_epi16(u0, u2); |
611 | |
|
612 | 0 | sig[8] = _mm256_unpacklo_epi16(u1, u3); |
613 | 0 | sig[12] = _mm256_unpackhi_epi16(u1, u3); |
614 | |
|
615 | 0 | u0 = _mm256_permute2x128_si256(s2, s3, 0x20); |
616 | 0 | u1 = _mm256_permute2x128_si256(s2, s3, 0x31); |
617 | |
|
618 | 0 | u2 = _mm256_permute2x128_si256(s3, s4, 0x20); |
619 | 0 | u3 = _mm256_permute2x128_si256(s3, s4, 0x31); |
620 | |
|
621 | 0 | sig[1] = _mm256_unpacklo_epi16(u0, u2); |
622 | 0 | sig[5] = _mm256_unpackhi_epi16(u0, u2); |
623 | |
|
624 | 0 | sig[9] = _mm256_unpacklo_epi16(u1, u3); |
625 | 0 | sig[13] = _mm256_unpackhi_epi16(u1, u3); |
626 | |
|
627 | 0 | u0 = _mm256_permute2x128_si256(s4, s5, 0x20); |
628 | 0 | u1 = _mm256_permute2x128_si256(s4, s5, 0x31); |
629 | |
|
630 | 0 | u2 = _mm256_permute2x128_si256(s5, s6, 0x20); |
631 | 0 | u3 = _mm256_permute2x128_si256(s5, s6, 0x31); |
632 | |
|
633 | 0 | sig[2] = _mm256_unpacklo_epi16(u0, u2); |
634 | 0 | sig[6] = _mm256_unpackhi_epi16(u0, u2); |
635 | |
|
636 | 0 | sig[10] = _mm256_unpacklo_epi16(u1, u3); |
637 | 0 | sig[14] = _mm256_unpackhi_epi16(u1, u3); |
638 | |
|
639 | 0 | sig[16] = s6; |
640 | 0 | } |
641 | | |
642 | | static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch, |
643 | 0 | __m256i *sig) { |
644 | | // base + 7th row |
645 | 0 | const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch)); |
646 | | // base + 8th row |
647 | 0 | const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch)); |
648 | |
|
649 | 0 | __m256i u0, u1, u2, u3; |
650 | 0 | u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20); |
651 | 0 | u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31); |
652 | |
|
653 | 0 | u2 = _mm256_permute2x128_si256(s7, s8, 0x20); |
654 | 0 | u3 = _mm256_permute2x128_si256(s7, s8, 0x31); |
655 | |
|
656 | 0 | sig[3] = _mm256_unpacklo_epi16(u0, u2); |
657 | 0 | sig[7] = _mm256_unpackhi_epi16(u0, u2); |
658 | |
|
659 | 0 | sig[11] = _mm256_unpacklo_epi16(u1, u3); |
660 | 0 | sig[15] = _mm256_unpackhi_epi16(u1, u3); |
661 | |
|
662 | 0 | sig[16] = s8; |
663 | 0 | } |
664 | | |
665 | | static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f, |
666 | 0 | __m256i *y0, __m256i *y1) { |
667 | 0 | __m256i res[4]; |
668 | 0 | int i; |
669 | 0 | for (i = 0; i < 4; ++i) { |
670 | 0 | filter_8x1_pixels(&sig[i << 2], f, &res[i]); |
671 | 0 | } |
672 | |
|
673 | 0 | { |
674 | 0 | const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]); |
675 | 0 | const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]); |
676 | 0 | *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20); |
677 | 0 | *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31); |
678 | 0 | } |
679 | 0 | } |
680 | | |
681 | | static INLINE void store_16x2_pixels(const __m256i *y0, const __m256i *y1, |
682 | | const __m256i *mask, uint16_t *dst, |
683 | 0 | ptrdiff_t pitch) { |
684 | 0 | __m256i p = _mm256_min_epi16(*y0, *mask); |
685 | 0 | _mm256_storeu_si256((__m256i *)dst, p); |
686 | 0 | p = _mm256_min_epi16(*y1, *mask); |
687 | 0 | _mm256_storeu_si256((__m256i *)(dst + pitch), p); |
688 | 0 | } |
689 | | |
690 | 0 | static void update_16x9_pixels(__m256i *sig) { |
691 | 0 | update_pixels(&sig[0]); |
692 | 0 | update_pixels(&sig[8]); |
693 | 0 | } |
694 | | |
695 | | static void vpx_highbd_filter_block1d16_v8_avx2( |
696 | | const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, |
697 | 0 | ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { |
698 | 0 | __m256i signal[17], res0, res1; |
699 | 0 | const __m256i max = _mm256_set1_epi16((1 << bd) - 1); |
700 | |
|
701 | 0 | __m256i ff[4]; |
702 | 0 | pack_filters(filter, ff); |
703 | |
|
704 | 0 | pack_16x9_init(src_ptr, src_pitch, signal); |
705 | |
|
706 | 0 | do { |
707 | 0 | pack_16x9_pixels(src_ptr, src_pitch, signal); |
708 | 0 | filter_16x9_pixels(signal, ff, &res0, &res1); |
709 | 0 | store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); |
710 | 0 | update_16x9_pixels(signal); |
711 | |
|
712 | 0 | src_ptr += src_pitch << 1; |
713 | 0 | dst_ptr += dst_pitch << 1; |
714 | 0 | height -= 2; |
715 | 0 | } while (height > 0); |
716 | 0 | } |
717 | | |
718 | | // ----------------------------------------------------------------------------- |
719 | | // 2-tap vertical filtering |
720 | | |
721 | 0 | static void pack_16x2_init(const uint16_t *src, __m256i *sig) { |
722 | 0 | sig[2] = _mm256_loadu_si256((const __m256i *)src); |
723 | 0 | } |
724 | | |
725 | | static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch, |
726 | 0 | __m256i *sig) { |
727 | | // load the next row |
728 | 0 | const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch)); |
729 | 0 | sig[0] = _mm256_unpacklo_epi16(sig[2], u); |
730 | 0 | sig[1] = _mm256_unpackhi_epi16(sig[2], u); |
731 | 0 | sig[2] = u; |
732 | 0 | } |
733 | | |
734 | | static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f, |
735 | 0 | __m256i *y0, __m256i *y1) { |
736 | 0 | filter_16_2t_pixels(sig, f, y0, y1); |
737 | 0 | } |
738 | | |
739 | | static void vpx_highbd_filter_block1d16_v2_avx2( |
740 | | const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, |
741 | 0 | ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { |
742 | 0 | __m256i signal[3], res0, res1; |
743 | 0 | const __m256i max = _mm256_set1_epi16((1 << bd) - 1); |
744 | 0 | __m256i ff; |
745 | |
|
746 | 0 | pack_2t_filter(filter, &ff); |
747 | 0 | pack_16x2_init(src_ptr, signal); |
748 | |
|
749 | 0 | do { |
750 | 0 | pack_16x2_2t_pixels(src_ptr, src_pitch, signal); |
751 | 0 | filter_16x2_2t_pixels(signal, &ff, &res0, &res1); |
752 | 0 | store_16x1_pixels(&res0, &res1, &max, dst_ptr); |
753 | |
|
754 | 0 | src_ptr += src_pitch; |
755 | 0 | dst_ptr += dst_pitch; |
756 | 0 | height -= 1; |
757 | 0 | } while (height > 0); |
758 | 0 | } |
759 | | |
760 | 0 | static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) { |
761 | 0 | const __m128i h = _mm_loadu_si128((const __m128i *)filter); |
762 | 0 | const __m128i p = _mm_set1_epi32(0x09080706); |
763 | 0 | f[0] = _mm_shuffle_epi8(h, p); |
764 | 0 | } |
765 | | |
766 | 0 | static void pack_8x2_init(const uint16_t *src, __m128i *sig) { |
767 | 0 | sig[2] = _mm_loadu_si128((const __m128i *)src); |
768 | 0 | } |
769 | | |
770 | | static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch, |
771 | 0 | __m128i *sig) { |
772 | | // load the next row |
773 | 0 | const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch)); |
774 | 0 | sig[0] = _mm_unpacklo_epi16(sig[2], u); |
775 | 0 | sig[1] = _mm_unpackhi_epi16(sig[2], u); |
776 | 0 | sig[2] = u; |
777 | 0 | } |
778 | | |
779 | | static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f, |
780 | 0 | __m128i *y0, __m128i *y1) { |
781 | 0 | const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); |
782 | 0 | __m128i x0 = _mm_madd_epi16(sig[0], *f); |
783 | 0 | __m128i x1 = _mm_madd_epi16(sig[1], *f); |
784 | 0 | x0 = _mm_add_epi32(x0, rounding); |
785 | 0 | x1 = _mm_add_epi32(x1, rounding); |
786 | 0 | *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS); |
787 | 0 | *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS); |
788 | 0 | } |
789 | | |
790 | | static INLINE void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1, |
791 | 0 | const __m128i *mask, uint16_t *dst) { |
792 | 0 | __m128i res = _mm_packus_epi32(*y0, *y1); |
793 | 0 | res = _mm_min_epi16(res, *mask); |
794 | 0 | _mm_storeu_si128((__m128i *)dst, res); |
795 | 0 | } |
796 | | |
797 | | static void vpx_highbd_filter_block1d8_v2_avx2( |
798 | | const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, |
799 | 0 | ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { |
800 | 0 | __m128i signal[3], res0, res1; |
801 | 0 | const __m128i max = _mm_set1_epi16((1 << bd) - 1); |
802 | 0 | __m128i ff; |
803 | |
|
804 | 0 | pack_8x1_2t_filter(filter, &ff); |
805 | 0 | pack_8x2_init(src_ptr, signal); |
806 | |
|
807 | 0 | do { |
808 | 0 | pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal); |
809 | 0 | filter_8_2t_pixels(signal, &ff, &res0, &res1); |
810 | 0 | store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr); |
811 | |
|
812 | 0 | src_ptr += src_pitch; |
813 | 0 | dst_ptr += dst_pitch; |
814 | 0 | height -= 1; |
815 | 0 | } while (height > 0); |
816 | 0 | } |
817 | | |
818 | | // Calculation with averaging the input pixels |
819 | | |
820 | | static INLINE void store_8x1_avg_pixels(const __m256i *y0, const __m256i *mask, |
821 | 0 | uint16_t *dst) { |
822 | 0 | const __m128i a0 = _mm256_castsi256_si128(*y0); |
823 | 0 | const __m128i a1 = _mm256_extractf128_si256(*y0, 1); |
824 | 0 | __m128i res = _mm_packus_epi32(a0, a1); |
825 | 0 | const __m128i pix = _mm_loadu_si128((const __m128i *)dst); |
826 | 0 | res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask)); |
827 | 0 | res = _mm_avg_epu16(res, pix); |
828 | 0 | _mm_storeu_si128((__m128i *)dst, res); |
829 | 0 | } |
830 | | |
831 | | static INLINE void store_8x2_avg_pixels(const __m256i *y0, const __m256i *y1, |
832 | | const __m256i *mask, uint16_t *dst, |
833 | 0 | ptrdiff_t pitch) { |
834 | 0 | __m256i a = _mm256_packus_epi32(*y0, *y1); |
835 | 0 | const __m128i pix0 = _mm_loadu_si128((const __m128i *)dst); |
836 | 0 | const __m128i pix1 = _mm_loadu_si128((const __m128i *)(dst + pitch)); |
837 | 0 | const __m256i pix = |
838 | 0 | _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1); |
839 | 0 | a = _mm256_min_epi16(a, *mask); |
840 | 0 | a = _mm256_avg_epu16(a, pix); |
841 | 0 | _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a)); |
842 | 0 | _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1)); |
843 | 0 | } |
844 | | |
845 | | static INLINE void store_16x1_avg_pixels(const __m256i *y0, const __m256i *y1, |
846 | 0 | const __m256i *mask, uint16_t *dst) { |
847 | 0 | __m256i a = _mm256_packus_epi32(*y0, *y1); |
848 | 0 | const __m256i pix = _mm256_loadu_si256((const __m256i *)dst); |
849 | 0 | a = _mm256_min_epi16(a, *mask); |
850 | 0 | a = _mm256_avg_epu16(a, pix); |
851 | 0 | _mm256_storeu_si256((__m256i *)dst, a); |
852 | 0 | } |
853 | | |
854 | | static INLINE void store_16x2_avg_pixels(const __m256i *y0, const __m256i *y1, |
855 | | const __m256i *mask, uint16_t *dst, |
856 | 0 | ptrdiff_t pitch) { |
857 | 0 | const __m256i pix0 = _mm256_loadu_si256((const __m256i *)dst); |
858 | 0 | const __m256i pix1 = _mm256_loadu_si256((const __m256i *)(dst + pitch)); |
859 | 0 | __m256i p = _mm256_min_epi16(*y0, *mask); |
860 | 0 | p = _mm256_avg_epu16(p, pix0); |
861 | 0 | _mm256_storeu_si256((__m256i *)dst, p); |
862 | |
|
863 | 0 | p = _mm256_min_epi16(*y1, *mask); |
864 | 0 | p = _mm256_avg_epu16(p, pix1); |
865 | 0 | _mm256_storeu_si256((__m256i *)(dst + pitch), p); |
866 | 0 | } |
867 | | |
868 | | static INLINE void store_8x1_2t_avg_pixels_ver(const __m128i *y0, |
869 | | const __m128i *y1, |
870 | | const __m128i *mask, |
871 | 0 | uint16_t *dst) { |
872 | 0 | __m128i res = _mm_packus_epi32(*y0, *y1); |
873 | 0 | const __m128i pix = _mm_loadu_si128((const __m128i *)dst); |
874 | 0 | res = _mm_min_epi16(res, *mask); |
875 | 0 | res = _mm_avg_epu16(res, pix); |
876 | 0 | _mm_storeu_si128((__m128i *)dst, res); |
877 | 0 | } |
878 | | |
879 | | static void vpx_highbd_filter_block1d8_h8_avg_avx2( |
880 | | const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, |
881 | 0 | ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { |
882 | 0 | __m256i signal[8], res0, res1; |
883 | 0 | const __m256i max = _mm256_set1_epi16((1 << bd) - 1); |
884 | |
|
885 | 0 | __m256i ff[4]; |
886 | 0 | pack_filters(filter, ff); |
887 | |
|
888 | 0 | src_ptr -= 3; |
889 | 0 | do { |
890 | 0 | pack_8x2_pixels(src_ptr, src_pitch, signal); |
891 | 0 | filter_8x1_pixels(signal, ff, &res0); |
892 | 0 | filter_8x1_pixels(&signal[4], ff, &res1); |
893 | 0 | store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); |
894 | 0 | height -= 2; |
895 | 0 | src_ptr += src_pitch << 1; |
896 | 0 | dst_ptr += dst_pitch << 1; |
897 | 0 | } while (height > 1); |
898 | |
|
899 | 0 | if (height > 0) { |
900 | 0 | pack_8x1_pixels(src_ptr, signal); |
901 | 0 | filter_8x1_pixels(signal, ff, &res0); |
902 | 0 | store_8x1_avg_pixels(&res0, &max, dst_ptr); |
903 | 0 | } |
904 | 0 | } |
905 | | |
906 | | static void vpx_highbd_filter_block1d16_h8_avg_avx2( |
907 | | const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, |
908 | 0 | ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { |
909 | 0 | __m256i signal[8], res0, res1; |
910 | 0 | const __m256i max = _mm256_set1_epi16((1 << bd) - 1); |
911 | |
|
912 | 0 | __m256i ff[4]; |
913 | 0 | pack_filters(filter, ff); |
914 | |
|
915 | 0 | src_ptr -= 3; |
916 | 0 | do { |
917 | 0 | pack_16x1_pixels(src_ptr, signal); |
918 | 0 | filter_8x1_pixels(signal, ff, &res0); |
919 | 0 | filter_8x1_pixels(&signal[4], ff, &res1); |
920 | 0 | store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr); |
921 | 0 | height -= 1; |
922 | 0 | src_ptr += src_pitch; |
923 | 0 | dst_ptr += dst_pitch; |
924 | 0 | } while (height > 0); |
925 | 0 | } |
926 | | |
927 | | static void vpx_highbd_filter_block1d4_h4_avx2( |
928 | | const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, |
929 | 0 | ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { |
930 | | // We extract the middle four elements of the kernel into two registers in |
931 | | // the form |
932 | | // ... k[3] k[2] k[3] k[2] |
933 | | // ... k[5] k[4] k[5] k[4] |
934 | | // Then we shuffle the source into |
935 | | // ... s[1] s[0] s[0] s[-1] |
936 | | // ... s[3] s[2] s[2] s[1] |
937 | | // Calling multiply and add gives us half of the sum. Calling add on the two |
938 | | // halves gives us the output. Since avx2 allows us to use 256-bit buffer, we |
939 | | // can do this two rows at a time. |
940 | |
|
941 | 0 | __m256i src_reg, src_reg_shift_0, src_reg_shift_2; |
942 | 0 | __m256i res_reg; |
943 | 0 | __m256i idx_shift_0 = |
944 | 0 | _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2, |
945 | 0 | 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9); |
946 | 0 | __m256i idx_shift_2 = |
947 | 0 | _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4, |
948 | 0 | 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13); |
949 | |
|
950 | 0 | __m128i kernel_reg_128; // Kernel |
951 | 0 | __m256i kernel_reg, kernel_reg_23, |
952 | 0 | kernel_reg_45; // Segments of the kernel used |
953 | 0 | const __m256i reg_round = |
954 | 0 | _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding |
955 | 0 | const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1); |
956 | 0 | const ptrdiff_t unrolled_src_stride = src_stride << 1; |
957 | 0 | const ptrdiff_t unrolled_dst_stride = dst_stride << 1; |
958 | 0 | int h; |
959 | | |
960 | | // Start one pixel before as we need tap/2 - 1 = 1 sample from the past |
961 | 0 | src_ptr -= 1; |
962 | | |
963 | | // Load Kernel |
964 | 0 | kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); |
965 | 0 | kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); |
966 | 0 | kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55); |
967 | 0 | kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa); |
968 | |
|
969 | 0 | for (h = height; h >= 2; h -= 2) { |
970 | | // Load the source |
971 | 0 | src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride); |
972 | 0 | src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); |
973 | 0 | src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); |
974 | | |
975 | | // Get the output |
976 | 0 | res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2, |
977 | 0 | &kernel_reg_23, &kernel_reg_45); |
978 | | |
979 | | // Round the result |
980 | 0 | res_reg = mm256_round_epi32(&res_reg, ®_round, CONV8_ROUNDING_BITS); |
981 | | |
982 | | // Finally combine to get the final dst |
983 | 0 | res_reg = _mm256_packus_epi32(res_reg, res_reg); |
984 | 0 | res_reg = _mm256_min_epi16(res_reg, reg_max); |
985 | 0 | mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), |
986 | 0 | &res_reg); |
987 | |
|
988 | 0 | src_ptr += unrolled_src_stride; |
989 | 0 | dst_ptr += unrolled_dst_stride; |
990 | 0 | } |
991 | | |
992 | | // Repeat for the last row if needed |
993 | 0 | if (h > 0) { |
994 | | // Load the source |
995 | 0 | src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4); |
996 | 0 | src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); |
997 | 0 | src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); |
998 | | |
999 | | // Get the output |
1000 | 0 | res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2, |
1001 | 0 | &kernel_reg_23, &kernel_reg_45); |
1002 | | |
1003 | | // Round the result |
1004 | 0 | res_reg = mm256_round_epi32(&res_reg, ®_round, CONV8_ROUNDING_BITS); |
1005 | | |
1006 | | // Finally combine to get the final dst |
1007 | 0 | res_reg = _mm256_packus_epi32(res_reg, res_reg); |
1008 | 0 | res_reg = _mm256_min_epi16(res_reg, reg_max); |
1009 | 0 | _mm_storel_epi64((__m128i *)dst_ptr, _mm256_castsi256_si128(res_reg)); |
1010 | 0 | } |
1011 | 0 | } |
1012 | | |
1013 | | static void vpx_highbd_filter_block1d8_h4_avx2( |
1014 | | const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, |
1015 | 0 | ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { |
1016 | | // We will extract the middle four elements of the kernel into two registers |
1017 | | // in the form |
1018 | | // ... k[3] k[2] k[3] k[2] |
1019 | | // ... k[5] k[4] k[5] k[4] |
1020 | | // Then we shuffle the source into |
1021 | | // ... s[1] s[0] s[0] s[-1] |
1022 | | // ... s[3] s[2] s[2] s[1] |
1023 | | // Calling multiply and add gives us half of the sum of the first half. |
1024 | | // Calling add gives us first half of the output. Repat again to get the whole |
1025 | | // output. Since avx2 allows us to use 256-bit buffer, we can do this two rows |
1026 | | // at a time. |
1027 | |
|
1028 | 0 | __m256i src_reg, src_reg_shift_0, src_reg_shift_2; |
1029 | 0 | __m256i res_reg, res_first, res_last; |
1030 | 0 | __m256i idx_shift_0 = |
1031 | 0 | _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2, |
1032 | 0 | 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9); |
1033 | 0 | __m256i idx_shift_2 = |
1034 | 0 | _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4, |
1035 | 0 | 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13); |
1036 | |
|
1037 | 0 | __m128i kernel_reg_128; // Kernel |
1038 | 0 | __m256i kernel_reg, kernel_reg_23, |
1039 | 0 | kernel_reg_45; // Segments of the kernel used |
1040 | 0 | const __m256i reg_round = |
1041 | 0 | _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding |
1042 | 0 | const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1); |
1043 | 0 | const ptrdiff_t unrolled_src_stride = src_stride << 1; |
1044 | 0 | const ptrdiff_t unrolled_dst_stride = dst_stride << 1; |
1045 | 0 | int h; |
1046 | | |
1047 | | // Start one pixel before as we need tap/2 - 1 = 1 sample from the past |
1048 | 0 | src_ptr -= 1; |
1049 | | |
1050 | | // Load Kernel |
1051 | 0 | kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); |
1052 | 0 | kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); |
1053 | 0 | kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55); |
1054 | 0 | kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa); |
1055 | |
|
1056 | 0 | for (h = height; h >= 2; h -= 2) { |
1057 | | // Load the source |
1058 | 0 | src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride); |
1059 | 0 | src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); |
1060 | 0 | src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); |
1061 | | |
1062 | | // Result for first half |
1063 | 0 | res_first = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2, |
1064 | 0 | &kernel_reg_23, &kernel_reg_45); |
1065 | | |
1066 | | // Do again to get the second half of dst |
1067 | | // Load the source |
1068 | 0 | src_reg = mm256_loadu2_si128(src_ptr + 4, src_ptr + src_stride + 4); |
1069 | 0 | src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); |
1070 | 0 | src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); |
1071 | | |
1072 | | // Result for second half |
1073 | 0 | res_last = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2, |
1074 | 0 | &kernel_reg_23, &kernel_reg_45); |
1075 | | |
1076 | | // Round each result |
1077 | 0 | res_first = mm256_round_epi32(&res_first, ®_round, CONV8_ROUNDING_BITS); |
1078 | 0 | res_last = mm256_round_epi32(&res_last, ®_round, CONV8_ROUNDING_BITS); |
1079 | | |
1080 | | // Finally combine to get the final dst |
1081 | 0 | res_reg = _mm256_packus_epi32(res_first, res_last); |
1082 | 0 | res_reg = _mm256_min_epi16(res_reg, reg_max); |
1083 | 0 | mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), |
1084 | 0 | &res_reg); |
1085 | |
|
1086 | 0 | src_ptr += unrolled_src_stride; |
1087 | 0 | dst_ptr += unrolled_dst_stride; |
1088 | 0 | } |
1089 | | |
1090 | | // Repeat for the last row if needed |
1091 | 0 | if (h > 0) { |
1092 | 0 | src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4); |
1093 | 0 | src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); |
1094 | 0 | src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); |
1095 | |
|
1096 | 0 | res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2, |
1097 | 0 | &kernel_reg_23, &kernel_reg_45); |
1098 | |
|
1099 | 0 | res_reg = mm256_round_epi32(&res_reg, ®_round, CONV8_ROUNDING_BITS); |
1100 | |
|
1101 | 0 | res_reg = _mm256_packus_epi32(res_reg, res_reg); |
1102 | 0 | res_reg = _mm256_min_epi16(res_reg, reg_max); |
1103 | |
|
1104 | 0 | mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + 4), &res_reg); |
1105 | 0 | } |
1106 | 0 | } |
1107 | | |
1108 | | static void vpx_highbd_filter_block1d16_h4_avx2( |
1109 | | const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, |
1110 | 0 | ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { |
1111 | 0 | vpx_highbd_filter_block1d8_h4_avx2(src_ptr, src_stride, dst_ptr, dst_stride, |
1112 | 0 | height, kernel, bd); |
1113 | 0 | vpx_highbd_filter_block1d8_h4_avx2(src_ptr + 8, src_stride, dst_ptr + 8, |
1114 | 0 | dst_stride, height, kernel, bd); |
1115 | 0 | } |
1116 | | |
1117 | | static void vpx_highbd_filter_block1d8_v8_avg_avx2( |
1118 | | const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, |
1119 | 0 | ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { |
1120 | 0 | __m256i signal[9], res0, res1; |
1121 | 0 | const __m256i max = _mm256_set1_epi16((1 << bd) - 1); |
1122 | |
|
1123 | 0 | __m256i ff[4]; |
1124 | 0 | pack_filters(filter, ff); |
1125 | |
|
1126 | 0 | pack_8x9_init(src_ptr, src_pitch, signal); |
1127 | |
|
1128 | 0 | do { |
1129 | 0 | pack_8x9_pixels(src_ptr, src_pitch, signal); |
1130 | |
|
1131 | 0 | filter_8x9_pixels(signal, ff, &res0, &res1); |
1132 | 0 | store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); |
1133 | 0 | update_pixels(signal); |
1134 | |
|
1135 | 0 | src_ptr += src_pitch << 1; |
1136 | 0 | dst_ptr += dst_pitch << 1; |
1137 | 0 | height -= 2; |
1138 | 0 | } while (height > 0); |
1139 | 0 | } |
1140 | | |
1141 | | static void vpx_highbd_filter_block1d16_v8_avg_avx2( |
1142 | | const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, |
1143 | 0 | ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { |
1144 | 0 | __m256i signal[17], res0, res1; |
1145 | 0 | const __m256i max = _mm256_set1_epi16((1 << bd) - 1); |
1146 | |
|
1147 | 0 | __m256i ff[4]; |
1148 | 0 | pack_filters(filter, ff); |
1149 | |
|
1150 | 0 | pack_16x9_init(src_ptr, src_pitch, signal); |
1151 | |
|
1152 | 0 | do { |
1153 | 0 | pack_16x9_pixels(src_ptr, src_pitch, signal); |
1154 | 0 | filter_16x9_pixels(signal, ff, &res0, &res1); |
1155 | 0 | store_16x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); |
1156 | 0 | update_16x9_pixels(signal); |
1157 | |
|
1158 | 0 | src_ptr += src_pitch << 1; |
1159 | 0 | dst_ptr += dst_pitch << 1; |
1160 | 0 | height -= 2; |
1161 | 0 | } while (height > 0); |
1162 | 0 | } |
1163 | | |
1164 | | static void vpx_highbd_filter_block1d8_h2_avg_avx2( |
1165 | | const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, |
1166 | 0 | ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { |
1167 | 0 | __m256i signal[2], res0, res1; |
1168 | 0 | const __m256i max = _mm256_set1_epi16((1 << bd) - 1); |
1169 | |
|
1170 | 0 | __m256i ff; |
1171 | 0 | pack_2t_filter(filter, &ff); |
1172 | |
|
1173 | 0 | src_ptr -= 3; |
1174 | 0 | do { |
1175 | 0 | pack_8x2_2t_pixels(src_ptr, src_pitch, signal); |
1176 | 0 | filter_16_2t_pixels(signal, &ff, &res0, &res1); |
1177 | 0 | store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); |
1178 | 0 | height -= 2; |
1179 | 0 | src_ptr += src_pitch << 1; |
1180 | 0 | dst_ptr += dst_pitch << 1; |
1181 | 0 | } while (height > 1); |
1182 | |
|
1183 | 0 | if (height > 0) { |
1184 | 0 | pack_8x1_2t_pixels(src_ptr, signal); |
1185 | 0 | filter_8x1_2t_pixels(signal, &ff, &res0); |
1186 | 0 | store_8x1_avg_pixels(&res0, &max, dst_ptr); |
1187 | 0 | } |
1188 | 0 | } |
1189 | | |
1190 | | static void vpx_highbd_filter_block1d16_h2_avg_avx2( |
1191 | | const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, |
1192 | 0 | ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { |
1193 | 0 | __m256i signal[2], res0, res1; |
1194 | 0 | const __m256i max = _mm256_set1_epi16((1 << bd) - 1); |
1195 | |
|
1196 | 0 | __m256i ff; |
1197 | 0 | pack_2t_filter(filter, &ff); |
1198 | |
|
1199 | 0 | src_ptr -= 3; |
1200 | 0 | do { |
1201 | 0 | pack_16x1_2t_pixels(src_ptr, signal); |
1202 | 0 | filter_16_2t_pixels(signal, &ff, &res0, &res1); |
1203 | 0 | store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr); |
1204 | 0 | height -= 1; |
1205 | 0 | src_ptr += src_pitch; |
1206 | 0 | dst_ptr += dst_pitch; |
1207 | 0 | } while (height > 0); |
1208 | 0 | } |
1209 | | |
1210 | | static void vpx_highbd_filter_block1d16_v2_avg_avx2( |
1211 | | const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, |
1212 | 0 | ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { |
1213 | 0 | __m256i signal[3], res0, res1; |
1214 | 0 | const __m256i max = _mm256_set1_epi16((1 << bd) - 1); |
1215 | 0 | __m256i ff; |
1216 | |
|
1217 | 0 | pack_2t_filter(filter, &ff); |
1218 | 0 | pack_16x2_init(src_ptr, signal); |
1219 | |
|
1220 | 0 | do { |
1221 | 0 | pack_16x2_2t_pixels(src_ptr, src_pitch, signal); |
1222 | 0 | filter_16x2_2t_pixels(signal, &ff, &res0, &res1); |
1223 | 0 | store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr); |
1224 | |
|
1225 | 0 | src_ptr += src_pitch; |
1226 | 0 | dst_ptr += dst_pitch; |
1227 | 0 | height -= 1; |
1228 | 0 | } while (height > 0); |
1229 | 0 | } |
1230 | | |
1231 | | static void vpx_highbd_filter_block1d8_v2_avg_avx2( |
1232 | | const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, |
1233 | 0 | ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { |
1234 | 0 | __m128i signal[3], res0, res1; |
1235 | 0 | const __m128i max = _mm_set1_epi16((1 << bd) - 1); |
1236 | 0 | __m128i ff; |
1237 | |
|
1238 | 0 | pack_8x1_2t_filter(filter, &ff); |
1239 | 0 | pack_8x2_init(src_ptr, signal); |
1240 | |
|
1241 | 0 | do { |
1242 | 0 | pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal); |
1243 | 0 | filter_8_2t_pixels(signal, &ff, &res0, &res1); |
1244 | 0 | store_8x1_2t_avg_pixels_ver(&res0, &res1, &max, dst_ptr); |
1245 | |
|
1246 | 0 | src_ptr += src_pitch; |
1247 | 0 | dst_ptr += dst_pitch; |
1248 | 0 | height -= 1; |
1249 | 0 | } while (height > 0); |
1250 | 0 | } |
1251 | | |
1252 | | static void vpx_highbd_filter_block1d4_v4_avx2( |
1253 | | const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, |
1254 | 0 | ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { |
1255 | | // We will load two rows of pixels and rearrange them into the form |
1256 | | // ... s[1,0] s[0,0] s[0,0] s[-1,0] |
1257 | | // so that we can call multiply and add with the kernel partial output. Then |
1258 | | // we can call add with another row to get the output. |
1259 | | |
1260 | | // Register for source s[-1:3, :] |
1261 | 0 | __m256i src_reg_1, src_reg_2, src_reg_3; |
1262 | | // Interleaved rows of the source. lo is first half, hi second |
1263 | 0 | __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23; |
1264 | 0 | __m256i src_reg_m1001, src_reg_1223; |
1265 | | |
1266 | | // Result after multiply and add |
1267 | 0 | __m256i res_reg; |
1268 | |
|
1269 | 0 | __m128i kernel_reg_128; // Kernel |
1270 | 0 | __m256i kernel_reg, kernel_reg_23, kernel_reg_45; // Segments of kernel used |
1271 | |
|
1272 | 0 | const __m256i reg_round = |
1273 | 0 | _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding |
1274 | 0 | const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1); |
1275 | 0 | const ptrdiff_t src_stride_unrolled = src_stride << 1; |
1276 | 0 | const ptrdiff_t dst_stride_unrolled = dst_stride << 1; |
1277 | 0 | int h; |
1278 | | |
1279 | | // Load Kernel |
1280 | 0 | kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); |
1281 | 0 | kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); |
1282 | 0 | kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55); |
1283 | 0 | kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa); |
1284 | | |
1285 | | // Row -1 to row 0 |
1286 | 0 | src_reg_m10 = mm256_loadu2_epi64((const __m128i *)src_ptr, |
1287 | 0 | (const __m128i *)(src_ptr + src_stride)); |
1288 | | |
1289 | | // Row 0 to row 1 |
1290 | 0 | src_reg_1 = _mm256_castsi128_si256( |
1291 | 0 | _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2))); |
1292 | 0 | src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21); |
1293 | | |
1294 | | // First three rows |
1295 | 0 | src_reg_m1001 = _mm256_unpacklo_epi16(src_reg_m10, src_reg_01); |
1296 | |
|
1297 | 0 | for (h = height; h > 1; h -= 2) { |
1298 | 0 | src_reg_2 = _mm256_castsi128_si256( |
1299 | 0 | _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3))); |
1300 | |
|
1301 | 0 | src_reg_12 = _mm256_inserti128_si256(src_reg_1, |
1302 | 0 | _mm256_castsi256_si128(src_reg_2), 1); |
1303 | |
|
1304 | 0 | src_reg_3 = _mm256_castsi128_si256( |
1305 | 0 | _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4))); |
1306 | |
|
1307 | 0 | src_reg_23 = _mm256_inserti128_si256(src_reg_2, |
1308 | 0 | _mm256_castsi256_si128(src_reg_3), 1); |
1309 | | |
1310 | | // Last three rows |
1311 | 0 | src_reg_1223 = _mm256_unpacklo_epi16(src_reg_12, src_reg_23); |
1312 | | |
1313 | | // Output |
1314 | 0 | res_reg = mm256_madd_add_epi32(&src_reg_m1001, &src_reg_1223, |
1315 | 0 | &kernel_reg_23, &kernel_reg_45); |
1316 | | |
1317 | | // Round the words |
1318 | 0 | res_reg = mm256_round_epi32(&res_reg, ®_round, CONV8_ROUNDING_BITS); |
1319 | | |
1320 | | // Combine to get the result |
1321 | 0 | res_reg = _mm256_packus_epi32(res_reg, res_reg); |
1322 | 0 | res_reg = _mm256_min_epi16(res_reg, reg_max); |
1323 | | |
1324 | | // Save the result |
1325 | 0 | mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), |
1326 | 0 | &res_reg); |
1327 | | |
1328 | | // Update the source by two rows |
1329 | 0 | src_ptr += src_stride_unrolled; |
1330 | 0 | dst_ptr += dst_stride_unrolled; |
1331 | |
|
1332 | 0 | src_reg_m1001 = src_reg_1223; |
1333 | 0 | src_reg_1 = src_reg_3; |
1334 | 0 | } |
1335 | 0 | } |
1336 | | |
1337 | | static void vpx_highbd_filter_block1d8_v4_avx2( |
1338 | | const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, |
1339 | 0 | ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { |
1340 | | // We will load two rows of pixels and rearrange them into the form |
1341 | | // ... s[1,0] s[0,0] s[0,0] s[-1,0] |
1342 | | // so that we can call multiply and add with the kernel partial output. Then |
1343 | | // we can call add with another row to get the output. |
1344 | | |
1345 | | // Register for source s[-1:3, :] |
1346 | 0 | __m256i src_reg_1, src_reg_2, src_reg_3; |
1347 | | // Interleaved rows of the source. lo is first half, hi second |
1348 | 0 | __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23; |
1349 | 0 | __m256i src_reg_m1001_lo, src_reg_m1001_hi, src_reg_1223_lo, src_reg_1223_hi; |
1350 | |
|
1351 | 0 | __m128i kernel_reg_128; // Kernel |
1352 | 0 | __m256i kernel_reg, kernel_reg_23, kernel_reg_45; // Segments of kernel |
1353 | | |
1354 | | // Result after multiply and add |
1355 | 0 | __m256i res_reg, res_reg_lo, res_reg_hi; |
1356 | |
|
1357 | 0 | const __m256i reg_round = |
1358 | 0 | _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding |
1359 | 0 | const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1); |
1360 | 0 | const ptrdiff_t src_stride_unrolled = src_stride << 1; |
1361 | 0 | const ptrdiff_t dst_stride_unrolled = dst_stride << 1; |
1362 | 0 | int h; |
1363 | | |
1364 | | // Load Kernel |
1365 | 0 | kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); |
1366 | 0 | kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); |
1367 | 0 | kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55); |
1368 | 0 | kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa); |
1369 | | |
1370 | | // Row -1 to row 0 |
1371 | 0 | src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr, |
1372 | 0 | (const __m128i *)(src_ptr + src_stride)); |
1373 | | |
1374 | | // Row 0 to row 1 |
1375 | 0 | src_reg_1 = _mm256_castsi128_si256( |
1376 | 0 | _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2))); |
1377 | 0 | src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21); |
1378 | | |
1379 | | // First three rows |
1380 | 0 | src_reg_m1001_lo = _mm256_unpacklo_epi16(src_reg_m10, src_reg_01); |
1381 | 0 | src_reg_m1001_hi = _mm256_unpackhi_epi16(src_reg_m10, src_reg_01); |
1382 | |
|
1383 | 0 | for (h = height; h > 1; h -= 2) { |
1384 | 0 | src_reg_2 = _mm256_castsi128_si256( |
1385 | 0 | _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3))); |
1386 | |
|
1387 | 0 | src_reg_12 = _mm256_inserti128_si256(src_reg_1, |
1388 | 0 | _mm256_castsi256_si128(src_reg_2), 1); |
1389 | |
|
1390 | 0 | src_reg_3 = _mm256_castsi128_si256( |
1391 | 0 | _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4))); |
1392 | |
|
1393 | 0 | src_reg_23 = _mm256_inserti128_si256(src_reg_2, |
1394 | 0 | _mm256_castsi256_si128(src_reg_3), 1); |
1395 | | |
1396 | | // Last three rows |
1397 | 0 | src_reg_1223_lo = _mm256_unpacklo_epi16(src_reg_12, src_reg_23); |
1398 | 0 | src_reg_1223_hi = _mm256_unpackhi_epi16(src_reg_12, src_reg_23); |
1399 | | |
1400 | | // Output from first half |
1401 | 0 | res_reg_lo = mm256_madd_add_epi32(&src_reg_m1001_lo, &src_reg_1223_lo, |
1402 | 0 | &kernel_reg_23, &kernel_reg_45); |
1403 | | |
1404 | | // Output from second half |
1405 | 0 | res_reg_hi = mm256_madd_add_epi32(&src_reg_m1001_hi, &src_reg_1223_hi, |
1406 | 0 | &kernel_reg_23, &kernel_reg_45); |
1407 | | |
1408 | | // Round the words |
1409 | 0 | res_reg_lo = |
1410 | 0 | mm256_round_epi32(&res_reg_lo, ®_round, CONV8_ROUNDING_BITS); |
1411 | 0 | res_reg_hi = |
1412 | 0 | mm256_round_epi32(&res_reg_hi, ®_round, CONV8_ROUNDING_BITS); |
1413 | | |
1414 | | // Combine to get the result |
1415 | 0 | res_reg = _mm256_packus_epi32(res_reg_lo, res_reg_hi); |
1416 | 0 | res_reg = _mm256_min_epi16(res_reg, reg_max); |
1417 | | |
1418 | | // Save the result |
1419 | 0 | mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), |
1420 | 0 | &res_reg); |
1421 | | |
1422 | | // Update the source by two rows |
1423 | 0 | src_ptr += src_stride_unrolled; |
1424 | 0 | dst_ptr += dst_stride_unrolled; |
1425 | |
|
1426 | 0 | src_reg_m1001_lo = src_reg_1223_lo; |
1427 | 0 | src_reg_m1001_hi = src_reg_1223_hi; |
1428 | 0 | src_reg_1 = src_reg_3; |
1429 | 0 | } |
1430 | 0 | } |
1431 | | |
1432 | | static void vpx_highbd_filter_block1d16_v4_avx2( |
1433 | | const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, |
1434 | 0 | ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { |
1435 | 0 | vpx_highbd_filter_block1d8_v4_avx2(src_ptr, src_stride, dst_ptr, dst_stride, |
1436 | 0 | height, kernel, bd); |
1437 | 0 | vpx_highbd_filter_block1d8_v4_avx2(src_ptr + 8, src_stride, dst_ptr + 8, |
1438 | 0 | dst_stride, height, kernel, bd); |
1439 | 0 | } |
1440 | | |
1441 | | // From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm. |
1442 | | highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2; |
1443 | | highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2; |
1444 | | |
1445 | | // From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm. |
1446 | | highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2; |
1447 | | highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2; |
1448 | | |
1449 | 0 | #define vpx_highbd_filter_block1d4_h8_avx2 vpx_highbd_filter_block1d4_h8_sse2 |
1450 | 0 | #define vpx_highbd_filter_block1d4_h2_avx2 vpx_highbd_filter_block1d4_h2_sse2 |
1451 | 0 | #define vpx_highbd_filter_block1d4_v8_avx2 vpx_highbd_filter_block1d4_v8_sse2 |
1452 | 0 | #define vpx_highbd_filter_block1d4_v2_avx2 vpx_highbd_filter_block1d4_v2_sse2 |
1453 | | |
1454 | | // Use the [vh]8 version because there is no [vh]4 implementation. |
1455 | | #define vpx_highbd_filter_block1d16_v4_avg_avx2 \ |
1456 | 0 | vpx_highbd_filter_block1d16_v8_avg_avx2 |
1457 | | #define vpx_highbd_filter_block1d16_h4_avg_avx2 \ |
1458 | 0 | vpx_highbd_filter_block1d16_h8_avg_avx2 |
1459 | | #define vpx_highbd_filter_block1d8_v4_avg_avx2 \ |
1460 | 0 | vpx_highbd_filter_block1d8_v8_avg_avx2 |
1461 | | #define vpx_highbd_filter_block1d8_h4_avg_avx2 \ |
1462 | 0 | vpx_highbd_filter_block1d8_h8_avg_avx2 |
1463 | | #define vpx_highbd_filter_block1d4_v4_avg_avx2 \ |
1464 | 0 | vpx_highbd_filter_block1d4_v8_avg_avx2 |
1465 | | #define vpx_highbd_filter_block1d4_h4_avg_avx2 \ |
1466 | 0 | vpx_highbd_filter_block1d4_h8_avg_avx2 |
1467 | | |
1468 | | HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0) |
1469 | | HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, |
1470 | | src - src_stride * (num_taps / 2 - 1), , avx2, 0) |
1471 | | HIGH_FUN_CONV_2D(, avx2, 0) |
1472 | | |
1473 | | // From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm. |
1474 | | highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2; |
1475 | | highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2; |
1476 | | |
1477 | | // From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm. |
1478 | | highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2; |
1479 | | highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2; |
1480 | | |
1481 | | #define vpx_highbd_filter_block1d4_h8_avg_avx2 \ |
1482 | 0 | vpx_highbd_filter_block1d4_h8_avg_sse2 |
1483 | | #define vpx_highbd_filter_block1d4_h2_avg_avx2 \ |
1484 | 0 | vpx_highbd_filter_block1d4_h2_avg_sse2 |
1485 | | #define vpx_highbd_filter_block1d4_v8_avg_avx2 \ |
1486 | 0 | vpx_highbd_filter_block1d4_v8_avg_sse2 |
1487 | | #define vpx_highbd_filter_block1d4_v2_avg_avx2 \ |
1488 | 0 | vpx_highbd_filter_block1d4_v2_avg_sse2 |
1489 | | |
1490 | | HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1) |
1491 | | HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, |
1492 | | src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1) |
1493 | | HIGH_FUN_CONV_2D(avg_, avx2, 1) |
1494 | | |
1495 | | #undef HIGHBD_FUNC |