/src/libvpx/vpx_dsp/x86/highbd_sad_avx2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2022 The WebM project authors. All Rights Reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | #include <immintrin.h> |
11 | | #include "./vpx_dsp_rtcd.h" |
12 | | #include "vpx/vpx_integer.h" |
13 | | |
14 | 0 | static VPX_FORCE_INLINE unsigned int calc_final(const __m256i sums_32) { |
15 | 0 | const __m256i t0 = _mm256_add_epi32(sums_32, _mm256_srli_si256(sums_32, 8)); |
16 | 0 | const __m256i t1 = _mm256_add_epi32(t0, _mm256_srli_si256(t0, 4)); |
17 | 0 | const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t1), |
18 | 0 | _mm256_extractf128_si256(t1, 1)); |
19 | 0 | return (unsigned int)_mm_cvtsi128_si32(sum); |
20 | 0 | } |
21 | | |
22 | | static VPX_FORCE_INLINE void highbd_sad64xH(__m256i *sums_16, |
23 | | const uint16_t *src, int src_stride, |
24 | | uint16_t *ref, int ref_stride, |
25 | 0 | int height) { |
26 | 0 | int i; |
27 | 0 | for (i = 0; i < height; ++i) { |
28 | | // load src and all ref[] |
29 | 0 | const __m256i s0 = _mm256_load_si256((const __m256i *)src); |
30 | 0 | const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16)); |
31 | 0 | const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32)); |
32 | 0 | const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48)); |
33 | 0 | const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref); |
34 | 0 | const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16)); |
35 | 0 | const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32)); |
36 | 0 | const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48)); |
37 | | // absolute differences between every ref[] to src |
38 | 0 | const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0)); |
39 | 0 | const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1)); |
40 | 0 | const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(r2, s2)); |
41 | 0 | const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(r3, s3)); |
42 | | // sum every abs diff |
43 | 0 | *sums_16 = |
44 | 0 | _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1)); |
45 | 0 | *sums_16 = |
46 | 0 | _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3)); |
47 | |
|
48 | 0 | src += src_stride; |
49 | 0 | ref += ref_stride; |
50 | 0 | } |
51 | 0 | } |
52 | | |
53 | | static VPX_FORCE_INLINE unsigned int highbd_sad64xN_avx2(const uint8_t *src_ptr, |
54 | | int src_stride, |
55 | | const uint8_t *ref_ptr, |
56 | | int ref_stride, |
57 | 0 | int n) { |
58 | 0 | const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); |
59 | 0 | uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); |
60 | 0 | __m256i sums_32 = _mm256_setzero_si256(); |
61 | 0 | int i; |
62 | |
|
63 | 0 | for (i = 0; i < (n / 2); ++i) { |
64 | 0 | __m256i sums_16 = _mm256_setzero_si256(); |
65 | |
|
66 | 0 | highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2); |
67 | | |
68 | | /* sums_16 will outrange after 2 rows, so add current sums_16 to |
69 | | * sums_32*/ |
70 | 0 | sums_32 = _mm256_add_epi32( |
71 | 0 | sums_32, |
72 | 0 | _mm256_add_epi32( |
73 | 0 | _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), |
74 | 0 | _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); |
75 | |
|
76 | 0 | src += src_stride << 1; |
77 | 0 | ref += ref_stride << 1; |
78 | 0 | } |
79 | 0 | return calc_final(sums_32); |
80 | 0 | } |
81 | | |
82 | | #define HIGHBD_SAD64XN(n) \ |
83 | | unsigned int vpx_highbd_sad64x##n##_avx2(const uint8_t *src, int src_stride, \ |
84 | | const uint8_t *ref, \ |
85 | 0 | int ref_stride) { \ |
86 | 0 | return highbd_sad64xN_avx2(src, src_stride, ref, ref_stride, n); \ |
87 | 0 | } Unexecuted instantiation: vpx_highbd_sad64x64_avx2 Unexecuted instantiation: vpx_highbd_sad64x32_avx2 |
88 | | |
89 | | #define HIGHBD_SADSKIP64xN(n) \ |
90 | | unsigned int vpx_highbd_sad_skip_64x##n##_avx2( \ |
91 | | const uint8_t *src, int src_stride, const uint8_t *ref, \ |
92 | 0 | int ref_stride) { \ |
93 | 0 | return 2 * highbd_sad64xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \ |
94 | 0 | n / 2); \ |
95 | 0 | } Unexecuted instantiation: vpx_highbd_sad_skip_64x64_avx2 Unexecuted instantiation: vpx_highbd_sad_skip_64x32_avx2 |
96 | | |
97 | | static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16, |
98 | | const uint16_t *src, int src_stride, |
99 | | uint16_t *ref, int ref_stride, |
100 | 0 | int height) { |
101 | 0 | int i; |
102 | 0 | for (i = 0; i < height; ++i) { |
103 | | // load src and all ref[] |
104 | 0 | const __m256i s0 = _mm256_load_si256((const __m256i *)src); |
105 | 0 | const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16)); |
106 | 0 | const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref); |
107 | 0 | const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16)); |
108 | | // absolute differences between every ref[] to src |
109 | 0 | const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0)); |
110 | 0 | const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1)); |
111 | | // sum every abs diff |
112 | 0 | *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0); |
113 | 0 | *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1); |
114 | |
|
115 | 0 | src += src_stride; |
116 | 0 | ref += ref_stride; |
117 | 0 | } |
118 | 0 | } |
119 | | |
120 | | static VPX_FORCE_INLINE unsigned int highbd_sad32xN_avx2(const uint8_t *src_ptr, |
121 | | int src_stride, |
122 | | const uint8_t *ref_ptr, |
123 | | int ref_stride, |
124 | 0 | int n) { |
125 | 0 | const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); |
126 | 0 | uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); |
127 | 0 | __m256i sums_32 = _mm256_setzero_si256(); |
128 | 0 | int i; |
129 | |
|
130 | 0 | for (i = 0; i < (n / 8); ++i) { |
131 | 0 | __m256i sums_16 = _mm256_setzero_si256(); |
132 | |
|
133 | 0 | highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8); |
134 | | |
135 | | /* sums_16 will outrange after 8 rows, so add current sums_16 to |
136 | | * sums_32*/ |
137 | 0 | sums_32 = _mm256_add_epi32( |
138 | 0 | sums_32, |
139 | 0 | _mm256_add_epi32( |
140 | 0 | _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), |
141 | 0 | _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); |
142 | |
|
143 | 0 | src += src_stride << 3; |
144 | 0 | ref += ref_stride << 3; |
145 | 0 | } |
146 | 0 | return calc_final(sums_32); |
147 | 0 | } |
148 | | |
149 | | #define HIGHBD_SAD32XN(n) \ |
150 | | unsigned int vpx_highbd_sad32x##n##_avx2(const uint8_t *src, int src_stride, \ |
151 | | const uint8_t *ref, \ |
152 | 0 | int ref_stride) { \ |
153 | 0 | return highbd_sad32xN_avx2(src, src_stride, ref, ref_stride, n); \ |
154 | 0 | } Unexecuted instantiation: vpx_highbd_sad32x64_avx2 Unexecuted instantiation: vpx_highbd_sad32x32_avx2 Unexecuted instantiation: vpx_highbd_sad32x16_avx2 |
155 | | |
156 | | #define HIGHBD_SADSKIP32xN(n) \ |
157 | | unsigned int vpx_highbd_sad_skip_32x##n##_avx2( \ |
158 | | const uint8_t *src, int src_stride, const uint8_t *ref, \ |
159 | 0 | int ref_stride) { \ |
160 | 0 | return 2 * highbd_sad32xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \ |
161 | 0 | n / 2); \ |
162 | 0 | } Unexecuted instantiation: vpx_highbd_sad_skip_32x64_avx2 Unexecuted instantiation: vpx_highbd_sad_skip_32x32_avx2 Unexecuted instantiation: vpx_highbd_sad_skip_32x16_avx2 |
163 | | |
164 | | static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16, |
165 | | const uint16_t *src, int src_stride, |
166 | | uint16_t *ref, int ref_stride, |
167 | 0 | int height) { |
168 | 0 | int i; |
169 | 0 | for (i = 0; i < height; i += 2) { |
170 | | // load src and all ref[] |
171 | 0 | const __m256i s0 = _mm256_load_si256((const __m256i *)src); |
172 | 0 | const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride)); |
173 | 0 | const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref); |
174 | 0 | const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride)); |
175 | | // absolute differences between every ref[] to src |
176 | 0 | const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0)); |
177 | 0 | const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1)); |
178 | | // sum every abs diff |
179 | 0 | *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0); |
180 | 0 | *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1); |
181 | |
|
182 | 0 | src += src_stride << 1; |
183 | 0 | ref += ref_stride << 1; |
184 | 0 | } |
185 | 0 | } |
186 | | |
187 | | static VPX_FORCE_INLINE unsigned int highbd_sad16xN_avx2(const uint8_t *src_ptr, |
188 | | int src_stride, |
189 | | const uint8_t *ref_ptr, |
190 | | int ref_stride, |
191 | 0 | int n) { |
192 | 0 | const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); |
193 | 0 | uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); |
194 | 0 | __m256i sums_32 = _mm256_setzero_si256(); |
195 | 0 | const int height = VPXMIN(16, n); |
196 | 0 | const int num_iters = n / height; |
197 | 0 | int i; |
198 | |
|
199 | 0 | for (i = 0; i < num_iters; ++i) { |
200 | 0 | __m256i sums_16 = _mm256_setzero_si256(); |
201 | |
|
202 | 0 | highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, height); |
203 | | |
204 | | // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32 |
205 | 0 | sums_32 = _mm256_add_epi32( |
206 | 0 | sums_32, |
207 | 0 | _mm256_add_epi32( |
208 | 0 | _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), |
209 | 0 | _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); |
210 | |
|
211 | 0 | src += src_stride << 4; |
212 | 0 | ref += ref_stride << 4; |
213 | 0 | } |
214 | 0 | return calc_final(sums_32); |
215 | 0 | } |
216 | | |
217 | | #define HIGHBD_SAD16XN(n) \ |
218 | | unsigned int vpx_highbd_sad16x##n##_avx2(const uint8_t *src, int src_stride, \ |
219 | | const uint8_t *ref, \ |
220 | 0 | int ref_stride) { \ |
221 | 0 | return highbd_sad16xN_avx2(src, src_stride, ref, ref_stride, n); \ |
222 | 0 | } |
223 | | |
224 | | #define HIGHBD_SADSKIP16xN(n) \ |
225 | | unsigned int vpx_highbd_sad_skip_16x##n##_avx2( \ |
226 | | const uint8_t *src, int src_stride, const uint8_t *ref, \ |
227 | 0 | int ref_stride) { \ |
228 | 0 | return 2 * highbd_sad16xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \ |
229 | 0 | n / 2); \ |
230 | 0 | } Unexecuted instantiation: vpx_highbd_sad_skip_16x32_avx2 Unexecuted instantiation: vpx_highbd_sad_skip_16x16_avx2 Unexecuted instantiation: vpx_highbd_sad_skip_16x8_avx2 |
231 | | |
232 | | unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src_ptr, int src_stride, |
233 | 0 | const uint8_t *ref_ptr, int ref_stride) { |
234 | 0 | const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); |
235 | 0 | uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); |
236 | 0 | __m256i sums_16 = _mm256_setzero_si256(); |
237 | |
|
238 | 0 | highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16); |
239 | |
|
240 | 0 | { |
241 | 0 | const __m256i sums_32 = _mm256_add_epi32( |
242 | 0 | _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), |
243 | 0 | _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))); |
244 | 0 | return calc_final(sums_32); |
245 | 0 | } |
246 | 0 | } |
247 | | |
248 | | unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src_ptr, int src_stride, |
249 | 0 | const uint8_t *ref_ptr, int ref_stride) { |
250 | 0 | const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); |
251 | 0 | uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); |
252 | 0 | __m256i sums_16 = _mm256_setzero_si256(); |
253 | |
|
254 | 0 | highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 8); |
255 | |
|
256 | 0 | { |
257 | 0 | const __m256i sums_32 = _mm256_add_epi32( |
258 | 0 | _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), |
259 | 0 | _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))); |
260 | 0 | return calc_final(sums_32); |
261 | 0 | } |
262 | 0 | } |
263 | | |
264 | | // clang-format off |
265 | | HIGHBD_SAD64XN(64) |
266 | | HIGHBD_SADSKIP64xN(64) |
267 | | HIGHBD_SAD64XN(32) |
268 | | HIGHBD_SADSKIP64xN(32) |
269 | | HIGHBD_SAD32XN(64) |
270 | | HIGHBD_SADSKIP32xN(64) |
271 | | HIGHBD_SAD32XN(32) |
272 | | HIGHBD_SADSKIP32xN(32) |
273 | | HIGHBD_SAD32XN(16) |
274 | | HIGHBD_SADSKIP32xN(16) |
275 | | HIGHBD_SAD16XN(32) |
276 | | HIGHBD_SADSKIP16xN(32) |
277 | | HIGHBD_SADSKIP16xN(16) |
278 | | HIGHBD_SADSKIP16xN(8) |
279 | | //clang-format on |
280 | | |
281 | | // AVG ------------------------------------------------------------------------- |
282 | | static VPX_FORCE_INLINE void highbd_sad64xH_avg(__m256i *sums_16, |
283 | | const uint16_t *src, |
284 | | int src_stride, uint16_t *ref, |
285 | | int ref_stride, uint16_t *sec, |
286 | 0 | int height) { |
287 | 0 | int i; |
288 | 0 | for (i = 0; i < height; ++i) { |
289 | | // load src and all ref[] |
290 | 0 | const __m256i s0 = _mm256_load_si256((const __m256i *)src); |
291 | 0 | const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16)); |
292 | 0 | const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32)); |
293 | 0 | const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48)); |
294 | 0 | const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref); |
295 | 0 | const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16)); |
296 | 0 | const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32)); |
297 | 0 | const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48)); |
298 | 0 | const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec); |
299 | 0 | const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16)); |
300 | 0 | const __m256i x2 = _mm256_loadu_si256((const __m256i *)(sec + 32)); |
301 | 0 | const __m256i x3 = _mm256_loadu_si256((const __m256i *)(sec + 48)); |
302 | 0 | const __m256i avg0 = _mm256_avg_epu16(r0, x0); |
303 | 0 | const __m256i avg1 = _mm256_avg_epu16(r1, x1); |
304 | 0 | const __m256i avg2 = _mm256_avg_epu16(r2, x2); |
305 | 0 | const __m256i avg3 = _mm256_avg_epu16(r3, x3); |
306 | | // absolute differences between every ref/pred avg to src |
307 | 0 | const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0)); |
308 | 0 | const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1)); |
309 | 0 | const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(avg2, s2)); |
310 | 0 | const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(avg3, s3)); |
311 | | // sum every abs diff |
312 | 0 | *sums_16 = |
313 | 0 | _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1)); |
314 | 0 | *sums_16 = |
315 | 0 | _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3)); |
316 | |
|
317 | 0 | src += src_stride; |
318 | 0 | ref += ref_stride; |
319 | 0 | sec += 64; |
320 | 0 | } |
321 | 0 | } |
322 | | |
323 | | #define HIGHBD_SAD64XN_AVG(n) \ |
324 | | unsigned int vpx_highbd_sad64x##n##_avg_avx2( \ |
325 | | const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ |
326 | 0 | int ref_stride, const uint8_t *second_pred) { \ |
327 | 0 | const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ |
328 | 0 | uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ |
329 | 0 | uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); \ |
330 | 0 | __m256i sums_32 = _mm256_setzero_si256(); \ |
331 | 0 | int i; \ |
332 | 0 | \ |
333 | 0 | for (i = 0; i < (n / 2); ++i) { \ |
334 | 0 | __m256i sums_16 = _mm256_setzero_si256(); \ |
335 | 0 | \ |
336 | 0 | highbd_sad64xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 2); \ |
337 | 0 | \ |
338 | 0 | /* sums_16 will outrange after 2 rows, so add current sums_16 to \ |
339 | 0 | * sums_32*/ \ |
340 | 0 | sums_32 = _mm256_add_epi32( \ |
341 | 0 | sums_32, \ |
342 | 0 | _mm256_add_epi32( \ |
343 | 0 | _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \ |
344 | 0 | _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \ |
345 | 0 | \ |
346 | 0 | src += src_stride << 1; \ |
347 | 0 | ref += ref_stride << 1; \ |
348 | 0 | sec += 64 << 1; \ |
349 | 0 | } \ |
350 | 0 | return calc_final(sums_32); \ |
351 | 0 | } Unexecuted instantiation: vpx_highbd_sad64x64_avg_avx2 Unexecuted instantiation: vpx_highbd_sad64x32_avg_avx2 |
352 | | |
353 | | // 64x64 |
354 | | HIGHBD_SAD64XN_AVG(64) |
355 | | |
356 | | // 64x32 |
357 | | HIGHBD_SAD64XN_AVG(32) |
358 | | |
359 | | static VPX_FORCE_INLINE void highbd_sad32xH_avg(__m256i *sums_16, |
360 | | const uint16_t *src, |
361 | | int src_stride, uint16_t *ref, |
362 | | int ref_stride, uint16_t *sec, |
363 | 0 | int height) { |
364 | 0 | int i; |
365 | 0 | for (i = 0; i < height; ++i) { |
366 | | // load src and all ref[] |
367 | 0 | const __m256i s0 = _mm256_load_si256((const __m256i *)src); |
368 | 0 | const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16)); |
369 | 0 | const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref); |
370 | 0 | const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16)); |
371 | 0 | const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec); |
372 | 0 | const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16)); |
373 | 0 | const __m256i avg0 = _mm256_avg_epu16(r0, x0); |
374 | 0 | const __m256i avg1 = _mm256_avg_epu16(r1, x1); |
375 | | // absolute differences between every ref/pred avg to src |
376 | 0 | const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0)); |
377 | 0 | const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1)); |
378 | | // sum every abs diff |
379 | 0 | *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0); |
380 | 0 | *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1); |
381 | |
|
382 | 0 | src += src_stride; |
383 | 0 | ref += ref_stride; |
384 | 0 | sec += 32; |
385 | 0 | } |
386 | 0 | } |
387 | | |
388 | | #define HIGHBD_SAD32XN_AVG(n) \ |
389 | | unsigned int vpx_highbd_sad32x##n##_avg_avx2( \ |
390 | | const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ |
391 | 0 | int ref_stride, const uint8_t *second_pred) { \ |
392 | 0 | const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ |
393 | 0 | uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ |
394 | 0 | uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); \ |
395 | 0 | __m256i sums_32 = _mm256_setzero_si256(); \ |
396 | 0 | int i; \ |
397 | 0 | \ |
398 | 0 | for (i = 0; i < (n / 8); ++i) { \ |
399 | 0 | __m256i sums_16 = _mm256_setzero_si256(); \ |
400 | 0 | \ |
401 | 0 | highbd_sad32xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8); \ |
402 | 0 | \ |
403 | 0 | /* sums_16 will outrange after 8 rows, so add current sums_16 to \ |
404 | 0 | * sums_32*/ \ |
405 | 0 | sums_32 = _mm256_add_epi32( \ |
406 | 0 | sums_32, \ |
407 | 0 | _mm256_add_epi32( \ |
408 | 0 | _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \ |
409 | 0 | _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \ |
410 | 0 | \ |
411 | 0 | src += src_stride << 3; \ |
412 | 0 | ref += ref_stride << 3; \ |
413 | 0 | sec += 32 << 3; \ |
414 | 0 | } \ |
415 | 0 | return calc_final(sums_32); \ |
416 | 0 | } Unexecuted instantiation: vpx_highbd_sad32x64_avg_avx2 Unexecuted instantiation: vpx_highbd_sad32x32_avg_avx2 Unexecuted instantiation: vpx_highbd_sad32x16_avg_avx2 |
417 | | |
418 | | // 32x64 |
419 | | HIGHBD_SAD32XN_AVG(64) |
420 | | |
421 | | // 32x32 |
422 | | HIGHBD_SAD32XN_AVG(32) |
423 | | |
424 | | // 32x16 |
425 | | HIGHBD_SAD32XN_AVG(16) |
426 | | |
427 | | static VPX_FORCE_INLINE void highbd_sad16xH_avg(__m256i *sums_16, |
428 | | const uint16_t *src, |
429 | | int src_stride, uint16_t *ref, |
430 | | int ref_stride, uint16_t *sec, |
431 | 0 | int height) { |
432 | 0 | int i; |
433 | 0 | for (i = 0; i < height; i += 2) { |
434 | | // load src and all ref[] |
435 | 0 | const __m256i s0 = _mm256_load_si256((const __m256i *)src); |
436 | 0 | const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride)); |
437 | 0 | const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref); |
438 | 0 | const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride)); |
439 | 0 | const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec); |
440 | 0 | const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16)); |
441 | 0 | const __m256i avg0 = _mm256_avg_epu16(r0, x0); |
442 | 0 | const __m256i avg1 = _mm256_avg_epu16(r1, x1); |
443 | | // absolute differences between every ref[] to src |
444 | 0 | const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0)); |
445 | 0 | const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1)); |
446 | | // sum every abs diff |
447 | 0 | *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0); |
448 | 0 | *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1); |
449 | |
|
450 | 0 | src += src_stride << 1; |
451 | 0 | ref += ref_stride << 1; |
452 | 0 | sec += 32; |
453 | 0 | } |
454 | 0 | } |
455 | | |
456 | | unsigned int vpx_highbd_sad16x32_avg_avx2(const uint8_t *src_ptr, |
457 | | int src_stride, |
458 | | const uint8_t *ref_ptr, |
459 | | int ref_stride, |
460 | 0 | const uint8_t *second_pred) { |
461 | 0 | const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); |
462 | 0 | uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); |
463 | 0 | uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); |
464 | 0 | __m256i sums_32 = _mm256_setzero_si256(); |
465 | 0 | int i; |
466 | |
|
467 | 0 | for (i = 0; i < 2; ++i) { |
468 | 0 | __m256i sums_16 = _mm256_setzero_si256(); |
469 | |
|
470 | 0 | highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16); |
471 | | |
472 | | // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32 |
473 | 0 | sums_32 = _mm256_add_epi32( |
474 | 0 | sums_32, |
475 | 0 | _mm256_add_epi32( |
476 | 0 | _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), |
477 | 0 | _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); |
478 | |
|
479 | 0 | src += src_stride << 4; |
480 | 0 | ref += ref_stride << 4; |
481 | 0 | sec += 16 << 4; |
482 | 0 | } |
483 | 0 | return calc_final(sums_32); |
484 | 0 | } |
485 | | |
486 | | unsigned int vpx_highbd_sad16x16_avg_avx2(const uint8_t *src_ptr, |
487 | | int src_stride, |
488 | | const uint8_t *ref_ptr, |
489 | | int ref_stride, |
490 | 0 | const uint8_t *second_pred) { |
491 | 0 | const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); |
492 | 0 | uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); |
493 | 0 | uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); |
494 | 0 | __m256i sums_16 = _mm256_setzero_si256(); |
495 | |
|
496 | 0 | highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16); |
497 | |
|
498 | 0 | { |
499 | 0 | const __m256i sums_32 = _mm256_add_epi32( |
500 | 0 | _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), |
501 | 0 | _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))); |
502 | 0 | return calc_final(sums_32); |
503 | 0 | } |
504 | 0 | } |
505 | | |
506 | | unsigned int vpx_highbd_sad16x8_avg_avx2(const uint8_t *src_ptr, int src_stride, |
507 | | const uint8_t *ref_ptr, int ref_stride, |
508 | 0 | const uint8_t *second_pred) { |
509 | 0 | const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); |
510 | 0 | uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); |
511 | 0 | uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); |
512 | 0 | __m256i sums_16 = _mm256_setzero_si256(); |
513 | |
|
514 | 0 | highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8); |
515 | |
|
516 | 0 | { |
517 | 0 | const __m256i sums_32 = _mm256_add_epi32( |
518 | 0 | _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), |
519 | 0 | _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))); |
520 | 0 | return calc_final(sums_32); |
521 | 0 | } |
522 | 0 | } |