/src/libvpx/vpx_dsp/x86/highbd_sad4d_avx2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2022 The WebM project authors. All Rights Reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | #include <immintrin.h> // AVX2 |
11 | | #include "./vpx_dsp_rtcd.h" |
12 | | #include "vpx/vpx_integer.h" |
13 | | |
14 | | static VPX_FORCE_INLINE void calc_final_4(const __m256i *const sums /*[4]*/, |
15 | 0 | uint32_t sad_array[4]) { |
16 | 0 | const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]); |
17 | 0 | const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]); |
18 | 0 | const __m256i t2 = _mm256_hadd_epi32(t0, t1); |
19 | 0 | const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t2), |
20 | 0 | _mm256_extractf128_si256(t2, 1)); |
21 | 0 | _mm_storeu_si128((__m128i *)sad_array, sum); |
22 | 0 | } |
23 | | |
24 | | static VPX_FORCE_INLINE void highbd_sad64xHx4d(__m256i *sums_16 /*[4]*/, |
25 | | const uint16_t *src, |
26 | | int src_stride, |
27 | | uint16_t *refs[4], |
28 | 0 | int ref_stride, int height) { |
29 | 0 | int i; |
30 | 0 | for (i = 0; i < height; ++i) { |
31 | | // load src and all ref[] |
32 | 0 | const __m256i s0 = _mm256_load_si256((const __m256i *)src); |
33 | 0 | const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16)); |
34 | 0 | const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32)); |
35 | 0 | const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48)); |
36 | 0 | int x; |
37 | |
|
38 | 0 | for (x = 0; x < 4; ++x) { |
39 | 0 | __m256i r[4]; |
40 | 0 | r[0] = _mm256_loadu_si256((const __m256i *)refs[x]); |
41 | 0 | r[1] = _mm256_loadu_si256((const __m256i *)(refs[x] + 16)); |
42 | 0 | r[2] = _mm256_loadu_si256((const __m256i *)(refs[x] + 32)); |
43 | 0 | r[3] = _mm256_loadu_si256((const __m256i *)(refs[x] + 48)); |
44 | | |
45 | | // absolute differences between every ref[] to src |
46 | 0 | r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s0)); |
47 | 0 | r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s1)); |
48 | 0 | r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s2)); |
49 | 0 | r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s3)); |
50 | | |
51 | | // sum every abs diff |
52 | 0 | sums_16[x] = _mm256_add_epi16(sums_16[x], _mm256_add_epi16(r[0], r[1])); |
53 | 0 | sums_16[x] = _mm256_add_epi16(sums_16[x], _mm256_add_epi16(r[2], r[3])); |
54 | 0 | } |
55 | |
|
56 | 0 | src += src_stride; |
57 | 0 | refs[0] += ref_stride; |
58 | 0 | refs[1] += ref_stride; |
59 | 0 | refs[2] += ref_stride; |
60 | 0 | refs[3] += ref_stride; |
61 | 0 | } |
62 | 0 | } |
63 | | |
64 | | static VPX_FORCE_INLINE void highbd_sad64xNx4d_avx2( |
65 | | const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], |
66 | 0 | int ref_stride, uint32_t sad_array[4], int n) { |
67 | 0 | const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); |
68 | 0 | uint16_t *refs[4]; |
69 | 0 | __m256i sums_16[4]; |
70 | 0 | __m256i sums_32[4]; |
71 | 0 | int i; |
72 | |
|
73 | 0 | refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); |
74 | 0 | refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); |
75 | 0 | refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); |
76 | 0 | refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); |
77 | 0 | sums_32[0] = _mm256_setzero_si256(); |
78 | 0 | sums_32[1] = _mm256_setzero_si256(); |
79 | 0 | sums_32[2] = _mm256_setzero_si256(); |
80 | 0 | sums_32[3] = _mm256_setzero_si256(); |
81 | |
|
82 | 0 | for (i = 0; i < (n / 2); ++i) { |
83 | 0 | sums_16[0] = _mm256_setzero_si256(); |
84 | 0 | sums_16[1] = _mm256_setzero_si256(); |
85 | 0 | sums_16[2] = _mm256_setzero_si256(); |
86 | 0 | sums_16[3] = _mm256_setzero_si256(); |
87 | |
|
88 | 0 | highbd_sad64xHx4d(sums_16, src, src_stride, refs, ref_stride, 2); |
89 | | |
90 | | /* sums_16 will outrange after 2 rows, so add current sums_16 to |
91 | | * sums_32*/ |
92 | 0 | sums_32[0] = _mm256_add_epi32( |
93 | 0 | sums_32[0], |
94 | 0 | _mm256_add_epi32( |
95 | 0 | _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), |
96 | 0 | _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)))); |
97 | 0 | sums_32[1] = _mm256_add_epi32( |
98 | 0 | sums_32[1], |
99 | 0 | _mm256_add_epi32( |
100 | 0 | _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), |
101 | 0 | _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)))); |
102 | 0 | sums_32[2] = _mm256_add_epi32( |
103 | 0 | sums_32[2], |
104 | 0 | _mm256_add_epi32( |
105 | 0 | _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), |
106 | 0 | _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)))); |
107 | 0 | sums_32[3] = _mm256_add_epi32( |
108 | 0 | sums_32[3], |
109 | 0 | _mm256_add_epi32( |
110 | 0 | _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), |
111 | 0 | _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)))); |
112 | |
|
113 | 0 | src += src_stride << 1; |
114 | 0 | } |
115 | 0 | calc_final_4(sums_32, sad_array); |
116 | 0 | } |
117 | | |
118 | | #define HIGHBD_SAD64XNX4D(n) \ |
119 | | void vpx_highbd_sad64x##n##x4d_avx2(const uint8_t *src, int src_stride, \ |
120 | | const uint8_t *const ref_array[4], \ |
121 | 0 | int ref_stride, uint32_t sad_array[4]) { \ |
122 | 0 | highbd_sad64xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array, \ |
123 | 0 | n); \ |
124 | 0 | } Unexecuted instantiation: vpx_highbd_sad64x64x4d_avx2 Unexecuted instantiation: vpx_highbd_sad64x32x4d_avx2 |
125 | | |
126 | | #define HIGHBD_SADSKIP64XNx4D(n) \ |
127 | | void vpx_highbd_sad_skip_64x##n##x4d_avx2( \ |
128 | | const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ |
129 | 0 | int ref_stride, uint32_t sad_array[4]) { \ |
130 | 0 | highbd_sad64xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \ |
131 | 0 | sad_array, n / 2); \ |
132 | 0 | sad_array[0] <<= 1; \ |
133 | 0 | sad_array[1] <<= 1; \ |
134 | 0 | sad_array[2] <<= 1; \ |
135 | 0 | sad_array[3] <<= 1; \ |
136 | 0 | } Unexecuted instantiation: vpx_highbd_sad_skip_64x64x4d_avx2 Unexecuted instantiation: vpx_highbd_sad_skip_64x32x4d_avx2 |
137 | | |
138 | | static VPX_FORCE_INLINE void highbd_sad32xHx4d(__m256i *sums_16 /*[4]*/, |
139 | | const uint16_t *src, |
140 | | int src_stride, |
141 | | uint16_t *refs[4], |
142 | 0 | int ref_stride, int height) { |
143 | 0 | int i; |
144 | 0 | for (i = 0; i < height; i++) { |
145 | 0 | __m256i r[8]; |
146 | | |
147 | | // load src and all ref[] |
148 | 0 | const __m256i s = _mm256_load_si256((const __m256i *)src); |
149 | 0 | const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 16)); |
150 | 0 | r[0] = _mm256_loadu_si256((const __m256i *)refs[0]); |
151 | 0 | r[1] = _mm256_loadu_si256((const __m256i *)(refs[0] + 16)); |
152 | 0 | r[2] = _mm256_loadu_si256((const __m256i *)refs[1]); |
153 | 0 | r[3] = _mm256_loadu_si256((const __m256i *)(refs[1] + 16)); |
154 | 0 | r[4] = _mm256_loadu_si256((const __m256i *)refs[2]); |
155 | 0 | r[5] = _mm256_loadu_si256((const __m256i *)(refs[2] + 16)); |
156 | 0 | r[6] = _mm256_loadu_si256((const __m256i *)refs[3]); |
157 | 0 | r[7] = _mm256_loadu_si256((const __m256i *)(refs[3] + 16)); |
158 | | |
159 | | // absolute differences between every ref[] to src |
160 | 0 | r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s)); |
161 | 0 | r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s2)); |
162 | 0 | r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s)); |
163 | 0 | r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s2)); |
164 | 0 | r[4] = _mm256_abs_epi16(_mm256_sub_epi16(r[4], s)); |
165 | 0 | r[5] = _mm256_abs_epi16(_mm256_sub_epi16(r[5], s2)); |
166 | 0 | r[6] = _mm256_abs_epi16(_mm256_sub_epi16(r[6], s)); |
167 | 0 | r[7] = _mm256_abs_epi16(_mm256_sub_epi16(r[7], s2)); |
168 | | |
169 | | // sum every abs diff |
170 | 0 | sums_16[0] = _mm256_add_epi16(sums_16[0], _mm256_add_epi16(r[0], r[1])); |
171 | 0 | sums_16[1] = _mm256_add_epi16(sums_16[1], _mm256_add_epi16(r[2], r[3])); |
172 | 0 | sums_16[2] = _mm256_add_epi16(sums_16[2], _mm256_add_epi16(r[4], r[5])); |
173 | 0 | sums_16[3] = _mm256_add_epi16(sums_16[3], _mm256_add_epi16(r[6], r[7])); |
174 | |
|
175 | 0 | src += src_stride; |
176 | 0 | refs[0] += ref_stride; |
177 | 0 | refs[1] += ref_stride; |
178 | 0 | refs[2] += ref_stride; |
179 | 0 | refs[3] += ref_stride; |
180 | 0 | } |
181 | 0 | } |
182 | | |
183 | | static VPX_FORCE_INLINE void highbd_sad32xNx4d_avx2( |
184 | | const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], |
185 | 0 | int ref_stride, uint32_t sad_array[4], int n) { |
186 | 0 | const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); |
187 | 0 | uint16_t *refs[4]; |
188 | 0 | __m256i sums_16[4]; |
189 | 0 | __m256i sums_32[4]; |
190 | 0 | int i; |
191 | |
|
192 | 0 | refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); |
193 | 0 | refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); |
194 | 0 | refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); |
195 | 0 | refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); |
196 | 0 | sums_32[0] = _mm256_setzero_si256(); |
197 | 0 | sums_32[1] = _mm256_setzero_si256(); |
198 | 0 | sums_32[2] = _mm256_setzero_si256(); |
199 | 0 | sums_32[3] = _mm256_setzero_si256(); |
200 | |
|
201 | 0 | for (i = 0; i < (n / 8); ++i) { |
202 | 0 | sums_16[0] = _mm256_setzero_si256(); |
203 | 0 | sums_16[1] = _mm256_setzero_si256(); |
204 | 0 | sums_16[2] = _mm256_setzero_si256(); |
205 | 0 | sums_16[3] = _mm256_setzero_si256(); |
206 | |
|
207 | 0 | highbd_sad32xHx4d(sums_16, src, src_stride, refs, ref_stride, 8); |
208 | | |
209 | | /* sums_16 will outrange after 8 rows, so add current sums_16 to |
210 | | * sums_32*/ |
211 | 0 | sums_32[0] = _mm256_add_epi32( |
212 | 0 | sums_32[0], |
213 | 0 | _mm256_add_epi32( |
214 | 0 | _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), |
215 | 0 | _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)))); |
216 | 0 | sums_32[1] = _mm256_add_epi32( |
217 | 0 | sums_32[1], |
218 | 0 | _mm256_add_epi32( |
219 | 0 | _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), |
220 | 0 | _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)))); |
221 | 0 | sums_32[2] = _mm256_add_epi32( |
222 | 0 | sums_32[2], |
223 | 0 | _mm256_add_epi32( |
224 | 0 | _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), |
225 | 0 | _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)))); |
226 | 0 | sums_32[3] = _mm256_add_epi32( |
227 | 0 | sums_32[3], |
228 | 0 | _mm256_add_epi32( |
229 | 0 | _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), |
230 | 0 | _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)))); |
231 | |
|
232 | 0 | src += src_stride << 3; |
233 | 0 | } |
234 | 0 | calc_final_4(sums_32, sad_array); |
235 | 0 | } |
236 | | |
237 | | #define HIGHBD_SAD32XNX4D(n) \ |
238 | | void vpx_highbd_sad32x##n##x4d_avx2(const uint8_t *src, int src_stride, \ |
239 | | const uint8_t *const ref_array[4], \ |
240 | 0 | int ref_stride, uint32_t sad_array[4]) { \ |
241 | 0 | highbd_sad32xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array, \ |
242 | 0 | n); \ |
243 | 0 | } Unexecuted instantiation: vpx_highbd_sad32x64x4d_avx2 Unexecuted instantiation: vpx_highbd_sad32x32x4d_avx2 Unexecuted instantiation: vpx_highbd_sad32x16x4d_avx2 |
244 | | |
245 | | #define HIGHBD_SADSKIP32XNx4D(n) \ |
246 | | void vpx_highbd_sad_skip_32x##n##x4d_avx2( \ |
247 | | const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ |
248 | 0 | int ref_stride, uint32_t sad_array[4]) { \ |
249 | 0 | highbd_sad32xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \ |
250 | 0 | sad_array, n / 2); \ |
251 | 0 | sad_array[0] <<= 1; \ |
252 | 0 | sad_array[1] <<= 1; \ |
253 | 0 | sad_array[2] <<= 1; \ |
254 | 0 | sad_array[3] <<= 1; \ |
255 | 0 | } Unexecuted instantiation: vpx_highbd_sad_skip_32x64x4d_avx2 Unexecuted instantiation: vpx_highbd_sad_skip_32x32x4d_avx2 Unexecuted instantiation: vpx_highbd_sad_skip_32x16x4d_avx2 |
256 | | |
257 | | static VPX_FORCE_INLINE void highbd_sad16xHx4d(__m256i *sums_16 /*[4]*/, |
258 | | const uint16_t *src, |
259 | | int src_stride, |
260 | | uint16_t *refs[4], |
261 | 0 | int ref_stride, int height) { |
262 | 0 | int i; |
263 | 0 | for (i = 0; i < height; i++) { |
264 | 0 | __m256i r[4]; |
265 | | |
266 | | // load src and all ref[] |
267 | 0 | const __m256i s = _mm256_load_si256((const __m256i *)src); |
268 | 0 | r[0] = _mm256_loadu_si256((const __m256i *)refs[0]); |
269 | 0 | r[1] = _mm256_loadu_si256((const __m256i *)refs[1]); |
270 | 0 | r[2] = _mm256_loadu_si256((const __m256i *)refs[2]); |
271 | 0 | r[3] = _mm256_loadu_si256((const __m256i *)refs[3]); |
272 | | |
273 | | // absolute differences between every ref[] to src |
274 | 0 | r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s)); |
275 | 0 | r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s)); |
276 | 0 | r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s)); |
277 | 0 | r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s)); |
278 | | |
279 | | // sum every abs diff |
280 | 0 | sums_16[0] = _mm256_add_epi16(sums_16[0], r[0]); |
281 | 0 | sums_16[1] = _mm256_add_epi16(sums_16[1], r[1]); |
282 | 0 | sums_16[2] = _mm256_add_epi16(sums_16[2], r[2]); |
283 | 0 | sums_16[3] = _mm256_add_epi16(sums_16[3], r[3]); |
284 | |
|
285 | 0 | src += src_stride; |
286 | 0 | refs[0] += ref_stride; |
287 | 0 | refs[1] += ref_stride; |
288 | 0 | refs[2] += ref_stride; |
289 | 0 | refs[3] += ref_stride; |
290 | 0 | } |
291 | 0 | } |
292 | | |
293 | | static VPX_FORCE_INLINE void highbd_sad16xNx4d_avx2( |
294 | | const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], |
295 | 0 | int ref_stride, uint32_t sad_array[4], int n) { |
296 | 0 | const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); |
297 | 0 | uint16_t *refs[4]; |
298 | 0 | __m256i sums_16[4]; |
299 | 0 | __m256i sums_32[4]; |
300 | 0 | const int height = VPXMIN(16, n); |
301 | 0 | const int num_iters = n / height; |
302 | 0 | int i; |
303 | |
|
304 | 0 | refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); |
305 | 0 | refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); |
306 | 0 | refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); |
307 | 0 | refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); |
308 | 0 | sums_32[0] = _mm256_setzero_si256(); |
309 | 0 | sums_32[1] = _mm256_setzero_si256(); |
310 | 0 | sums_32[2] = _mm256_setzero_si256(); |
311 | 0 | sums_32[3] = _mm256_setzero_si256(); |
312 | |
|
313 | 0 | for (i = 0; i < num_iters; ++i) { |
314 | 0 | sums_16[0] = _mm256_setzero_si256(); |
315 | 0 | sums_16[1] = _mm256_setzero_si256(); |
316 | 0 | sums_16[2] = _mm256_setzero_si256(); |
317 | 0 | sums_16[3] = _mm256_setzero_si256(); |
318 | |
|
319 | 0 | highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, height); |
320 | | |
321 | | // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32 |
322 | 0 | sums_32[0] = _mm256_add_epi32( |
323 | 0 | sums_32[0], |
324 | 0 | _mm256_add_epi32( |
325 | 0 | _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), |
326 | 0 | _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)))); |
327 | 0 | sums_32[1] = _mm256_add_epi32( |
328 | 0 | sums_32[1], |
329 | 0 | _mm256_add_epi32( |
330 | 0 | _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), |
331 | 0 | _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)))); |
332 | 0 | sums_32[2] = _mm256_add_epi32( |
333 | 0 | sums_32[2], |
334 | 0 | _mm256_add_epi32( |
335 | 0 | _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), |
336 | 0 | _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)))); |
337 | 0 | sums_32[3] = _mm256_add_epi32( |
338 | 0 | sums_32[3], |
339 | 0 | _mm256_add_epi32( |
340 | 0 | _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), |
341 | 0 | _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)))); |
342 | |
|
343 | 0 | src += src_stride << 4; |
344 | 0 | } |
345 | 0 | calc_final_4(sums_32, sad_array); |
346 | 0 | } |
347 | | |
348 | | #define HIGHBD_SAD16XNX4D(n) \ |
349 | | void vpx_highbd_sad16x##n##x4d_avx2(const uint8_t *src, int src_stride, \ |
350 | | const uint8_t *const ref_array[4], \ |
351 | 0 | int ref_stride, uint32_t sad_array[4]) { \ |
352 | 0 | highbd_sad16xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array, \ |
353 | 0 | n); \ |
354 | 0 | } |
355 | | |
356 | | #define HIGHBD_SADSKIP16XNx4D(n) \ |
357 | | void vpx_highbd_sad_skip_16x##n##x4d_avx2( \ |
358 | | const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ |
359 | 0 | int ref_stride, uint32_t sad_array[4]) { \ |
360 | 0 | highbd_sad16xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \ |
361 | 0 | sad_array, n / 2); \ |
362 | 0 | sad_array[0] <<= 1; \ |
363 | 0 | sad_array[1] <<= 1; \ |
364 | 0 | sad_array[2] <<= 1; \ |
365 | 0 | sad_array[3] <<= 1; \ |
366 | 0 | } Unexecuted instantiation: vpx_highbd_sad_skip_16x32x4d_avx2 Unexecuted instantiation: vpx_highbd_sad_skip_16x16x4d_avx2 Unexecuted instantiation: vpx_highbd_sad_skip_16x8x4d_avx2 |
367 | | |
368 | | void vpx_highbd_sad16x16x4d_avx2(const uint8_t *src_ptr, int src_stride, |
369 | | const uint8_t *const ref_array[4], |
370 | 0 | int ref_stride, uint32_t sad_array[4]) { |
371 | 0 | const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); |
372 | 0 | uint16_t *refs[4]; |
373 | 0 | __m256i sums_16[4]; |
374 | |
|
375 | 0 | refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); |
376 | 0 | refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); |
377 | 0 | refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); |
378 | 0 | refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); |
379 | 0 | sums_16[0] = _mm256_setzero_si256(); |
380 | 0 | sums_16[1] = _mm256_setzero_si256(); |
381 | 0 | sums_16[2] = _mm256_setzero_si256(); |
382 | 0 | sums_16[3] = _mm256_setzero_si256(); |
383 | |
|
384 | 0 | highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 16); |
385 | |
|
386 | 0 | { |
387 | 0 | __m256i sums_32[4]; |
388 | 0 | sums_32[0] = _mm256_add_epi32( |
389 | 0 | _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), |
390 | 0 | _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))); |
391 | 0 | sums_32[1] = _mm256_add_epi32( |
392 | 0 | _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), |
393 | 0 | _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))); |
394 | 0 | sums_32[2] = _mm256_add_epi32( |
395 | 0 | _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), |
396 | 0 | _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))); |
397 | 0 | sums_32[3] = _mm256_add_epi32( |
398 | 0 | _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), |
399 | 0 | _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))); |
400 | 0 | calc_final_4(sums_32, sad_array); |
401 | 0 | } |
402 | 0 | } |
403 | | |
404 | | void vpx_highbd_sad16x8x4d_avx2(const uint8_t *src_ptr, int src_stride, |
405 | | const uint8_t *const ref_array[4], |
406 | 0 | int ref_stride, uint32_t sad_array[4]) { |
407 | 0 | const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); |
408 | 0 | uint16_t *refs[4]; |
409 | 0 | __m256i sums_16[4]; |
410 | |
|
411 | 0 | refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); |
412 | 0 | refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); |
413 | 0 | refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); |
414 | 0 | refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); |
415 | 0 | sums_16[0] = _mm256_setzero_si256(); |
416 | 0 | sums_16[1] = _mm256_setzero_si256(); |
417 | 0 | sums_16[2] = _mm256_setzero_si256(); |
418 | 0 | sums_16[3] = _mm256_setzero_si256(); |
419 | |
|
420 | 0 | highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 8); |
421 | |
|
422 | 0 | { |
423 | 0 | __m256i sums_32[4]; |
424 | 0 | sums_32[0] = _mm256_add_epi32( |
425 | 0 | _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), |
426 | 0 | _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))); |
427 | 0 | sums_32[1] = _mm256_add_epi32( |
428 | 0 | _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), |
429 | 0 | _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))); |
430 | 0 | sums_32[2] = _mm256_add_epi32( |
431 | 0 | _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), |
432 | 0 | _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))); |
433 | 0 | sums_32[3] = _mm256_add_epi32( |
434 | 0 | _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), |
435 | 0 | _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))); |
436 | 0 | calc_final_4(sums_32, sad_array); |
437 | 0 | } |
438 | 0 | } |
439 | | |
440 | | // clang-format off |
441 | | HIGHBD_SAD64XNX4D(64) |
442 | | HIGHBD_SADSKIP64XNx4D(64) |
443 | | |
444 | | HIGHBD_SAD64XNX4D(32) |
445 | | HIGHBD_SADSKIP64XNx4D(32) |
446 | | |
447 | | HIGHBD_SAD32XNX4D(64) |
448 | | HIGHBD_SADSKIP32XNx4D(64) |
449 | | |
450 | | HIGHBD_SAD32XNX4D(32) |
451 | | HIGHBD_SADSKIP32XNx4D(32) |
452 | | |
453 | | HIGHBD_SAD32XNX4D(16) |
454 | | HIGHBD_SADSKIP32XNx4D(16) |
455 | | |
456 | | HIGHBD_SAD16XNX4D(32) |
457 | | HIGHBD_SADSKIP16XNx4D(32) |
458 | | |
459 | | HIGHBD_SADSKIP16XNx4D(16) |
460 | | |
461 | | HIGHBD_SADSKIP16XNx4D(8) |
462 | | // clang-format on |