/src/libvpx/vpx_dsp/x86/variance_avx2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | |
11 | | #include <immintrin.h> // AVX2 |
12 | | |
13 | | #include "./vpx_dsp_rtcd.h" |
14 | | |
15 | | /* clang-format off */ |
16 | | DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = { |
17 | | 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, |
18 | | 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, |
19 | | 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, |
20 | | 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, |
21 | | 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, |
22 | | 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, |
23 | | 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, |
24 | | 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, |
25 | | 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, |
26 | | 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, |
27 | | 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, |
28 | | 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, |
29 | | 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, |
30 | | 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, |
31 | | 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, |
32 | | 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, |
33 | | }; |
34 | | |
35 | | DECLARE_ALIGNED(32, static const int8_t, adjacent_sub_avx2[32]) = { |
36 | | 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, |
37 | | 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1 |
38 | | }; |
39 | | /* clang-format on */ |
40 | | |
41 | | static INLINE void variance_kernel_avx2(const __m256i src, const __m256i ref, |
42 | | __m256i *const sse, |
43 | 817M | __m256i *const sum) { |
44 | 817M | const __m256i adj_sub = _mm256_load_si256((__m256i const *)adjacent_sub_avx2); |
45 | | |
46 | | // unpack into pairs of source and reference values |
47 | 817M | const __m256i src_ref0 = _mm256_unpacklo_epi8(src, ref); |
48 | 817M | const __m256i src_ref1 = _mm256_unpackhi_epi8(src, ref); |
49 | | |
50 | | // subtract adjacent elements using src*1 + ref*-1 |
51 | 817M | const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub); |
52 | 817M | const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub); |
53 | 817M | const __m256i madd0 = _mm256_madd_epi16(diff0, diff0); |
54 | 817M | const __m256i madd1 = _mm256_madd_epi16(diff1, diff1); |
55 | | |
56 | | // add to the running totals |
57 | 817M | *sum = _mm256_add_epi16(*sum, _mm256_add_epi16(diff0, diff1)); |
58 | 817M | *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(madd0, madd1)); |
59 | 817M | } |
60 | | |
61 | | static INLINE void variance_final_from_32bit_sum_avx2(__m256i vsse, |
62 | | __m128i vsum, |
63 | | unsigned int *const sse, |
64 | 216M | int *const sum) { |
65 | | // extract the low lane and add it to the high lane |
66 | 216M | const __m128i sse_reg_128 = _mm_add_epi32(_mm256_castsi256_si128(vsse), |
67 | 216M | _mm256_extractf128_si256(vsse, 1)); |
68 | | |
69 | | // unpack sse and sum registers and add |
70 | 216M | const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, vsum); |
71 | 216M | const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, vsum); |
72 | 216M | const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi); |
73 | | |
74 | | // perform the final summation and extract the results |
75 | 216M | const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8)); |
76 | 216M | *((int *)sse) = _mm_cvtsi128_si32(res); |
77 | 216M | *((int *)sum) = _mm_extract_epi32(res, 1); |
78 | 216M | } |
79 | | |
80 | | static INLINE void variance_final_from_16bit_sum_avx2(__m256i vsse, |
81 | | __m256i vsum, |
82 | | unsigned int *const sse, |
83 | 207M | int *const sum) { |
84 | | // extract the low lane and add it to the high lane |
85 | 207M | const __m128i sum_reg_128 = _mm_add_epi16(_mm256_castsi256_si128(vsum), |
86 | 207M | _mm256_extractf128_si256(vsum, 1)); |
87 | 207M | const __m128i sum_reg_64 = |
88 | 207M | _mm_add_epi16(sum_reg_128, _mm_srli_si128(sum_reg_128, 8)); |
89 | 207M | const __m128i sum_int32 = _mm_cvtepi16_epi32(sum_reg_64); |
90 | | |
91 | 207M | variance_final_from_32bit_sum_avx2(vsse, sum_int32, sse, sum); |
92 | 207M | } |
93 | | |
94 | 2.00M | static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) { |
95 | 2.00M | const __m256i sum_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum)); |
96 | 2.00M | const __m256i sum_hi = |
97 | 2.00M | _mm256_cvtepi16_epi32(_mm256_extractf128_si256(sum, 1)); |
98 | 2.00M | return _mm256_add_epi32(sum_lo, sum_hi); |
99 | 2.00M | } |
100 | | |
101 | | static INLINE void variance8_kernel_avx2( |
102 | | const uint8_t *const src, const int src_stride, const uint8_t *const ref, |
103 | 605M | const int ref_stride, __m256i *const sse, __m256i *const sum) { |
104 | 605M | __m128i src0, src1, ref0, ref1; |
105 | 605M | __m256i ss, rr, diff; |
106 | | |
107 | | // 0 0 0.... 0 s07 s06 s05 s04 s03 s02 s01 s00 |
108 | 605M | src0 = _mm_loadl_epi64((const __m128i *)(src + 0 * src_stride)); |
109 | | |
110 | | // 0 0 0.... 0 s17 s16 s15 s14 s13 s12 s11 s10 |
111 | 605M | src1 = _mm_loadl_epi64((const __m128i *)(src + 1 * src_stride)); |
112 | | |
113 | | // s17 s16...s11 s10 s07 s06...s01 s00 (8bit) |
114 | 605M | src0 = _mm_unpacklo_epi64(src0, src1); |
115 | | |
116 | | // s17 s16...s11 s10 s07 s06...s01 s00 (16 bit) |
117 | 605M | ss = _mm256_cvtepu8_epi16(src0); |
118 | | |
119 | | // 0 0 0.... 0 r07 r06 r05 r04 r03 r02 r01 r00 |
120 | 605M | ref0 = _mm_loadl_epi64((const __m128i *)(ref + 0 * ref_stride)); |
121 | | |
122 | | // 0 0 0.... 0 r17 r16 0 r15 0 r14 0 r13 0 r12 0 r11 0 r10 |
123 | 605M | ref1 = _mm_loadl_epi64((const __m128i *)(ref + 1 * ref_stride)); |
124 | | |
125 | | // r17 r16...r11 r10 r07 r06...r01 r00 (8 bit) |
126 | 605M | ref0 = _mm_unpacklo_epi64(ref0, ref1); |
127 | | |
128 | | // r17 r16...r11 r10 r07 r06...r01 r00 (16 bit) |
129 | 605M | rr = _mm256_cvtepu8_epi16(ref0); |
130 | | |
131 | 605M | diff = _mm256_sub_epi16(ss, rr); |
132 | 605M | *sse = _mm256_add_epi32(*sse, _mm256_madd_epi16(diff, diff)); |
133 | 605M | *sum = _mm256_add_epi16(*sum, diff); |
134 | 605M | } |
135 | | |
136 | | static INLINE void variance16_kernel_avx2( |
137 | | const uint8_t *const src, const int src_stride, const uint8_t *const ref, |
138 | 444M | const int ref_stride, __m256i *const sse, __m256i *const sum) { |
139 | 444M | const __m128i s0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); |
140 | 444M | const __m128i s1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); |
141 | 444M | const __m128i r0 = _mm_loadu_si128((__m128i const *)(ref + 0 * ref_stride)); |
142 | 444M | const __m128i r1 = _mm_loadu_si128((__m128i const *)(ref + 1 * ref_stride)); |
143 | 444M | const __m256i s = _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1); |
144 | 444M | const __m256i r = _mm256_inserti128_si256(_mm256_castsi128_si256(r0), r1, 1); |
145 | 444M | variance_kernel_avx2(s, r, sse, sum); |
146 | 444M | } |
147 | | |
148 | | static INLINE void variance32_kernel_avx2(const uint8_t *const src, |
149 | | const uint8_t *const ref, |
150 | | __m256i *const sse, |
151 | 372M | __m256i *const sum) { |
152 | 372M | const __m256i s = _mm256_loadu_si256((__m256i const *)(src)); |
153 | 372M | const __m256i r = _mm256_loadu_si256((__m256i const *)(ref)); |
154 | 372M | variance_kernel_avx2(s, r, sse, sum); |
155 | 372M | } |
156 | | |
157 | | static INLINE void variance8_avx2(const uint8_t *src, const int src_stride, |
158 | | const uint8_t *ref, const int ref_stride, |
159 | | const int h, __m256i *const vsse, |
160 | 149M | __m256i *const vsum) { |
161 | 149M | int i; |
162 | 149M | *vsum = _mm256_setzero_si256(); |
163 | 149M | *vsse = _mm256_setzero_si256(); |
164 | | |
165 | 754M | for (i = 0; i < h; i += 2) { |
166 | 605M | variance8_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum); |
167 | 605M | src += 2 * src_stride; |
168 | 605M | ref += 2 * ref_stride; |
169 | 605M | } |
170 | 149M | } |
171 | | |
172 | | static INLINE void variance16_avx2(const uint8_t *src, const int src_stride, |
173 | | const uint8_t *ref, const int ref_stride, |
174 | | const int h, __m256i *const vsse, |
175 | 57.2M | __m256i *const vsum) { |
176 | 57.2M | int i; |
177 | 57.2M | *vsum = _mm256_setzero_si256(); |
178 | 57.2M | *vsse = _mm256_setzero_si256(); |
179 | | |
180 | 501M | for (i = 0; i < h; i += 2) { |
181 | 444M | variance16_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum); |
182 | 444M | src += 2 * src_stride; |
183 | 444M | ref += 2 * ref_stride; |
184 | 444M | } |
185 | 57.2M | } |
186 | | |
187 | | static INLINE void variance32_avx2(const uint8_t *src, const int src_stride, |
188 | | const uint8_t *ref, const int ref_stride, |
189 | | const int h, __m256i *const vsse, |
190 | 8.78M | __m256i *const vsum) { |
191 | 8.78M | int i; |
192 | 8.78M | *vsum = _mm256_setzero_si256(); |
193 | 8.78M | *vsse = _mm256_setzero_si256(); |
194 | | |
195 | 272M | for (i = 0; i < h; i++) { |
196 | 263M | variance32_kernel_avx2(src, ref, vsse, vsum); |
197 | 263M | src += src_stride; |
198 | 263M | ref += ref_stride; |
199 | 263M | } |
200 | 8.78M | } |
201 | | |
202 | | static INLINE void variance64_avx2(const uint8_t *src, const int src_stride, |
203 | | const uint8_t *ref, const int ref_stride, |
204 | | const int h, __m256i *const vsse, |
205 | 1.70M | __m256i *const vsum) { |
206 | 1.70M | int i; |
207 | 1.70M | *vsum = _mm256_setzero_si256(); |
208 | | |
209 | 56.3M | for (i = 0; i < h; i++) { |
210 | 54.6M | variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum); |
211 | 54.6M | variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum); |
212 | 54.6M | src += src_stride; |
213 | 54.6M | ref += ref_stride; |
214 | 54.6M | } |
215 | 1.70M | } |
216 | | |
217 | | void vpx_get16x16var_avx2(const uint8_t *src_ptr, int src_stride, |
218 | | const uint8_t *ref_ptr, int ref_stride, |
219 | 0 | unsigned int *sse, int *sum) { |
220 | 0 | __m256i vsse, vsum; |
221 | 0 | variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); |
222 | 0 | variance_final_from_16bit_sum_avx2(vsse, vsum, sse, sum); |
223 | 0 | } |
224 | | |
225 | | #define FILTER_SRC(filter) \ |
226 | | /* filter the source */ \ |
227 | 68.9M | exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \ |
228 | 68.9M | exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \ |
229 | 68.9M | \ |
230 | 68.9M | /* add 8 to source */ \ |
231 | 68.9M | exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \ |
232 | 68.9M | exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \ |
233 | 68.9M | \ |
234 | 68.9M | /* divide source by 16 */ \ |
235 | 68.9M | exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \ |
236 | 68.9M | exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4); |
237 | | |
238 | | #define CALC_SUM_SSE_INSIDE_LOOP \ |
239 | | /* expand each byte to 2 bytes */ \ |
240 | 100M | exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \ |
241 | 100M | exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \ |
242 | 100M | /* source - dest */ \ |
243 | 100M | exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \ |
244 | 100M | exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \ |
245 | 100M | /* caculate sum */ \ |
246 | 100M | *sum_reg = _mm256_add_epi16(*sum_reg, exp_src_lo); \ |
247 | 100M | exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \ |
248 | 100M | *sum_reg = _mm256_add_epi16(*sum_reg, exp_src_hi); \ |
249 | 100M | exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \ |
250 | 100M | /* calculate sse */ \ |
251 | 100M | *sse_reg = _mm256_add_epi32(*sse_reg, exp_src_lo); \ |
252 | 100M | *sse_reg = _mm256_add_epi32(*sse_reg, exp_src_hi); |
253 | | |
254 | | // final calculation to sum and sse |
255 | | #define CALC_SUM_AND_SSE \ |
256 | 2.29M | res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \ |
257 | 2.29M | sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \ |
258 | 2.29M | sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \ |
259 | 2.29M | sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \ |
260 | 2.29M | sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ |
261 | 2.29M | sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \ |
262 | 2.29M | \ |
263 | 2.29M | sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \ |
264 | 2.29M | sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \ |
265 | 2.29M | \ |
266 | 2.29M | sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ |
267 | 2.29M | sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ |
268 | 2.29M | *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \ |
269 | 2.29M | _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \ |
270 | 2.29M | sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \ |
271 | 2.29M | sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ |
272 | 2.29M | sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \ |
273 | 2.29M | _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1)); |
274 | | |
275 | | static INLINE void spv32_x0_y0(const uint8_t *src, int src_stride, |
276 | | const uint8_t *dst, int dst_stride, |
277 | | const uint8_t *second_pred, int second_stride, |
278 | | int do_sec, int height, __m256i *sum_reg, |
279 | 69.5k | __m256i *sse_reg) { |
280 | 69.5k | const __m256i zero_reg = _mm256_setzero_si256(); |
281 | 69.5k | __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; |
282 | 69.5k | int i; |
283 | 3.17M | for (i = 0; i < height; i++) { |
284 | 3.10M | const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst); |
285 | 3.10M | const __m256i src_reg = _mm256_loadu_si256((__m256i const *)src); |
286 | 3.10M | if (do_sec) { |
287 | 0 | const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred); |
288 | 0 | const __m256i avg_reg = _mm256_avg_epu8(src_reg, sec_reg); |
289 | 0 | exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); |
290 | 0 | exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); |
291 | 0 | second_pred += second_stride; |
292 | 3.10M | } else { |
293 | 3.10M | exp_src_lo = _mm256_unpacklo_epi8(src_reg, zero_reg); |
294 | 3.10M | exp_src_hi = _mm256_unpackhi_epi8(src_reg, zero_reg); |
295 | 3.10M | } |
296 | 3.10M | CALC_SUM_SSE_INSIDE_LOOP |
297 | 3.10M | src += src_stride; |
298 | 3.10M | dst += dst_stride; |
299 | 3.10M | } |
300 | 69.5k | } |
301 | | |
302 | | // (x == 0, y == 4) or (x == 4, y == 0). sstep determines the direction. |
303 | | static INLINE void spv32_half_zero(const uint8_t *src, int src_stride, |
304 | | const uint8_t *dst, int dst_stride, |
305 | | const uint8_t *second_pred, |
306 | | int second_stride, int do_sec, int height, |
307 | | __m256i *sum_reg, __m256i *sse_reg, |
308 | 755k | int sstep) { |
309 | 755k | const __m256i zero_reg = _mm256_setzero_si256(); |
310 | 755k | __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; |
311 | 755k | int i; |
312 | 33.7M | for (i = 0; i < height; i++) { |
313 | 33.0M | const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst); |
314 | 33.0M | const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src); |
315 | 33.0M | const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + sstep)); |
316 | 33.0M | const __m256i src_avg = _mm256_avg_epu8(src_0, src_1); |
317 | 33.0M | if (do_sec) { |
318 | 0 | const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred); |
319 | 0 | const __m256i avg_reg = _mm256_avg_epu8(src_avg, sec_reg); |
320 | 0 | exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); |
321 | 0 | exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); |
322 | 0 | second_pred += second_stride; |
323 | 33.0M | } else { |
324 | 33.0M | exp_src_lo = _mm256_unpacklo_epi8(src_avg, zero_reg); |
325 | 33.0M | exp_src_hi = _mm256_unpackhi_epi8(src_avg, zero_reg); |
326 | 33.0M | } |
327 | 33.0M | CALC_SUM_SSE_INSIDE_LOOP |
328 | 33.0M | src += src_stride; |
329 | 33.0M | dst += dst_stride; |
330 | 33.0M | } |
331 | 755k | } |
332 | | |
333 | | static INLINE void spv32_x0_y4(const uint8_t *src, int src_stride, |
334 | | const uint8_t *dst, int dst_stride, |
335 | | const uint8_t *second_pred, int second_stride, |
336 | | int do_sec, int height, __m256i *sum_reg, |
337 | 378k | __m256i *sse_reg) { |
338 | 378k | spv32_half_zero(src, src_stride, dst, dst_stride, second_pred, second_stride, |
339 | 378k | do_sec, height, sum_reg, sse_reg, src_stride); |
340 | 378k | } |
341 | | |
342 | | static INLINE void spv32_x4_y0(const uint8_t *src, int src_stride, |
343 | | const uint8_t *dst, int dst_stride, |
344 | | const uint8_t *second_pred, int second_stride, |
345 | | int do_sec, int height, __m256i *sum_reg, |
346 | 376k | __m256i *sse_reg) { |
347 | 376k | spv32_half_zero(src, src_stride, dst, dst_stride, second_pred, second_stride, |
348 | 376k | do_sec, height, sum_reg, sse_reg, 1); |
349 | 376k | } |
350 | | |
351 | | static INLINE void spv32_x4_y4(const uint8_t *src, int src_stride, |
352 | | const uint8_t *dst, int dst_stride, |
353 | | const uint8_t *second_pred, int second_stride, |
354 | | int do_sec, int height, __m256i *sum_reg, |
355 | 220k | __m256i *sse_reg) { |
356 | 220k | const __m256i zero_reg = _mm256_setzero_si256(); |
357 | 220k | const __m256i src_a = _mm256_loadu_si256((__m256i const *)src); |
358 | 220k | const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1)); |
359 | 220k | __m256i prev_src_avg = _mm256_avg_epu8(src_a, src_b); |
360 | 220k | __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; |
361 | 220k | int i; |
362 | 220k | src += src_stride; |
363 | 9.77M | for (i = 0; i < height; i++) { |
364 | 9.55M | const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst); |
365 | 9.55M | const __m256i src_0 = _mm256_loadu_si256((__m256i const *)(src)); |
366 | 9.55M | const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1)); |
367 | 9.55M | const __m256i src_avg = _mm256_avg_epu8(src_0, src_1); |
368 | 9.55M | const __m256i current_avg = _mm256_avg_epu8(prev_src_avg, src_avg); |
369 | 9.55M | prev_src_avg = src_avg; |
370 | | |
371 | 9.55M | if (do_sec) { |
372 | 0 | const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred); |
373 | 0 | const __m256i avg_reg = _mm256_avg_epu8(current_avg, sec_reg); |
374 | 0 | exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); |
375 | 0 | exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); |
376 | 0 | second_pred += second_stride; |
377 | 9.55M | } else { |
378 | 9.55M | exp_src_lo = _mm256_unpacklo_epi8(current_avg, zero_reg); |
379 | 9.55M | exp_src_hi = _mm256_unpackhi_epi8(current_avg, zero_reg); |
380 | 9.55M | } |
381 | | // save current source average |
382 | 9.55M | CALC_SUM_SSE_INSIDE_LOOP |
383 | 9.55M | dst += dst_stride; |
384 | 9.55M | src += src_stride; |
385 | 9.55M | } |
386 | 220k | } |
387 | | |
388 | | // (x == 0, y == bil) or (x == 4, y == bil). sstep determines the direction. |
389 | | static INLINE void spv32_bilin_zero(const uint8_t *src, int src_stride, |
390 | | const uint8_t *dst, int dst_stride, |
391 | | const uint8_t *second_pred, |
392 | | int second_stride, int do_sec, int height, |
393 | | __m256i *sum_reg, __m256i *sse_reg, |
394 | 692k | int offset, int sstep) { |
395 | 692k | const __m256i zero_reg = _mm256_setzero_si256(); |
396 | 692k | const __m256i pw8 = _mm256_set1_epi16(8); |
397 | 692k | const __m256i filter = _mm256_load_si256( |
398 | 692k | (__m256i const *)(bilinear_filters_avx2 + (offset << 5))); |
399 | 692k | __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; |
400 | 692k | int i; |
401 | 30.4M | for (i = 0; i < height; i++) { |
402 | 29.7M | const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst); |
403 | 29.7M | const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src); |
404 | 29.7M | const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + sstep)); |
405 | 29.7M | exp_src_lo = _mm256_unpacklo_epi8(src_0, src_1); |
406 | 29.7M | exp_src_hi = _mm256_unpackhi_epi8(src_0, src_1); |
407 | | |
408 | 29.7M | FILTER_SRC(filter) |
409 | 29.7M | if (do_sec) { |
410 | 0 | const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred); |
411 | 0 | const __m256i exp_src = _mm256_packus_epi16(exp_src_lo, exp_src_hi); |
412 | 0 | const __m256i avg_reg = _mm256_avg_epu8(exp_src, sec_reg); |
413 | 0 | second_pred += second_stride; |
414 | 0 | exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); |
415 | 0 | exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); |
416 | 0 | } |
417 | 29.7M | CALC_SUM_SSE_INSIDE_LOOP |
418 | 29.7M | src += src_stride; |
419 | 29.7M | dst += dst_stride; |
420 | 29.7M | } |
421 | 692k | } |
422 | | |
423 | | static INLINE void spv32_x0_yb(const uint8_t *src, int src_stride, |
424 | | const uint8_t *dst, int dst_stride, |
425 | | const uint8_t *second_pred, int second_stride, |
426 | | int do_sec, int height, __m256i *sum_reg, |
427 | 348k | __m256i *sse_reg, int y_offset) { |
428 | 348k | spv32_bilin_zero(src, src_stride, dst, dst_stride, second_pred, second_stride, |
429 | 348k | do_sec, height, sum_reg, sse_reg, y_offset, src_stride); |
430 | 348k | } |
431 | | |
432 | | static INLINE void spv32_xb_y0(const uint8_t *src, int src_stride, |
433 | | const uint8_t *dst, int dst_stride, |
434 | | const uint8_t *second_pred, int second_stride, |
435 | | int do_sec, int height, __m256i *sum_reg, |
436 | 344k | __m256i *sse_reg, int x_offset) { |
437 | 344k | spv32_bilin_zero(src, src_stride, dst, dst_stride, second_pred, second_stride, |
438 | 344k | do_sec, height, sum_reg, sse_reg, x_offset, 1); |
439 | 344k | } |
440 | | |
441 | | static INLINE void spv32_x4_yb(const uint8_t *src, int src_stride, |
442 | | const uint8_t *dst, int dst_stride, |
443 | | const uint8_t *second_pred, int second_stride, |
444 | | int do_sec, int height, __m256i *sum_reg, |
445 | 119k | __m256i *sse_reg, int y_offset) { |
446 | 119k | const __m256i zero_reg = _mm256_setzero_si256(); |
447 | 119k | const __m256i pw8 = _mm256_set1_epi16(8); |
448 | 119k | const __m256i filter = _mm256_load_si256( |
449 | 119k | (__m256i const *)(bilinear_filters_avx2 + (y_offset << 5))); |
450 | 119k | const __m256i src_a = _mm256_loadu_si256((__m256i const *)src); |
451 | 119k | const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1)); |
452 | 119k | __m256i prev_src_avg = _mm256_avg_epu8(src_a, src_b); |
453 | 119k | __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; |
454 | 119k | int i; |
455 | 119k | src += src_stride; |
456 | 5.35M | for (i = 0; i < height; i++) { |
457 | 5.23M | const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst); |
458 | 5.23M | const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src); |
459 | 5.23M | const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1)); |
460 | 5.23M | const __m256i src_avg = _mm256_avg_epu8(src_0, src_1); |
461 | 5.23M | exp_src_lo = _mm256_unpacklo_epi8(prev_src_avg, src_avg); |
462 | 5.23M | exp_src_hi = _mm256_unpackhi_epi8(prev_src_avg, src_avg); |
463 | 5.23M | prev_src_avg = src_avg; |
464 | | |
465 | 5.23M | FILTER_SRC(filter) |
466 | 5.23M | if (do_sec) { |
467 | 0 | const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred); |
468 | 0 | const __m256i exp_src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); |
469 | 0 | const __m256i avg_reg = _mm256_avg_epu8(exp_src_avg, sec_reg); |
470 | 0 | exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); |
471 | 0 | exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); |
472 | 0 | second_pred += second_stride; |
473 | 0 | } |
474 | 5.23M | CALC_SUM_SSE_INSIDE_LOOP |
475 | 5.23M | dst += dst_stride; |
476 | 5.23M | src += src_stride; |
477 | 5.23M | } |
478 | 119k | } |
479 | | |
480 | | static INLINE void spv32_xb_y4(const uint8_t *src, int src_stride, |
481 | | const uint8_t *dst, int dst_stride, |
482 | | const uint8_t *second_pred, int second_stride, |
483 | | int do_sec, int height, __m256i *sum_reg, |
484 | 116k | __m256i *sse_reg, int x_offset) { |
485 | 116k | const __m256i zero_reg = _mm256_setzero_si256(); |
486 | 116k | const __m256i pw8 = _mm256_set1_epi16(8); |
487 | 116k | const __m256i filter = _mm256_load_si256( |
488 | 116k | (__m256i const *)(bilinear_filters_avx2 + (x_offset << 5))); |
489 | 116k | const __m256i src_a = _mm256_loadu_si256((__m256i const *)src); |
490 | 116k | const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1)); |
491 | 116k | __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; |
492 | 116k | __m256i src_reg, src_pack; |
493 | 116k | int i; |
494 | 116k | exp_src_lo = _mm256_unpacklo_epi8(src_a, src_b); |
495 | 116k | exp_src_hi = _mm256_unpackhi_epi8(src_a, src_b); |
496 | 116k | FILTER_SRC(filter) |
497 | | // convert each 16 bit to 8 bit to each low and high lane source |
498 | 116k | src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); |
499 | | |
500 | 116k | src += src_stride; |
501 | 5.37M | for (i = 0; i < height; i++) { |
502 | 5.26M | const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst); |
503 | 5.26M | const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src); |
504 | 5.26M | const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1)); |
505 | 5.26M | exp_src_lo = _mm256_unpacklo_epi8(src_0, src_1); |
506 | 5.26M | exp_src_hi = _mm256_unpackhi_epi8(src_0, src_1); |
507 | | |
508 | 5.26M | FILTER_SRC(filter) |
509 | | |
510 | 5.26M | src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); |
511 | | // average between previous pack to the current |
512 | 5.26M | src_pack = _mm256_avg_epu8(src_pack, src_reg); |
513 | | |
514 | 5.26M | if (do_sec) { |
515 | 0 | const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred); |
516 | 0 | const __m256i avg_pack = _mm256_avg_epu8(src_pack, sec_reg); |
517 | 0 | exp_src_lo = _mm256_unpacklo_epi8(avg_pack, zero_reg); |
518 | 0 | exp_src_hi = _mm256_unpackhi_epi8(avg_pack, zero_reg); |
519 | 0 | second_pred += second_stride; |
520 | 5.26M | } else { |
521 | 5.26M | exp_src_lo = _mm256_unpacklo_epi8(src_pack, zero_reg); |
522 | 5.26M | exp_src_hi = _mm256_unpackhi_epi8(src_pack, zero_reg); |
523 | 5.26M | } |
524 | 5.26M | CALC_SUM_SSE_INSIDE_LOOP |
525 | 5.26M | src_pack = src_reg; |
526 | 5.26M | dst += dst_stride; |
527 | 5.26M | src += src_stride; |
528 | 5.26M | } |
529 | 116k | } |
530 | | |
531 | | static INLINE void spv32_xb_yb(const uint8_t *src, int src_stride, |
532 | | const uint8_t *dst, int dst_stride, |
533 | | const uint8_t *second_pred, int second_stride, |
534 | | int do_sec, int height, __m256i *sum_reg, |
535 | 325k | __m256i *sse_reg, int x_offset, int y_offset) { |
536 | 325k | const __m256i zero_reg = _mm256_setzero_si256(); |
537 | 325k | const __m256i pw8 = _mm256_set1_epi16(8); |
538 | 325k | const __m256i xfilter = _mm256_load_si256( |
539 | 325k | (__m256i const *)(bilinear_filters_avx2 + (x_offset << 5))); |
540 | 325k | const __m256i yfilter = _mm256_load_si256( |
541 | 325k | (__m256i const *)(bilinear_filters_avx2 + (y_offset << 5))); |
542 | 325k | const __m256i src_a = _mm256_loadu_si256((__m256i const *)src); |
543 | 325k | const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1)); |
544 | 325k | __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; |
545 | 325k | __m256i prev_src_pack, src_pack; |
546 | 325k | int i; |
547 | 325k | exp_src_lo = _mm256_unpacklo_epi8(src_a, src_b); |
548 | 325k | exp_src_hi = _mm256_unpackhi_epi8(src_a, src_b); |
549 | 325k | FILTER_SRC(xfilter) |
550 | | // convert each 16 bit to 8 bit to each low and high lane source |
551 | 325k | prev_src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); |
552 | 325k | src += src_stride; |
553 | | |
554 | 14.4M | for (i = 0; i < height; i++) { |
555 | 14.1M | const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst); |
556 | 14.1M | const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src); |
557 | 14.1M | const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1)); |
558 | 14.1M | exp_src_lo = _mm256_unpacklo_epi8(src_0, src_1); |
559 | 14.1M | exp_src_hi = _mm256_unpackhi_epi8(src_0, src_1); |
560 | | |
561 | 14.1M | FILTER_SRC(xfilter) |
562 | 14.1M | src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); |
563 | | |
564 | | // merge previous pack to current pack source |
565 | 14.1M | exp_src_lo = _mm256_unpacklo_epi8(prev_src_pack, src_pack); |
566 | 14.1M | exp_src_hi = _mm256_unpackhi_epi8(prev_src_pack, src_pack); |
567 | | |
568 | 14.1M | FILTER_SRC(yfilter) |
569 | 14.1M | if (do_sec) { |
570 | 0 | const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred); |
571 | 0 | const __m256i exp_src = _mm256_packus_epi16(exp_src_lo, exp_src_hi); |
572 | 0 | const __m256i avg_reg = _mm256_avg_epu8(exp_src, sec_reg); |
573 | 0 | exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); |
574 | 0 | exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); |
575 | 0 | second_pred += second_stride; |
576 | 0 | } |
577 | | |
578 | 14.1M | prev_src_pack = src_pack; |
579 | | |
580 | 14.1M | CALC_SUM_SSE_INSIDE_LOOP |
581 | 14.1M | dst += dst_stride; |
582 | 14.1M | src += src_stride; |
583 | 14.1M | } |
584 | 325k | } |
585 | | |
586 | | static INLINE int sub_pix_var32xh(const uint8_t *src, int src_stride, |
587 | | int x_offset, int y_offset, |
588 | | const uint8_t *dst, int dst_stride, |
589 | | const uint8_t *second_pred, int second_stride, |
590 | 2.29M | int do_sec, int height, unsigned int *sse) { |
591 | 2.29M | const __m256i zero_reg = _mm256_setzero_si256(); |
592 | 2.29M | __m256i sum_reg = _mm256_setzero_si256(); |
593 | 2.29M | __m256i sse_reg = _mm256_setzero_si256(); |
594 | 2.29M | __m256i sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; |
595 | 2.29M | int sum; |
596 | | // x_offset = 0 and y_offset = 0 |
597 | 2.29M | if (x_offset == 0) { |
598 | 796k | if (y_offset == 0) { |
599 | 69.5k | spv32_x0_y0(src, src_stride, dst, dst_stride, second_pred, second_stride, |
600 | 69.5k | do_sec, height, &sum_reg, &sse_reg); |
601 | | // x_offset = 0 and y_offset = 4 |
602 | 726k | } else if (y_offset == 4) { |
603 | 378k | spv32_x0_y4(src, src_stride, dst, dst_stride, second_pred, second_stride, |
604 | 378k | do_sec, height, &sum_reg, &sse_reg); |
605 | | // x_offset = 0 and y_offset = bilin interpolation |
606 | 378k | } else { |
607 | 348k | spv32_x0_yb(src, src_stride, dst, dst_stride, second_pred, second_stride, |
608 | 348k | do_sec, height, &sum_reg, &sse_reg, y_offset); |
609 | 348k | } |
610 | | // x_offset = 4 and y_offset = 0 |
611 | 1.50M | } else if (x_offset == 4) { |
612 | 716k | if (y_offset == 0) { |
613 | 376k | spv32_x4_y0(src, src_stride, dst, dst_stride, second_pred, second_stride, |
614 | 376k | do_sec, height, &sum_reg, &sse_reg); |
615 | | // x_offset = 4 and y_offset = 4 |
616 | 376k | } else if (y_offset == 4) { |
617 | 220k | spv32_x4_y4(src, src_stride, dst, dst_stride, second_pred, second_stride, |
618 | 220k | do_sec, height, &sum_reg, &sse_reg); |
619 | | // x_offset = 4 and y_offset = bilin interpolation |
620 | 220k | } else { |
621 | 119k | spv32_x4_yb(src, src_stride, dst, dst_stride, second_pred, second_stride, |
622 | 119k | do_sec, height, &sum_reg, &sse_reg, y_offset); |
623 | 119k | } |
624 | | // x_offset = bilin interpolation and y_offset = 0 |
625 | 786k | } else { |
626 | 786k | if (y_offset == 0) { |
627 | 344k | spv32_xb_y0(src, src_stride, dst, dst_stride, second_pred, second_stride, |
628 | 344k | do_sec, height, &sum_reg, &sse_reg, x_offset); |
629 | | // x_offset = bilin interpolation and y_offset = 4 |
630 | 441k | } else if (y_offset == 4) { |
631 | 116k | spv32_xb_y4(src, src_stride, dst, dst_stride, second_pred, second_stride, |
632 | 116k | do_sec, height, &sum_reg, &sse_reg, x_offset); |
633 | | // x_offset = bilin interpolation and y_offset = bilin interpolation |
634 | 325k | } else { |
635 | 325k | spv32_xb_yb(src, src_stride, dst, dst_stride, second_pred, second_stride, |
636 | 325k | do_sec, height, &sum_reg, &sse_reg, x_offset, y_offset); |
637 | 325k | } |
638 | 786k | } |
639 | 2.29M | CALC_SUM_AND_SSE |
640 | 2.29M | return sum; |
641 | 2.29M | } |
642 | | |
643 | | static int sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, |
644 | | int x_offset, int y_offset, |
645 | | const uint8_t *dst, int dst_stride, |
646 | 2.29M | int height, unsigned int *sse) { |
647 | 2.29M | return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride, |
648 | 2.29M | NULL, 0, 0, height, sse); |
649 | 2.29M | } |
650 | | |
651 | | static int sub_pixel_avg_variance32xh_avx2(const uint8_t *src, int src_stride, |
652 | | int x_offset, int y_offset, |
653 | | const uint8_t *dst, int dst_stride, |
654 | | const uint8_t *second_pred, |
655 | | int second_stride, int height, |
656 | 0 | unsigned int *sse) { |
657 | 0 | return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride, |
658 | 0 | second_pred, second_stride, 1, height, sse); |
659 | 0 | } |
660 | | |
661 | | typedef void (*get_var_avx2)(const uint8_t *src_ptr, int src_stride, |
662 | | const uint8_t *ref_ptr, int ref_stride, |
663 | | unsigned int *sse, int *sum); |
664 | | |
665 | | unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride, |
666 | | const uint8_t *ref_ptr, int ref_stride, |
667 | 8.62M | unsigned int *sse) { |
668 | 8.62M | __m256i vsse, vsum; |
669 | 8.62M | int sum; |
670 | 8.62M | variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum); |
671 | 8.62M | variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); |
672 | 8.62M | return *sse - ((sum * sum) >> 5); |
673 | 8.62M | } |
674 | | |
675 | | unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride, |
676 | | const uint8_t *ref_ptr, int ref_stride, |
677 | 133M | unsigned int *sse) { |
678 | 133M | __m256i vsse, vsum; |
679 | 133M | int sum; |
680 | 133M | variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); |
681 | 133M | variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); |
682 | 133M | return *sse - ((sum * sum) >> 6); |
683 | 133M | } |
684 | | |
685 | | unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride, |
686 | | const uint8_t *ref_ptr, int ref_stride, |
687 | 6.54M | unsigned int *sse) { |
688 | 6.54M | __m256i vsse, vsum; |
689 | 6.54M | int sum; |
690 | 6.54M | variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); |
691 | 6.54M | variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); |
692 | 6.54M | return *sse - ((sum * sum) >> 7); |
693 | 6.54M | } |
694 | | |
695 | | unsigned int vpx_variance16x8_avx2(const uint8_t *src_ptr, int src_stride, |
696 | | const uint8_t *ref_ptr, int ref_stride, |
697 | 6.47M | unsigned int *sse) { |
698 | 6.47M | int sum; |
699 | 6.47M | __m256i vsse, vsum; |
700 | 6.47M | variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); |
701 | 6.47M | variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); |
702 | 6.47M | return *sse - (uint32_t)(((int64_t)sum * sum) >> 7); |
703 | 6.47M | } |
704 | | |
705 | | unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int src_stride, |
706 | | const uint8_t *ref_ptr, int ref_stride, |
707 | 39.3M | unsigned int *sse) { |
708 | 39.3M | int sum; |
709 | 39.3M | __m256i vsse, vsum; |
710 | 39.3M | variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); |
711 | 39.3M | variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); |
712 | 39.3M | return *sse - (uint32_t)(((int64_t)sum * sum) >> 8); |
713 | 39.3M | } |
714 | | |
715 | | unsigned int vpx_variance16x32_avx2(const uint8_t *src_ptr, int src_stride, |
716 | | const uint8_t *ref_ptr, int ref_stride, |
717 | 1.55M | unsigned int *sse) { |
718 | 1.55M | int sum; |
719 | 1.55M | __m256i vsse, vsum; |
720 | 1.55M | variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum); |
721 | 1.55M | variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); |
722 | 1.55M | return *sse - (uint32_t)(((int64_t)sum * sum) >> 9); |
723 | 1.55M | } |
724 | | |
725 | | unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int src_stride, |
726 | | const uint8_t *ref_ptr, int ref_stride, |
727 | 1.70M | unsigned int *sse) { |
728 | 1.70M | int sum; |
729 | 1.70M | __m256i vsse, vsum; |
730 | 1.70M | variance32_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); |
731 | 1.70M | variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); |
732 | 1.70M | return *sse - (uint32_t)(((int64_t)sum * sum) >> 9); |
733 | 1.70M | } |
734 | | |
735 | | unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, |
736 | | const uint8_t *ref_ptr, int ref_stride, |
737 | 6.77M | unsigned int *sse) { |
738 | 6.77M | int sum; |
739 | 6.77M | __m256i vsse, vsum; |
740 | 6.77M | __m128i vsum_128; |
741 | 6.77M | variance32_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum); |
742 | 6.77M | vsum_128 = _mm_add_epi16(_mm256_castsi256_si128(vsum), |
743 | 6.77M | _mm256_extractf128_si256(vsum, 1)); |
744 | 6.77M | vsum_128 = _mm_add_epi32(_mm_cvtepi16_epi32(vsum_128), |
745 | 6.77M | _mm_cvtepi16_epi32(_mm_srli_si128(vsum_128, 8))); |
746 | 6.77M | variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse, &sum); |
747 | 6.77M | return *sse - (uint32_t)(((int64_t)sum * sum) >> 10); |
748 | 6.77M | } |
749 | | |
750 | | unsigned int vpx_variance32x64_avx2(const uint8_t *src_ptr, int src_stride, |
751 | | const uint8_t *ref_ptr, int ref_stride, |
752 | 302k | unsigned int *sse) { |
753 | 302k | int sum; |
754 | 302k | __m256i vsse, vsum; |
755 | 302k | __m128i vsum_128; |
756 | 302k | variance32_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64, &vsse, &vsum); |
757 | 302k | vsum = sum_to_32bit_avx2(vsum); |
758 | 302k | vsum_128 = _mm_add_epi32(_mm256_castsi256_si128(vsum), |
759 | 302k | _mm256_extractf128_si256(vsum, 1)); |
760 | 302k | variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse, &sum); |
761 | 302k | return *sse - (uint32_t)(((int64_t)sum * sum) >> 11); |
762 | 302k | } |
763 | | |
764 | | unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int src_stride, |
765 | | const uint8_t *ref_ptr, int ref_stride, |
766 | 413k | unsigned int *sse) { |
767 | 413k | __m256i vsse = _mm256_setzero_si256(); |
768 | 413k | __m256i vsum = _mm256_setzero_si256(); |
769 | 413k | __m128i vsum_128; |
770 | 413k | int sum; |
771 | 413k | variance64_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum); |
772 | 413k | vsum = sum_to_32bit_avx2(vsum); |
773 | 413k | vsum_128 = _mm_add_epi32(_mm256_castsi256_si128(vsum), |
774 | 413k | _mm256_extractf128_si256(vsum, 1)); |
775 | 413k | variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse, &sum); |
776 | 413k | return *sse - (uint32_t)(((int64_t)sum * sum) >> 11); |
777 | 413k | } |
778 | | |
779 | | unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, |
780 | | const uint8_t *ref_ptr, int ref_stride, |
781 | 646k | unsigned int *sse) { |
782 | 646k | __m256i vsse = _mm256_setzero_si256(); |
783 | 646k | __m256i vsum = _mm256_setzero_si256(); |
784 | 646k | __m128i vsum_128; |
785 | 646k | int sum; |
786 | 646k | int i = 0; |
787 | | |
788 | 1.94M | for (i = 0; i < 2; i++) { |
789 | 1.29M | __m256i vsum16; |
790 | 1.29M | variance64_avx2(src_ptr + 32 * i * src_stride, src_stride, |
791 | 1.29M | ref_ptr + 32 * i * ref_stride, ref_stride, 32, &vsse, |
792 | 1.29M | &vsum16); |
793 | 1.29M | vsum = _mm256_add_epi32(vsum, sum_to_32bit_avx2(vsum16)); |
794 | 1.29M | } |
795 | 646k | vsum_128 = _mm_add_epi32(_mm256_castsi256_si128(vsum), |
796 | 646k | _mm256_extractf128_si256(vsum, 1)); |
797 | 646k | variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse, &sum); |
798 | 646k | return *sse - (unsigned int)(((int64_t)sum * sum) >> 12); |
799 | 646k | } |
800 | | |
801 | | unsigned int vpx_mse16x8_avx2(const uint8_t *src_ptr, int src_stride, |
802 | | const uint8_t *ref_ptr, int ref_stride, |
803 | 0 | unsigned int *sse) { |
804 | 0 | int sum; |
805 | 0 | __m256i vsse, vsum; |
806 | 0 | variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); |
807 | 0 | variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); |
808 | 0 | return *sse; |
809 | 0 | } |
810 | | |
811 | | unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int src_stride, |
812 | | const uint8_t *ref_ptr, int ref_stride, |
813 | 9.86M | unsigned int *sse) { |
814 | 9.86M | int sum; |
815 | 9.86M | __m256i vsse, vsum; |
816 | 9.86M | variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); |
817 | 9.86M | variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); |
818 | 9.86M | return *sse; |
819 | 9.86M | } |
820 | | |
821 | | unsigned int vpx_sub_pixel_variance64x64_avx2( |
822 | | const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, |
823 | 414k | const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { |
824 | 414k | unsigned int sse1; |
825 | 414k | const int se1 = sub_pixel_variance32xh_avx2( |
826 | 414k | src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, 64, &sse1); |
827 | 414k | unsigned int sse2; |
828 | 414k | const int se2 = |
829 | 414k | sub_pixel_variance32xh_avx2(src_ptr + 32, src_stride, x_offset, y_offset, |
830 | 414k | ref_ptr + 32, ref_stride, 64, &sse2); |
831 | 414k | const int se = se1 + se2; |
832 | 414k | *sse = sse1 + sse2; |
833 | 414k | return *sse - (uint32_t)(((int64_t)se * se) >> 12); |
834 | 414k | } |
835 | | |
836 | | unsigned int vpx_sub_pixel_variance32x32_avx2( |
837 | | const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, |
838 | 1.47M | const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { |
839 | 1.47M | const int se = sub_pixel_variance32xh_avx2( |
840 | 1.47M | src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, 32, sse); |
841 | 1.47M | return *sse - (uint32_t)(((int64_t)se * se) >> 10); |
842 | 1.47M | } |
843 | | |
844 | | unsigned int vpx_sub_pixel_avg_variance64x64_avx2( |
845 | | const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, |
846 | | const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, |
847 | 0 | const uint8_t *second_pred) { |
848 | 0 | unsigned int sse1; |
849 | 0 | const int se1 = sub_pixel_avg_variance32xh_avx2(src_ptr, src_stride, x_offset, |
850 | 0 | y_offset, ref_ptr, ref_stride, |
851 | 0 | second_pred, 64, 64, &sse1); |
852 | 0 | unsigned int sse2; |
853 | 0 | const int se2 = sub_pixel_avg_variance32xh_avx2( |
854 | 0 | src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32, ref_stride, |
855 | 0 | second_pred + 32, 64, 64, &sse2); |
856 | 0 | const int se = se1 + se2; |
857 | |
|
858 | 0 | *sse = sse1 + sse2; |
859 | |
|
860 | 0 | return *sse - (uint32_t)(((int64_t)se * se) >> 12); |
861 | 0 | } |
862 | | |
863 | | unsigned int vpx_sub_pixel_avg_variance32x32_avx2( |
864 | | const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, |
865 | | const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, |
866 | 0 | const uint8_t *second_pred) { |
867 | | // Process 32 elements in parallel. |
868 | 0 | const int se = sub_pixel_avg_variance32xh_avx2(src_ptr, src_stride, x_offset, |
869 | 0 | y_offset, ref_ptr, ref_stride, |
870 | 0 | second_pred, 32, 32, sse); |
871 | 0 | return *sse - (uint32_t)(((int64_t)se * se) >> 10); |
872 | 0 | } |