/src/libvpx/vpx_dsp/x86/sad_avx2.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | #include <immintrin.h> |
11 | | #include "./vpx_dsp_rtcd.h" |
12 | | #include "vpx_ports/mem.h" |
13 | | |
14 | | static INLINE unsigned int sad64xh_avx2(const uint8_t *src_ptr, int src_stride, |
15 | | const uint8_t *ref_ptr, int ref_stride, |
16 | 5.44M | int h) { |
17 | 5.44M | int i, res; |
18 | 5.44M | __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; |
19 | 5.44M | __m256i sum_sad = _mm256_setzero_si256(); |
20 | 5.44M | __m256i sum_sad_h; |
21 | 5.44M | __m128i sum_sad128; |
22 | 195M | for (i = 0; i < h; i++) { |
23 | 190M | ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); |
24 | 190M | ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); |
25 | 190M | sad1_reg = |
26 | 190M | _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); |
27 | 190M | sad2_reg = _mm256_sad_epu8( |
28 | 190M | ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); |
29 | 190M | sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); |
30 | 190M | ref_ptr += ref_stride; |
31 | 190M | src_ptr += src_stride; |
32 | 190M | } |
33 | 5.44M | sum_sad_h = _mm256_srli_si256(sum_sad, 8); |
34 | 5.44M | sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); |
35 | 5.44M | sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); |
36 | 5.44M | sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); |
37 | 5.44M | res = _mm_cvtsi128_si32(sum_sad128); |
38 | 5.44M | return res; |
39 | 5.44M | } |
40 | | |
41 | | static INLINE unsigned int sad32xh_avx2(const uint8_t *src_ptr, int src_stride, |
42 | | const uint8_t *ref_ptr, int ref_stride, |
43 | 17.2M | int h) { |
44 | 17.2M | int i, res; |
45 | 17.2M | __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; |
46 | 17.2M | __m256i sum_sad = _mm256_setzero_si256(); |
47 | 17.2M | __m256i sum_sad_h; |
48 | 17.2M | __m128i sum_sad128; |
49 | 17.2M | const int ref2_stride = ref_stride << 1; |
50 | 17.2M | const int src2_stride = src_stride << 1; |
51 | 17.2M | const int max = h >> 1; |
52 | 208M | for (i = 0; i < max; i++) { |
53 | 191M | ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); |
54 | 191M | ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); |
55 | 191M | sad1_reg = |
56 | 191M | _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); |
57 | 191M | sad2_reg = _mm256_sad_epu8( |
58 | 191M | ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); |
59 | 191M | sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); |
60 | 191M | ref_ptr += ref2_stride; |
61 | 191M | src_ptr += src2_stride; |
62 | 191M | } |
63 | 17.2M | sum_sad_h = _mm256_srli_si256(sum_sad, 8); |
64 | 17.2M | sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); |
65 | 17.2M | sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); |
66 | 17.2M | sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); |
67 | 17.2M | res = _mm_cvtsi128_si32(sum_sad128); |
68 | 17.2M | return res; |
69 | 17.2M | } |
70 | | |
71 | | #define FSAD64_H(h) \ |
72 | | unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ |
73 | 2.59M | const uint8_t *ref_ptr, int ref_stride) { \ |
74 | 2.59M | return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \ |
75 | 2.59M | } Line | Count | Source | 73 | 1.42M | const uint8_t *ref_ptr, int ref_stride) { \ | 74 | 1.42M | return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \ | 75 | 1.42M | } |
Line | Count | Source | 73 | 1.16M | const uint8_t *ref_ptr, int ref_stride) { \ | 74 | 1.16M | return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \ | 75 | 1.16M | } |
|
76 | | |
77 | | #define FSADS64_H(h) \ |
78 | | unsigned int vpx_sad_skip_64x##h##_avx2( \ |
79 | | const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ |
80 | 2.85M | int ref_stride) { \ |
81 | 2.85M | return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \ |
82 | 2.85M | h / 2); \ |
83 | 2.85M | } Line | Count | Source | 80 | 998k | int ref_stride) { \ | 81 | 998k | return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \ | 82 | 998k | h / 2); \ | 83 | 998k | } |
Line | Count | Source | 80 | 1.85M | int ref_stride) { \ | 81 | 1.85M | return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \ | 82 | 1.85M | h / 2); \ | 83 | 1.85M | } |
|
84 | | |
85 | | #define FSAD32_H(h) \ |
86 | | unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ |
87 | 8.41M | const uint8_t *ref_ptr, int ref_stride) { \ |
88 | 8.41M | return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \ |
89 | 8.41M | } Line | Count | Source | 87 | 1.97M | const uint8_t *ref_ptr, int ref_stride) { \ | 88 | 1.97M | return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \ | 89 | 1.97M | } |
Line | Count | Source | 87 | 2.81M | const uint8_t *ref_ptr, int ref_stride) { \ | 88 | 2.81M | return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \ | 89 | 2.81M | } |
Line | Count | Source | 87 | 3.62M | const uint8_t *ref_ptr, int ref_stride) { \ | 88 | 3.62M | return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \ | 89 | 3.62M | } |
|
90 | | |
91 | | #define FSADS32_H(h) \ |
92 | | unsigned int vpx_sad_skip_32x##h##_avx2( \ |
93 | | const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ |
94 | 8.80M | int ref_stride) { \ |
95 | 8.80M | return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \ |
96 | 8.80M | h / 2); \ |
97 | 8.80M | } Line | Count | Source | 94 | 840k | int ref_stride) { \ | 95 | 840k | return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \ | 96 | 840k | h / 2); \ | 97 | 840k | } |
Line | Count | Source | 94 | 2.22M | int ref_stride) { \ | 95 | 2.22M | return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \ | 96 | 2.22M | h / 2); \ | 97 | 2.22M | } |
Line | Count | Source | 94 | 5.73M | int ref_stride) { \ | 95 | 5.73M | return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \ | 96 | 5.73M | h / 2); \ | 97 | 5.73M | } |
|
98 | | |
99 | | #if CONFIG_ENCODERS |
100 | | #define FSAD64 \ |
101 | | FSAD64_H(64) \ |
102 | | FSAD64_H(32) \ |
103 | | FSADS64_H(64) \ |
104 | | FSADS64_H(32) |
105 | | |
106 | | #define FSAD32 \ |
107 | | FSAD32_H(64) \ |
108 | | FSAD32_H(32) \ |
109 | | FSAD32_H(16) \ |
110 | | FSADS32_H(64) \ |
111 | | FSADS32_H(32) \ |
112 | | FSADS32_H(16) |
113 | | #else // !CONFIG_ENCODERS |
114 | | #define FSAD64 FSAD64_H(64) |
115 | | #define FSAD32 FSAD32_H(32) |
116 | | #endif // CONFIG_ENCODERS |
117 | | |
118 | | FSAD64 |
119 | | FSAD32 |
120 | | |
121 | | #undef FSAD64 |
122 | | #undef FSAD32 |
123 | | #undef FSAD64_H |
124 | | #undef FSAD32_H |
125 | | #undef FSADS64_H |
126 | | #undef FSADS32_H |
127 | | |
128 | | #if CONFIG_ENCODERS |
129 | | #define FSADAVG64_H(h) \ |
130 | | unsigned int vpx_sad64x##h##_avg_avx2( \ |
131 | | const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ |
132 | 0 | int ref_stride, const uint8_t *second_pred) { \ |
133 | 0 | int i; \ |
134 | 0 | __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ |
135 | 0 | __m256i sum_sad = _mm256_setzero_si256(); \ |
136 | 0 | __m256i sum_sad_h; \ |
137 | 0 | __m128i sum_sad128; \ |
138 | 0 | for (i = 0; i < h; i++) { \ |
139 | 0 | ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ |
140 | 0 | ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ |
141 | 0 | ref1_reg = _mm256_avg_epu8( \ |
142 | 0 | ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \ |
143 | 0 | ref2_reg = _mm256_avg_epu8( \ |
144 | 0 | ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \ |
145 | 0 | sad1_reg = _mm256_sad_epu8( \ |
146 | 0 | ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ |
147 | 0 | sad2_reg = _mm256_sad_epu8( \ |
148 | 0 | ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ |
149 | 0 | sum_sad = \ |
150 | 0 | _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ |
151 | 0 | ref_ptr += ref_stride; \ |
152 | 0 | src_ptr += src_stride; \ |
153 | 0 | second_pred += 64; \ |
154 | 0 | } \ |
155 | 0 | sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ |
156 | 0 | sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ |
157 | 0 | sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ |
158 | 0 | sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ |
159 | 0 | return (unsigned int)_mm_cvtsi128_si32(sum_sad128); \ |
160 | 0 | } |
161 | | |
162 | | #define FSADAVG32_H(h) \ |
163 | | unsigned int vpx_sad32x##h##_avg_avx2( \ |
164 | | const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ |
165 | 0 | int ref_stride, const uint8_t *second_pred) { \ |
166 | 0 | int i; \ |
167 | 0 | __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ |
168 | 0 | __m256i sum_sad = _mm256_setzero_si256(); \ |
169 | 0 | __m256i sum_sad_h; \ |
170 | 0 | __m128i sum_sad128; \ |
171 | 0 | int ref2_stride = ref_stride << 1; \ |
172 | 0 | int src2_stride = src_stride << 1; \ |
173 | 0 | int max = h >> 1; \ |
174 | 0 | for (i = 0; i < max; i++) { \ |
175 | 0 | ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ |
176 | 0 | ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ |
177 | 0 | ref1_reg = _mm256_avg_epu8( \ |
178 | 0 | ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \ |
179 | 0 | ref2_reg = _mm256_avg_epu8( \ |
180 | 0 | ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \ |
181 | 0 | sad1_reg = _mm256_sad_epu8( \ |
182 | 0 | ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ |
183 | 0 | sad2_reg = _mm256_sad_epu8( \ |
184 | 0 | ref2_reg, \ |
185 | 0 | _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ |
186 | 0 | sum_sad = \ |
187 | 0 | _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ |
188 | 0 | ref_ptr += ref2_stride; \ |
189 | 0 | src_ptr += src2_stride; \ |
190 | 0 | second_pred += 64; \ |
191 | 0 | } \ |
192 | 0 | sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ |
193 | 0 | sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ |
194 | 0 | sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ |
195 | 0 | sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ |
196 | 0 | return (unsigned int)_mm_cvtsi128_si32(sum_sad128); \ |
197 | 0 | } |
198 | | |
199 | | #define FSADAVG64 \ |
200 | | FSADAVG64_H(64) \ |
201 | | FSADAVG64_H(32) |
202 | | |
203 | | #define FSADAVG32 \ |
204 | | FSADAVG32_H(64) \ |
205 | | FSADAVG32_H(32) \ |
206 | | FSADAVG32_H(16) |
207 | | |
208 | 0 | FSADAVG64 Unexecuted instantiation: vpx_sad64x64_avg_avx2 Unexecuted instantiation: vpx_sad64x32_avg_avx2 |
209 | | FSADAVG32 Unexecuted instantiation: vpx_sad32x64_avg_avx2 Unexecuted instantiation: vpx_sad32x32_avg_avx2 Unexecuted instantiation: vpx_sad32x16_avg_avx2 |
210 | | |
211 | | #undef FSADAVG64 |
212 | | #undef FSADAVG32 |
213 | | #undef FSADAVG64_H |
214 | | #undef FSADAVG32_H |
215 | | |
216 | | #endif // CONFIG_ENCODERS |