/src/libvpx/vpx_dsp/x86/avg_intrin_avx2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2017 The WebM project authors. All Rights Reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | |
11 | | #include <immintrin.h> |
12 | | |
13 | | #include "./vpx_dsp_rtcd.h" |
14 | | #include "vpx/vpx_integer.h" |
15 | | #include "vpx_dsp/x86/bitdepth_conversion_avx2.h" |
16 | | #include "vpx_ports/mem.h" |
17 | | |
18 | | #if CONFIG_VP9_HIGHBITDEPTH |
19 | 0 | static void highbd_hadamard_col8_avx2(__m256i *in, int iter) { |
20 | 0 | __m256i a0 = in[0]; |
21 | 0 | __m256i a1 = in[1]; |
22 | 0 | __m256i a2 = in[2]; |
23 | 0 | __m256i a3 = in[3]; |
24 | 0 | __m256i a4 = in[4]; |
25 | 0 | __m256i a5 = in[5]; |
26 | 0 | __m256i a6 = in[6]; |
27 | 0 | __m256i a7 = in[7]; |
28 | |
|
29 | 0 | __m256i b0 = _mm256_add_epi32(a0, a1); |
30 | 0 | __m256i b1 = _mm256_sub_epi32(a0, a1); |
31 | 0 | __m256i b2 = _mm256_add_epi32(a2, a3); |
32 | 0 | __m256i b3 = _mm256_sub_epi32(a2, a3); |
33 | 0 | __m256i b4 = _mm256_add_epi32(a4, a5); |
34 | 0 | __m256i b5 = _mm256_sub_epi32(a4, a5); |
35 | 0 | __m256i b6 = _mm256_add_epi32(a6, a7); |
36 | 0 | __m256i b7 = _mm256_sub_epi32(a6, a7); |
37 | |
|
38 | 0 | a0 = _mm256_add_epi32(b0, b2); |
39 | 0 | a1 = _mm256_add_epi32(b1, b3); |
40 | 0 | a2 = _mm256_sub_epi32(b0, b2); |
41 | 0 | a3 = _mm256_sub_epi32(b1, b3); |
42 | 0 | a4 = _mm256_add_epi32(b4, b6); |
43 | 0 | a5 = _mm256_add_epi32(b5, b7); |
44 | 0 | a6 = _mm256_sub_epi32(b4, b6); |
45 | 0 | a7 = _mm256_sub_epi32(b5, b7); |
46 | |
|
47 | 0 | if (iter == 0) { |
48 | 0 | b0 = _mm256_add_epi32(a0, a4); |
49 | 0 | b7 = _mm256_add_epi32(a1, a5); |
50 | 0 | b3 = _mm256_add_epi32(a2, a6); |
51 | 0 | b4 = _mm256_add_epi32(a3, a7); |
52 | 0 | b2 = _mm256_sub_epi32(a0, a4); |
53 | 0 | b6 = _mm256_sub_epi32(a1, a5); |
54 | 0 | b1 = _mm256_sub_epi32(a2, a6); |
55 | 0 | b5 = _mm256_sub_epi32(a3, a7); |
56 | |
|
57 | 0 | a0 = _mm256_unpacklo_epi32(b0, b1); |
58 | 0 | a1 = _mm256_unpacklo_epi32(b2, b3); |
59 | 0 | a2 = _mm256_unpackhi_epi32(b0, b1); |
60 | 0 | a3 = _mm256_unpackhi_epi32(b2, b3); |
61 | 0 | a4 = _mm256_unpacklo_epi32(b4, b5); |
62 | 0 | a5 = _mm256_unpacklo_epi32(b6, b7); |
63 | 0 | a6 = _mm256_unpackhi_epi32(b4, b5); |
64 | 0 | a7 = _mm256_unpackhi_epi32(b6, b7); |
65 | |
|
66 | 0 | b0 = _mm256_unpacklo_epi64(a0, a1); |
67 | 0 | b1 = _mm256_unpacklo_epi64(a4, a5); |
68 | 0 | b2 = _mm256_unpackhi_epi64(a0, a1); |
69 | 0 | b3 = _mm256_unpackhi_epi64(a4, a5); |
70 | 0 | b4 = _mm256_unpacklo_epi64(a2, a3); |
71 | 0 | b5 = _mm256_unpacklo_epi64(a6, a7); |
72 | 0 | b6 = _mm256_unpackhi_epi64(a2, a3); |
73 | 0 | b7 = _mm256_unpackhi_epi64(a6, a7); |
74 | |
|
75 | 0 | in[0] = _mm256_permute2x128_si256(b0, b1, 0x20); |
76 | 0 | in[1] = _mm256_permute2x128_si256(b0, b1, 0x31); |
77 | 0 | in[2] = _mm256_permute2x128_si256(b2, b3, 0x20); |
78 | 0 | in[3] = _mm256_permute2x128_si256(b2, b3, 0x31); |
79 | 0 | in[4] = _mm256_permute2x128_si256(b4, b5, 0x20); |
80 | 0 | in[5] = _mm256_permute2x128_si256(b4, b5, 0x31); |
81 | 0 | in[6] = _mm256_permute2x128_si256(b6, b7, 0x20); |
82 | 0 | in[7] = _mm256_permute2x128_si256(b6, b7, 0x31); |
83 | 0 | } else { |
84 | 0 | in[0] = _mm256_add_epi32(a0, a4); |
85 | 0 | in[7] = _mm256_add_epi32(a1, a5); |
86 | 0 | in[3] = _mm256_add_epi32(a2, a6); |
87 | 0 | in[4] = _mm256_add_epi32(a3, a7); |
88 | 0 | in[2] = _mm256_sub_epi32(a0, a4); |
89 | 0 | in[6] = _mm256_sub_epi32(a1, a5); |
90 | 0 | in[1] = _mm256_sub_epi32(a2, a6); |
91 | 0 | in[5] = _mm256_sub_epi32(a3, a7); |
92 | 0 | } |
93 | 0 | } |
94 | | |
95 | | void vpx_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride, |
96 | 0 | tran_low_t *coeff) { |
97 | 0 | __m128i src16[8]; |
98 | 0 | __m256i src32[8]; |
99 | |
|
100 | 0 | src16[0] = _mm_loadu_si128((const __m128i *)src_diff); |
101 | 0 | src16[1] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); |
102 | 0 | src16[2] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); |
103 | 0 | src16[3] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); |
104 | 0 | src16[4] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); |
105 | 0 | src16[5] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); |
106 | 0 | src16[6] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); |
107 | 0 | src16[7] = _mm_loadu_si128((const __m128i *)(src_diff + src_stride)); |
108 | |
|
109 | 0 | src32[0] = _mm256_cvtepi16_epi32(src16[0]); |
110 | 0 | src32[1] = _mm256_cvtepi16_epi32(src16[1]); |
111 | 0 | src32[2] = _mm256_cvtepi16_epi32(src16[2]); |
112 | 0 | src32[3] = _mm256_cvtepi16_epi32(src16[3]); |
113 | 0 | src32[4] = _mm256_cvtepi16_epi32(src16[4]); |
114 | 0 | src32[5] = _mm256_cvtepi16_epi32(src16[5]); |
115 | 0 | src32[6] = _mm256_cvtepi16_epi32(src16[6]); |
116 | 0 | src32[7] = _mm256_cvtepi16_epi32(src16[7]); |
117 | |
|
118 | 0 | highbd_hadamard_col8_avx2(src32, 0); |
119 | 0 | highbd_hadamard_col8_avx2(src32, 1); |
120 | |
|
121 | 0 | _mm256_storeu_si256((__m256i *)coeff, src32[0]); |
122 | 0 | coeff += 8; |
123 | 0 | _mm256_storeu_si256((__m256i *)coeff, src32[1]); |
124 | 0 | coeff += 8; |
125 | 0 | _mm256_storeu_si256((__m256i *)coeff, src32[2]); |
126 | 0 | coeff += 8; |
127 | 0 | _mm256_storeu_si256((__m256i *)coeff, src32[3]); |
128 | 0 | coeff += 8; |
129 | 0 | _mm256_storeu_si256((__m256i *)coeff, src32[4]); |
130 | 0 | coeff += 8; |
131 | 0 | _mm256_storeu_si256((__m256i *)coeff, src32[5]); |
132 | 0 | coeff += 8; |
133 | 0 | _mm256_storeu_si256((__m256i *)coeff, src32[6]); |
134 | 0 | coeff += 8; |
135 | 0 | _mm256_storeu_si256((__m256i *)coeff, src32[7]); |
136 | 0 | } |
137 | | |
138 | | void vpx_highbd_hadamard_16x16_avx2(const int16_t *src_diff, |
139 | 0 | ptrdiff_t src_stride, tran_low_t *coeff) { |
140 | 0 | int idx; |
141 | 0 | tran_low_t *t_coeff = coeff; |
142 | 0 | for (idx = 0; idx < 4; ++idx) { |
143 | 0 | const int16_t *src_ptr = |
144 | 0 | src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; |
145 | 0 | vpx_highbd_hadamard_8x8_avx2(src_ptr, src_stride, t_coeff + idx * 64); |
146 | 0 | } |
147 | |
|
148 | 0 | for (idx = 0; idx < 64; idx += 8) { |
149 | 0 | __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); |
150 | 0 | __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64)); |
151 | 0 | __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128)); |
152 | 0 | __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192)); |
153 | |
|
154 | 0 | __m256i b0 = _mm256_add_epi32(coeff0, coeff1); |
155 | 0 | __m256i b1 = _mm256_sub_epi32(coeff0, coeff1); |
156 | 0 | __m256i b2 = _mm256_add_epi32(coeff2, coeff3); |
157 | 0 | __m256i b3 = _mm256_sub_epi32(coeff2, coeff3); |
158 | |
|
159 | 0 | b0 = _mm256_srai_epi32(b0, 1); |
160 | 0 | b1 = _mm256_srai_epi32(b1, 1); |
161 | 0 | b2 = _mm256_srai_epi32(b2, 1); |
162 | 0 | b3 = _mm256_srai_epi32(b3, 1); |
163 | |
|
164 | 0 | coeff0 = _mm256_add_epi32(b0, b2); |
165 | 0 | coeff1 = _mm256_add_epi32(b1, b3); |
166 | 0 | coeff2 = _mm256_sub_epi32(b0, b2); |
167 | 0 | coeff3 = _mm256_sub_epi32(b1, b3); |
168 | |
|
169 | 0 | _mm256_storeu_si256((__m256i *)coeff, coeff0); |
170 | 0 | _mm256_storeu_si256((__m256i *)(coeff + 64), coeff1); |
171 | 0 | _mm256_storeu_si256((__m256i *)(coeff + 128), coeff2); |
172 | 0 | _mm256_storeu_si256((__m256i *)(coeff + 192), coeff3); |
173 | |
|
174 | 0 | coeff += 8; |
175 | 0 | t_coeff += 8; |
176 | 0 | } |
177 | 0 | } |
178 | | |
179 | | void vpx_highbd_hadamard_32x32_avx2(const int16_t *src_diff, |
180 | 0 | ptrdiff_t src_stride, tran_low_t *coeff) { |
181 | 0 | int idx; |
182 | 0 | tran_low_t *t_coeff = coeff; |
183 | 0 | for (idx = 0; idx < 4; ++idx) { |
184 | 0 | const int16_t *src_ptr = |
185 | 0 | src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; |
186 | 0 | vpx_highbd_hadamard_16x16_avx2(src_ptr, src_stride, t_coeff + idx * 256); |
187 | 0 | } |
188 | |
|
189 | 0 | for (idx = 0; idx < 256; idx += 8) { |
190 | 0 | __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); |
191 | 0 | __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256)); |
192 | 0 | __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512)); |
193 | 0 | __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768)); |
194 | |
|
195 | 0 | __m256i b0 = _mm256_add_epi32(coeff0, coeff1); |
196 | 0 | __m256i b1 = _mm256_sub_epi32(coeff0, coeff1); |
197 | 0 | __m256i b2 = _mm256_add_epi32(coeff2, coeff3); |
198 | 0 | __m256i b3 = _mm256_sub_epi32(coeff2, coeff3); |
199 | |
|
200 | 0 | b0 = _mm256_srai_epi32(b0, 2); |
201 | 0 | b1 = _mm256_srai_epi32(b1, 2); |
202 | 0 | b2 = _mm256_srai_epi32(b2, 2); |
203 | 0 | b3 = _mm256_srai_epi32(b3, 2); |
204 | |
|
205 | 0 | coeff0 = _mm256_add_epi32(b0, b2); |
206 | 0 | coeff1 = _mm256_add_epi32(b1, b3); |
207 | 0 | coeff2 = _mm256_sub_epi32(b0, b2); |
208 | 0 | coeff3 = _mm256_sub_epi32(b1, b3); |
209 | |
|
210 | 0 | _mm256_storeu_si256((__m256i *)coeff, coeff0); |
211 | 0 | _mm256_storeu_si256((__m256i *)(coeff + 256), coeff1); |
212 | 0 | _mm256_storeu_si256((__m256i *)(coeff + 512), coeff2); |
213 | 0 | _mm256_storeu_si256((__m256i *)(coeff + 768), coeff3); |
214 | |
|
215 | 0 | coeff += 8; |
216 | 0 | t_coeff += 8; |
217 | 0 | } |
218 | 0 | } |
219 | | #endif // CONFIG_VP9_HIGHBITDEPTH |
220 | | |
221 | | static INLINE void sign_extend_16bit_to_32bit_avx2(__m256i in, __m256i zero, |
222 | | __m256i *out_lo, |
223 | 0 | __m256i *out_hi) { |
224 | 0 | const __m256i sign_bits = _mm256_cmpgt_epi16(zero, in); |
225 | 0 | *out_lo = _mm256_unpacklo_epi16(in, sign_bits); |
226 | 0 | *out_hi = _mm256_unpackhi_epi16(in, sign_bits); |
227 | 0 | } |
228 | | |
229 | 0 | static void hadamard_col8x2_avx2(__m256i *in, int iter) { |
230 | 0 | __m256i a0 = in[0]; |
231 | 0 | __m256i a1 = in[1]; |
232 | 0 | __m256i a2 = in[2]; |
233 | 0 | __m256i a3 = in[3]; |
234 | 0 | __m256i a4 = in[4]; |
235 | 0 | __m256i a5 = in[5]; |
236 | 0 | __m256i a6 = in[6]; |
237 | 0 | __m256i a7 = in[7]; |
238 | |
|
239 | 0 | __m256i b0 = _mm256_add_epi16(a0, a1); |
240 | 0 | __m256i b1 = _mm256_sub_epi16(a0, a1); |
241 | 0 | __m256i b2 = _mm256_add_epi16(a2, a3); |
242 | 0 | __m256i b3 = _mm256_sub_epi16(a2, a3); |
243 | 0 | __m256i b4 = _mm256_add_epi16(a4, a5); |
244 | 0 | __m256i b5 = _mm256_sub_epi16(a4, a5); |
245 | 0 | __m256i b6 = _mm256_add_epi16(a6, a7); |
246 | 0 | __m256i b7 = _mm256_sub_epi16(a6, a7); |
247 | |
|
248 | 0 | a0 = _mm256_add_epi16(b0, b2); |
249 | 0 | a1 = _mm256_add_epi16(b1, b3); |
250 | 0 | a2 = _mm256_sub_epi16(b0, b2); |
251 | 0 | a3 = _mm256_sub_epi16(b1, b3); |
252 | 0 | a4 = _mm256_add_epi16(b4, b6); |
253 | 0 | a5 = _mm256_add_epi16(b5, b7); |
254 | 0 | a6 = _mm256_sub_epi16(b4, b6); |
255 | 0 | a7 = _mm256_sub_epi16(b5, b7); |
256 | |
|
257 | 0 | if (iter == 0) { |
258 | 0 | b0 = _mm256_add_epi16(a0, a4); |
259 | 0 | b7 = _mm256_add_epi16(a1, a5); |
260 | 0 | b3 = _mm256_add_epi16(a2, a6); |
261 | 0 | b4 = _mm256_add_epi16(a3, a7); |
262 | 0 | b2 = _mm256_sub_epi16(a0, a4); |
263 | 0 | b6 = _mm256_sub_epi16(a1, a5); |
264 | 0 | b1 = _mm256_sub_epi16(a2, a6); |
265 | 0 | b5 = _mm256_sub_epi16(a3, a7); |
266 | |
|
267 | 0 | a0 = _mm256_unpacklo_epi16(b0, b1); |
268 | 0 | a1 = _mm256_unpacklo_epi16(b2, b3); |
269 | 0 | a2 = _mm256_unpackhi_epi16(b0, b1); |
270 | 0 | a3 = _mm256_unpackhi_epi16(b2, b3); |
271 | 0 | a4 = _mm256_unpacklo_epi16(b4, b5); |
272 | 0 | a5 = _mm256_unpacklo_epi16(b6, b7); |
273 | 0 | a6 = _mm256_unpackhi_epi16(b4, b5); |
274 | 0 | a7 = _mm256_unpackhi_epi16(b6, b7); |
275 | |
|
276 | 0 | b0 = _mm256_unpacklo_epi32(a0, a1); |
277 | 0 | b1 = _mm256_unpacklo_epi32(a4, a5); |
278 | 0 | b2 = _mm256_unpackhi_epi32(a0, a1); |
279 | 0 | b3 = _mm256_unpackhi_epi32(a4, a5); |
280 | 0 | b4 = _mm256_unpacklo_epi32(a2, a3); |
281 | 0 | b5 = _mm256_unpacklo_epi32(a6, a7); |
282 | 0 | b6 = _mm256_unpackhi_epi32(a2, a3); |
283 | 0 | b7 = _mm256_unpackhi_epi32(a6, a7); |
284 | |
|
285 | 0 | in[0] = _mm256_unpacklo_epi64(b0, b1); |
286 | 0 | in[1] = _mm256_unpackhi_epi64(b0, b1); |
287 | 0 | in[2] = _mm256_unpacklo_epi64(b2, b3); |
288 | 0 | in[3] = _mm256_unpackhi_epi64(b2, b3); |
289 | 0 | in[4] = _mm256_unpacklo_epi64(b4, b5); |
290 | 0 | in[5] = _mm256_unpackhi_epi64(b4, b5); |
291 | 0 | in[6] = _mm256_unpacklo_epi64(b6, b7); |
292 | 0 | in[7] = _mm256_unpackhi_epi64(b6, b7); |
293 | 0 | } else { |
294 | 0 | in[0] = _mm256_add_epi16(a0, a4); |
295 | 0 | in[7] = _mm256_add_epi16(a1, a5); |
296 | 0 | in[3] = _mm256_add_epi16(a2, a6); |
297 | 0 | in[4] = _mm256_add_epi16(a3, a7); |
298 | 0 | in[2] = _mm256_sub_epi16(a0, a4); |
299 | 0 | in[6] = _mm256_sub_epi16(a1, a5); |
300 | 0 | in[1] = _mm256_sub_epi16(a2, a6); |
301 | 0 | in[5] = _mm256_sub_epi16(a3, a7); |
302 | 0 | } |
303 | 0 | } |
304 | | |
305 | | static void hadamard_8x8x2_avx2(const int16_t *src_diff, ptrdiff_t src_stride, |
306 | 0 | int16_t *coeff) { |
307 | 0 | __m256i src[8]; |
308 | 0 | src[0] = _mm256_loadu_si256((const __m256i *)src_diff); |
309 | 0 | src[1] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); |
310 | 0 | src[2] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); |
311 | 0 | src[3] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); |
312 | 0 | src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); |
313 | 0 | src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); |
314 | 0 | src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); |
315 | 0 | src[7] = _mm256_loadu_si256((const __m256i *)(src_diff + src_stride)); |
316 | |
|
317 | 0 | hadamard_col8x2_avx2(src, 0); |
318 | 0 | hadamard_col8x2_avx2(src, 1); |
319 | |
|
320 | 0 | _mm256_storeu_si256((__m256i *)coeff, |
321 | 0 | _mm256_permute2x128_si256(src[0], src[1], 0x20)); |
322 | 0 | coeff += 16; |
323 | 0 | _mm256_storeu_si256((__m256i *)coeff, |
324 | 0 | _mm256_permute2x128_si256(src[2], src[3], 0x20)); |
325 | 0 | coeff += 16; |
326 | 0 | _mm256_storeu_si256((__m256i *)coeff, |
327 | 0 | _mm256_permute2x128_si256(src[4], src[5], 0x20)); |
328 | 0 | coeff += 16; |
329 | 0 | _mm256_storeu_si256((__m256i *)coeff, |
330 | 0 | _mm256_permute2x128_si256(src[6], src[7], 0x20)); |
331 | 0 | coeff += 16; |
332 | 0 | _mm256_storeu_si256((__m256i *)coeff, |
333 | 0 | _mm256_permute2x128_si256(src[0], src[1], 0x31)); |
334 | 0 | coeff += 16; |
335 | 0 | _mm256_storeu_si256((__m256i *)coeff, |
336 | 0 | _mm256_permute2x128_si256(src[2], src[3], 0x31)); |
337 | 0 | coeff += 16; |
338 | 0 | _mm256_storeu_si256((__m256i *)coeff, |
339 | 0 | _mm256_permute2x128_si256(src[4], src[5], 0x31)); |
340 | 0 | coeff += 16; |
341 | 0 | _mm256_storeu_si256((__m256i *)coeff, |
342 | 0 | _mm256_permute2x128_si256(src[6], src[7], 0x31)); |
343 | 0 | } |
344 | | |
345 | | static INLINE void hadamard_16x16_avx2(const int16_t *src_diff, |
346 | | ptrdiff_t src_stride, tran_low_t *coeff, |
347 | 0 | int is_final) { |
348 | 0 | #if CONFIG_VP9_HIGHBITDEPTH |
349 | 0 | DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]); |
350 | 0 | int16_t *t_coeff = temp_coeff; |
351 | | #else |
352 | | int16_t *t_coeff = coeff; |
353 | | #endif |
354 | 0 | int16_t *coeff16 = (int16_t *)coeff; |
355 | 0 | int idx; |
356 | 0 | for (idx = 0; idx < 2; ++idx) { |
357 | 0 | const int16_t *src_ptr = src_diff + idx * 8 * src_stride; |
358 | 0 | hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2)); |
359 | 0 | } |
360 | |
|
361 | 0 | for (idx = 0; idx < 64; idx += 16) { |
362 | 0 | const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); |
363 | 0 | const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64)); |
364 | 0 | const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128)); |
365 | 0 | const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192)); |
366 | |
|
367 | 0 | __m256i b0 = _mm256_add_epi16(coeff0, coeff1); |
368 | 0 | __m256i b1 = _mm256_sub_epi16(coeff0, coeff1); |
369 | 0 | __m256i b2 = _mm256_add_epi16(coeff2, coeff3); |
370 | 0 | __m256i b3 = _mm256_sub_epi16(coeff2, coeff3); |
371 | |
|
372 | 0 | b0 = _mm256_srai_epi16(b0, 1); |
373 | 0 | b1 = _mm256_srai_epi16(b1, 1); |
374 | 0 | b2 = _mm256_srai_epi16(b2, 1); |
375 | 0 | b3 = _mm256_srai_epi16(b3, 1); |
376 | 0 | if (is_final) { |
377 | 0 | store_tran_low(_mm256_add_epi16(b0, b2), coeff); |
378 | 0 | store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64); |
379 | 0 | store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128); |
380 | 0 | store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192); |
381 | 0 | coeff += 16; |
382 | 0 | } else { |
383 | 0 | _mm256_storeu_si256((__m256i *)coeff16, _mm256_add_epi16(b0, b2)); |
384 | 0 | _mm256_storeu_si256((__m256i *)(coeff16 + 64), _mm256_add_epi16(b1, b3)); |
385 | 0 | _mm256_storeu_si256((__m256i *)(coeff16 + 128), _mm256_sub_epi16(b0, b2)); |
386 | 0 | _mm256_storeu_si256((__m256i *)(coeff16 + 192), _mm256_sub_epi16(b1, b3)); |
387 | 0 | coeff16 += 16; |
388 | 0 | } |
389 | 0 | t_coeff += 16; |
390 | 0 | } |
391 | 0 | } |
392 | | |
393 | | void vpx_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride, |
394 | 0 | tran_low_t *coeff) { |
395 | 0 | hadamard_16x16_avx2(src_diff, src_stride, coeff, 1); |
396 | 0 | } |
397 | | |
398 | | void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, |
399 | 0 | tran_low_t *coeff) { |
400 | 0 | #if CONFIG_VP9_HIGHBITDEPTH |
401 | | // For high bitdepths, it is unnecessary to store_tran_low |
402 | | // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the |
403 | | // next stage. Output to an intermediate buffer first, then store_tran_low() |
404 | | // in the final stage. |
405 | 0 | DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]); |
406 | 0 | int16_t *t_coeff = temp_coeff; |
407 | | #else |
408 | | int16_t *t_coeff = coeff; |
409 | | #endif |
410 | 0 | int idx; |
411 | 0 | __m256i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo, |
412 | 0 | b3_lo; |
413 | 0 | __m256i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi, |
414 | 0 | b3_hi; |
415 | 0 | __m256i b0, b1, b2, b3; |
416 | 0 | const __m256i zero = _mm256_setzero_si256(); |
417 | 0 | for (idx = 0; idx < 4; ++idx) { |
418 | | // src_diff: 9 bit, dynamic range [-255, 255] |
419 | 0 | const int16_t *src_ptr = |
420 | 0 | src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; |
421 | 0 | hadamard_16x16_avx2(src_ptr, src_stride, |
422 | 0 | (tran_low_t *)(t_coeff + idx * 256), 0); |
423 | 0 | } |
424 | |
|
425 | 0 | for (idx = 0; idx < 256; idx += 16) { |
426 | 0 | const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); |
427 | 0 | const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256)); |
428 | 0 | const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512)); |
429 | 0 | const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768)); |
430 | | |
431 | | // Sign extend 16 bit to 32 bit. |
432 | 0 | sign_extend_16bit_to_32bit_avx2(coeff0, zero, &coeff0_lo, &coeff0_hi); |
433 | 0 | sign_extend_16bit_to_32bit_avx2(coeff1, zero, &coeff1_lo, &coeff1_hi); |
434 | 0 | sign_extend_16bit_to_32bit_avx2(coeff2, zero, &coeff2_lo, &coeff2_hi); |
435 | 0 | sign_extend_16bit_to_32bit_avx2(coeff3, zero, &coeff3_lo, &coeff3_hi); |
436 | |
|
437 | 0 | b0_lo = _mm256_add_epi32(coeff0_lo, coeff1_lo); |
438 | 0 | b0_hi = _mm256_add_epi32(coeff0_hi, coeff1_hi); |
439 | |
|
440 | 0 | b1_lo = _mm256_sub_epi32(coeff0_lo, coeff1_lo); |
441 | 0 | b1_hi = _mm256_sub_epi32(coeff0_hi, coeff1_hi); |
442 | |
|
443 | 0 | b2_lo = _mm256_add_epi32(coeff2_lo, coeff3_lo); |
444 | 0 | b2_hi = _mm256_add_epi32(coeff2_hi, coeff3_hi); |
445 | |
|
446 | 0 | b3_lo = _mm256_sub_epi32(coeff2_lo, coeff3_lo); |
447 | 0 | b3_hi = _mm256_sub_epi32(coeff2_hi, coeff3_hi); |
448 | |
|
449 | 0 | b0_lo = _mm256_srai_epi32(b0_lo, 2); |
450 | 0 | b1_lo = _mm256_srai_epi32(b1_lo, 2); |
451 | 0 | b2_lo = _mm256_srai_epi32(b2_lo, 2); |
452 | 0 | b3_lo = _mm256_srai_epi32(b3_lo, 2); |
453 | |
|
454 | 0 | b0_hi = _mm256_srai_epi32(b0_hi, 2); |
455 | 0 | b1_hi = _mm256_srai_epi32(b1_hi, 2); |
456 | 0 | b2_hi = _mm256_srai_epi32(b2_hi, 2); |
457 | 0 | b3_hi = _mm256_srai_epi32(b3_hi, 2); |
458 | |
|
459 | 0 | b0 = _mm256_packs_epi32(b0_lo, b0_hi); |
460 | 0 | b1 = _mm256_packs_epi32(b1_lo, b1_hi); |
461 | 0 | b2 = _mm256_packs_epi32(b2_lo, b2_hi); |
462 | 0 | b3 = _mm256_packs_epi32(b3_lo, b3_hi); |
463 | |
|
464 | 0 | store_tran_low(_mm256_add_epi16(b0, b2), coeff); |
465 | 0 | store_tran_low(_mm256_add_epi16(b1, b3), coeff + 256); |
466 | 0 | store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 512); |
467 | 0 | store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 768); |
468 | |
|
469 | 0 | coeff += 16; |
470 | 0 | t_coeff += 16; |
471 | 0 | } |
472 | 0 | } |
473 | | |
474 | 0 | int vpx_satd_avx2(const tran_low_t *coeff, int length) { |
475 | 0 | const __m256i one = _mm256_set1_epi16(1); |
476 | 0 | __m256i accum = _mm256_setzero_si256(); |
477 | 0 | int i; |
478 | |
|
479 | 0 | for (i = 0; i < length; i += 16) { |
480 | 0 | const __m256i src_line = load_tran_low(coeff); |
481 | 0 | const __m256i abs = _mm256_abs_epi16(src_line); |
482 | 0 | const __m256i sum = _mm256_madd_epi16(abs, one); |
483 | 0 | accum = _mm256_add_epi32(accum, sum); |
484 | 0 | coeff += 16; |
485 | 0 | } |
486 | |
|
487 | 0 | { // 32 bit horizontal add |
488 | 0 | const __m256i a = _mm256_srli_si256(accum, 8); |
489 | 0 | const __m256i b = _mm256_add_epi32(accum, a); |
490 | 0 | const __m256i c = _mm256_srli_epi64(b, 32); |
491 | 0 | const __m256i d = _mm256_add_epi32(b, c); |
492 | 0 | const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d), |
493 | 0 | _mm256_extractf128_si256(d, 1)); |
494 | 0 | return _mm_cvtsi128_si32(accum_128); |
495 | 0 | } |
496 | 0 | } |
497 | | |
498 | | #if CONFIG_VP9_HIGHBITDEPTH |
499 | 0 | int vpx_highbd_satd_avx2(const tran_low_t *coeff, int length) { |
500 | 0 | __m256i accum = _mm256_setzero_si256(); |
501 | 0 | int i; |
502 | |
|
503 | 0 | for (i = 0; i < length; i += 8, coeff += 8) { |
504 | 0 | const __m256i src_line = _mm256_loadu_si256((const __m256i *)coeff); |
505 | 0 | const __m256i abs = _mm256_abs_epi32(src_line); |
506 | 0 | accum = _mm256_add_epi32(accum, abs); |
507 | 0 | } |
508 | |
|
509 | 0 | { // 32 bit horizontal add |
510 | 0 | const __m256i a = _mm256_srli_si256(accum, 8); |
511 | 0 | const __m256i b = _mm256_add_epi32(accum, a); |
512 | 0 | const __m256i c = _mm256_srli_epi64(b, 32); |
513 | 0 | const __m256i d = _mm256_add_epi32(b, c); |
514 | 0 | const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d), |
515 | 0 | _mm256_extractf128_si256(d, 1)); |
516 | 0 | return _mm_cvtsi128_si32(accum_128); |
517 | 0 | } |
518 | 0 | } |
519 | | #endif // CONFIG_VP9_HIGHBITDEPTH |