/src/libvpx/vpx_dsp/x86/sse_avx2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2023 The WebM project authors. All Rights Reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | |
11 | | #include <immintrin.h> |
12 | | #include <smmintrin.h> |
13 | | #include <stdint.h> |
14 | | |
15 | | #include "./vpx_config.h" |
16 | | #include "./vpx_dsp_rtcd.h" |
17 | | |
18 | | #include "vpx_ports/mem.h" |
19 | | #include "vpx_dsp/x86/mem_sse2.h" |
20 | | |
21 | | static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a, |
22 | 0 | const uint8_t *b) { |
23 | 0 | const __m256i v_a0 = _mm256_loadu_si256((const __m256i *)a); |
24 | 0 | const __m256i v_b0 = _mm256_loadu_si256((const __m256i *)b); |
25 | 0 | const __m256i zero = _mm256_setzero_si256(); |
26 | 0 | const __m256i v_a00_w = _mm256_unpacklo_epi8(v_a0, zero); |
27 | 0 | const __m256i v_a01_w = _mm256_unpackhi_epi8(v_a0, zero); |
28 | 0 | const __m256i v_b00_w = _mm256_unpacklo_epi8(v_b0, zero); |
29 | 0 | const __m256i v_b01_w = _mm256_unpackhi_epi8(v_b0, zero); |
30 | 0 | const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w); |
31 | 0 | const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w); |
32 | 0 | *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w)); |
33 | 0 | *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d01_w, v_d01_w)); |
34 | 0 | } |
35 | | |
36 | 4.77M | static INLINE int64_t summary_all_avx2(const __m256i *sum_all) { |
37 | 4.77M | int64_t sum; |
38 | 4.77M | __m256i zero = _mm256_setzero_si256(); |
39 | 4.77M | const __m256i sum0_4x64 = _mm256_unpacklo_epi32(*sum_all, zero); |
40 | 4.77M | const __m256i sum1_4x64 = _mm256_unpackhi_epi32(*sum_all, zero); |
41 | 4.77M | const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64); |
42 | 4.77M | const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64), |
43 | 4.77M | _mm256_extracti128_si256(sum_4x64, 1)); |
44 | 4.77M | const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); |
45 | 4.77M | _mm_storel_epi64((__m128i *)&sum, sum_1x64); |
46 | 4.77M | return sum; |
47 | 4.77M | } |
48 | | |
49 | | #if CONFIG_VP9_HIGHBITDEPTH |
50 | 0 | static INLINE void summary_32_avx2(const __m256i *sum32, __m256i *sum) { |
51 | 0 | const __m256i sum0_4x64 = |
52 | 0 | _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum32)); |
53 | 0 | const __m256i sum1_4x64 = |
54 | 0 | _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum32, 1)); |
55 | 0 | const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64); |
56 | 0 | *sum = _mm256_add_epi64(*sum, sum_4x64); |
57 | 0 | } |
58 | | |
59 | 0 | static INLINE int64_t summary_4x64_avx2(const __m256i sum_4x64) { |
60 | 0 | int64_t sum; |
61 | 0 | const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64), |
62 | 0 | _mm256_extracti128_si256(sum_4x64, 1)); |
63 | 0 | const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); |
64 | |
|
65 | 0 | _mm_storel_epi64((__m128i *)&sum, sum_1x64); |
66 | 0 | return sum; |
67 | 0 | } |
68 | | #endif |
69 | | |
70 | | static INLINE void sse_w4x4_avx2(const uint8_t *a, int a_stride, |
71 | 0 | const uint8_t *b, int b_stride, __m256i *sum) { |
72 | 0 | const __m128i v_a0 = load_unaligned_u32(a); |
73 | 0 | const __m128i v_a1 = load_unaligned_u32(a + a_stride); |
74 | 0 | const __m128i v_a2 = load_unaligned_u32(a + a_stride * 2); |
75 | 0 | const __m128i v_a3 = load_unaligned_u32(a + a_stride * 3); |
76 | 0 | const __m128i v_b0 = load_unaligned_u32(b); |
77 | 0 | const __m128i v_b1 = load_unaligned_u32(b + b_stride); |
78 | 0 | const __m128i v_b2 = load_unaligned_u32(b + b_stride * 2); |
79 | 0 | const __m128i v_b3 = load_unaligned_u32(b + b_stride * 3); |
80 | 0 | const __m128i v_a0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_a0, v_a1), |
81 | 0 | _mm_unpacklo_epi32(v_a2, v_a3)); |
82 | 0 | const __m128i v_b0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_b0, v_b1), |
83 | 0 | _mm_unpacklo_epi32(v_b2, v_b3)); |
84 | 0 | const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123); |
85 | 0 | const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123); |
86 | 0 | const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); |
87 | 0 | *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); |
88 | 0 | } |
89 | | |
90 | | static INLINE void sse_w8x2_avx2(const uint8_t *a, int a_stride, |
91 | 0 | const uint8_t *b, int b_stride, __m256i *sum) { |
92 | 0 | const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a); |
93 | 0 | const __m128i v_a1 = _mm_loadl_epi64((const __m128i *)(a + a_stride)); |
94 | 0 | const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b); |
95 | 0 | const __m128i v_b1 = _mm_loadl_epi64((const __m128i *)(b + b_stride)); |
96 | 0 | const __m256i v_a_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1)); |
97 | 0 | const __m256i v_b_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1)); |
98 | 0 | const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); |
99 | 0 | *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); |
100 | 0 | } |
101 | | |
102 | | int64_t vpx_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b, |
103 | 4.77M | int b_stride, int width, int height) { |
104 | 4.77M | int32_t y = 0; |
105 | 4.77M | int64_t sse = 0; |
106 | 4.77M | __m256i sum = _mm256_setzero_si256(); |
107 | 4.77M | __m256i zero = _mm256_setzero_si256(); |
108 | 4.77M | switch (width) { |
109 | 0 | case 4: |
110 | 0 | do { |
111 | 0 | sse_w4x4_avx2(a, a_stride, b, b_stride, &sum); |
112 | 0 | a += a_stride << 2; |
113 | 0 | b += b_stride << 2; |
114 | 0 | y += 4; |
115 | 0 | } while (y < height); |
116 | 0 | sse = summary_all_avx2(&sum); |
117 | 0 | break; |
118 | 0 | case 8: |
119 | 0 | do { |
120 | 0 | sse_w8x2_avx2(a, a_stride, b, b_stride, &sum); |
121 | 0 | a += a_stride << 1; |
122 | 0 | b += b_stride << 1; |
123 | 0 | y += 2; |
124 | 0 | } while (y < height); |
125 | 0 | sse = summary_all_avx2(&sum); |
126 | 0 | break; |
127 | 4.77M | case 16: |
128 | 38.1M | do { |
129 | 38.1M | const __m128i v_a0 = _mm_loadu_si128((const __m128i *)a); |
130 | 38.1M | const __m128i v_a1 = _mm_loadu_si128((const __m128i *)(a + a_stride)); |
131 | 38.1M | const __m128i v_b0 = _mm_loadu_si128((const __m128i *)b); |
132 | 38.1M | const __m128i v_b1 = _mm_loadu_si128((const __m128i *)(b + b_stride)); |
133 | 38.1M | const __m256i v_a = |
134 | 38.1M | _mm256_insertf128_si256(_mm256_castsi128_si256(v_a0), v_a1, 0x01); |
135 | 38.1M | const __m256i v_b = |
136 | 38.1M | _mm256_insertf128_si256(_mm256_castsi128_si256(v_b0), v_b1, 0x01); |
137 | 38.1M | const __m256i v_al = _mm256_unpacklo_epi8(v_a, zero); |
138 | 38.1M | const __m256i v_au = _mm256_unpackhi_epi8(v_a, zero); |
139 | 38.1M | const __m256i v_bl = _mm256_unpacklo_epi8(v_b, zero); |
140 | 38.1M | const __m256i v_bu = _mm256_unpackhi_epi8(v_b, zero); |
141 | 38.1M | const __m256i v_asub = _mm256_sub_epi16(v_al, v_bl); |
142 | 38.1M | const __m256i v_bsub = _mm256_sub_epi16(v_au, v_bu); |
143 | 38.1M | const __m256i temp = |
144 | 38.1M | _mm256_add_epi32(_mm256_madd_epi16(v_asub, v_asub), |
145 | 38.1M | _mm256_madd_epi16(v_bsub, v_bsub)); |
146 | 38.1M | sum = _mm256_add_epi32(sum, temp); |
147 | 38.1M | a += a_stride << 1; |
148 | 38.1M | b += b_stride << 1; |
149 | 38.1M | y += 2; |
150 | 38.1M | } while (y < height); |
151 | 4.77M | sse = summary_all_avx2(&sum); |
152 | 4.77M | break; |
153 | 0 | case 32: |
154 | 0 | do { |
155 | 0 | sse_w32_avx2(&sum, a, b); |
156 | 0 | a += a_stride; |
157 | 0 | b += b_stride; |
158 | 0 | y += 1; |
159 | 0 | } while (y < height); |
160 | 0 | sse = summary_all_avx2(&sum); |
161 | 0 | break; |
162 | 0 | case 64: |
163 | 0 | do { |
164 | 0 | sse_w32_avx2(&sum, a, b); |
165 | 0 | sse_w32_avx2(&sum, a + 32, b + 32); |
166 | 0 | a += a_stride; |
167 | 0 | b += b_stride; |
168 | 0 | y += 1; |
169 | 0 | } while (y < height); |
170 | 0 | sse = summary_all_avx2(&sum); |
171 | 0 | break; |
172 | 0 | default: |
173 | 0 | if ((width & 0x07) == 0) { |
174 | 0 | do { |
175 | 0 | int i = 0; |
176 | 0 | do { |
177 | 0 | sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum); |
178 | 0 | i += 8; |
179 | 0 | } while (i < width); |
180 | 0 | a += a_stride << 1; |
181 | 0 | b += b_stride << 1; |
182 | 0 | y += 2; |
183 | 0 | } while (y < height); |
184 | 0 | } else { |
185 | 0 | do { |
186 | 0 | int i = 0; |
187 | 0 | do { |
188 | 0 | const uint8_t *a2; |
189 | 0 | const uint8_t *b2; |
190 | 0 | sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum); |
191 | 0 | a2 = a + i + (a_stride << 1); |
192 | 0 | b2 = b + i + (b_stride << 1); |
193 | 0 | sse_w8x2_avx2(a2, a_stride, b2, b_stride, &sum); |
194 | 0 | i += 8; |
195 | 0 | } while (i + 4 < width); |
196 | 0 | sse_w4x4_avx2(a + i, a_stride, b + i, b_stride, &sum); |
197 | 0 | a += a_stride << 2; |
198 | 0 | b += b_stride << 2; |
199 | 0 | y += 4; |
200 | 0 | } while (y < height); |
201 | 0 | } |
202 | 0 | sse = summary_all_avx2(&sum); |
203 | 0 | break; |
204 | 4.77M | } |
205 | | |
206 | 4.77M | return sse; |
207 | 4.77M | } |
208 | | |
209 | | #if CONFIG_VP9_HIGHBITDEPTH |
210 | | static INLINE void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a, |
211 | 0 | const uint16_t *b) { |
212 | 0 | const __m256i v_a_w = _mm256_loadu_si256((const __m256i *)a); |
213 | 0 | const __m256i v_b_w = _mm256_loadu_si256((const __m256i *)b); |
214 | 0 | const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); |
215 | 0 | *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); |
216 | 0 | } |
217 | | |
218 | | static INLINE void highbd_sse_w4x4_avx2(__m256i *sum, const uint16_t *a, |
219 | | int a_stride, const uint16_t *b, |
220 | 0 | int b_stride) { |
221 | 0 | const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a); |
222 | 0 | const __m128i v_a1 = _mm_loadl_epi64((const __m128i *)(a + a_stride)); |
223 | 0 | const __m128i v_a2 = _mm_loadl_epi64((const __m128i *)(a + a_stride * 2)); |
224 | 0 | const __m128i v_a3 = _mm_loadl_epi64((const __m128i *)(a + a_stride * 3)); |
225 | 0 | const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b); |
226 | 0 | const __m128i v_b1 = _mm_loadl_epi64((const __m128i *)(b + b_stride)); |
227 | 0 | const __m128i v_b2 = _mm_loadl_epi64((const __m128i *)(b + b_stride * 2)); |
228 | 0 | const __m128i v_b3 = _mm_loadl_epi64((const __m128i *)(b + b_stride * 3)); |
229 | 0 | const __m128i v_a_hi = _mm_unpacklo_epi64(v_a0, v_a1); |
230 | 0 | const __m128i v_a_lo = _mm_unpacklo_epi64(v_a2, v_a3); |
231 | 0 | const __m256i v_a_w = |
232 | 0 | _mm256_insertf128_si256(_mm256_castsi128_si256(v_a_lo), v_a_hi, 1); |
233 | 0 | const __m128i v_b_hi = _mm_unpacklo_epi64(v_b0, v_b1); |
234 | 0 | const __m128i v_b_lo = _mm_unpacklo_epi64(v_b2, v_b3); |
235 | 0 | const __m256i v_b_w = |
236 | 0 | _mm256_insertf128_si256(_mm256_castsi128_si256(v_b_lo), v_b_hi, 1); |
237 | 0 | const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); |
238 | 0 | *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); |
239 | 0 | } |
240 | | |
241 | | static INLINE void highbd_sse_w8x2_avx2(__m256i *sum, const uint16_t *a, |
242 | | int a_stride, const uint16_t *b, |
243 | 0 | int b_stride) { |
244 | 0 | const __m128i v_a_hi = _mm_loadu_si128((const __m128i *)(a + a_stride)); |
245 | 0 | const __m128i v_a_lo = _mm_loadu_si128((const __m128i *)a); |
246 | 0 | const __m256i v_a_w = |
247 | 0 | _mm256_insertf128_si256(_mm256_castsi128_si256(v_a_lo), v_a_hi, 1); |
248 | 0 | const __m128i v_b_hi = _mm_loadu_si128((const __m128i *)(b + b_stride)); |
249 | 0 | const __m128i v_b_lo = _mm_loadu_si128((const __m128i *)b); |
250 | 0 | const __m256i v_b_w = |
251 | 0 | _mm256_insertf128_si256(_mm256_castsi128_si256(v_b_lo), v_b_hi, 1); |
252 | 0 | const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); |
253 | 0 | *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); |
254 | 0 | } |
255 | | |
256 | | int64_t vpx_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8, |
257 | 0 | int b_stride, int width, int height) { |
258 | 0 | int32_t y = 0; |
259 | 0 | int64_t sse = 0; |
260 | 0 | uint16_t *a = CONVERT_TO_SHORTPTR(a8); |
261 | 0 | uint16_t *b = CONVERT_TO_SHORTPTR(b8); |
262 | 0 | __m256i sum = _mm256_setzero_si256(); |
263 | 0 | switch (width) { |
264 | 0 | case 4: |
265 | 0 | do { |
266 | 0 | highbd_sse_w4x4_avx2(&sum, a, a_stride, b, b_stride); |
267 | 0 | a += a_stride << 2; |
268 | 0 | b += b_stride << 2; |
269 | 0 | y += 4; |
270 | 0 | } while (y < height); |
271 | 0 | sse = summary_all_avx2(&sum); |
272 | 0 | break; |
273 | 0 | case 8: |
274 | 0 | do { |
275 | 0 | highbd_sse_w8x2_avx2(&sum, a, a_stride, b, b_stride); |
276 | 0 | a += a_stride << 1; |
277 | 0 | b += b_stride << 1; |
278 | 0 | y += 2; |
279 | 0 | } while (y < height); |
280 | 0 | sse = summary_all_avx2(&sum); |
281 | 0 | break; |
282 | 0 | case 16: |
283 | 0 | do { |
284 | 0 | highbd_sse_w16_avx2(&sum, a, b); |
285 | 0 | a += a_stride; |
286 | 0 | b += b_stride; |
287 | 0 | y += 1; |
288 | 0 | } while (y < height); |
289 | 0 | sse = summary_all_avx2(&sum); |
290 | 0 | break; |
291 | 0 | case 32: |
292 | 0 | do { |
293 | 0 | int l = 0; |
294 | 0 | __m256i sum32 = _mm256_setzero_si256(); |
295 | 0 | do { |
296 | 0 | highbd_sse_w16_avx2(&sum32, a, b); |
297 | 0 | highbd_sse_w16_avx2(&sum32, a + 16, b + 16); |
298 | 0 | a += a_stride; |
299 | 0 | b += b_stride; |
300 | 0 | l += 1; |
301 | 0 | } while (l < 64 && l < (height - y)); |
302 | 0 | summary_32_avx2(&sum32, &sum); |
303 | 0 | y += 64; |
304 | 0 | } while (y < height); |
305 | 0 | sse = summary_4x64_avx2(sum); |
306 | 0 | break; |
307 | 0 | case 64: |
308 | 0 | do { |
309 | 0 | int l = 0; |
310 | 0 | __m256i sum32 = _mm256_setzero_si256(); |
311 | 0 | do { |
312 | 0 | highbd_sse_w16_avx2(&sum32, a, b); |
313 | 0 | highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1); |
314 | 0 | highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2); |
315 | 0 | highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3); |
316 | 0 | a += a_stride; |
317 | 0 | b += b_stride; |
318 | 0 | l += 1; |
319 | 0 | } while (l < 32 && l < (height - y)); |
320 | 0 | summary_32_avx2(&sum32, &sum); |
321 | 0 | y += 32; |
322 | 0 | } while (y < height); |
323 | 0 | sse = summary_4x64_avx2(sum); |
324 | 0 | break; |
325 | 0 | default: |
326 | 0 | if (width & 0x7) { |
327 | 0 | do { |
328 | 0 | int i = 0; |
329 | 0 | __m256i sum32 = _mm256_setzero_si256(); |
330 | 0 | do { |
331 | 0 | const uint16_t *a2; |
332 | 0 | const uint16_t *b2; |
333 | 0 | highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride); |
334 | 0 | a2 = a + i + (a_stride << 1); |
335 | 0 | b2 = b + i + (b_stride << 1); |
336 | 0 | highbd_sse_w8x2_avx2(&sum32, a2, a_stride, b2, b_stride); |
337 | 0 | i += 8; |
338 | 0 | } while (i + 4 < width); |
339 | 0 | highbd_sse_w4x4_avx2(&sum32, a + i, a_stride, b + i, b_stride); |
340 | 0 | summary_32_avx2(&sum32, &sum); |
341 | 0 | a += a_stride << 2; |
342 | 0 | b += b_stride << 2; |
343 | 0 | y += 4; |
344 | 0 | } while (y < height); |
345 | 0 | } else { |
346 | 0 | do { |
347 | 0 | int l = 0; |
348 | 0 | __m256i sum32 = _mm256_setzero_si256(); |
349 | 0 | do { |
350 | 0 | int i = 0; |
351 | 0 | do { |
352 | 0 | highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride); |
353 | 0 | i += 8; |
354 | 0 | } while (i < width); |
355 | 0 | a += a_stride << 1; |
356 | 0 | b += b_stride << 1; |
357 | 0 | l += 2; |
358 | 0 | } while (l < 8 && l < (height - y)); |
359 | 0 | summary_32_avx2(&sum32, &sum); |
360 | 0 | y += 8; |
361 | 0 | } while (y < height); |
362 | 0 | } |
363 | 0 | sse = summary_4x64_avx2(sum); |
364 | 0 | break; |
365 | 0 | } |
366 | 0 | return sse; |
367 | 0 | } |
368 | | #endif // CONFIG_VP9_HIGHBITDEPTH |