/src/libvpx/vpx_dsp/x86/sad4d_avx2.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | #include <immintrin.h> // AVX2 |
11 | | #include "./vpx_dsp_rtcd.h" |
12 | | #include "vpx/vpx_integer.h" |
13 | | |
14 | | // Note with sums[4] some versions of Visual Studio may fail due to parameter |
15 | | // alignment, though the functions should be equivalent: |
16 | | // error C2719: 'sums': formal parameter with requested alignment of 32 won't be |
17 | | // aligned |
18 | | static INLINE void calc_final_4(const __m256i *const sums /*[4]*/, |
19 | 10.6M | uint32_t sad_array[4]) { |
20 | 10.6M | const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]); |
21 | 10.6M | const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]); |
22 | 10.6M | const __m256i t2 = _mm256_hadd_epi32(t0, t1); |
23 | 10.6M | const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t2), |
24 | 10.6M | _mm256_extractf128_si256(t2, 1)); |
25 | 10.6M | _mm_storeu_si128((__m128i *)sad_array, sum); |
26 | 10.6M | } |
27 | | |
28 | | static INLINE void sad32xhx4d_avx2(const uint8_t *src_ptr, int src_stride, |
29 | | const uint8_t *const ref_array[4], |
30 | | int ref_stride, int h, |
31 | 8.03M | uint32_t sad_array[4]) { |
32 | 8.03M | int i; |
33 | 8.03M | const uint8_t *refs[4]; |
34 | 8.03M | __m256i sums[4]; |
35 | | |
36 | 8.03M | refs[0] = ref_array[0]; |
37 | 8.03M | refs[1] = ref_array[1]; |
38 | 8.03M | refs[2] = ref_array[2]; |
39 | 8.03M | refs[3] = ref_array[3]; |
40 | 8.03M | sums[0] = _mm256_setzero_si256(); |
41 | 8.03M | sums[1] = _mm256_setzero_si256(); |
42 | 8.03M | sums[2] = _mm256_setzero_si256(); |
43 | 8.03M | sums[3] = _mm256_setzero_si256(); |
44 | | |
45 | 202M | for (i = 0; i < h; i++) { |
46 | 194M | __m256i r[4]; |
47 | | |
48 | | // load src and all ref[] |
49 | 194M | const __m256i s = _mm256_load_si256((const __m256i *)src_ptr); |
50 | 194M | r[0] = _mm256_loadu_si256((const __m256i *)refs[0]); |
51 | 194M | r[1] = _mm256_loadu_si256((const __m256i *)refs[1]); |
52 | 194M | r[2] = _mm256_loadu_si256((const __m256i *)refs[2]); |
53 | 194M | r[3] = _mm256_loadu_si256((const __m256i *)refs[3]); |
54 | | |
55 | | // sum of the absolute differences between every ref[] to src |
56 | 194M | r[0] = _mm256_sad_epu8(r[0], s); |
57 | 194M | r[1] = _mm256_sad_epu8(r[1], s); |
58 | 194M | r[2] = _mm256_sad_epu8(r[2], s); |
59 | 194M | r[3] = _mm256_sad_epu8(r[3], s); |
60 | | |
61 | | // sum every ref[] |
62 | 194M | sums[0] = _mm256_add_epi32(sums[0], r[0]); |
63 | 194M | sums[1] = _mm256_add_epi32(sums[1], r[1]); |
64 | 194M | sums[2] = _mm256_add_epi32(sums[2], r[2]); |
65 | 194M | sums[3] = _mm256_add_epi32(sums[3], r[3]); |
66 | | |
67 | 194M | src_ptr += src_stride; |
68 | 194M | refs[0] += ref_stride; |
69 | 194M | refs[1] += ref_stride; |
70 | 194M | refs[2] += ref_stride; |
71 | 194M | refs[3] += ref_stride; |
72 | 194M | } |
73 | | |
74 | 8.03M | calc_final_4(sums, sad_array); |
75 | 8.03M | } |
76 | | |
77 | | static INLINE void sad64xhx4d_avx2(const uint8_t *src_ptr, int src_stride, |
78 | | const uint8_t *const ref_array[4], |
79 | | int ref_stride, int h, |
80 | 2.64M | uint32_t sad_array[4]) { |
81 | 2.64M | __m256i sums[4]; |
82 | 2.64M | int i; |
83 | 2.64M | const uint8_t *refs[4]; |
84 | | |
85 | 2.64M | refs[0] = ref_array[0]; |
86 | 2.64M | refs[1] = ref_array[1]; |
87 | 2.64M | refs[2] = ref_array[2]; |
88 | 2.64M | refs[3] = ref_array[3]; |
89 | 2.64M | sums[0] = _mm256_setzero_si256(); |
90 | 2.64M | sums[1] = _mm256_setzero_si256(); |
91 | 2.64M | sums[2] = _mm256_setzero_si256(); |
92 | 2.64M | sums[3] = _mm256_setzero_si256(); |
93 | | |
94 | 131M | for (i = 0; i < h; i++) { |
95 | 129M | __m256i r_lo[4], r_hi[4]; |
96 | | // load 64 bytes from src and all ref[] |
97 | 129M | const __m256i s_lo = _mm256_load_si256((const __m256i *)src_ptr); |
98 | 129M | const __m256i s_hi = _mm256_load_si256((const __m256i *)(src_ptr + 32)); |
99 | 129M | r_lo[0] = _mm256_loadu_si256((const __m256i *)refs[0]); |
100 | 129M | r_hi[0] = _mm256_loadu_si256((const __m256i *)(refs[0] + 32)); |
101 | 129M | r_lo[1] = _mm256_loadu_si256((const __m256i *)refs[1]); |
102 | 129M | r_hi[1] = _mm256_loadu_si256((const __m256i *)(refs[1] + 32)); |
103 | 129M | r_lo[2] = _mm256_loadu_si256((const __m256i *)refs[2]); |
104 | 129M | r_hi[2] = _mm256_loadu_si256((const __m256i *)(refs[2] + 32)); |
105 | 129M | r_lo[3] = _mm256_loadu_si256((const __m256i *)refs[3]); |
106 | 129M | r_hi[3] = _mm256_loadu_si256((const __m256i *)(refs[3] + 32)); |
107 | | |
108 | | // sum of the absolute differences between every ref[] to src |
109 | 129M | r_lo[0] = _mm256_sad_epu8(r_lo[0], s_lo); |
110 | 129M | r_lo[1] = _mm256_sad_epu8(r_lo[1], s_lo); |
111 | 129M | r_lo[2] = _mm256_sad_epu8(r_lo[2], s_lo); |
112 | 129M | r_lo[3] = _mm256_sad_epu8(r_lo[3], s_lo); |
113 | 129M | r_hi[0] = _mm256_sad_epu8(r_hi[0], s_hi); |
114 | 129M | r_hi[1] = _mm256_sad_epu8(r_hi[1], s_hi); |
115 | 129M | r_hi[2] = _mm256_sad_epu8(r_hi[2], s_hi); |
116 | 129M | r_hi[3] = _mm256_sad_epu8(r_hi[3], s_hi); |
117 | | |
118 | | // sum every ref[] |
119 | 129M | sums[0] = _mm256_add_epi32(sums[0], r_lo[0]); |
120 | 129M | sums[1] = _mm256_add_epi32(sums[1], r_lo[1]); |
121 | 129M | sums[2] = _mm256_add_epi32(sums[2], r_lo[2]); |
122 | 129M | sums[3] = _mm256_add_epi32(sums[3], r_lo[3]); |
123 | 129M | sums[0] = _mm256_add_epi32(sums[0], r_hi[0]); |
124 | 129M | sums[1] = _mm256_add_epi32(sums[1], r_hi[1]); |
125 | 129M | sums[2] = _mm256_add_epi32(sums[2], r_hi[2]); |
126 | 129M | sums[3] = _mm256_add_epi32(sums[3], r_hi[3]); |
127 | | |
128 | 129M | src_ptr += src_stride; |
129 | 129M | refs[0] += ref_stride; |
130 | 129M | refs[1] += ref_stride; |
131 | 129M | refs[2] += ref_stride; |
132 | 129M | refs[3] += ref_stride; |
133 | 129M | } |
134 | | |
135 | 2.64M | calc_final_4(sums, sad_array); |
136 | 2.64M | } |
137 | | |
138 | | #define SAD64_H(h) \ |
139 | | void vpx_sad64x##h##x4d_avx2(const uint8_t *src, int src_stride, \ |
140 | | const uint8_t *const ref_array[4], \ |
141 | 1.47M | int ref_stride, uint32_t sad_array[4]) { \ |
142 | 1.47M | sad64xhx4d_avx2(src, src_stride, ref_array, ref_stride, h, sad_array); \ |
143 | 1.47M | } |
144 | | |
145 | | #define SAD32_H(h) \ |
146 | | void vpx_sad32x##h##x4d_avx2(const uint8_t *src, int src_stride, \ |
147 | | const uint8_t *const ref_array[4], \ |
148 | 4.44M | int ref_stride, uint32_t sad_array[4]) { \ |
149 | 4.44M | sad32xhx4d_avx2(src, src_stride, ref_array, ref_stride, h, sad_array); \ |
150 | 4.44M | } |
151 | | |
152 | | SAD64_H(64) |
153 | | SAD32_H(32) |
154 | | |
155 | | #define SADS64_H(h) \ |
156 | | void vpx_sad_skip_64x##h##x4d_avx2(const uint8_t *src, int src_stride, \ |
157 | | const uint8_t *const ref_array[4], \ |
158 | 1.16M | int ref_stride, uint32_t sad_array[4]) { \ |
159 | 1.16M | sad64xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \ |
160 | 1.16M | ((h) >> 1), sad_array); \ |
161 | 1.16M | sad_array[0] <<= 1; \ |
162 | 1.16M | sad_array[1] <<= 1; \ |
163 | 1.16M | sad_array[2] <<= 1; \ |
164 | 1.16M | sad_array[3] <<= 1; \ |
165 | 1.16M | } vpx_sad_skip_64x64x4d_avx2 Line | Count | Source | 158 | 1.00M | int ref_stride, uint32_t sad_array[4]) { \ | 159 | 1.00M | sad64xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \ | 160 | 1.00M | ((h) >> 1), sad_array); \ | 161 | 1.00M | sad_array[0] <<= 1; \ | 162 | 1.00M | sad_array[1] <<= 1; \ | 163 | 1.00M | sad_array[2] <<= 1; \ | 164 | 1.00M | sad_array[3] <<= 1; \ | 165 | 1.00M | } |
vpx_sad_skip_64x32x4d_avx2 Line | Count | Source | 158 | 159k | int ref_stride, uint32_t sad_array[4]) { \ | 159 | 159k | sad64xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \ | 160 | 159k | ((h) >> 1), sad_array); \ | 161 | 159k | sad_array[0] <<= 1; \ | 162 | 159k | sad_array[1] <<= 1; \ | 163 | 159k | sad_array[2] <<= 1; \ | 164 | 159k | sad_array[3] <<= 1; \ | 165 | 159k | } |
|
166 | | |
167 | | #define SADS32_H(h) \ |
168 | | void vpx_sad_skip_32x##h##x4d_avx2(const uint8_t *src, int src_stride, \ |
169 | | const uint8_t *const ref_array[4], \ |
170 | 3.59M | int ref_stride, uint32_t sad_array[4]) { \ |
171 | 3.59M | sad32xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \ |
172 | 3.59M | ((h) >> 1), sad_array); \ |
173 | 3.59M | sad_array[0] <<= 1; \ |
174 | 3.59M | sad_array[1] <<= 1; \ |
175 | 3.59M | sad_array[2] <<= 1; \ |
176 | 3.59M | sad_array[3] <<= 1; \ |
177 | 3.59M | } vpx_sad_skip_32x64x4d_avx2 Line | Count | Source | 170 | 23.0k | int ref_stride, uint32_t sad_array[4]) { \ | 171 | 23.0k | sad32xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \ | 172 | 23.0k | ((h) >> 1), sad_array); \ | 173 | 23.0k | sad_array[0] <<= 1; \ | 174 | 23.0k | sad_array[1] <<= 1; \ | 175 | 23.0k | sad_array[2] <<= 1; \ | 176 | 23.0k | sad_array[3] <<= 1; \ | 177 | 23.0k | } |
vpx_sad_skip_32x32x4d_avx2 Line | Count | Source | 170 | 2.90M | int ref_stride, uint32_t sad_array[4]) { \ | 171 | 2.90M | sad32xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \ | 172 | 2.90M | ((h) >> 1), sad_array); \ | 173 | 2.90M | sad_array[0] <<= 1; \ | 174 | 2.90M | sad_array[1] <<= 1; \ | 175 | 2.90M | sad_array[2] <<= 1; \ | 176 | 2.90M | sad_array[3] <<= 1; \ | 177 | 2.90M | } |
vpx_sad_skip_32x16x4d_avx2 Line | Count | Source | 170 | 666k | int ref_stride, uint32_t sad_array[4]) { \ | 171 | 666k | sad32xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \ | 172 | 666k | ((h) >> 1), sad_array); \ | 173 | 666k | sad_array[0] <<= 1; \ | 174 | 666k | sad_array[1] <<= 1; \ | 175 | 666k | sad_array[2] <<= 1; \ | 176 | 666k | sad_array[3] <<= 1; \ | 177 | 666k | } |
|
178 | | |
179 | | SADS64_H(64) |
180 | | SADS64_H(32) |
181 | | |
182 | | SADS32_H(64) |
183 | | SADS32_H(32) |
184 | | SADS32_H(16) |