/src/libvpx/vpx_dsp/x86/sad_avx2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | #include <immintrin.h> |
11 | | #include "./vpx_dsp_rtcd.h" |
12 | | #include "vpx_ports/mem.h" |
13 | | |
14 | | static INLINE unsigned int sad64xh_avx2(const uint8_t *src_ptr, int src_stride, |
15 | | const uint8_t *ref_ptr, int ref_stride, |
16 | 2.78M | int h) { |
17 | 2.78M | int i, res; |
18 | 2.78M | __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; |
19 | 2.78M | __m256i sum_sad = _mm256_setzero_si256(); |
20 | 2.78M | __m256i sum_sad_h; |
21 | 2.78M | __m128i sum_sad128; |
22 | 108M | for (i = 0; i < h; i++) { |
23 | 106M | ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); |
24 | 106M | ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); |
25 | 106M | sad1_reg = |
26 | 106M | _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); |
27 | 106M | sad2_reg = _mm256_sad_epu8( |
28 | 106M | ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); |
29 | 106M | sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); |
30 | 106M | ref_ptr += ref_stride; |
31 | 106M | src_ptr += src_stride; |
32 | 106M | } |
33 | 2.78M | sum_sad_h = _mm256_srli_si256(sum_sad, 8); |
34 | 2.78M | sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); |
35 | 2.78M | sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); |
36 | 2.78M | sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); |
37 | 2.78M | res = _mm_cvtsi128_si32(sum_sad128); |
38 | 2.78M | return res; |
39 | 2.78M | } |
40 | | |
41 | | static INLINE unsigned int sad32xh_avx2(const uint8_t *src_ptr, int src_stride, |
42 | | const uint8_t *ref_ptr, int ref_stride, |
43 | 14.7M | int h) { |
44 | 14.7M | int i, res; |
45 | 14.7M | __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; |
46 | 14.7M | __m256i sum_sad = _mm256_setzero_si256(); |
47 | 14.7M | __m256i sum_sad_h; |
48 | 14.7M | __m128i sum_sad128; |
49 | 14.7M | const int ref2_stride = ref_stride << 1; |
50 | 14.7M | const int src2_stride = src_stride << 1; |
51 | 14.7M | const int max = h >> 1; |
52 | 197M | for (i = 0; i < max; i++) { |
53 | 183M | ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); |
54 | 183M | ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); |
55 | 183M | sad1_reg = |
56 | 183M | _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); |
57 | 183M | sad2_reg = _mm256_sad_epu8( |
58 | 183M | ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); |
59 | 183M | sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); |
60 | 183M | ref_ptr += ref2_stride; |
61 | 183M | src_ptr += src2_stride; |
62 | 183M | } |
63 | 14.7M | sum_sad_h = _mm256_srli_si256(sum_sad, 8); |
64 | 14.7M | sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); |
65 | 14.7M | sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); |
66 | 14.7M | sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); |
67 | 14.7M | res = _mm_cvtsi128_si32(sum_sad128); |
68 | 14.7M | return res; |
69 | 14.7M | } |
70 | | |
71 | | #define FSAD64_H(h) \ |
72 | | unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ |
73 | 2.05M | const uint8_t *ref_ptr, int ref_stride) { \ |
74 | 2.05M | return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \ |
75 | 2.05M | } Line | Count | Source | 73 | 644k | const uint8_t *ref_ptr, int ref_stride) { \ | 74 | 644k | return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \ | 75 | 644k | } |
Line | Count | Source | 73 | 1.40M | const uint8_t *ref_ptr, int ref_stride) { \ | 74 | 1.40M | return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \ | 75 | 1.40M | } |
|
76 | | |
77 | | #define FSADS64_H(h) \ |
78 | | unsigned int vpx_sad_skip_64x##h##_avx2( \ |
79 | | const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ |
80 | 729k | int ref_stride) { \ |
81 | 729k | return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \ |
82 | 729k | h / 2); \ |
83 | 729k | } Line | Count | Source | 80 | 515k | int ref_stride) { \ | 81 | 515k | return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \ | 82 | 515k | h / 2); \ | 83 | 515k | } |
Line | Count | Source | 80 | 214k | int ref_stride) { \ | 81 | 214k | return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \ | 82 | 214k | h / 2); \ | 83 | 214k | } |
|
84 | | |
85 | | #define FSAD32_H(h) \ |
86 | | unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ |
87 | 10.4M | const uint8_t *ref_ptr, int ref_stride) { \ |
88 | 10.4M | return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \ |
89 | 10.4M | } Line | Count | Source | 87 | 1.21M | const uint8_t *ref_ptr, int ref_stride) { \ | 88 | 1.21M | return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \ | 89 | 1.21M | } |
Line | Count | Source | 87 | 4.86M | const uint8_t *ref_ptr, int ref_stride) { \ | 88 | 4.86M | return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \ | 89 | 4.86M | } |
Line | Count | Source | 87 | 4.37M | const uint8_t *ref_ptr, int ref_stride) { \ | 88 | 4.37M | return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \ | 89 | 4.37M | } |
|
90 | | |
91 | | #define FSADS32_H(h) \ |
92 | | unsigned int vpx_sad_skip_32x##h##_avx2( \ |
93 | | const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ |
94 | 4.26M | int ref_stride) { \ |
95 | 4.26M | return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \ |
96 | 4.26M | h / 2); \ |
97 | 4.26M | } Line | Count | Source | 94 | 110k | int ref_stride) { \ | 95 | 110k | return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \ | 96 | 110k | h / 2); \ | 97 | 110k | } |
Line | Count | Source | 94 | 3.22M | int ref_stride) { \ | 95 | 3.22M | return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \ | 96 | 3.22M | h / 2); \ | 97 | 3.22M | } |
Line | Count | Source | 94 | 930k | int ref_stride) { \ | 95 | 930k | return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \ | 96 | 930k | h / 2); \ | 97 | 930k | } |
|
98 | | |
99 | | #define FSAD64 \ |
100 | | FSAD64_H(64) \ |
101 | | FSAD64_H(32) \ |
102 | | FSADS64_H(64) \ |
103 | | FSADS64_H(32) |
104 | | |
105 | | #define FSAD32 \ |
106 | | FSAD32_H(64) \ |
107 | | FSAD32_H(32) \ |
108 | | FSAD32_H(16) \ |
109 | | FSADS32_H(64) \ |
110 | | FSADS32_H(32) \ |
111 | | FSADS32_H(16) |
112 | | |
113 | | FSAD64 |
114 | | FSAD32 |
115 | | |
116 | | #undef FSAD64 |
117 | | #undef FSAD32 |
118 | | #undef FSAD64_H |
119 | | #undef FSAD32_H |
120 | | #undef FSADS64_H |
121 | | #undef FSADS32_H |
122 | | |
123 | | #define FSADAVG64_H(h) \ |
124 | | unsigned int vpx_sad64x##h##_avg_avx2( \ |
125 | | const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ |
126 | 0 | int ref_stride, const uint8_t *second_pred) { \ |
127 | 0 | int i; \ |
128 | 0 | __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ |
129 | 0 | __m256i sum_sad = _mm256_setzero_si256(); \ |
130 | 0 | __m256i sum_sad_h; \ |
131 | 0 | __m128i sum_sad128; \ |
132 | 0 | for (i = 0; i < h; i++) { \ |
133 | 0 | ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ |
134 | 0 | ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ |
135 | 0 | ref1_reg = _mm256_avg_epu8( \ |
136 | 0 | ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \ |
137 | 0 | ref2_reg = _mm256_avg_epu8( \ |
138 | 0 | ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \ |
139 | 0 | sad1_reg = _mm256_sad_epu8( \ |
140 | 0 | ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ |
141 | 0 | sad2_reg = _mm256_sad_epu8( \ |
142 | 0 | ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ |
143 | 0 | sum_sad = \ |
144 | 0 | _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ |
145 | 0 | ref_ptr += ref_stride; \ |
146 | 0 | src_ptr += src_stride; \ |
147 | 0 | second_pred += 64; \ |
148 | 0 | } \ |
149 | 0 | sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ |
150 | 0 | sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ |
151 | 0 | sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ |
152 | 0 | sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ |
153 | 0 | return (unsigned int)_mm_cvtsi128_si32(sum_sad128); \ |
154 | 0 | } Unexecuted instantiation: vpx_sad64x64_avg_avx2 Unexecuted instantiation: vpx_sad64x32_avg_avx2 |
155 | | |
156 | | #define FSADAVG32_H(h) \ |
157 | | unsigned int vpx_sad32x##h##_avg_avx2( \ |
158 | | const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ |
159 | 0 | int ref_stride, const uint8_t *second_pred) { \ |
160 | 0 | int i; \ |
161 | 0 | __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ |
162 | 0 | __m256i sum_sad = _mm256_setzero_si256(); \ |
163 | 0 | __m256i sum_sad_h; \ |
164 | 0 | __m128i sum_sad128; \ |
165 | 0 | int ref2_stride = ref_stride << 1; \ |
166 | 0 | int src2_stride = src_stride << 1; \ |
167 | 0 | int max = h >> 1; \ |
168 | 0 | for (i = 0; i < max; i++) { \ |
169 | 0 | ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ |
170 | 0 | ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ |
171 | 0 | ref1_reg = _mm256_avg_epu8( \ |
172 | 0 | ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \ |
173 | 0 | ref2_reg = _mm256_avg_epu8( \ |
174 | 0 | ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \ |
175 | 0 | sad1_reg = _mm256_sad_epu8( \ |
176 | 0 | ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ |
177 | 0 | sad2_reg = _mm256_sad_epu8( \ |
178 | 0 | ref2_reg, \ |
179 | 0 | _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ |
180 | 0 | sum_sad = \ |
181 | 0 | _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ |
182 | 0 | ref_ptr += ref2_stride; \ |
183 | 0 | src_ptr += src2_stride; \ |
184 | 0 | second_pred += 64; \ |
185 | 0 | } \ |
186 | 0 | sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ |
187 | 0 | sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ |
188 | 0 | sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ |
189 | 0 | sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ |
190 | 0 | return (unsigned int)_mm_cvtsi128_si32(sum_sad128); \ |
191 | 0 | } Unexecuted instantiation: vpx_sad32x64_avg_avx2 Unexecuted instantiation: vpx_sad32x32_avg_avx2 Unexecuted instantiation: vpx_sad32x16_avg_avx2 |
192 | | |
193 | | #define FSADAVG64 \ |
194 | | FSADAVG64_H(64) \ |
195 | | FSADAVG64_H(32) |
196 | | |
197 | | #define FSADAVG32 \ |
198 | | FSADAVG32_H(64) \ |
199 | | FSADAVG32_H(32) \ |
200 | | FSADAVG32_H(16) |
201 | | |
202 | | FSADAVG64 |
203 | | FSADAVG32 |
204 | | |
205 | | #undef FSADAVG64 |
206 | | #undef FSADAVG32 |
207 | | #undef FSADAVG64_H |
208 | | #undef FSADAVG32_H |