/src/libvpx/vpx_dsp/x86/sse_sse4.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2023 The WebM project authors. All Rights Reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | |
11 | | #include <assert.h> |
12 | | #include <smmintrin.h> |
13 | | |
14 | | #include "./vpx_config.h" |
15 | | #include "./vpx_dsp_rtcd.h" |
16 | | |
17 | | #include "vpx_ports/mem.h" |
18 | | #include "vpx/vpx_integer.h" |
19 | | #include "vpx_dsp/x86/mem_sse2.h" |
20 | | |
21 | 0 | static INLINE int64_t summary_all_sse4(const __m128i *sum_all) { |
22 | 0 | int64_t sum; |
23 | 0 | const __m128i sum0 = _mm_cvtepu32_epi64(*sum_all); |
24 | 0 | const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum_all, 8)); |
25 | 0 | const __m128i sum_2x64 = _mm_add_epi64(sum0, sum1); |
26 | 0 | const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); |
27 | 0 | _mm_storel_epi64((__m128i *)&sum, sum_1x64); |
28 | 0 | return sum; |
29 | 0 | } |
30 | | |
31 | | #if CONFIG_VP9_HIGHBITDEPTH |
32 | 0 | static INLINE void summary_32_sse4(const __m128i *sum32, __m128i *sum64) { |
33 | 0 | const __m128i sum0 = _mm_cvtepu32_epi64(*sum32); |
34 | 0 | const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum32, 8)); |
35 | 0 | *sum64 = _mm_add_epi64(sum0, *sum64); |
36 | 0 | *sum64 = _mm_add_epi64(sum1, *sum64); |
37 | 0 | } |
38 | | #endif |
39 | | |
40 | | static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a, |
41 | 0 | const uint8_t *b) { |
42 | 0 | const __m128i v_a0 = _mm_loadu_si128((const __m128i *)a); |
43 | 0 | const __m128i v_b0 = _mm_loadu_si128((const __m128i *)b); |
44 | 0 | const __m128i v_a00_w = _mm_cvtepu8_epi16(v_a0); |
45 | 0 | const __m128i v_a01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_a0, 8)); |
46 | 0 | const __m128i v_b00_w = _mm_cvtepu8_epi16(v_b0); |
47 | 0 | const __m128i v_b01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_b0, 8)); |
48 | 0 | const __m128i v_d00_w = _mm_sub_epi16(v_a00_w, v_b00_w); |
49 | 0 | const __m128i v_d01_w = _mm_sub_epi16(v_a01_w, v_b01_w); |
50 | 0 | *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d00_w, v_d00_w)); |
51 | 0 | *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w)); |
52 | 0 | } |
53 | | |
54 | | static INLINE void sse4x2_sse4_1(const uint8_t *a, int a_stride, |
55 | 0 | const uint8_t *b, int b_stride, __m128i *sum) { |
56 | 0 | const __m128i v_a0 = load_unaligned_u32(a); |
57 | 0 | const __m128i v_a1 = load_unaligned_u32(a + a_stride); |
58 | 0 | const __m128i v_b0 = load_unaligned_u32(b); |
59 | 0 | const __m128i v_b1 = load_unaligned_u32(b + b_stride); |
60 | 0 | const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1)); |
61 | 0 | const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1)); |
62 | 0 | const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); |
63 | 0 | *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); |
64 | 0 | } |
65 | | |
66 | | static INLINE void sse8_sse4_1(const uint8_t *a, const uint8_t *b, |
67 | 0 | __m128i *sum) { |
68 | 0 | const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a); |
69 | 0 | const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b); |
70 | 0 | const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0); |
71 | 0 | const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0); |
72 | 0 | const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); |
73 | 0 | *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); |
74 | 0 | } |
75 | | |
76 | | int64_t vpx_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b, |
77 | 0 | int b_stride, int width, int height) { |
78 | 0 | int y = 0; |
79 | 0 | int64_t sse = 0; |
80 | 0 | __m128i sum = _mm_setzero_si128(); |
81 | 0 | switch (width) { |
82 | 0 | case 4: |
83 | 0 | do { |
84 | 0 | sse4x2_sse4_1(a, a_stride, b, b_stride, &sum); |
85 | 0 | a += a_stride << 1; |
86 | 0 | b += b_stride << 1; |
87 | 0 | y += 2; |
88 | 0 | } while (y < height); |
89 | 0 | sse = summary_all_sse4(&sum); |
90 | 0 | break; |
91 | 0 | case 8: |
92 | 0 | do { |
93 | 0 | sse8_sse4_1(a, b, &sum); |
94 | 0 | a += a_stride; |
95 | 0 | b += b_stride; |
96 | 0 | y += 1; |
97 | 0 | } while (y < height); |
98 | 0 | sse = summary_all_sse4(&sum); |
99 | 0 | break; |
100 | 0 | case 16: |
101 | 0 | do { |
102 | 0 | sse_w16_sse4_1(&sum, a, b); |
103 | 0 | a += a_stride; |
104 | 0 | b += b_stride; |
105 | 0 | y += 1; |
106 | 0 | } while (y < height); |
107 | 0 | sse = summary_all_sse4(&sum); |
108 | 0 | break; |
109 | 0 | case 32: |
110 | 0 | do { |
111 | 0 | sse_w16_sse4_1(&sum, a, b); |
112 | 0 | sse_w16_sse4_1(&sum, a + 16, b + 16); |
113 | 0 | a += a_stride; |
114 | 0 | b += b_stride; |
115 | 0 | y += 1; |
116 | 0 | } while (y < height); |
117 | 0 | sse = summary_all_sse4(&sum); |
118 | 0 | break; |
119 | 0 | case 64: |
120 | 0 | do { |
121 | 0 | sse_w16_sse4_1(&sum, a, b); |
122 | 0 | sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1); |
123 | 0 | sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2); |
124 | 0 | sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3); |
125 | 0 | a += a_stride; |
126 | 0 | b += b_stride; |
127 | 0 | y += 1; |
128 | 0 | } while (y < height); |
129 | 0 | sse = summary_all_sse4(&sum); |
130 | 0 | break; |
131 | 0 | default: |
132 | 0 | if (width & 0x07) { |
133 | 0 | do { |
134 | 0 | int i = 0; |
135 | 0 | do { |
136 | 0 | sse8_sse4_1(a + i, b + i, &sum); |
137 | 0 | sse8_sse4_1(a + i + a_stride, b + i + b_stride, &sum); |
138 | 0 | i += 8; |
139 | 0 | } while (i + 4 < width); |
140 | 0 | sse4x2_sse4_1(a + i, a_stride, b + i, b_stride, &sum); |
141 | 0 | a += (a_stride << 1); |
142 | 0 | b += (b_stride << 1); |
143 | 0 | y += 2; |
144 | 0 | } while (y < height); |
145 | 0 | } else { |
146 | 0 | do { |
147 | 0 | int i = 0; |
148 | 0 | do { |
149 | 0 | sse8_sse4_1(a + i, b + i, &sum); |
150 | 0 | i += 8; |
151 | 0 | } while (i < width); |
152 | 0 | a += a_stride; |
153 | 0 | b += b_stride; |
154 | 0 | y += 1; |
155 | 0 | } while (y < height); |
156 | 0 | } |
157 | 0 | sse = summary_all_sse4(&sum); |
158 | 0 | break; |
159 | 0 | } |
160 | | |
161 | 0 | return sse; |
162 | 0 | } |
163 | | |
164 | | #if CONFIG_VP9_HIGHBITDEPTH |
165 | | static INLINE void highbd_sse_w4x2_sse4_1(__m128i *sum, const uint16_t *a, |
166 | | int a_stride, const uint16_t *b, |
167 | 0 | int b_stride) { |
168 | 0 | const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a); |
169 | 0 | const __m128i v_a1 = _mm_loadl_epi64((const __m128i *)(a + a_stride)); |
170 | 0 | const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b); |
171 | 0 | const __m128i v_b1 = _mm_loadl_epi64((const __m128i *)(b + b_stride)); |
172 | 0 | const __m128i v_a_w = _mm_unpacklo_epi64(v_a0, v_a1); |
173 | 0 | const __m128i v_b_w = _mm_unpacklo_epi64(v_b0, v_b1); |
174 | 0 | const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); |
175 | 0 | *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); |
176 | 0 | } |
177 | | |
178 | | static INLINE void highbd_sse_w8_sse4_1(__m128i *sum, const uint16_t *a, |
179 | 0 | const uint16_t *b) { |
180 | 0 | const __m128i v_a_w = _mm_loadu_si128((const __m128i *)a); |
181 | 0 | const __m128i v_b_w = _mm_loadu_si128((const __m128i *)b); |
182 | 0 | const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); |
183 | 0 | *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); |
184 | 0 | } |
185 | | |
186 | | int64_t vpx_highbd_sse_sse4_1(const uint8_t *a8, int a_stride, |
187 | | const uint8_t *b8, int b_stride, int width, |
188 | 0 | int height) { |
189 | 0 | int32_t y = 0; |
190 | 0 | int64_t sse = 0; |
191 | 0 | uint16_t *a = CONVERT_TO_SHORTPTR(a8); |
192 | 0 | uint16_t *b = CONVERT_TO_SHORTPTR(b8); |
193 | 0 | __m128i sum = _mm_setzero_si128(); |
194 | 0 | switch (width) { |
195 | 0 | case 4: |
196 | 0 | do { |
197 | 0 | highbd_sse_w4x2_sse4_1(&sum, a, a_stride, b, b_stride); |
198 | 0 | a += a_stride << 1; |
199 | 0 | b += b_stride << 1; |
200 | 0 | y += 2; |
201 | 0 | } while (y < height); |
202 | 0 | sse = summary_all_sse4(&sum); |
203 | 0 | break; |
204 | 0 | case 8: |
205 | 0 | do { |
206 | 0 | highbd_sse_w8_sse4_1(&sum, a, b); |
207 | 0 | a += a_stride; |
208 | 0 | b += b_stride; |
209 | 0 | y += 1; |
210 | 0 | } while (y < height); |
211 | 0 | sse = summary_all_sse4(&sum); |
212 | 0 | break; |
213 | 0 | case 16: |
214 | 0 | do { |
215 | 0 | int l = 0; |
216 | 0 | __m128i sum32 = _mm_setzero_si128(); |
217 | 0 | do { |
218 | 0 | highbd_sse_w8_sse4_1(&sum32, a, b); |
219 | 0 | highbd_sse_w8_sse4_1(&sum32, a + 8, b + 8); |
220 | 0 | a += a_stride; |
221 | 0 | b += b_stride; |
222 | 0 | l += 1; |
223 | 0 | } while (l < 64 && l < (height - y)); |
224 | 0 | summary_32_sse4(&sum32, &sum); |
225 | 0 | y += 64; |
226 | 0 | } while (y < height); |
227 | 0 | _mm_storel_epi64((__m128i *)&sse, |
228 | 0 | _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); |
229 | 0 | break; |
230 | 0 | case 32: |
231 | 0 | do { |
232 | 0 | int l = 0; |
233 | 0 | __m128i sum32 = _mm_setzero_si128(); |
234 | 0 | do { |
235 | 0 | highbd_sse_w8_sse4_1(&sum32, a, b); |
236 | 0 | highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1); |
237 | 0 | highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2); |
238 | 0 | highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3); |
239 | 0 | a += a_stride; |
240 | 0 | b += b_stride; |
241 | 0 | l += 1; |
242 | 0 | } while (l < 32 && l < (height - y)); |
243 | 0 | summary_32_sse4(&sum32, &sum); |
244 | 0 | y += 32; |
245 | 0 | } while (y < height); |
246 | 0 | _mm_storel_epi64((__m128i *)&sse, |
247 | 0 | _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); |
248 | 0 | break; |
249 | 0 | case 64: |
250 | 0 | do { |
251 | 0 | int l = 0; |
252 | 0 | __m128i sum32 = _mm_setzero_si128(); |
253 | 0 | do { |
254 | 0 | highbd_sse_w8_sse4_1(&sum32, a, b); |
255 | 0 | highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1); |
256 | 0 | highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2); |
257 | 0 | highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3); |
258 | 0 | highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4); |
259 | 0 | highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5); |
260 | 0 | highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6); |
261 | 0 | highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7); |
262 | 0 | a += a_stride; |
263 | 0 | b += b_stride; |
264 | 0 | l += 1; |
265 | 0 | } while (l < 16 && l < (height - y)); |
266 | 0 | summary_32_sse4(&sum32, &sum); |
267 | 0 | y += 16; |
268 | 0 | } while (y < height); |
269 | 0 | _mm_storel_epi64((__m128i *)&sse, |
270 | 0 | _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); |
271 | 0 | break; |
272 | 0 | default: |
273 | 0 | if (width & 0x7) { |
274 | 0 | do { |
275 | 0 | __m128i sum32 = _mm_setzero_si128(); |
276 | 0 | int i = 0; |
277 | 0 | do { |
278 | 0 | highbd_sse_w8_sse4_1(&sum32, a + i, b + i); |
279 | 0 | highbd_sse_w8_sse4_1(&sum32, a + i + a_stride, b + i + b_stride); |
280 | 0 | i += 8; |
281 | 0 | } while (i + 4 < width); |
282 | 0 | highbd_sse_w4x2_sse4_1(&sum32, a + i, a_stride, b + i, b_stride); |
283 | 0 | a += (a_stride << 1); |
284 | 0 | b += (b_stride << 1); |
285 | 0 | y += 2; |
286 | 0 | summary_32_sse4(&sum32, &sum); |
287 | 0 | } while (y < height); |
288 | 0 | } else { |
289 | 0 | do { |
290 | 0 | int l = 0; |
291 | 0 | __m128i sum32 = _mm_setzero_si128(); |
292 | 0 | do { |
293 | 0 | int i = 0; |
294 | 0 | do { |
295 | 0 | highbd_sse_w8_sse4_1(&sum32, a + i, b + i); |
296 | 0 | i += 8; |
297 | 0 | } while (i < width); |
298 | 0 | a += a_stride; |
299 | 0 | b += b_stride; |
300 | 0 | l += 1; |
301 | 0 | } while (l < 8 && l < (height - y)); |
302 | 0 | summary_32_sse4(&sum32, &sum); |
303 | 0 | y += 8; |
304 | 0 | } while (y < height); |
305 | 0 | } |
306 | 0 | _mm_storel_epi64((__m128i *)&sse, |
307 | 0 | _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); |
308 | 0 | break; |
309 | 0 | } |
310 | 0 | return sse; |
311 | 0 | } |
312 | | #endif // CONFIG_VP9_HIGHBITDEPTH |