/src/libvpx/vpx_dsp/x86/sse_sse4.c
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | /*  | 
2  |  |  *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.  | 
3  |  |  *  | 
4  |  |  *  Use of this source code is governed by a BSD-style license  | 
5  |  |  *  that can be found in the LICENSE file in the root of the source  | 
6  |  |  *  tree. An additional intellectual property rights grant can be found  | 
7  |  |  *  in the file PATENTS.  All contributing project authors may  | 
8  |  |  *  be found in the AUTHORS file in the root of the source tree.  | 
9  |  |  */  | 
10  |  |  | 
11  |  | #include <assert.h>  | 
12  |  | #include <smmintrin.h>  | 
13  |  |  | 
14  |  | #include "./vpx_config.h"  | 
15  |  | #include "./vpx_dsp_rtcd.h"  | 
16  |  |  | 
17  |  | #include "vpx_ports/mem.h"  | 
18  |  | #include "vpx/vpx_integer.h"  | 
19  |  | #include "vpx_dsp/x86/mem_sse2.h"  | 
20  |  |  | 
21  | 0  | static INLINE int64_t summary_all_sse4(const __m128i *sum_all) { | 
22  | 0  |   int64_t sum;  | 
23  | 0  |   const __m128i sum0 = _mm_cvtepu32_epi64(*sum_all);  | 
24  | 0  |   const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum_all, 8));  | 
25  | 0  |   const __m128i sum_2x64 = _mm_add_epi64(sum0, sum1);  | 
26  | 0  |   const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));  | 
27  | 0  |   _mm_storel_epi64((__m128i *)&sum, sum_1x64);  | 
28  | 0  |   return sum;  | 
29  | 0  | }  | 
30  |  |  | 
31  |  | #if CONFIG_VP9_HIGHBITDEPTH  | 
32  | 0  | static INLINE void summary_32_sse4(const __m128i *sum32, __m128i *sum64) { | 
33  | 0  |   const __m128i sum0 = _mm_cvtepu32_epi64(*sum32);  | 
34  | 0  |   const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum32, 8));  | 
35  | 0  |   *sum64 = _mm_add_epi64(sum0, *sum64);  | 
36  | 0  |   *sum64 = _mm_add_epi64(sum1, *sum64);  | 
37  | 0  | }  | 
38  |  | #endif  | 
39  |  |  | 
40  |  | static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a,  | 
41  | 0  |                                   const uint8_t *b) { | 
42  | 0  |   const __m128i v_a0 = _mm_loadu_si128((const __m128i *)a);  | 
43  | 0  |   const __m128i v_b0 = _mm_loadu_si128((const __m128i *)b);  | 
44  | 0  |   const __m128i v_a00_w = _mm_cvtepu8_epi16(v_a0);  | 
45  | 0  |   const __m128i v_a01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_a0, 8));  | 
46  | 0  |   const __m128i v_b00_w = _mm_cvtepu8_epi16(v_b0);  | 
47  | 0  |   const __m128i v_b01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_b0, 8));  | 
48  | 0  |   const __m128i v_d00_w = _mm_sub_epi16(v_a00_w, v_b00_w);  | 
49  | 0  |   const __m128i v_d01_w = _mm_sub_epi16(v_a01_w, v_b01_w);  | 
50  | 0  |   *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d00_w, v_d00_w));  | 
51  | 0  |   *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w));  | 
52  | 0  | }  | 
53  |  |  | 
54  |  | static INLINE void sse4x2_sse4_1(const uint8_t *a, int a_stride,  | 
55  | 0  |                                  const uint8_t *b, int b_stride, __m128i *sum) { | 
56  | 0  |   const __m128i v_a0 = load_unaligned_u32(a);  | 
57  | 0  |   const __m128i v_a1 = load_unaligned_u32(a + a_stride);  | 
58  | 0  |   const __m128i v_b0 = load_unaligned_u32(b);  | 
59  | 0  |   const __m128i v_b1 = load_unaligned_u32(b + b_stride);  | 
60  | 0  |   const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1));  | 
61  | 0  |   const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1));  | 
62  | 0  |   const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);  | 
63  | 0  |   *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));  | 
64  | 0  | }  | 
65  |  |  | 
66  |  | static INLINE void sse8_sse4_1(const uint8_t *a, const uint8_t *b,  | 
67  | 0  |                                __m128i *sum) { | 
68  | 0  |   const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a);  | 
69  | 0  |   const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b);  | 
70  | 0  |   const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0);  | 
71  | 0  |   const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0);  | 
72  | 0  |   const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);  | 
73  | 0  |   *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));  | 
74  | 0  | }  | 
75  |  |  | 
76  |  | int64_t vpx_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b,  | 
77  | 0  |                        int b_stride, int width, int height) { | 
78  | 0  |   int y = 0;  | 
79  | 0  |   int64_t sse = 0;  | 
80  | 0  |   __m128i sum = _mm_setzero_si128();  | 
81  | 0  |   switch (width) { | 
82  | 0  |     case 4:  | 
83  | 0  |       do { | 
84  | 0  |         sse4x2_sse4_1(a, a_stride, b, b_stride, &sum);  | 
85  | 0  |         a += a_stride << 1;  | 
86  | 0  |         b += b_stride << 1;  | 
87  | 0  |         y += 2;  | 
88  | 0  |       } while (y < height);  | 
89  | 0  |       sse = summary_all_sse4(&sum);  | 
90  | 0  |       break;  | 
91  | 0  |     case 8:  | 
92  | 0  |       do { | 
93  | 0  |         sse8_sse4_1(a, b, &sum);  | 
94  | 0  |         a += a_stride;  | 
95  | 0  |         b += b_stride;  | 
96  | 0  |         y += 1;  | 
97  | 0  |       } while (y < height);  | 
98  | 0  |       sse = summary_all_sse4(&sum);  | 
99  | 0  |       break;  | 
100  | 0  |     case 16:  | 
101  | 0  |       do { | 
102  | 0  |         sse_w16_sse4_1(&sum, a, b);  | 
103  | 0  |         a += a_stride;  | 
104  | 0  |         b += b_stride;  | 
105  | 0  |         y += 1;  | 
106  | 0  |       } while (y < height);  | 
107  | 0  |       sse = summary_all_sse4(&sum);  | 
108  | 0  |       break;  | 
109  | 0  |     case 32:  | 
110  | 0  |       do { | 
111  | 0  |         sse_w16_sse4_1(&sum, a, b);  | 
112  | 0  |         sse_w16_sse4_1(&sum, a + 16, b + 16);  | 
113  | 0  |         a += a_stride;  | 
114  | 0  |         b += b_stride;  | 
115  | 0  |         y += 1;  | 
116  | 0  |       } while (y < height);  | 
117  | 0  |       sse = summary_all_sse4(&sum);  | 
118  | 0  |       break;  | 
119  | 0  |     case 64:  | 
120  | 0  |       do { | 
121  | 0  |         sse_w16_sse4_1(&sum, a, b);  | 
122  | 0  |         sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1);  | 
123  | 0  |         sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2);  | 
124  | 0  |         sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3);  | 
125  | 0  |         a += a_stride;  | 
126  | 0  |         b += b_stride;  | 
127  | 0  |         y += 1;  | 
128  | 0  |       } while (y < height);  | 
129  | 0  |       sse = summary_all_sse4(&sum);  | 
130  | 0  |       break;  | 
131  | 0  |     default:  | 
132  | 0  |       if (width & 0x07) { | 
133  | 0  |         do { | 
134  | 0  |           int i = 0;  | 
135  | 0  |           do { | 
136  | 0  |             sse8_sse4_1(a + i, b + i, &sum);  | 
137  | 0  |             sse8_sse4_1(a + i + a_stride, b + i + b_stride, &sum);  | 
138  | 0  |             i += 8;  | 
139  | 0  |           } while (i + 4 < width);  | 
140  | 0  |           sse4x2_sse4_1(a + i, a_stride, b + i, b_stride, &sum);  | 
141  | 0  |           a += (a_stride << 1);  | 
142  | 0  |           b += (b_stride << 1);  | 
143  | 0  |           y += 2;  | 
144  | 0  |         } while (y < height);  | 
145  | 0  |       } else { | 
146  | 0  |         do { | 
147  | 0  |           int i = 0;  | 
148  | 0  |           do { | 
149  | 0  |             sse8_sse4_1(a + i, b + i, &sum);  | 
150  | 0  |             i += 8;  | 
151  | 0  |           } while (i < width);  | 
152  | 0  |           a += a_stride;  | 
153  | 0  |           b += b_stride;  | 
154  | 0  |           y += 1;  | 
155  | 0  |         } while (y < height);  | 
156  | 0  |       }  | 
157  | 0  |       sse = summary_all_sse4(&sum);  | 
158  | 0  |       break;  | 
159  | 0  |   }  | 
160  |  |  | 
161  | 0  |   return sse;  | 
162  | 0  | }  | 
163  |  |  | 
164  |  | #if CONFIG_VP9_HIGHBITDEPTH  | 
165  |  | static INLINE void highbd_sse_w4x2_sse4_1(__m128i *sum, const uint16_t *a,  | 
166  |  |                                           int a_stride, const uint16_t *b,  | 
167  | 0  |                                           int b_stride) { | 
168  | 0  |   const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a);  | 
169  | 0  |   const __m128i v_a1 = _mm_loadl_epi64((const __m128i *)(a + a_stride));  | 
170  | 0  |   const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b);  | 
171  | 0  |   const __m128i v_b1 = _mm_loadl_epi64((const __m128i *)(b + b_stride));  | 
172  | 0  |   const __m128i v_a_w = _mm_unpacklo_epi64(v_a0, v_a1);  | 
173  | 0  |   const __m128i v_b_w = _mm_unpacklo_epi64(v_b0, v_b1);  | 
174  | 0  |   const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);  | 
175  | 0  |   *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));  | 
176  | 0  | }  | 
177  |  |  | 
178  |  | static INLINE void highbd_sse_w8_sse4_1(__m128i *sum, const uint16_t *a,  | 
179  | 0  |                                         const uint16_t *b) { | 
180  | 0  |   const __m128i v_a_w = _mm_loadu_si128((const __m128i *)a);  | 
181  | 0  |   const __m128i v_b_w = _mm_loadu_si128((const __m128i *)b);  | 
182  | 0  |   const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);  | 
183  | 0  |   *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));  | 
184  | 0  | }  | 
185  |  |  | 
186  |  | int64_t vpx_highbd_sse_sse4_1(const uint8_t *a8, int a_stride,  | 
187  |  |                               const uint8_t *b8, int b_stride, int width,  | 
188  | 0  |                               int height) { | 
189  | 0  |   int32_t y = 0;  | 
190  | 0  |   int64_t sse = 0;  | 
191  | 0  |   uint16_t *a = CONVERT_TO_SHORTPTR(a8);  | 
192  | 0  |   uint16_t *b = CONVERT_TO_SHORTPTR(b8);  | 
193  | 0  |   __m128i sum = _mm_setzero_si128();  | 
194  | 0  |   switch (width) { | 
195  | 0  |     case 4:  | 
196  | 0  |       do { | 
197  | 0  |         highbd_sse_w4x2_sse4_1(&sum, a, a_stride, b, b_stride);  | 
198  | 0  |         a += a_stride << 1;  | 
199  | 0  |         b += b_stride << 1;  | 
200  | 0  |         y += 2;  | 
201  | 0  |       } while (y < height);  | 
202  | 0  |       sse = summary_all_sse4(&sum);  | 
203  | 0  |       break;  | 
204  | 0  |     case 8:  | 
205  | 0  |       do { | 
206  | 0  |         highbd_sse_w8_sse4_1(&sum, a, b);  | 
207  | 0  |         a += a_stride;  | 
208  | 0  |         b += b_stride;  | 
209  | 0  |         y += 1;  | 
210  | 0  |       } while (y < height);  | 
211  | 0  |       sse = summary_all_sse4(&sum);  | 
212  | 0  |       break;  | 
213  | 0  |     case 16:  | 
214  | 0  |       do { | 
215  | 0  |         int l = 0;  | 
216  | 0  |         __m128i sum32 = _mm_setzero_si128();  | 
217  | 0  |         do { | 
218  | 0  |           highbd_sse_w8_sse4_1(&sum32, a, b);  | 
219  | 0  |           highbd_sse_w8_sse4_1(&sum32, a + 8, b + 8);  | 
220  | 0  |           a += a_stride;  | 
221  | 0  |           b += b_stride;  | 
222  | 0  |           l += 1;  | 
223  | 0  |         } while (l < 64 && l < (height - y));  | 
224  | 0  |         summary_32_sse4(&sum32, &sum);  | 
225  | 0  |         y += 64;  | 
226  | 0  |       } while (y < height);  | 
227  | 0  |       _mm_storel_epi64((__m128i *)&sse,  | 
228  | 0  |                        _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));  | 
229  | 0  |       break;  | 
230  | 0  |     case 32:  | 
231  | 0  |       do { | 
232  | 0  |         int l = 0;  | 
233  | 0  |         __m128i sum32 = _mm_setzero_si128();  | 
234  | 0  |         do { | 
235  | 0  |           highbd_sse_w8_sse4_1(&sum32, a, b);  | 
236  | 0  |           highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1);  | 
237  | 0  |           highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2);  | 
238  | 0  |           highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3);  | 
239  | 0  |           a += a_stride;  | 
240  | 0  |           b += b_stride;  | 
241  | 0  |           l += 1;  | 
242  | 0  |         } while (l < 32 && l < (height - y));  | 
243  | 0  |         summary_32_sse4(&sum32, &sum);  | 
244  | 0  |         y += 32;  | 
245  | 0  |       } while (y < height);  | 
246  | 0  |       _mm_storel_epi64((__m128i *)&sse,  | 
247  | 0  |                        _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));  | 
248  | 0  |       break;  | 
249  | 0  |     case 64:  | 
250  | 0  |       do { | 
251  | 0  |         int l = 0;  | 
252  | 0  |         __m128i sum32 = _mm_setzero_si128();  | 
253  | 0  |         do { | 
254  | 0  |           highbd_sse_w8_sse4_1(&sum32, a, b);  | 
255  | 0  |           highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1);  | 
256  | 0  |           highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2);  | 
257  | 0  |           highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3);  | 
258  | 0  |           highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4);  | 
259  | 0  |           highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5);  | 
260  | 0  |           highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6);  | 
261  | 0  |           highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7);  | 
262  | 0  |           a += a_stride;  | 
263  | 0  |           b += b_stride;  | 
264  | 0  |           l += 1;  | 
265  | 0  |         } while (l < 16 && l < (height - y));  | 
266  | 0  |         summary_32_sse4(&sum32, &sum);  | 
267  | 0  |         y += 16;  | 
268  | 0  |       } while (y < height);  | 
269  | 0  |       _mm_storel_epi64((__m128i *)&sse,  | 
270  | 0  |                        _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));  | 
271  | 0  |       break;  | 
272  | 0  |     default:  | 
273  | 0  |       if (width & 0x7) { | 
274  | 0  |         do { | 
275  | 0  |           __m128i sum32 = _mm_setzero_si128();  | 
276  | 0  |           int i = 0;  | 
277  | 0  |           do { | 
278  | 0  |             highbd_sse_w8_sse4_1(&sum32, a + i, b + i);  | 
279  | 0  |             highbd_sse_w8_sse4_1(&sum32, a + i + a_stride, b + i + b_stride);  | 
280  | 0  |             i += 8;  | 
281  | 0  |           } while (i + 4 < width);  | 
282  | 0  |           highbd_sse_w4x2_sse4_1(&sum32, a + i, a_stride, b + i, b_stride);  | 
283  | 0  |           a += (a_stride << 1);  | 
284  | 0  |           b += (b_stride << 1);  | 
285  | 0  |           y += 2;  | 
286  | 0  |           summary_32_sse4(&sum32, &sum);  | 
287  | 0  |         } while (y < height);  | 
288  | 0  |       } else { | 
289  | 0  |         do { | 
290  | 0  |           int l = 0;  | 
291  | 0  |           __m128i sum32 = _mm_setzero_si128();  | 
292  | 0  |           do { | 
293  | 0  |             int i = 0;  | 
294  | 0  |             do { | 
295  | 0  |               highbd_sse_w8_sse4_1(&sum32, a + i, b + i);  | 
296  | 0  |               i += 8;  | 
297  | 0  |             } while (i < width);  | 
298  | 0  |             a += a_stride;  | 
299  | 0  |             b += b_stride;  | 
300  | 0  |             l += 1;  | 
301  | 0  |           } while (l < 8 && l < (height - y));  | 
302  | 0  |           summary_32_sse4(&sum32, &sum);  | 
303  | 0  |           y += 8;  | 
304  | 0  |         } while (y < height);  | 
305  | 0  |       }  | 
306  | 0  |       _mm_storel_epi64((__m128i *)&sse,  | 
307  | 0  |                        _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));  | 
308  | 0  |       break;  | 
309  | 0  |   }  | 
310  | 0  |   return sse;  | 
311  | 0  | }  | 
312  |  | #endif  // CONFIG_VP9_HIGHBITDEPTH  |