/src/libvpx/vpx_dsp/x86/subtract_avx2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2022 The WebM project authors. All Rights Reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | |
11 | | #include <assert.h> |
12 | | #include <immintrin.h> |
13 | | |
14 | | #include "./vpx_dsp_rtcd.h" |
15 | | #include "vpx/vpx_integer.h" |
16 | | |
17 | | static VPX_FORCE_INLINE void subtract32_avx2(int16_t *diff_ptr, |
18 | | const uint8_t *src_ptr, |
19 | 127M | const uint8_t *pred_ptr) { |
20 | 127M | const __m256i s = _mm256_lddqu_si256((const __m256i *)src_ptr); |
21 | 127M | const __m256i p = _mm256_lddqu_si256((const __m256i *)pred_ptr); |
22 | 127M | const __m256i s_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s)); |
23 | 127M | const __m256i s_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s, 1)); |
24 | 127M | const __m256i p_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(p)); |
25 | 127M | const __m256i p_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(p, 1)); |
26 | 127M | const __m256i d_0 = _mm256_sub_epi16(s_0, p_0); |
27 | 127M | const __m256i d_1 = _mm256_sub_epi16(s_1, p_1); |
28 | 127M | _mm256_storeu_si256((__m256i *)diff_ptr, d_0); |
29 | 127M | _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d_1); |
30 | 127M | } |
31 | | |
32 | | static VPX_FORCE_INLINE void subtract_block_16xn_avx2( |
33 | | int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, |
34 | 22.9M | ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { |
35 | 22.9M | int j; |
36 | 380M | for (j = 0; j < rows; ++j) { |
37 | 357M | const __m128i s = _mm_lddqu_si128((const __m128i *)src_ptr); |
38 | 357M | const __m128i p = _mm_lddqu_si128((const __m128i *)pred_ptr); |
39 | 357M | const __m256i s_0 = _mm256_cvtepu8_epi16(s); |
40 | 357M | const __m256i p_0 = _mm256_cvtepu8_epi16(p); |
41 | 357M | const __m256i d_0 = _mm256_sub_epi16(s_0, p_0); |
42 | 357M | _mm256_storeu_si256((__m256i *)diff_ptr, d_0); |
43 | 357M | src_ptr += src_stride; |
44 | 357M | pred_ptr += pred_stride; |
45 | 357M | diff_ptr += diff_stride; |
46 | 357M | } |
47 | 22.9M | } |
48 | | |
49 | | static VPX_FORCE_INLINE void subtract_block_32xn_avx2( |
50 | | int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, |
51 | 3.68M | ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { |
52 | 3.68M | int j; |
53 | 116M | for (j = 0; j < rows; ++j) { |
54 | 112M | subtract32_avx2(diff_ptr, src_ptr, pred_ptr); |
55 | 112M | src_ptr += src_stride; |
56 | 112M | pred_ptr += pred_stride; |
57 | 112M | diff_ptr += diff_stride; |
58 | 112M | } |
59 | 3.68M | } |
60 | | |
61 | | static VPX_FORCE_INLINE void subtract_block_64xn_avx2( |
62 | | int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, |
63 | 146k | ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { |
64 | 146k | int j; |
65 | 7.40M | for (j = 0; j < rows; ++j) { |
66 | 7.25M | subtract32_avx2(diff_ptr, src_ptr, pred_ptr); |
67 | 7.25M | subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32); |
68 | 7.25M | src_ptr += src_stride; |
69 | 7.25M | pred_ptr += pred_stride; |
70 | 7.25M | diff_ptr += diff_stride; |
71 | 7.25M | } |
72 | 146k | } |
73 | | |
74 | | void vpx_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, |
75 | | ptrdiff_t diff_stride, const uint8_t *src_ptr, |
76 | | ptrdiff_t src_stride, const uint8_t *pred_ptr, |
77 | 734M | ptrdiff_t pred_stride) { |
78 | 734M | switch (cols) { |
79 | 22.9M | case 16: |
80 | 22.9M | subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride, |
81 | 22.9M | pred_ptr, pred_stride); |
82 | 22.9M | break; |
83 | 3.68M | case 32: |
84 | 3.68M | subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride, |
85 | 3.68M | pred_ptr, pred_stride); |
86 | 3.68M | break; |
87 | 146k | case 64: |
88 | 146k | subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride, |
89 | 146k | pred_ptr, pred_stride); |
90 | 146k | break; |
91 | 708M | default: |
92 | 708M | vpx_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr, |
93 | 708M | src_stride, pred_ptr, pred_stride); |
94 | 708M | break; |
95 | 734M | } |
96 | 734M | } |
97 | | |
98 | | #if CONFIG_VP9_HIGHBITDEPTH |
99 | | void vpx_highbd_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, |
100 | | ptrdiff_t diff_stride, |
101 | | const uint8_t *src8_ptr, |
102 | | ptrdiff_t src_stride, |
103 | | const uint8_t *pred8_ptr, |
104 | 0 | ptrdiff_t pred_stride, int bd) { |
105 | 0 | uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8_ptr); |
106 | 0 | uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(pred8_ptr); |
107 | 0 | (void)bd; |
108 | 0 | if (cols == 64) { |
109 | 0 | int j = rows; |
110 | 0 | do { |
111 | 0 | const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr); |
112 | 0 | const __m256i s1 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 16)); |
113 | 0 | const __m256i s2 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 32)); |
114 | 0 | const __m256i s3 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 48)); |
115 | 0 | const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr); |
116 | 0 | const __m256i p1 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 16)); |
117 | 0 | const __m256i p2 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 32)); |
118 | 0 | const __m256i p3 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 48)); |
119 | 0 | const __m256i d0 = _mm256_sub_epi16(s0, p0); |
120 | 0 | const __m256i d1 = _mm256_sub_epi16(s1, p1); |
121 | 0 | const __m256i d2 = _mm256_sub_epi16(s2, p2); |
122 | 0 | const __m256i d3 = _mm256_sub_epi16(s3, p3); |
123 | 0 | _mm256_storeu_si256((__m256i *)diff_ptr, d0); |
124 | 0 | _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d1); |
125 | 0 | _mm256_storeu_si256((__m256i *)(diff_ptr + 32), d2); |
126 | 0 | _mm256_storeu_si256((__m256i *)(diff_ptr + 48), d3); |
127 | 0 | src_ptr += src_stride; |
128 | 0 | pred_ptr += pred_stride; |
129 | 0 | diff_ptr += diff_stride; |
130 | 0 | } while (--j != 0); |
131 | 0 | } else if (cols == 32) { |
132 | 0 | int j = rows; |
133 | 0 | do { |
134 | 0 | const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr); |
135 | 0 | const __m256i s1 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 16)); |
136 | 0 | const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr); |
137 | 0 | const __m256i p1 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 16)); |
138 | 0 | const __m256i d0 = _mm256_sub_epi16(s0, p0); |
139 | 0 | const __m256i d1 = _mm256_sub_epi16(s1, p1); |
140 | 0 | _mm256_storeu_si256((__m256i *)diff_ptr, d0); |
141 | 0 | _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d1); |
142 | 0 | src_ptr += src_stride; |
143 | 0 | pred_ptr += pred_stride; |
144 | 0 | diff_ptr += diff_stride; |
145 | 0 | } while (--j != 0); |
146 | 0 | } else if (cols == 16) { |
147 | 0 | int j = rows; |
148 | 0 | do { |
149 | 0 | const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr); |
150 | 0 | const __m256i s1 = |
151 | 0 | _mm256_lddqu_si256((const __m256i *)(src_ptr + src_stride)); |
152 | 0 | const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr); |
153 | 0 | const __m256i p1 = |
154 | 0 | _mm256_lddqu_si256((const __m256i *)(pred_ptr + pred_stride)); |
155 | 0 | const __m256i d0 = _mm256_sub_epi16(s0, p0); |
156 | 0 | const __m256i d1 = _mm256_sub_epi16(s1, p1); |
157 | 0 | _mm256_storeu_si256((__m256i *)diff_ptr, d0); |
158 | 0 | _mm256_storeu_si256((__m256i *)(diff_ptr + diff_stride), d1); |
159 | 0 | src_ptr += src_stride << 1; |
160 | 0 | pred_ptr += pred_stride << 1; |
161 | 0 | diff_ptr += diff_stride << 1; |
162 | 0 | j -= 2; |
163 | 0 | } while (j != 0); |
164 | 0 | } else if (cols == 8) { |
165 | 0 | int j = rows; |
166 | 0 | do { |
167 | 0 | const __m128i s0 = _mm_lddqu_si128((const __m128i *)src_ptr); |
168 | 0 | const __m128i s1 = |
169 | 0 | _mm_lddqu_si128((const __m128i *)(src_ptr + src_stride)); |
170 | 0 | const __m128i p0 = _mm_lddqu_si128((const __m128i *)pred_ptr); |
171 | 0 | const __m128i p1 = |
172 | 0 | _mm_lddqu_si128((const __m128i *)(pred_ptr + pred_stride)); |
173 | 0 | const __m128i d0 = _mm_sub_epi16(s0, p0); |
174 | 0 | const __m128i d1 = _mm_sub_epi16(s1, p1); |
175 | 0 | _mm_storeu_si128((__m128i *)diff_ptr, d0); |
176 | 0 | _mm_storeu_si128((__m128i *)(diff_ptr + diff_stride), d1); |
177 | 0 | src_ptr += src_stride << 1; |
178 | 0 | pred_ptr += pred_stride << 1; |
179 | 0 | diff_ptr += diff_stride << 1; |
180 | 0 | j -= 2; |
181 | 0 | } while (j != 0); |
182 | 0 | } else { |
183 | 0 | int j = rows; |
184 | 0 | assert(cols == 4); |
185 | 0 | do { |
186 | 0 | const __m128i s0 = _mm_loadl_epi64((const __m128i *)src_ptr); |
187 | 0 | const __m128i s1 = |
188 | 0 | _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride)); |
189 | 0 | const __m128i p0 = _mm_loadl_epi64((const __m128i *)pred_ptr); |
190 | 0 | const __m128i p1 = |
191 | 0 | _mm_loadl_epi64((const __m128i *)(pred_ptr + pred_stride)); |
192 | 0 | const __m128i d0 = _mm_sub_epi16(s0, p0); |
193 | 0 | const __m128i d1 = _mm_sub_epi16(s1, p1); |
194 | 0 | _mm_storel_epi64((__m128i *)diff_ptr, d0); |
195 | 0 | _mm_storel_epi64((__m128i *)(diff_ptr + diff_stride), d1); |
196 | 0 | src_ptr += src_stride << 1; |
197 | 0 | pred_ptr += pred_stride << 1; |
198 | 0 | diff_ptr += diff_stride << 1; |
199 | 0 | j -= 2; |
200 | 0 | } while (j != 0); |
201 | 0 | } |
202 | 0 | } |
203 | | #endif // CONFIG_VP9_HIGHBITDEPTH |