/src/aom/aom_dsp/x86/convolve_avx2.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2018, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #ifndef AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ |
13 | | #define AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ |
14 | | |
15 | | #include <immintrin.h> |
16 | | |
17 | | #include "aom_ports/mem.h" |
18 | | |
19 | | #include "av1/common/convolve.h" |
20 | | #include "av1/common/filter.h" |
21 | | |
22 | | // filters for 16 |
23 | | DECLARE_ALIGNED(32, static const uint8_t, filt_global_avx2[]) = { |
24 | | 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, |
25 | | 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 2, 3, 3, 4, 4, 5, |
26 | | 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6, |
27 | | 7, 7, 8, 8, 9, 9, 10, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, |
28 | | 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, |
29 | | 12, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 6, 7, |
30 | | 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 |
31 | | }; |
32 | | |
33 | | DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[]) = { |
34 | | 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, |
35 | | 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, |
36 | | 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, |
37 | | }; |
38 | | |
39 | | DECLARE_ALIGNED(32, static const uint8_t, filt4_d4_global_avx2[]) = { |
40 | | 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, |
41 | | 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, |
42 | | }; |
43 | | |
44 | | DECLARE_ALIGNED(32, static const uint8_t, filt_center_global_avx2[32]) = { |
45 | | 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255, |
46 | | 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255 |
47 | | }; |
48 | | |
49 | | DECLARE_ALIGNED(32, static const uint8_t, |
50 | | filt1_global_avx2[32]) = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, |
51 | | 6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3, |
52 | | 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
53 | | |
54 | | DECLARE_ALIGNED(32, static const uint8_t, |
55 | | filt2_global_avx2[32]) = { 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, |
56 | | 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, |
57 | | 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 }; |
58 | | |
59 | | DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = { |
60 | | 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, |
61 | | 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 |
62 | | }; |
63 | | |
64 | | DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { |
65 | | 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, |
66 | | 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 |
67 | | }; |
68 | | |
69 | | #define CONVOLVE_SR_HORIZONTAL_FILTER_4TAP \ |
70 | 0 | for (i = 0; i < (im_h - 2); i += 2) { \ |
71 | 0 | __m256i data = _mm256_castsi128_si256( \ |
72 | 0 | _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \ |
73 | 0 | data = _mm256_inserti128_si256( \ |
74 | 0 | data, \ |
75 | 0 | _mm_loadu_si128( \ |
76 | 0 | (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]), \ |
77 | 0 | 1); \ |
78 | 0 | __m256i res = convolve_lowbd_x_4tap(data, coeffs_h + 1, filt); \ |
79 | 0 | res = \ |
80 | 0 | _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \ |
81 | 0 | _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \ |
82 | 0 | } \ |
83 | 0 | __m256i data_1 = _mm256_castsi128_si256( \ |
84 | 0 | _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \ |
85 | 0 | __m256i res = convolve_lowbd_x_4tap(data_1, coeffs_h + 1, filt); \ |
86 | 0 | res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \ |
87 | 0 | _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); |
88 | | |
89 | | #define CONVOLVE_SR_VERTICAL_FILTER_4TAP \ |
90 | 0 | __m256i s[6]; \ |
91 | 0 | __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ |
92 | 0 | __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ |
93 | 0 | __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \ |
94 | 0 | __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \ |
95 | 0 | \ |
96 | 0 | s[0] = _mm256_unpacklo_epi16(src_0, src_1); \ |
97 | 0 | s[1] = _mm256_unpacklo_epi16(src_2, src_3); \ |
98 | 0 | s[3] = _mm256_unpackhi_epi16(src_0, src_1); \ |
99 | 0 | s[4] = _mm256_unpackhi_epi16(src_2, src_3); \ |
100 | 0 | \ |
101 | 0 | for (i = 0; i < h; i += 2) { \ |
102 | 0 | const int16_t *data = &im_block[i * im_stride]; \ |
103 | 0 | const __m256i s4 = _mm256_loadu_si256((__m256i *)(data + 4 * im_stride)); \ |
104 | 0 | const __m256i s5 = _mm256_loadu_si256((__m256i *)(data + 5 * im_stride)); \ |
105 | 0 | s[2] = _mm256_unpacklo_epi16(s4, s5); \ |
106 | 0 | s[5] = _mm256_unpackhi_epi16(s4, s5); \ |
107 | 0 | \ |
108 | 0 | __m256i res_a = convolve_4tap(s, coeffs_v + 1); \ |
109 | 0 | __m256i res_b = convolve_4tap(s + 3, coeffs_v + 1); \ |
110 | 0 | \ |
111 | 0 | res_a = \ |
112 | 0 | _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v); \ |
113 | 0 | res_b = \ |
114 | 0 | _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v); \ |
115 | 0 | const __m256i res_a_round = _mm256_sra_epi32( \ |
116 | 0 | _mm256_add_epi32(res_a, round_const_v), round_shift_v); \ |
117 | 0 | const __m256i res_b_round = _mm256_sra_epi32( \ |
118 | 0 | _mm256_add_epi32(res_b, round_const_v), round_shift_v); \ |
119 | 0 | const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); \ |
120 | 0 | const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); \ |
121 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_8b); \ |
122 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); \ |
123 | 0 | \ |
124 | 0 | __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; \ |
125 | 0 | __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; \ |
126 | 0 | if (w - j > 4) { \ |
127 | 0 | _mm_storel_epi64(p_0, res_0); \ |
128 | 0 | _mm_storel_epi64(p_1, res_1); \ |
129 | 0 | } else if (w == 4) { \ |
130 | 0 | xx_storel_32(p_0, res_0); \ |
131 | 0 | xx_storel_32(p_1, res_1); \ |
132 | 0 | } else { \ |
133 | 0 | *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); \ |
134 | 0 | *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); \ |
135 | 0 | } \ |
136 | 0 | \ |
137 | 0 | s[0] = s[1]; \ |
138 | 0 | s[1] = s[2]; \ |
139 | 0 | s[3] = s[4]; \ |
140 | 0 | s[4] = s[5]; \ |
141 | 0 | } |
142 | | |
143 | | #define CONVOLVE_SR_HORIZONTAL_FILTER_6TAP \ |
144 | 0 | for (i = 0; i < (im_h - 2); i += 2) { \ |
145 | 0 | __m256i data = _mm256_castsi128_si256( \ |
146 | 0 | _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \ |
147 | 0 | data = _mm256_inserti128_si256( \ |
148 | 0 | data, \ |
149 | 0 | _mm_loadu_si128( \ |
150 | 0 | (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]), \ |
151 | 0 | 1); \ |
152 | 0 | \ |
153 | 0 | __m256i res = convolve_lowbd_x_6tap(data, coeffs_h, filt); \ |
154 | 0 | res = \ |
155 | 0 | _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \ |
156 | 0 | _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \ |
157 | 0 | } \ |
158 | 0 | \ |
159 | 0 | __m256i data_1 = _mm256_castsi128_si256( \ |
160 | 0 | _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \ |
161 | 0 | \ |
162 | 0 | __m256i res = convolve_lowbd_x_6tap(data_1, coeffs_h, filt); \ |
163 | 0 | \ |
164 | 0 | res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \ |
165 | 0 | \ |
166 | 0 | _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); |
167 | | |
168 | | #define CONVOLVE_SR_VERTICAL_FILTER_6TAP \ |
169 | 0 | __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ |
170 | 0 | __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ |
171 | 0 | __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \ |
172 | 0 | __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \ |
173 | 0 | \ |
174 | 0 | __m256i s[8]; \ |
175 | 0 | s[0] = _mm256_unpacklo_epi16(src_0, src_1); \ |
176 | 0 | s[1] = _mm256_unpacklo_epi16(src_2, src_3); \ |
177 | 0 | \ |
178 | 0 | s[3] = _mm256_unpackhi_epi16(src_0, src_1); \ |
179 | 0 | s[4] = _mm256_unpackhi_epi16(src_2, src_3); \ |
180 | 0 | \ |
181 | 0 | for (i = 0; i < h; i += 2) { \ |
182 | 0 | const int16_t *data = &im_block[i * im_stride]; \ |
183 | 0 | \ |
184 | 0 | const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 4 * im_stride)); \ |
185 | 0 | const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 5 * im_stride)); \ |
186 | 0 | \ |
187 | 0 | s[2] = _mm256_unpacklo_epi16(s6, s7); \ |
188 | 0 | s[5] = _mm256_unpackhi_epi16(s6, s7); \ |
189 | 0 | \ |
190 | 0 | __m256i res_a = convolve_6tap(s, coeffs_v); \ |
191 | 0 | __m256i res_b = convolve_6tap(s + 3, coeffs_v); \ |
192 | 0 | \ |
193 | 0 | res_a = \ |
194 | 0 | _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v); \ |
195 | 0 | res_b = \ |
196 | 0 | _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v); \ |
197 | 0 | \ |
198 | 0 | const __m256i res_a_round = _mm256_sra_epi32( \ |
199 | 0 | _mm256_add_epi32(res_a, round_const_v), round_shift_v); \ |
200 | 0 | const __m256i res_b_round = _mm256_sra_epi32( \ |
201 | 0 | _mm256_add_epi32(res_b, round_const_v), round_shift_v); \ |
202 | 0 | \ |
203 | 0 | const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); \ |
204 | 0 | const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); \ |
205 | 0 | \ |
206 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_8b); \ |
207 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); \ |
208 | 0 | \ |
209 | 0 | __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; \ |
210 | 0 | __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; \ |
211 | 0 | if (w - j > 4) { \ |
212 | 0 | _mm_storel_epi64(p_0, res_0); \ |
213 | 0 | _mm_storel_epi64(p_1, res_1); \ |
214 | 0 | } else if (w == 4) { \ |
215 | 0 | xx_storel_32(p_0, res_0); \ |
216 | 0 | xx_storel_32(p_1, res_1); \ |
217 | 0 | } else { \ |
218 | 0 | *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); \ |
219 | 0 | *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); \ |
220 | 0 | } \ |
221 | 0 | \ |
222 | 0 | s[0] = s[1]; \ |
223 | 0 | s[1] = s[2]; \ |
224 | 0 | \ |
225 | 0 | s[3] = s[4]; \ |
226 | 0 | s[4] = s[5]; \ |
227 | 0 | } |
228 | | |
229 | | #define CONVOLVE_SR_HORIZONTAL_FILTER_8TAP \ |
230 | 0 | for (i = 0; i < (im_h - 2); i += 2) { \ |
231 | 0 | __m256i data = _mm256_castsi128_si256( \ |
232 | 0 | _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \ |
233 | 0 | data = _mm256_inserti128_si256( \ |
234 | 0 | data, \ |
235 | 0 | _mm_loadu_si128( \ |
236 | 0 | (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]), \ |
237 | 0 | 1); \ |
238 | 0 | \ |
239 | 0 | __m256i res = convolve_lowbd_x(data, coeffs_h, filt); \ |
240 | 0 | res = \ |
241 | 0 | _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \ |
242 | 0 | _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \ |
243 | 0 | } \ |
244 | 0 | \ |
245 | 0 | __m256i data_1 = _mm256_castsi128_si256( \ |
246 | 0 | _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \ |
247 | 0 | \ |
248 | 0 | __m256i res = convolve_lowbd_x(data_1, coeffs_h, filt); \ |
249 | 0 | \ |
250 | 0 | res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \ |
251 | 0 | \ |
252 | 0 | _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); |
253 | | |
254 | | #define CONVOLVE_SR_VERTICAL_FILTER_8TAP \ |
255 | 0 | __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ |
256 | 0 | __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ |
257 | 0 | __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \ |
258 | 0 | __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \ |
259 | 0 | __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); \ |
260 | 0 | __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); \ |
261 | 0 | \ |
262 | 0 | __m256i s[8]; \ |
263 | 0 | s[0] = _mm256_unpacklo_epi16(src_0, src_1); \ |
264 | 0 | s[1] = _mm256_unpacklo_epi16(src_2, src_3); \ |
265 | 0 | s[2] = _mm256_unpacklo_epi16(src_4, src_5); \ |
266 | 0 | \ |
267 | 0 | s[4] = _mm256_unpackhi_epi16(src_0, src_1); \ |
268 | 0 | s[5] = _mm256_unpackhi_epi16(src_2, src_3); \ |
269 | 0 | s[6] = _mm256_unpackhi_epi16(src_4, src_5); \ |
270 | 0 | \ |
271 | 0 | for (i = 0; i < h; i += 2) { \ |
272 | 0 | const int16_t *data = &im_block[i * im_stride]; \ |
273 | 0 | \ |
274 | 0 | const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); \ |
275 | 0 | const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); \ |
276 | 0 | \ |
277 | 0 | s[3] = _mm256_unpacklo_epi16(s6, s7); \ |
278 | 0 | s[7] = _mm256_unpackhi_epi16(s6, s7); \ |
279 | 0 | \ |
280 | 0 | __m256i res_a = convolve(s, coeffs_v); \ |
281 | 0 | __m256i res_b = convolve(s + 4, coeffs_v); \ |
282 | 0 | \ |
283 | 0 | res_a = \ |
284 | 0 | _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v); \ |
285 | 0 | res_b = \ |
286 | 0 | _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v); \ |
287 | 0 | \ |
288 | 0 | const __m256i res_a_round = _mm256_sra_epi32( \ |
289 | 0 | _mm256_add_epi32(res_a, round_const_v), round_shift_v); \ |
290 | 0 | const __m256i res_b_round = _mm256_sra_epi32( \ |
291 | 0 | _mm256_add_epi32(res_b, round_const_v), round_shift_v); \ |
292 | 0 | \ |
293 | 0 | const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); \ |
294 | 0 | const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); \ |
295 | 0 | \ |
296 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_8b); \ |
297 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); \ |
298 | 0 | \ |
299 | 0 | __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; \ |
300 | 0 | __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; \ |
301 | 0 | if (w - j > 4) { \ |
302 | 0 | _mm_storel_epi64(p_0, res_0); \ |
303 | 0 | _mm_storel_epi64(p_1, res_1); \ |
304 | 0 | } else if (w == 4) { \ |
305 | 0 | xx_storel_32(p_0, res_0); \ |
306 | 0 | xx_storel_32(p_1, res_1); \ |
307 | 0 | } else { \ |
308 | 0 | *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); \ |
309 | 0 | *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); \ |
310 | 0 | } \ |
311 | 0 | \ |
312 | 0 | s[0] = s[1]; \ |
313 | 0 | s[1] = s[2]; \ |
314 | 0 | s[2] = s[3]; \ |
315 | 0 | \ |
316 | 0 | s[4] = s[5]; \ |
317 | 0 | s[5] = s[6]; \ |
318 | 0 | s[6] = s[7]; \ |
319 | 0 | } |
320 | | |
321 | | #define CONVOLVE_SR_HORIZONTAL_FILTER_12TAP \ |
322 | 0 | const __m256i v_zero = _mm256_setzero_si256(); \ |
323 | 0 | __m256i s[12]; \ |
324 | 0 | if (w <= 4) { \ |
325 | 0 | for (i = 0; i < im_h; i += 2) { \ |
326 | 0 | const __m256i data = _mm256_permute2x128_si256( \ |
327 | 0 | _mm256_castsi128_si256( \ |
328 | 0 | _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))), \ |
329 | 0 | _mm256_castsi128_si256(_mm_loadu_si128( \ |
330 | 0 | (__m128i *)(&src_ptr[i * src_stride + src_stride + j]))), \ |
331 | 0 | 0x20); \ |
332 | 0 | const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero); \ |
333 | 0 | const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero); \ |
334 | 0 | const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo); \ |
335 | 0 | const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo); \ |
336 | 0 | \ |
337 | 0 | const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi); \ |
338 | 0 | const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi); \ |
339 | 0 | \ |
340 | 0 | s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2); \ |
341 | 0 | s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10); \ |
342 | 0 | s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2); \ |
343 | 0 | s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10); \ |
344 | 0 | s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2); \ |
345 | 0 | s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10); \ |
346 | 0 | \ |
347 | 0 | const __m256i res_lo = convolve_12taps(s, coeffs_h); \ |
348 | 0 | \ |
349 | 0 | __m256i res_32b_lo = _mm256_sra_epi32( \ |
350 | 0 | _mm256_add_epi32(res_lo, round_const_h12), round_shift_h12); \ |
351 | 0 | __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); \ |
352 | 0 | const __m128i res_0 = _mm256_extracti128_si256(res_16b_lo, 0); \ |
353 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_16b_lo, 1); \ |
354 | 0 | if (w > 2) { \ |
355 | 0 | _mm_storel_epi64((__m128i *)&im_block[i * im_stride], res_0); \ |
356 | 0 | _mm_storel_epi64((__m128i *)&im_block[i * im_stride + im_stride], \ |
357 | 0 | res_1); \ |
358 | 0 | } else { \ |
359 | 0 | uint32_t horiz_2; \ |
360 | 0 | horiz_2 = (uint32_t)_mm_cvtsi128_si32(res_0); \ |
361 | 0 | im_block[i * im_stride] = (uint16_t)horiz_2; \ |
362 | 0 | im_block[i * im_stride + 1] = (uint16_t)(horiz_2 >> 16); \ |
363 | 0 | horiz_2 = (uint32_t)_mm_cvtsi128_si32(res_1); \ |
364 | 0 | im_block[i * im_stride + im_stride] = (uint16_t)horiz_2; \ |
365 | 0 | im_block[i * im_stride + im_stride + 1] = (uint16_t)(horiz_2 >> 16); \ |
366 | 0 | } \ |
367 | 0 | } \ |
368 | 0 | } else { \ |
369 | 0 | for (i = 0; i < im_h; i++) { \ |
370 | 0 | const __m256i data = _mm256_permute2x128_si256( \ |
371 | 0 | _mm256_castsi128_si256( \ |
372 | 0 | _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))), \ |
373 | 0 | _mm256_castsi128_si256( \ |
374 | 0 | _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j + 4]))), \ |
375 | 0 | 0x20); \ |
376 | 0 | const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero); \ |
377 | 0 | const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero); \ |
378 | 0 | \ |
379 | 0 | const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo); \ |
380 | 0 | const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo); \ |
381 | 0 | \ |
382 | 0 | const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi); \ |
383 | 0 | const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi); \ |
384 | 0 | \ |
385 | 0 | s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2); \ |
386 | 0 | s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10); \ |
387 | 0 | s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2); \ |
388 | 0 | s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10); \ |
389 | 0 | s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2); \ |
390 | 0 | s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10); \ |
391 | 0 | \ |
392 | 0 | const __m256i res_lo = convolve_12taps(s, coeffs_h); \ |
393 | 0 | \ |
394 | 0 | __m256i res_32b_lo = _mm256_sra_epi32( \ |
395 | 0 | _mm256_add_epi32(res_lo, round_const_h12), round_shift_h12); \ |
396 | 0 | \ |
397 | 0 | __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); \ |
398 | 0 | _mm_store_si128((__m128i *)&im_block[i * im_stride], \ |
399 | 0 | _mm256_extracti128_si256( \ |
400 | 0 | _mm256_permute4x64_epi64(res_16b_lo, 0x88), 0)); \ |
401 | 0 | } \ |
402 | 0 | } |
403 | | |
404 | | #define CONVOLVE_SR_VERTICAL_FILTER_12TAP \ |
405 | 0 | __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ |
406 | 0 | __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ |
407 | 0 | __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \ |
408 | 0 | __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \ |
409 | 0 | __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); \ |
410 | 0 | __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); \ |
411 | 0 | __m256i src_6 = _mm256_loadu_si256((__m256i *)(im_block + 6 * im_stride)); \ |
412 | 0 | __m256i src_7 = _mm256_loadu_si256((__m256i *)(im_block + 7 * im_stride)); \ |
413 | 0 | __m256i src_8 = _mm256_loadu_si256((__m256i *)(im_block + 8 * im_stride)); \ |
414 | 0 | __m256i src_9 = _mm256_loadu_si256((__m256i *)(im_block + 9 * im_stride)); \ |
415 | 0 | \ |
416 | 0 | s[0] = _mm256_unpacklo_epi16(src_0, src_1); \ |
417 | 0 | s[1] = _mm256_unpacklo_epi16(src_2, src_3); \ |
418 | 0 | s[2] = _mm256_unpacklo_epi16(src_4, src_5); \ |
419 | 0 | s[3] = _mm256_unpacklo_epi16(src_6, src_7); \ |
420 | 0 | s[4] = _mm256_unpacklo_epi16(src_8, src_9); \ |
421 | 0 | \ |
422 | 0 | s[6] = _mm256_unpackhi_epi16(src_0, src_1); \ |
423 | 0 | s[7] = _mm256_unpackhi_epi16(src_2, src_3); \ |
424 | 0 | s[8] = _mm256_unpackhi_epi16(src_4, src_5); \ |
425 | 0 | s[9] = _mm256_unpackhi_epi16(src_6, src_7); \ |
426 | 0 | s[10] = _mm256_unpackhi_epi16(src_8, src_9); \ |
427 | 0 | \ |
428 | 0 | for (i = 0; i < h; i += 2) { \ |
429 | 0 | const int16_t *data = &im_block[i * im_stride]; \ |
430 | 0 | \ |
431 | 0 | const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 10 * im_stride)); \ |
432 | 0 | const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 11 * im_stride)); \ |
433 | 0 | \ |
434 | 0 | s[5] = _mm256_unpacklo_epi16(s6, s7); \ |
435 | 0 | s[11] = _mm256_unpackhi_epi16(s6, s7); \ |
436 | 0 | \ |
437 | 0 | __m256i res_a = convolve_12taps(s, coeffs_v); \ |
438 | 0 | __m256i res_b = convolve_12taps(s + 6, coeffs_v); \ |
439 | 0 | \ |
440 | 0 | res_a = \ |
441 | 0 | _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v); \ |
442 | 0 | res_b = \ |
443 | 0 | _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v); \ |
444 | 0 | \ |
445 | 0 | const __m256i res_a_round = _mm256_sra_epi32( \ |
446 | 0 | _mm256_add_epi32(res_a, round_const_v), round_shift_v); \ |
447 | 0 | const __m256i res_b_round = _mm256_sra_epi32( \ |
448 | 0 | _mm256_add_epi32(res_b, round_const_v), round_shift_v); \ |
449 | 0 | \ |
450 | 0 | const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); \ |
451 | 0 | const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); \ |
452 | 0 | \ |
453 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_8b); \ |
454 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); \ |
455 | 0 | \ |
456 | 0 | __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; \ |
457 | 0 | __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; \ |
458 | 0 | if (w - j > 4) { \ |
459 | 0 | _mm_storel_epi64(p_0, res_0); \ |
460 | 0 | _mm_storel_epi64(p_1, res_1); \ |
461 | 0 | } else if (w == 4) { \ |
462 | 0 | xx_storel_32(p_0, res_0); \ |
463 | 0 | xx_storel_32(p_1, res_1); \ |
464 | 0 | } else { \ |
465 | 0 | *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); \ |
466 | 0 | *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); \ |
467 | 0 | } \ |
468 | 0 | \ |
469 | 0 | s[0] = s[1]; \ |
470 | 0 | s[1] = s[2]; \ |
471 | 0 | s[2] = s[3]; \ |
472 | 0 | s[3] = s[4]; \ |
473 | 0 | s[4] = s[5]; \ |
474 | 0 | \ |
475 | 0 | s[6] = s[7]; \ |
476 | 0 | s[7] = s[8]; \ |
477 | 0 | s[8] = s[9]; \ |
478 | 0 | s[9] = s[10]; \ |
479 | 0 | s[10] = s[11]; \ |
480 | 0 | } |
481 | | |
482 | | #define DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP \ |
483 | 281k | do { \ |
484 | 4.49M | for (i = 0; i < im_h; i += 2) { \ |
485 | 4.21M | __m256i data = \ |
486 | 4.21M | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h)); \ |
487 | 4.21M | if (i + 1 < im_h) \ |
488 | 4.21M | data = _mm256_inserti128_si256( \ |
489 | 4.21M | data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1); \ |
490 | 4.21M | src_h += (src_stride << 1); \ |
491 | 4.21M | __m256i res = convolve_lowbd_x(data, coeffs_x, filt); \ |
492 | 4.21M | \ |
493 | 4.21M | res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), \ |
494 | 4.21M | round_shift_h); \ |
495 | 4.21M | \ |
496 | 4.21M | _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \ |
497 | 4.21M | } \ |
498 | 281k | } while (0) |
499 | | |
500 | | #define DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP \ |
501 | 367k | do { \ |
502 | 367k | __m256i s[8]; \ |
503 | 367k | __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ |
504 | 367k | __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ |
505 | 367k | __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \ |
506 | 367k | __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \ |
507 | 367k | __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); \ |
508 | 367k | __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); \ |
509 | 367k | \ |
510 | 367k | s[0] = _mm256_unpacklo_epi16(s0, s1); \ |
511 | 367k | s[1] = _mm256_unpacklo_epi16(s2, s3); \ |
512 | 367k | s[2] = _mm256_unpacklo_epi16(s4, s5); \ |
513 | 367k | \ |
514 | 367k | s[4] = _mm256_unpackhi_epi16(s0, s1); \ |
515 | 367k | s[5] = _mm256_unpackhi_epi16(s2, s3); \ |
516 | 367k | s[6] = _mm256_unpackhi_epi16(s4, s5); \ |
517 | 367k | \ |
518 | 3.86M | for (i = 0; i < h; i += 2) { \ |
519 | 3.49M | const int16_t *data = &im_block[i * im_stride]; \ |
520 | 3.49M | \ |
521 | 3.49M | const __m256i s6 = \ |
522 | 3.49M | _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); \ |
523 | 3.49M | const __m256i s7 = \ |
524 | 3.49M | _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); \ |
525 | 3.49M | \ |
526 | 3.49M | s[3] = _mm256_unpacklo_epi16(s6, s7); \ |
527 | 3.49M | s[7] = _mm256_unpackhi_epi16(s6, s7); \ |
528 | 3.49M | \ |
529 | 3.49M | const __m256i res_a = convolve(s, coeffs_y); \ |
530 | 3.49M | const __m256i res_a_round = _mm256_sra_epi32( \ |
531 | 3.49M | _mm256_add_epi32(res_a, round_const_v), round_shift_v); \ |
532 | 3.49M | \ |
533 | 3.49M | if (w - j > 4) { \ |
534 | 3.29M | const __m256i res_b = convolve(s + 4, coeffs_y); \ |
535 | 3.29M | const __m256i res_b_round = _mm256_sra_epi32( \ |
536 | 3.29M | _mm256_add_epi32(res_b, round_const_v), round_shift_v); \ |
537 | 3.29M | const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round); \ |
538 | 3.29M | const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); \ |
539 | 3.29M | \ |
540 | 3.29M | if (do_average) { \ |
541 | 1.51M | const __m256i data_ref_0 = \ |
542 | 1.51M | load_line2_avx2(&dst[i * dst_stride + j], \ |
543 | 1.51M | &dst[i * dst_stride + j + dst_stride]); \ |
544 | 1.51M | const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, \ |
545 | 1.51M | &wt, use_dist_wtd_comp_avg); \ |
546 | 1.51M | \ |
547 | 1.51M | const __m256i round_result = convolve_rounding( \ |
548 | 1.51M | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); \ |
549 | 1.51M | \ |
550 | 1.51M | const __m256i res_8 = \ |
551 | 1.51M | _mm256_packus_epi16(round_result, round_result); \ |
552 | 1.51M | const __m128i res_0 = _mm256_castsi256_si128(res_8); \ |
553 | 1.51M | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); \ |
554 | 1.51M | \ |
555 | 1.51M | _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); \ |
556 | 1.51M | _mm_storel_epi64( \ |
557 | 1.51M | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); \ |
558 | 1.77M | } else { \ |
559 | 1.77M | const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); \ |
560 | 1.77M | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); \ |
561 | 1.77M | \ |
562 | 1.77M | const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); \ |
563 | 1.77M | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), \ |
564 | 1.77M | res_1); \ |
565 | 1.77M | } \ |
566 | 3.29M | } else { \ |
567 | 205k | const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round); \ |
568 | 205k | const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); \ |
569 | 205k | \ |
570 | 205k | if (do_average) { \ |
571 | 107k | const __m256i data_ref_0 = \ |
572 | 107k | load_line2_avx2(&dst[i * dst_stride + j], \ |
573 | 107k | &dst[i * dst_stride + j + dst_stride]); \ |
574 | 107k | \ |
575 | 107k | const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, \ |
576 | 107k | &wt, use_dist_wtd_comp_avg); \ |
577 | 107k | \ |
578 | 107k | const __m256i round_result = convolve_rounding( \ |
579 | 107k | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); \ |
580 | 107k | \ |
581 | 107k | const __m256i res_8 = \ |
582 | 107k | _mm256_packus_epi16(round_result, round_result); \ |
583 | 107k | const __m128i res_0 = _mm256_castsi256_si128(res_8); \ |
584 | 107k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); \ |
585 | 107k | \ |
586 | 107k | *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); \ |
587 | 107k | *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = \ |
588 | 107k | _mm_cvtsi128_si32(res_1); \ |
589 | 107k | \ |
590 | 107k | } else { \ |
591 | 97.9k | const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); \ |
592 | 97.9k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); \ |
593 | 97.9k | \ |
594 | 97.9k | const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); \ |
595 | 97.9k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), \ |
596 | 97.9k | res_1); \ |
597 | 97.9k | } \ |
598 | 205k | } \ |
599 | 3.49M | \ |
600 | 3.49M | s[0] = s[1]; \ |
601 | 3.49M | s[1] = s[2]; \ |
602 | 3.49M | s[2] = s[3]; \ |
603 | 3.49M | \ |
604 | 3.49M | s[4] = s[5]; \ |
605 | 3.49M | s[5] = s[6]; \ |
606 | 3.49M | s[6] = s[7]; \ |
607 | 3.49M | } \ |
608 | 367k | } while (0) |
609 | | |
610 | | static inline void prepare_coeffs_lowbd( |
611 | | const InterpFilterParams *const filter_params, const int subpel_q4, |
612 | 496k | __m256i *const coeffs /* [4] */) { |
613 | 496k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( |
614 | 496k | filter_params, subpel_q4 & SUBPEL_MASK); |
615 | 496k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); |
616 | 496k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); |
617 | | |
618 | | // right shift all filter co-efficients by 1 to reduce the bits required. |
619 | | // This extra right shift will be taken care of at the end while rounding |
620 | | // the result. |
621 | | // Since all filter co-efficients are even, this change will not affect the |
622 | | // end result |
623 | 496k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), |
624 | 496k | _mm_set1_epi16((short)0xffff))); |
625 | | |
626 | 496k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); |
627 | | |
628 | | // coeffs 0 1 0 1 0 1 0 1 |
629 | 496k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u)); |
630 | | // coeffs 2 3 2 3 2 3 2 3 |
631 | 496k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u)); |
632 | | // coeffs 4 5 4 5 4 5 4 5 |
633 | 496k | coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u)); |
634 | | // coeffs 6 7 6 7 6 7 6 7 |
635 | 496k | coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu)); |
636 | 496k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_lowbd Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_lowbd Unexecuted instantiation: convolve_2d_avx2.c:prepare_coeffs_lowbd convolve_avx2.c:prepare_coeffs_lowbd Line | Count | Source | 612 | 25.3k | __m256i *const coeffs /* [4] */) { | 613 | 25.3k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 614 | 25.3k | filter_params, subpel_q4 & SUBPEL_MASK); | 615 | 25.3k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 616 | 25.3k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); | 617 | | | 618 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 619 | | // This extra right shift will be taken care of at the end while rounding | 620 | | // the result. | 621 | | // Since all filter co-efficients are even, this change will not affect the | 622 | | // end result | 623 | 25.3k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 624 | 25.3k | _mm_set1_epi16((short)0xffff))); | 625 | | | 626 | 25.3k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); | 627 | | | 628 | | // coeffs 0 1 0 1 0 1 0 1 | 629 | 25.3k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u)); | 630 | | // coeffs 2 3 2 3 2 3 2 3 | 631 | 25.3k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u)); | 632 | | // coeffs 4 5 4 5 4 5 4 5 | 633 | 25.3k | coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u)); | 634 | | // coeffs 6 7 6 7 6 7 6 7 | 635 | 25.3k | coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu)); | 636 | 25.3k | } |
jnt_convolve_avx2.c:prepare_coeffs_lowbd Line | Count | Source | 612 | 470k | __m256i *const coeffs /* [4] */) { | 613 | 470k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 614 | 470k | filter_params, subpel_q4 & SUBPEL_MASK); | 615 | 470k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 616 | 470k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); | 617 | | | 618 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 619 | | // This extra right shift will be taken care of at the end while rounding | 620 | | // the result. | 621 | | // Since all filter co-efficients are even, this change will not affect the | 622 | | // end result | 623 | 470k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 624 | 470k | _mm_set1_epi16((short)0xffff))); | 625 | | | 626 | 470k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); | 627 | | | 628 | | // coeffs 0 1 0 1 0 1 0 1 | 629 | 470k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u)); | 630 | | // coeffs 2 3 2 3 2 3 2 3 | 631 | 470k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u)); | 632 | | // coeffs 4 5 4 5 4 5 4 5 | 633 | 470k | coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u)); | 634 | | // coeffs 6 7 6 7 6 7 6 7 | 635 | 470k | coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu)); | 636 | 470k | } |
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_lowbd Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_lowbd Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_lowbd |
637 | | |
638 | | static inline void prepare_coeffs_6t_lowbd( |
639 | | const InterpFilterParams *const filter_params, const int subpel_q4, |
640 | 0 | __m256i *const coeffs /* [4] */) { |
641 | 0 | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( |
642 | 0 | filter_params, subpel_q4 & SUBPEL_MASK); |
643 | 0 | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); |
644 | 0 | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); |
645 | | |
646 | | // right shift all filter co-efficients by 1 to reduce the bits required. |
647 | | // This extra right shift will be taken care of at the end while rounding |
648 | | // the result. |
649 | | // Since all filter co-efficients are even, this change will not affect the |
650 | | // end result |
651 | 0 | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), |
652 | 0 | _mm_set1_epi16((int16_t)0xffff))); |
653 | | |
654 | 0 | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); |
655 | | |
656 | | // coeffs 1 2 1 2 1 2 1 2 |
657 | 0 | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0402u)); |
658 | | // coeffs 3 4 3 4 3 4 3 4 |
659 | 0 | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u)); |
660 | | // coeffs 5 6 5 6 5 6 5 6 |
661 | 0 | coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0c0au)); |
662 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_6t_lowbd Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_6t_lowbd Unexecuted instantiation: convolve_2d_avx2.c:prepare_coeffs_6t_lowbd Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_6t_lowbd Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_6t_lowbd Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_6t_lowbd Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_6t_lowbd Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_6t_lowbd |
663 | | |
664 | | static inline void prepare_coeffs_6t( |
665 | | const InterpFilterParams *const filter_params, const int subpel_q4, |
666 | 0 | __m256i *const coeffs /* [4] */) { |
667 | 0 | const int16_t *filter = av1_get_interp_filter_subpel_kernel( |
668 | 0 | filter_params, subpel_q4 & SUBPEL_MASK); |
669 | |
|
670 | 0 | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)(filter + 1)); |
671 | 0 | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); |
672 | | |
673 | | // coeffs 1 2 1 2 1 2 1 2 |
674 | 0 | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); |
675 | | // coeffs 3 4 3 4 3 4 3 4 |
676 | 0 | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); |
677 | | // coeffs 5 6 5 6 5 6 5 6 |
678 | 0 | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); |
679 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_6t Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_6t Unexecuted instantiation: convolve_2d_avx2.c:prepare_coeffs_6t Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_6t Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_6t Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_6t Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_6t Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_6t |
680 | | |
681 | | static inline void prepare_coeffs(const InterpFilterParams *const filter_params, |
682 | | const int subpel_q4, |
683 | 10.1M | __m256i *const coeffs /* [4] */) { |
684 | 10.1M | const int16_t *filter = av1_get_interp_filter_subpel_kernel( |
685 | 10.1M | filter_params, subpel_q4 & SUBPEL_MASK); |
686 | | |
687 | 10.1M | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); |
688 | 10.1M | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); |
689 | | |
690 | | // coeffs 0 1 0 1 0 1 0 1 |
691 | 10.1M | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); |
692 | | // coeffs 2 3 2 3 2 3 2 3 |
693 | 10.1M | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); |
694 | | // coeffs 4 5 4 5 4 5 4 5 |
695 | 10.1M | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); |
696 | | // coeffs 6 7 6 7 6 7 6 7 |
697 | 10.1M | coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); |
698 | 10.1M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs highbd_convolve_avx2.c:prepare_coeffs Line | Count | Source | 683 | 1.91M | __m256i *const coeffs /* [4] */) { | 684 | 1.91M | const int16_t *filter = av1_get_interp_filter_subpel_kernel( | 685 | 1.91M | filter_params, subpel_q4 & SUBPEL_MASK); | 686 | | | 687 | 1.91M | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); | 688 | 1.91M | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); | 689 | | | 690 | | // coeffs 0 1 0 1 0 1 0 1 | 691 | 1.91M | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); | 692 | | // coeffs 2 3 2 3 2 3 2 3 | 693 | 1.91M | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); | 694 | | // coeffs 4 5 4 5 4 5 4 5 | 695 | 1.91M | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); | 696 | | // coeffs 6 7 6 7 6 7 6 7 | 697 | 1.91M | coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); | 698 | 1.91M | } |
Unexecuted instantiation: convolve_2d_avx2.c:prepare_coeffs Unexecuted instantiation: convolve_avx2.c:prepare_coeffs jnt_convolve_avx2.c:prepare_coeffs Line | Count | Source | 683 | 237k | __m256i *const coeffs /* [4] */) { | 684 | 237k | const int16_t *filter = av1_get_interp_filter_subpel_kernel( | 685 | 237k | filter_params, subpel_q4 & SUBPEL_MASK); | 686 | | | 687 | 237k | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); | 688 | 237k | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); | 689 | | | 690 | | // coeffs 0 1 0 1 0 1 0 1 | 691 | 237k | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); | 692 | | // coeffs 2 3 2 3 2 3 2 3 | 693 | 237k | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); | 694 | | // coeffs 4 5 4 5 4 5 4 5 | 695 | 237k | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); | 696 | | // coeffs 6 7 6 7 6 7 6 7 | 697 | 237k | coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); | 698 | 237k | } |
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs highbd_convolve_2d_avx2.c:prepare_coeffs Line | Count | Source | 683 | 5.98M | __m256i *const coeffs /* [4] */) { | 684 | 5.98M | const int16_t *filter = av1_get_interp_filter_subpel_kernel( | 685 | 5.98M | filter_params, subpel_q4 & SUBPEL_MASK); | 686 | | | 687 | 5.98M | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); | 688 | 5.98M | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); | 689 | | | 690 | | // coeffs 0 1 0 1 0 1 0 1 | 691 | 5.98M | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); | 692 | | // coeffs 2 3 2 3 2 3 2 3 | 693 | 5.98M | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); | 694 | | // coeffs 4 5 4 5 4 5 4 5 | 695 | 5.98M | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); | 696 | | // coeffs 6 7 6 7 6 7 6 7 | 697 | 5.98M | coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); | 698 | 5.98M | } |
highbd_jnt_convolve_avx2.c:prepare_coeffs Line | Count | Source | 683 | 1.96M | __m256i *const coeffs /* [4] */) { | 684 | 1.96M | const int16_t *filter = av1_get_interp_filter_subpel_kernel( | 685 | 1.96M | filter_params, subpel_q4 & SUBPEL_MASK); | 686 | | | 687 | 1.96M | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); | 688 | 1.96M | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); | 689 | | | 690 | | // coeffs 0 1 0 1 0 1 0 1 | 691 | 1.96M | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); | 692 | | // coeffs 2 3 2 3 2 3 2 3 | 693 | 1.96M | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); | 694 | | // coeffs 4 5 4 5 4 5 4 5 | 695 | 1.96M | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); | 696 | | // coeffs 6 7 6 7 6 7 6 7 | 697 | 1.96M | coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); | 698 | 1.96M | } |
|
699 | | |
700 | | static inline void prepare_coeffs_12taps( |
701 | | const InterpFilterParams *const filter_params, const int subpel_q4, |
702 | 0 | __m256i *const coeffs /* [4] */) { |
703 | 0 | const int16_t *filter = av1_get_interp_filter_subpel_kernel( |
704 | 0 | filter_params, subpel_q4 & SUBPEL_MASK); |
705 | |
|
706 | 0 | __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); |
707 | 0 | __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); |
708 | | |
709 | | // coeffs 0 1 0 1 0 1 0 1 |
710 | 0 | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); |
711 | | // coeffs 2 3 2 3 2 3 2 3 |
712 | 0 | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); |
713 | | // coeffs 4 5 4 5 4 5 4 5 |
714 | 0 | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); |
715 | | // coeffs 6 7 6 7 6 7 6 7 |
716 | 0 | coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); |
717 | | // coeffs 8 9 10 11 0 0 0 0 |
718 | 0 | coeff_8 = _mm_loadl_epi64((__m128i *)(filter + 8)); |
719 | 0 | coeff = _mm256_broadcastq_epi64(coeff_8); |
720 | 0 | coeffs[4] = _mm256_shuffle_epi32(coeff, 0x00); // coeffs 8 9 8 9 8 9 8 9 |
721 | 0 | coeffs[5] = _mm256_shuffle_epi32(coeff, 0x55); // coeffs 10 11 10 11.. 10 11 |
722 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_12taps Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_12taps Unexecuted instantiation: convolve_2d_avx2.c:prepare_coeffs_12taps Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_12taps Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_12taps Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_12taps Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_12taps Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_12taps |
723 | | |
724 | | static inline __m256i convolve_lowbd(const __m256i *const s, |
725 | 24.1M | const __m256i *const coeffs) { |
726 | 24.1M | const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); |
727 | 24.1M | const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); |
728 | 24.1M | const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); |
729 | 24.1M | const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]); |
730 | | |
731 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
732 | 24.1M | const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), |
733 | 24.1M | _mm256_add_epi16(res_23, res_67)); |
734 | | |
735 | 24.1M | return res; |
736 | 24.1M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd Unexecuted instantiation: convolve_2d_avx2.c:convolve_lowbd Unexecuted instantiation: convolve_avx2.c:convolve_lowbd jnt_convolve_avx2.c:convolve_lowbd Line | Count | Source | 725 | 7.81M | const __m256i *const coeffs) { | 726 | 7.81M | const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 727 | 7.81M | const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 728 | 7.81M | const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); | 729 | 7.81M | const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]); | 730 | | | 731 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 732 | 7.81M | const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), | 733 | 7.81M | _mm256_add_epi16(res_23, res_67)); | 734 | | | 735 | 7.81M | return res; | 736 | 7.81M | } |
wiener_convolve_avx2.c:convolve_lowbd Line | Count | Source | 725 | 16.3M | const __m256i *const coeffs) { | 726 | 16.3M | const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 727 | 16.3M | const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 728 | 16.3M | const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); | 729 | 16.3M | const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]); | 730 | | | 731 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 732 | 16.3M | const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), | 733 | 16.3M | _mm256_add_epi16(res_23, res_67)); | 734 | | | 735 | 16.3M | return res; | 736 | 16.3M | } |
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd |
737 | | |
738 | | static inline __m256i convolve_lowbd_6tap(const __m256i *const s, |
739 | 0 | const __m256i *const coeffs) { |
740 | 0 | const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); |
741 | 0 | const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); |
742 | 0 | const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); |
743 | | |
744 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
745 | 0 | const __m256i res = |
746 | 0 | _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), res_23); |
747 | |
|
748 | 0 | return res; |
749 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_6tap Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_6tap Unexecuted instantiation: convolve_2d_avx2.c:convolve_lowbd_6tap Unexecuted instantiation: convolve_avx2.c:convolve_lowbd_6tap Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_6tap Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_6tap Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_6tap Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_6tap |
750 | | |
751 | | static inline __m256i convolve_lowbd_4tap(const __m256i *const s, |
752 | 3.93M | const __m256i *const coeffs) { |
753 | 3.93M | const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]); |
754 | 3.93M | const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]); |
755 | | |
756 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
757 | 3.93M | const __m256i res = _mm256_add_epi16(res_45, res_23); |
758 | | |
759 | 3.93M | return res; |
760 | 3.93M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_4tap Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_4tap Unexecuted instantiation: convolve_2d_avx2.c:convolve_lowbd_4tap convolve_avx2.c:convolve_lowbd_4tap Line | Count | Source | 752 | 613k | const __m256i *const coeffs) { | 753 | 613k | const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 754 | 613k | const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 755 | | | 756 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 757 | 613k | const __m256i res = _mm256_add_epi16(res_45, res_23); | 758 | | | 759 | 613k | return res; | 760 | 613k | } |
jnt_convolve_avx2.c:convolve_lowbd_4tap Line | Count | Source | 752 | 3.31M | const __m256i *const coeffs) { | 753 | 3.31M | const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 754 | 3.31M | const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 755 | | | 756 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 757 | 3.31M | const __m256i res = _mm256_add_epi16(res_45, res_23); | 758 | | | 759 | 3.31M | return res; | 760 | 3.31M | } |
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_4tap Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_4tap Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_4tap |
761 | | |
762 | | static inline __m256i convolve_6tap(const __m256i *const s, |
763 | 0 | const __m256i *const coeffs) { |
764 | 0 | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); |
765 | 0 | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); |
766 | 0 | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); |
767 | |
|
768 | 0 | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), res_2); |
769 | |
|
770 | 0 | return res; |
771 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_6tap Unexecuted instantiation: highbd_convolve_avx2.c:convolve_6tap Unexecuted instantiation: convolve_2d_avx2.c:convolve_6tap Unexecuted instantiation: convolve_avx2.c:convolve_6tap Unexecuted instantiation: jnt_convolve_avx2.c:convolve_6tap Unexecuted instantiation: wiener_convolve_avx2.c:convolve_6tap Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_6tap Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_6tap |
772 | | |
773 | | static inline __m256i convolve_12taps(const __m256i *const s, |
774 | 0 | const __m256i *const coeffs) { |
775 | 0 | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); |
776 | 0 | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); |
777 | 0 | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); |
778 | 0 | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); |
779 | 0 | const __m256i res_4 = _mm256_madd_epi16(s[4], coeffs[4]); |
780 | 0 | const __m256i res_5 = _mm256_madd_epi16(s[5], coeffs[5]); |
781 | |
|
782 | 0 | const __m256i res1 = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), |
783 | 0 | _mm256_add_epi32(res_2, res_3)); |
784 | 0 | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_4, res_5), res1); |
785 | |
|
786 | 0 | return res; |
787 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_12taps Unexecuted instantiation: highbd_convolve_avx2.c:convolve_12taps Unexecuted instantiation: convolve_2d_avx2.c:convolve_12taps Unexecuted instantiation: convolve_avx2.c:convolve_12taps Unexecuted instantiation: jnt_convolve_avx2.c:convolve_12taps Unexecuted instantiation: wiener_convolve_avx2.c:convolve_12taps Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_12taps Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_12taps |
788 | | |
789 | | static inline __m256i convolve(const __m256i *const s, |
790 | 288M | const __m256i *const coeffs) { |
791 | 288M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); |
792 | 288M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); |
793 | 288M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); |
794 | 288M | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); |
795 | | |
796 | 288M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), |
797 | 288M | _mm256_add_epi32(res_2, res_3)); |
798 | | |
799 | 288M | return res; |
800 | 288M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve highbd_convolve_avx2.c:convolve Line | Count | Source | 790 | 33.1M | const __m256i *const coeffs) { | 791 | 33.1M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); | 792 | 33.1M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); | 793 | 33.1M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); | 794 | 33.1M | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); | 795 | | | 796 | 33.1M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), | 797 | 33.1M | _mm256_add_epi32(res_2, res_3)); | 798 | | | 799 | 33.1M | return res; | 800 | 33.1M | } |
Unexecuted instantiation: convolve_2d_avx2.c:convolve Unexecuted instantiation: convolve_avx2.c:convolve jnt_convolve_avx2.c:convolve Line | Count | Source | 790 | 6.72M | const __m256i *const coeffs) { | 791 | 6.72M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); | 792 | 6.72M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); | 793 | 6.72M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); | 794 | 6.72M | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); | 795 | | | 796 | 6.72M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), | 797 | 6.72M | _mm256_add_epi32(res_2, res_3)); | 798 | | | 799 | 6.72M | return res; | 800 | 6.72M | } |
wiener_convolve_avx2.c:convolve Line | Count | Source | 790 | 38.9M | const __m256i *const coeffs) { | 791 | 38.9M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); | 792 | 38.9M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); | 793 | 38.9M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); | 794 | 38.9M | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); | 795 | | | 796 | 38.9M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), | 797 | 38.9M | _mm256_add_epi32(res_2, res_3)); | 798 | | | 799 | 38.9M | return res; | 800 | 38.9M | } |
highbd_convolve_2d_avx2.c:convolve Line | Count | Source | 790 | 128M | const __m256i *const coeffs) { | 791 | 128M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); | 792 | 128M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); | 793 | 128M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); | 794 | 128M | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); | 795 | | | 796 | 128M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), | 797 | 128M | _mm256_add_epi32(res_2, res_3)); | 798 | | | 799 | 128M | return res; | 800 | 128M | } |
highbd_jnt_convolve_avx2.c:convolve Line | Count | Source | 790 | 81.6M | const __m256i *const coeffs) { | 791 | 81.6M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); | 792 | 81.6M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); | 793 | 81.6M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); | 794 | 81.6M | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); | 795 | | | 796 | 81.6M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), | 797 | 81.6M | _mm256_add_epi32(res_2, res_3)); | 798 | | | 799 | 81.6M | return res; | 800 | 81.6M | } |
|
801 | | |
802 | | static inline __m256i convolve_4tap(const __m256i *const s, |
803 | 288k | const __m256i *const coeffs) { |
804 | 288k | const __m256i res_1 = _mm256_madd_epi16(s[0], coeffs[0]); |
805 | 288k | const __m256i res_2 = _mm256_madd_epi16(s[1], coeffs[1]); |
806 | | |
807 | 288k | const __m256i res = _mm256_add_epi32(res_1, res_2); |
808 | 288k | return res; |
809 | 288k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_4tap Unexecuted instantiation: highbd_convolve_avx2.c:convolve_4tap Unexecuted instantiation: convolve_2d_avx2.c:convolve_4tap Unexecuted instantiation: convolve_avx2.c:convolve_4tap jnt_convolve_avx2.c:convolve_4tap Line | Count | Source | 803 | 288k | const __m256i *const coeffs) { | 804 | 288k | const __m256i res_1 = _mm256_madd_epi16(s[0], coeffs[0]); | 805 | 288k | const __m256i res_2 = _mm256_madd_epi16(s[1], coeffs[1]); | 806 | | | 807 | 288k | const __m256i res = _mm256_add_epi32(res_1, res_2); | 808 | 288k | return res; | 809 | 288k | } |
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_4tap Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_4tap Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_4tap |
810 | | |
811 | | static inline __m256i convolve_lowbd_x(const __m256i data, |
812 | | const __m256i *const coeffs, |
813 | 23.2M | const __m256i *const filt) { |
814 | 23.2M | __m256i s[4]; |
815 | | |
816 | 23.2M | s[0] = _mm256_shuffle_epi8(data, filt[0]); |
817 | 23.2M | s[1] = _mm256_shuffle_epi8(data, filt[1]); |
818 | 23.2M | s[2] = _mm256_shuffle_epi8(data, filt[2]); |
819 | 23.2M | s[3] = _mm256_shuffle_epi8(data, filt[3]); |
820 | | |
821 | 23.2M | return convolve_lowbd(s, coeffs); |
822 | 23.2M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x Unexecuted instantiation: convolve_2d_avx2.c:convolve_lowbd_x Unexecuted instantiation: convolve_avx2.c:convolve_lowbd_x jnt_convolve_avx2.c:convolve_lowbd_x Line | Count | Source | 813 | 6.94M | const __m256i *const filt) { | 814 | 6.94M | __m256i s[4]; | 815 | | | 816 | 6.94M | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 817 | 6.94M | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 818 | 6.94M | s[2] = _mm256_shuffle_epi8(data, filt[2]); | 819 | 6.94M | s[3] = _mm256_shuffle_epi8(data, filt[3]); | 820 | | | 821 | 6.94M | return convolve_lowbd(s, coeffs); | 822 | 6.94M | } |
wiener_convolve_avx2.c:convolve_lowbd_x Line | Count | Source | 813 | 16.3M | const __m256i *const filt) { | 814 | 16.3M | __m256i s[4]; | 815 | | | 816 | 16.3M | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 817 | 16.3M | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 818 | 16.3M | s[2] = _mm256_shuffle_epi8(data, filt[2]); | 819 | 16.3M | s[3] = _mm256_shuffle_epi8(data, filt[3]); | 820 | | | 821 | 16.3M | return convolve_lowbd(s, coeffs); | 822 | 16.3M | } |
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x |
823 | | |
824 | | static inline __m256i convolve_lowbd_x_6tap(const __m256i data, |
825 | | const __m256i *const coeffs, |
826 | 0 | const __m256i *const filt) { |
827 | 0 | __m256i s[4]; |
828 | |
|
829 | 0 | s[0] = _mm256_shuffle_epi8(data, filt[0]); |
830 | 0 | s[1] = _mm256_shuffle_epi8(data, filt[1]); |
831 | 0 | s[2] = _mm256_shuffle_epi8(data, filt[2]); |
832 | |
|
833 | 0 | return convolve_lowbd_6tap(s, coeffs); |
834 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x_6tap Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x_6tap Unexecuted instantiation: convolve_2d_avx2.c:convolve_lowbd_x_6tap Unexecuted instantiation: convolve_avx2.c:convolve_lowbd_x_6tap Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_x_6tap Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_x_6tap Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x_6tap Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x_6tap |
835 | | |
836 | | static inline __m256i convolve_lowbd_x_4tap(const __m256i data, |
837 | | const __m256i *const coeffs, |
838 | 3.61M | const __m256i *const filt) { |
839 | 3.61M | __m256i s[2]; |
840 | | |
841 | 3.61M | s[0] = _mm256_shuffle_epi8(data, filt[0]); |
842 | 3.61M | s[1] = _mm256_shuffle_epi8(data, filt[1]); |
843 | | |
844 | 3.61M | return convolve_lowbd_4tap(s, coeffs); |
845 | 3.61M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x_4tap Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x_4tap Unexecuted instantiation: convolve_2d_avx2.c:convolve_lowbd_x_4tap convolve_avx2.c:convolve_lowbd_x_4tap Line | Count | Source | 838 | 613k | const __m256i *const filt) { | 839 | 613k | __m256i s[2]; | 840 | | | 841 | 613k | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 842 | 613k | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 843 | | | 844 | 613k | return convolve_lowbd_4tap(s, coeffs); | 845 | 613k | } |
jnt_convolve_avx2.c:convolve_lowbd_x_4tap Line | Count | Source | 838 | 3.00M | const __m256i *const filt) { | 839 | 3.00M | __m256i s[2]; | 840 | | | 841 | 3.00M | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 842 | 3.00M | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 843 | | | 844 | 3.00M | return convolve_lowbd_4tap(s, coeffs); | 845 | 3.00M | } |
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_x_4tap Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x_4tap Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x_4tap |
846 | | |
847 | | static inline void add_store_aligned_256(CONV_BUF_TYPE *const dst, |
848 | | const __m256i *const res, |
849 | 0 | const int do_average) { |
850 | 0 | __m256i d; |
851 | 0 | if (do_average) { |
852 | 0 | d = _mm256_load_si256((__m256i *)dst); |
853 | 0 | d = _mm256_add_epi32(d, *res); |
854 | 0 | d = _mm256_srai_epi32(d, 1); |
855 | 0 | } else { |
856 | 0 | d = *res; |
857 | 0 | } |
858 | 0 | _mm256_store_si256((__m256i *)dst, d); |
859 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:add_store_aligned_256 Unexecuted instantiation: highbd_convolve_avx2.c:add_store_aligned_256 Unexecuted instantiation: convolve_2d_avx2.c:add_store_aligned_256 Unexecuted instantiation: convolve_avx2.c:add_store_aligned_256 Unexecuted instantiation: jnt_convolve_avx2.c:add_store_aligned_256 Unexecuted instantiation: wiener_convolve_avx2.c:add_store_aligned_256 Unexecuted instantiation: highbd_convolve_2d_avx2.c:add_store_aligned_256 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:add_store_aligned_256 |
860 | | |
861 | | static inline __m256i comp_avg(const __m256i *const data_ref_0, |
862 | | const __m256i *const res_unsigned, |
863 | | const __m256i *const wt, |
864 | 130M | const int use_dist_wtd_comp_avg) { |
865 | 130M | __m256i res; |
866 | 130M | if (use_dist_wtd_comp_avg) { |
867 | 2.49M | const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned); |
868 | 2.49M | const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned); |
869 | | |
870 | 2.49M | const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt); |
871 | 2.49M | const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt); |
872 | | |
873 | 2.49M | const __m256i res_lo = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); |
874 | 2.49M | const __m256i res_hi = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); |
875 | | |
876 | 2.49M | res = _mm256_packs_epi32(res_lo, res_hi); |
877 | 128M | } else { |
878 | 128M | const __m256i wt_res = _mm256_add_epi16(*data_ref_0, *res_unsigned); |
879 | 128M | res = _mm256_srai_epi16(wt_res, 1); |
880 | 128M | } |
881 | 130M | return res; |
882 | 130M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:comp_avg Unexecuted instantiation: highbd_convolve_avx2.c:comp_avg Unexecuted instantiation: convolve_2d_avx2.c:comp_avg Unexecuted instantiation: convolve_avx2.c:comp_avg jnt_convolve_avx2.c:comp_avg Line | Count | Source | 864 | 130M | const int use_dist_wtd_comp_avg) { | 865 | 130M | __m256i res; | 866 | 130M | if (use_dist_wtd_comp_avg) { | 867 | 2.49M | const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned); | 868 | 2.49M | const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned); | 869 | | | 870 | 2.49M | const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt); | 871 | 2.49M | const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt); | 872 | | | 873 | 2.49M | const __m256i res_lo = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); | 874 | 2.49M | const __m256i res_hi = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); | 875 | | | 876 | 2.49M | res = _mm256_packs_epi32(res_lo, res_hi); | 877 | 128M | } else { | 878 | 128M | const __m256i wt_res = _mm256_add_epi16(*data_ref_0, *res_unsigned); | 879 | 128M | res = _mm256_srai_epi16(wt_res, 1); | 880 | 128M | } | 881 | 130M | return res; | 882 | 130M | } |
Unexecuted instantiation: wiener_convolve_avx2.c:comp_avg Unexecuted instantiation: highbd_convolve_2d_avx2.c:comp_avg Unexecuted instantiation: highbd_jnt_convolve_avx2.c:comp_avg |
883 | | |
884 | | static inline __m256i convolve_rounding(const __m256i *const res_unsigned, |
885 | | const __m256i *const offset_const, |
886 | | const __m256i *const round_const, |
887 | 130M | const int round_shift) { |
888 | 130M | const __m256i res_signed = _mm256_sub_epi16(*res_unsigned, *offset_const); |
889 | 130M | const __m256i res_round = _mm256_srai_epi16( |
890 | 130M | _mm256_add_epi16(res_signed, *round_const), round_shift); |
891 | 130M | return res_round; |
892 | 130M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_rounding Unexecuted instantiation: highbd_convolve_avx2.c:convolve_rounding Unexecuted instantiation: convolve_2d_avx2.c:convolve_rounding Unexecuted instantiation: convolve_avx2.c:convolve_rounding jnt_convolve_avx2.c:convolve_rounding Line | Count | Source | 887 | 130M | const int round_shift) { | 888 | 130M | const __m256i res_signed = _mm256_sub_epi16(*res_unsigned, *offset_const); | 889 | 130M | const __m256i res_round = _mm256_srai_epi16( | 890 | 130M | _mm256_add_epi16(res_signed, *round_const), round_shift); | 891 | 130M | return res_round; | 892 | 130M | } |
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_rounding Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_rounding Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_rounding |
893 | | |
894 | | static inline __m256i highbd_comp_avg(const __m256i *const data_ref_0, |
895 | | const __m256i *const res_unsigned, |
896 | | const __m256i *const wt0, |
897 | | const __m256i *const wt1, |
898 | 24.1M | const int use_dist_wtd_comp_avg) { |
899 | 24.1M | __m256i res; |
900 | 24.1M | if (use_dist_wtd_comp_avg) { |
901 | 2.65M | const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0); |
902 | 2.65M | const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1); |
903 | 2.65M | const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res); |
904 | 2.65M | res = _mm256_srai_epi32(wt_res, DIST_PRECISION_BITS); |
905 | 21.5M | } else { |
906 | 21.5M | const __m256i wt_res = _mm256_add_epi32(*data_ref_0, *res_unsigned); |
907 | 21.5M | res = _mm256_srai_epi32(wt_res, 1); |
908 | 21.5M | } |
909 | 24.1M | return res; |
910 | 24.1M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:highbd_comp_avg Unexecuted instantiation: highbd_convolve_avx2.c:highbd_comp_avg Unexecuted instantiation: convolve_2d_avx2.c:highbd_comp_avg Unexecuted instantiation: convolve_avx2.c:highbd_comp_avg Unexecuted instantiation: jnt_convolve_avx2.c:highbd_comp_avg Unexecuted instantiation: wiener_convolve_avx2.c:highbd_comp_avg Unexecuted instantiation: highbd_convolve_2d_avx2.c:highbd_comp_avg highbd_jnt_convolve_avx2.c:highbd_comp_avg Line | Count | Source | 898 | 24.1M | const int use_dist_wtd_comp_avg) { | 899 | 24.1M | __m256i res; | 900 | 24.1M | if (use_dist_wtd_comp_avg) { | 901 | 2.65M | const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0); | 902 | 2.65M | const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1); | 903 | 2.65M | const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res); | 904 | 2.65M | res = _mm256_srai_epi32(wt_res, DIST_PRECISION_BITS); | 905 | 21.5M | } else { | 906 | 21.5M | const __m256i wt_res = _mm256_add_epi32(*data_ref_0, *res_unsigned); | 907 | 21.5M | res = _mm256_srai_epi32(wt_res, 1); | 908 | 21.5M | } | 909 | 24.1M | return res; | 910 | 24.1M | } |
|
911 | | |
912 | | static inline __m256i highbd_convolve_rounding( |
913 | | const __m256i *const res_unsigned, const __m256i *const offset_const, |
914 | 24.3M | const __m256i *const round_const, const int round_shift) { |
915 | 24.3M | const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const); |
916 | 24.3M | const __m256i res_round = _mm256_srai_epi32( |
917 | 24.3M | _mm256_add_epi32(res_signed, *round_const), round_shift); |
918 | | |
919 | 24.3M | return res_round; |
920 | 24.3M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:highbd_convolve_rounding Unexecuted instantiation: highbd_convolve_avx2.c:highbd_convolve_rounding Unexecuted instantiation: convolve_2d_avx2.c:highbd_convolve_rounding Unexecuted instantiation: convolve_avx2.c:highbd_convolve_rounding Unexecuted instantiation: jnt_convolve_avx2.c:highbd_convolve_rounding Unexecuted instantiation: wiener_convolve_avx2.c:highbd_convolve_rounding Unexecuted instantiation: highbd_convolve_2d_avx2.c:highbd_convolve_rounding highbd_jnt_convolve_avx2.c:highbd_convolve_rounding Line | Count | Source | 914 | 24.3M | const __m256i *const round_const, const int round_shift) { | 915 | 24.3M | const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const); | 916 | 24.3M | const __m256i res_round = _mm256_srai_epi32( | 917 | 24.3M | _mm256_add_epi32(res_signed, *round_const), round_shift); | 918 | | | 919 | 24.3M | return res_round; | 920 | 24.3M | } |
|
921 | | |
922 | | #endif // AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ |