/src/aom/aom_dsp/x86/convolve_avx2.h
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2018, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #ifndef AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ |
13 | | #define AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ |
14 | | |
15 | | #include <immintrin.h> |
16 | | |
17 | | #include "aom_ports/mem.h" |
18 | | |
19 | | #include "aom_dsp/x86/mem_sse2.h" |
20 | | #include "aom_dsp/x86/synonyms.h" |
21 | | |
22 | | #include "av1/common/convolve.h" |
23 | | #include "av1/common/filter.h" |
24 | | |
25 | 678k | #define SECOND_32_BLK (32) |
26 | 597k | #define THIRD_32_BLK (32 << 1) |
27 | 298k | #define FOURTH_32_BLK (SECOND_32_BLK + THIRD_32_BLK) |
28 | | |
29 | | // filters for 16 |
30 | | DECLARE_ALIGNED(32, static const uint8_t, filt_global_avx2[]) = { |
31 | | 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, |
32 | | 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 2, 3, 3, 4, 4, 5, |
33 | | 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6, |
34 | | 7, 7, 8, 8, 9, 9, 10, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, |
35 | | 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, |
36 | | 12, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 6, 7, |
37 | | 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 |
38 | | }; |
39 | | |
40 | | DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[]) = { |
41 | | 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, |
42 | | 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, |
43 | | 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, |
44 | | }; |
45 | | |
46 | | DECLARE_ALIGNED(32, static const uint8_t, filt4_d4_global_avx2[]) = { |
47 | | 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, |
48 | | 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, |
49 | | }; |
50 | | |
51 | | DECLARE_ALIGNED(32, static const uint8_t, filt_center_global_avx2[32]) = { |
52 | | 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255, |
53 | | 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255 |
54 | | }; |
55 | | |
56 | | DECLARE_ALIGNED(32, static const uint8_t, |
57 | | filt1_global_sse2[16]) = { 0, 1, 1, 2, 2, 3, 3, 4, |
58 | | 8, 9, 9, 10, 10, 11, 11, 12 }; |
59 | | |
60 | | DECLARE_ALIGNED(32, static const uint8_t, |
61 | | filt2_global_sse2[16]) = { 2, 3, 3, 4, 4, 5, 5, 6, |
62 | | 10, 11, 11, 12, 12, 13, 13, 14 }; |
63 | | |
64 | | DECLARE_ALIGNED(32, static const uint8_t, |
65 | | filt3_global_sse2[16]) = { 0, 1, 1, 2, 8, 9, 9, 10, |
66 | | 0, 0, 0, 0, 0, 0, 0, 0 }; |
67 | | |
68 | | DECLARE_ALIGNED(32, static const uint8_t, |
69 | | filt4_global_sse2[16]) = { 2, 3, 3, 4, 10, 11, 11, 12, |
70 | | 0, 0, 0, 0, 0, 0, 0, 0 }; |
71 | | |
72 | | DECLARE_ALIGNED(32, static const uint8_t, |
73 | | filt5_global_sse2[16]) = { 0, 1, 1, 2, 4, 5, 5, 6, |
74 | | 0, 0, 0, 0, 0, 0, 0, 0 }; |
75 | | |
76 | | DECLARE_ALIGNED(32, static const uint8_t, |
77 | | filt1_global_avx2[32]) = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, |
78 | | 6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3, |
79 | | 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
80 | | |
81 | | DECLARE_ALIGNED(32, static const uint8_t, |
82 | | filt2_global_avx2[32]) = { 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, |
83 | | 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, |
84 | | 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 }; |
85 | | |
86 | | DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = { |
87 | | 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, |
88 | | 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 |
89 | | }; |
90 | | |
91 | | DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { |
92 | | 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, |
93 | | 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 |
94 | | }; |
95 | | |
96 | | #define CONVOLVE_SR_HOR_FILTER_W4(CONVOLVE_LOWBD) \ |
97 | 4.43M | for (i = 0; i < (im_h - 2); i += 2) { \ |
98 | 3.60M | __m128i data = \ |
99 | 3.60M | load_8bit_8x2_to_1_reg_sse2(&src_ptr[(i * src_stride)], src_stride); \ |
100 | 3.60M | __m128i res = CONVOLVE_LOWBD(data, coeffs_h, filt); \ |
101 | 3.60M | res = _mm_srai_epi16(_mm_add_epi16(res, round_const_h), 2); \ |
102 | 3.60M | _mm_store_si128((__m128i *)&im_block[i * 4], res); \ |
103 | 3.60M | } \ |
104 | 831k | __m128i data_1 = _mm_loadl_epi64((__m128i *)&src_ptr[(i * src_stride)]); \ |
105 | 831k | __m128i res = CONVOLVE_LOWBD(data_1, coeffs_h, filt); \ |
106 | 831k | res = _mm_srai_epi16(_mm_add_epi16(res, round_const_h), 2); \ |
107 | 831k | _mm_storel_epi64((__m128i *)&im_block[i * 4], res); |
108 | | |
109 | | #define CONVOLVE_SR_HOR_FILTER_2TAP_W4 \ |
110 | 29.2k | CONVOLVE_SR_HOR_FILTER_W4(convolve_lowbd_x_2tap_ssse3) |
111 | | |
112 | | #define CONVOLVE_SR_HOR_FILTER_4TAP_W4 \ |
113 | 801k | CONVOLVE_SR_HOR_FILTER_W4(convolve_lowbd_x_4tap_ssse3) |
114 | | |
115 | | static inline void sr_2d_ver_round_and_store_w4(int w, __m256i res, |
116 | | uint8_t *dst, int dst_stride, |
117 | 2.50M | __m256i round_const_v) { |
118 | 2.50M | const __m256i res_round = |
119 | 2.50M | _mm256_srai_epi32(_mm256_add_epi32(res, round_const_v), 11); |
120 | | |
121 | 2.50M | const __m256i res_16bit = _mm256_packs_epi32(res_round, res_round); |
122 | 2.50M | const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); |
123 | | |
124 | 2.50M | const __m128i r0 = _mm256_castsi256_si128(res_8b); |
125 | 2.50M | const __m128i r1 = _mm256_extracti128_si256(res_8b, 1); |
126 | | |
127 | 2.50M | __m128i *const p0 = (__m128i *)dst; |
128 | 2.50M | __m128i *const p1 = (__m128i *)(dst + dst_stride); |
129 | | |
130 | 2.50M | if (w == 4) { |
131 | 2.07M | xx_storel_32(p0, r0); |
132 | 2.07M | xx_storel_32(p1, r1); |
133 | 2.07M | } else { |
134 | 437k | assert(w == 2); |
135 | 437k | *(uint16_t *)p0 = (uint16_t)_mm_cvtsi128_si32(r0); |
136 | 437k | *(uint16_t *)p1 = (uint16_t)_mm_cvtsi128_si32(r1); |
137 | 437k | } |
138 | 2.50M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:sr_2d_ver_round_and_store_w4 Unexecuted instantiation: highbd_convolve_avx2.c:sr_2d_ver_round_and_store_w4 convolve_2d_avx2.c:sr_2d_ver_round_and_store_w4 Line | Count | Source | 117 | 2.50M | __m256i round_const_v) { | 118 | 2.50M | const __m256i res_round = | 119 | 2.50M | _mm256_srai_epi32(_mm256_add_epi32(res, round_const_v), 11); | 120 | | | 121 | 2.50M | const __m256i res_16bit = _mm256_packs_epi32(res_round, res_round); | 122 | 2.50M | const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); | 123 | | | 124 | 2.50M | const __m128i r0 = _mm256_castsi256_si128(res_8b); | 125 | 2.50M | const __m128i r1 = _mm256_extracti128_si256(res_8b, 1); | 126 | | | 127 | 2.50M | __m128i *const p0 = (__m128i *)dst; | 128 | 2.50M | __m128i *const p1 = (__m128i *)(dst + dst_stride); | 129 | | | 130 | 2.50M | if (w == 4) { | 131 | 2.07M | xx_storel_32(p0, r0); | 132 | 2.07M | xx_storel_32(p1, r1); | 133 | 2.07M | } else { | 134 | 437k | assert(w == 2); | 135 | 437k | *(uint16_t *)p0 = (uint16_t)_mm_cvtsi128_si32(r0); | 136 | 437k | *(uint16_t *)p1 = (uint16_t)_mm_cvtsi128_si32(r1); | 137 | 437k | } | 138 | 2.50M | } |
Unexecuted instantiation: convolve_avx2.c:sr_2d_ver_round_and_store_w4 Unexecuted instantiation: jnt_convolve_avx2.c:sr_2d_ver_round_and_store_w4 Unexecuted instantiation: wiener_convolve_avx2.c:sr_2d_ver_round_and_store_w4 Unexecuted instantiation: highbd_convolve_2d_avx2.c:sr_2d_ver_round_and_store_w4 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:sr_2d_ver_round_and_store_w4 |
139 | | |
140 | | #define CONVOLVE_SR_VER_FILTER_2TAP_W4 \ |
141 | 29.2k | __m128i s[2]; \ |
142 | 29.2k | s[0] = _mm_loadl_epi64((__m128i *)(im_block + 0 * 4)); \ |
143 | 29.2k | \ |
144 | 110k | for (i = 0; i < h; i += 2) { \ |
145 | 81.2k | const int16_t *data = &im_block[i * 4]; \ |
146 | 81.2k | s[1] = _mm_loadl_epi64((__m128i *)(data + 1 * 4)); \ |
147 | 81.2k | const __m256i src_0 = _mm256_setr_m128i(s[0], s[1]); \ |
148 | 81.2k | s[0] = _mm_loadl_epi64((__m128i *)(data + 2 * 4)); \ |
149 | 81.2k | const __m256i src_1 = _mm256_setr_m128i(s[1], s[0]); \ |
150 | 81.2k | const __m256i ss = _mm256_unpacklo_epi16(src_0, src_1); \ |
151 | 81.2k | \ |
152 | 81.2k | const __m256i res = _mm256_madd_epi16(ss, coeffs_v[0]); \ |
153 | 81.2k | \ |
154 | 81.2k | sr_2d_ver_round_and_store_w4(w, res, dst_ptr, dst_stride, round_const_v); \ |
155 | 81.2k | dst_ptr += 2 * dst_stride; \ |
156 | 81.2k | } |
157 | | |
158 | | #define CONVOLVE_SR_VER_FILTER_4TAP_W4 \ |
159 | 520k | __m128i s[4]; \ |
160 | 520k | __m256i ss[2]; \ |
161 | 520k | s[0] = _mm_loadl_epi64((__m128i *)(im_block + 0 * 4)); \ |
162 | 520k | s[1] = _mm_loadl_epi64((__m128i *)(im_block + 1 * 4)); \ |
163 | 520k | s[2] = _mm_loadl_epi64((__m128i *)(im_block + 2 * 4)); \ |
164 | 520k | \ |
165 | 520k | const __m256i src_0 = _mm256_setr_m128i(s[0], s[1]); \ |
166 | 520k | const __m256i src_1 = _mm256_setr_m128i(s[1], s[2]); \ |
167 | 520k | \ |
168 | 520k | ss[0] = _mm256_unpacklo_epi16(src_0, src_1); \ |
169 | 520k | \ |
170 | 1.52M | for (i = 0; i < h; i += 2) { \ |
171 | 1.00M | const int16_t *data = &im_block[i * 4]; \ |
172 | 1.00M | s[3] = _mm_loadl_epi64((__m128i *)(data + 3 * 4)); \ |
173 | 1.00M | const __m256i src_2 = _mm256_setr_m128i(s[2], s[3]); \ |
174 | 1.00M | s[2] = _mm_loadl_epi64((__m128i *)(data + 4 * 4)); \ |
175 | 1.00M | const __m256i src_3 = _mm256_setr_m128i(s[3], s[2]); \ |
176 | 1.00M | ss[1] = _mm256_unpacklo_epi16(src_2, src_3); \ |
177 | 1.00M | \ |
178 | 1.00M | const __m256i res = convolve_4tap(ss, coeffs_v); \ |
179 | 1.00M | \ |
180 | 1.00M | sr_2d_ver_round_and_store_w4(w, res, dst_ptr, dst_stride, round_const_v); \ |
181 | 1.00M | dst_ptr += 2 * dst_stride; \ |
182 | 1.00M | \ |
183 | 1.00M | ss[0] = ss[1]; \ |
184 | 1.00M | } |
185 | | |
186 | | #define CONVOLVE_SR_VER_FILTER_6TAP_W4 \ |
187 | 268k | __m128i s[6]; \ |
188 | 268k | __m256i ss[3]; \ |
189 | 268k | s[0] = _mm_loadl_epi64((__m128i *)(im_block + 0 * 4)); \ |
190 | 268k | s[1] = _mm_loadl_epi64((__m128i *)(im_block + 1 * 4)); \ |
191 | 268k | s[2] = _mm_loadl_epi64((__m128i *)(im_block + 2 * 4)); \ |
192 | 268k | s[3] = _mm_loadl_epi64((__m128i *)(im_block + 3 * 4)); \ |
193 | 268k | s[4] = _mm_loadl_epi64((__m128i *)(im_block + 4 * 4)); \ |
194 | 268k | \ |
195 | 268k | const __m256i src_0 = _mm256_setr_m128i(s[0], s[1]); \ |
196 | 268k | const __m256i src_1 = _mm256_setr_m128i(s[1], s[2]); \ |
197 | 268k | const __m256i src_2 = _mm256_setr_m128i(s[2], s[3]); \ |
198 | 268k | const __m256i src_3 = _mm256_setr_m128i(s[3], s[4]); \ |
199 | 268k | \ |
200 | 268k | ss[0] = _mm256_unpacklo_epi16(src_0, src_1); \ |
201 | 268k | ss[1] = _mm256_unpacklo_epi16(src_2, src_3); \ |
202 | 268k | \ |
203 | 1.61M | for (i = 0; i < h; i += 2) { \ |
204 | 1.34M | const int16_t *data = &im_block[i * 4]; \ |
205 | 1.34M | s[5] = _mm_loadl_epi64((__m128i *)(data + 5 * 4)); \ |
206 | 1.34M | const __m256i src_4 = _mm256_setr_m128i(s[4], s[5]); \ |
207 | 1.34M | s[4] = _mm_loadl_epi64((__m128i *)(data + 6 * 4)); \ |
208 | 1.34M | const __m256i src_5 = _mm256_setr_m128i(s[5], s[4]); \ |
209 | 1.34M | ss[2] = _mm256_unpacklo_epi16(src_4, src_5); \ |
210 | 1.34M | \ |
211 | 1.34M | const __m256i res = convolve_6tap(ss, coeffs_v); \ |
212 | 1.34M | \ |
213 | 1.34M | sr_2d_ver_round_and_store_w4(w, res, dst_ptr, dst_stride, round_const_v); \ |
214 | 1.34M | dst_ptr += 2 * dst_stride; \ |
215 | 1.34M | \ |
216 | 1.34M | ss[0] = ss[1]; \ |
217 | 1.34M | ss[1] = ss[2]; \ |
218 | 1.34M | } |
219 | | |
220 | | #define CONVOLVE_SR_VER_FILTER_8TAP_W4 \ |
221 | 13.4k | __m128i s[8]; \ |
222 | 13.4k | __m256i ss[4]; \ |
223 | 13.4k | s[0] = _mm_loadl_epi64((__m128i *)(im_block + 0 * 4)); \ |
224 | 13.4k | s[1] = _mm_loadl_epi64((__m128i *)(im_block + 1 * 4)); \ |
225 | 13.4k | s[2] = _mm_loadl_epi64((__m128i *)(im_block + 2 * 4)); \ |
226 | 13.4k | s[3] = _mm_loadl_epi64((__m128i *)(im_block + 3 * 4)); \ |
227 | 13.4k | s[4] = _mm_loadl_epi64((__m128i *)(im_block + 4 * 4)); \ |
228 | 13.4k | s[5] = _mm_loadl_epi64((__m128i *)(im_block + 5 * 4)); \ |
229 | 13.4k | s[6] = _mm_loadl_epi64((__m128i *)(im_block + 6 * 4)); \ |
230 | 13.4k | \ |
231 | 13.4k | const __m256i src_0 = _mm256_setr_m128i(s[0], s[1]); \ |
232 | 13.4k | const __m256i src_1 = _mm256_setr_m128i(s[1], s[2]); \ |
233 | 13.4k | const __m256i src_2 = _mm256_setr_m128i(s[2], s[3]); \ |
234 | 13.4k | const __m256i src_3 = _mm256_setr_m128i(s[3], s[4]); \ |
235 | 13.4k | const __m256i src_4 = _mm256_setr_m128i(s[4], s[5]); \ |
236 | 13.4k | const __m256i src_5 = _mm256_setr_m128i(s[5], s[6]); \ |
237 | 13.4k | \ |
238 | 13.4k | ss[0] = _mm256_unpacklo_epi16(src_0, src_1); \ |
239 | 13.4k | ss[1] = _mm256_unpacklo_epi16(src_2, src_3); \ |
240 | 13.4k | ss[2] = _mm256_unpacklo_epi16(src_4, src_5); \ |
241 | 13.4k | \ |
242 | 82.7k | for (i = 0; i < h; i += 2) { \ |
243 | 69.3k | const int16_t *data = &im_block[i * 4]; \ |
244 | 69.3k | s[7] = _mm_loadl_epi64((__m128i *)(data + 7 * 4)); \ |
245 | 69.3k | const __m256i src_6 = _mm256_setr_m128i(s[6], s[7]); \ |
246 | 69.3k | s[6] = _mm_loadl_epi64((__m128i *)(data + 8 * 4)); \ |
247 | 69.3k | const __m256i src_7 = _mm256_setr_m128i(s[7], s[6]); \ |
248 | 69.3k | ss[3] = _mm256_unpacklo_epi16(src_6, src_7); \ |
249 | 69.3k | \ |
250 | 69.3k | const __m256i res = convolve(ss, coeffs_v); \ |
251 | 69.3k | \ |
252 | 69.3k | sr_2d_ver_round_and_store_w4(w, res, dst_ptr, dst_stride, round_const_v); \ |
253 | 69.3k | dst_ptr += 2 * dst_stride; \ |
254 | 69.3k | \ |
255 | 69.3k | ss[0] = ss[1]; \ |
256 | 69.3k | ss[1] = ss[2]; \ |
257 | 69.3k | ss[2] = ss[3]; \ |
258 | 69.3k | } |
259 | | |
260 | | #define CONVOLVE_SR_HORIZONTAL_FILTER(CONVOLVE_LOWBD) \ |
261 | 21.5M | for (i = 0; i < (im_h - 2); i += 2) { \ |
262 | 19.6M | __m256i data = _mm256_castsi128_si256( \ |
263 | 19.6M | _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \ |
264 | 19.6M | data = _mm256_inserti128_si256( \ |
265 | 19.6M | data, \ |
266 | 19.6M | _mm_loadu_si128( \ |
267 | 19.6M | (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]), \ |
268 | 19.6M | 1); \ |
269 | 19.6M | __m256i res = CONVOLVE_LOWBD(data, coeffs_h, filt); \ |
270 | 19.6M | res = _mm256_srai_epi16(_mm256_add_epi16(res, round_const_h), 2); \ |
271 | 19.6M | _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \ |
272 | 19.6M | } \ |
273 | 1.88M | __m256i data_1 = _mm256_castsi128_si256( \ |
274 | 1.88M | _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \ |
275 | 1.88M | __m256i res = CONVOLVE_LOWBD(data_1, coeffs_h, filt); \ |
276 | 1.88M | res = _mm256_srai_epi16(_mm256_add_epi16(res, round_const_h), 2); \ |
277 | 1.88M | _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); |
278 | | |
279 | | #define CONVOLVE_SR_HORIZONTAL_FILTER_2TAP \ |
280 | 55.5k | CONVOLVE_SR_HORIZONTAL_FILTER(convolve_lowbd_x_2tap) |
281 | | |
282 | | #define CONVOLVE_SR_HORIZONTAL_FILTER_4TAP \ |
283 | 135k | CONVOLVE_SR_HORIZONTAL_FILTER(convolve_lowbd_x_4tap) |
284 | | |
285 | | #define CONVOLVE_SR_HORIZONTAL_FILTER_6TAP \ |
286 | 1.45M | CONVOLVE_SR_HORIZONTAL_FILTER(convolve_lowbd_x_6tap) |
287 | | |
288 | | #define CONVOLVE_SR_HORIZONTAL_FILTER_8TAP \ |
289 | 238k | CONVOLVE_SR_HORIZONTAL_FILTER(convolve_lowbd_x) |
290 | | |
291 | | static inline void sr_2d_ver_round_and_store(__m256i res_a, __m256i res_b, |
292 | | uint8_t *dst, int dst_stride, |
293 | 16.3M | __m256i round_const_v) { |
294 | 16.3M | const __m256i res_a_round = |
295 | 16.3M | _mm256_srai_epi32(_mm256_add_epi32(res_a, round_const_v), 11); |
296 | 16.3M | const __m256i res_b_round = |
297 | 16.3M | _mm256_srai_epi32(_mm256_add_epi32(res_b, round_const_v), 11); |
298 | 16.3M | const __m256i r16 = _mm256_packs_epi32(res_a_round, res_b_round); |
299 | 16.3M | const __m256i r8 = _mm256_packus_epi16(r16, r16); |
300 | | |
301 | 16.3M | _mm_storel_epi64((__m128i *)dst, _mm256_castsi256_si128(r8)); |
302 | 16.3M | _mm_storel_epi64((__m128i *)(dst + dst_stride), |
303 | 16.3M | _mm256_extracti128_si256(r8, 1)); |
304 | 16.3M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:sr_2d_ver_round_and_store Unexecuted instantiation: highbd_convolve_avx2.c:sr_2d_ver_round_and_store convolve_2d_avx2.c:sr_2d_ver_round_and_store Line | Count | Source | 293 | 16.3M | __m256i round_const_v) { | 294 | 16.3M | const __m256i res_a_round = | 295 | 16.3M | _mm256_srai_epi32(_mm256_add_epi32(res_a, round_const_v), 11); | 296 | 16.3M | const __m256i res_b_round = | 297 | 16.3M | _mm256_srai_epi32(_mm256_add_epi32(res_b, round_const_v), 11); | 298 | 16.3M | const __m256i r16 = _mm256_packs_epi32(res_a_round, res_b_round); | 299 | 16.3M | const __m256i r8 = _mm256_packus_epi16(r16, r16); | 300 | | | 301 | 16.3M | _mm_storel_epi64((__m128i *)dst, _mm256_castsi256_si128(r8)); | 302 | 16.3M | _mm_storel_epi64((__m128i *)(dst + dst_stride), | 303 | | _mm256_extracti128_si256(r8, 1)); | 304 | 16.3M | } |
Unexecuted instantiation: convolve_avx2.c:sr_2d_ver_round_and_store Unexecuted instantiation: jnt_convolve_avx2.c:sr_2d_ver_round_and_store Unexecuted instantiation: wiener_convolve_avx2.c:sr_2d_ver_round_and_store Unexecuted instantiation: highbd_convolve_2d_avx2.c:sr_2d_ver_round_and_store Unexecuted instantiation: highbd_jnt_convolve_avx2.c:sr_2d_ver_round_and_store |
305 | | |
306 | | #define CONVOLVE_SR_VERTICAL_FILTER_2TAP \ |
307 | 766k | for (i = 0; i < h; i += 2) { \ |
308 | 711k | __m256i s[2]; \ |
309 | 711k | const int16_t *data = &im_block[i * im_stride]; \ |
310 | 711k | const __m256i s1 = _mm256_loadu_si256((__m256i *)(data + 0 * im_stride)); \ |
311 | 711k | const __m256i s2 = _mm256_loadu_si256((__m256i *)(data + 1 * im_stride)); \ |
312 | 711k | s[0] = _mm256_unpacklo_epi16(s1, s2); \ |
313 | 711k | s[1] = _mm256_unpackhi_epi16(s1, s2); \ |
314 | 711k | \ |
315 | 711k | __m256i res_a = _mm256_madd_epi16(s[0], coeffs_v[0]); \ |
316 | 711k | __m256i res_b = _mm256_madd_epi16(s[1], coeffs_v[0]); \ |
317 | 711k | \ |
318 | 711k | sr_2d_ver_round_and_store(res_a, res_b, dst_ptr, dst_stride, \ |
319 | 711k | round_const_v); \ |
320 | 711k | dst_ptr += 2 * dst_stride; \ |
321 | 711k | } |
322 | | |
323 | | #define CONVOLVE_SR_VERTICAL_FILTER_4TAP \ |
324 | 662k | __m256i s[6]; \ |
325 | 662k | __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ |
326 | 662k | __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ |
327 | 662k | \ |
328 | 662k | s[0] = _mm256_unpacklo_epi16(src_0, src_1); \ |
329 | 662k | s[2] = _mm256_unpackhi_epi16(src_0, src_1); \ |
330 | 662k | \ |
331 | 2.88M | for (i = 0; i < h; i += 2) { \ |
332 | 2.22M | const int16_t *data = &im_block[i * im_stride]; \ |
333 | 2.22M | const __m256i s4 = _mm256_loadu_si256((__m256i *)(data + 2 * im_stride)); \ |
334 | 2.22M | const __m256i s5 = _mm256_loadu_si256((__m256i *)(data + 3 * im_stride)); \ |
335 | 2.22M | s[1] = _mm256_unpacklo_epi16(s4, s5); \ |
336 | 2.22M | s[3] = _mm256_unpackhi_epi16(s4, s5); \ |
337 | 2.22M | \ |
338 | 2.22M | __m256i res_a = convolve_4tap(s, coeffs_v); \ |
339 | 2.22M | __m256i res_b = convolve_4tap(s + 2, coeffs_v); \ |
340 | 2.22M | \ |
341 | 2.22M | sr_2d_ver_round_and_store(res_a, res_b, dst_ptr, dst_stride, \ |
342 | 2.22M | round_const_v); \ |
343 | 2.22M | dst_ptr += 2 * dst_stride; \ |
344 | 2.22M | \ |
345 | 2.22M | s[0] = s[1]; \ |
346 | 2.22M | s[2] = s[3]; \ |
347 | 2.22M | } |
348 | | |
349 | | #define CONVOLVE_SR_VERTICAL_FILTER_6TAP \ |
350 | 953k | __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ |
351 | 953k | __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ |
352 | 953k | __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \ |
353 | 953k | __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \ |
354 | 953k | \ |
355 | 953k | __m256i s[8]; \ |
356 | 953k | s[0] = _mm256_unpacklo_epi16(src_0, src_1); \ |
357 | 953k | s[1] = _mm256_unpacklo_epi16(src_2, src_3); \ |
358 | 953k | \ |
359 | 953k | s[3] = _mm256_unpackhi_epi16(src_0, src_1); \ |
360 | 953k | s[4] = _mm256_unpackhi_epi16(src_2, src_3); \ |
361 | 953k | \ |
362 | 11.7M | for (i = 0; i < h; i += 2) { \ |
363 | 10.8M | const int16_t *data = &im_block[i * im_stride]; \ |
364 | 10.8M | \ |
365 | 10.8M | const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 4 * im_stride)); \ |
366 | 10.8M | const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 5 * im_stride)); \ |
367 | 10.8M | \ |
368 | 10.8M | s[2] = _mm256_unpacklo_epi16(s6, s7); \ |
369 | 10.8M | s[5] = _mm256_unpackhi_epi16(s6, s7); \ |
370 | 10.8M | \ |
371 | 10.8M | __m256i res_a = convolve_6tap(s, coeffs_v); \ |
372 | 10.8M | __m256i res_b = convolve_6tap(s + 3, coeffs_v); \ |
373 | 10.8M | \ |
374 | 10.8M | sr_2d_ver_round_and_store(res_a, res_b, dst_ptr, dst_stride, \ |
375 | 10.8M | round_const_v); \ |
376 | 10.8M | dst_ptr += 2 * dst_stride; \ |
377 | 10.8M | \ |
378 | 10.8M | s[0] = s[1]; \ |
379 | 10.8M | s[1] = s[2]; \ |
380 | 10.8M | \ |
381 | 10.8M | s[3] = s[4]; \ |
382 | 10.8M | s[4] = s[5]; \ |
383 | 10.8M | } |
384 | | |
385 | | #define CONVOLVE_SR_VERTICAL_FILTER_8TAP \ |
386 | 213k | __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ |
387 | 213k | __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ |
388 | 213k | __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \ |
389 | 213k | __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \ |
390 | 213k | __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); \ |
391 | 213k | __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); \ |
392 | 213k | \ |
393 | 213k | __m256i s[8]; \ |
394 | 213k | s[0] = _mm256_unpacklo_epi16(src_0, src_1); \ |
395 | 213k | s[1] = _mm256_unpacklo_epi16(src_2, src_3); \ |
396 | 213k | s[2] = _mm256_unpacklo_epi16(src_4, src_5); \ |
397 | 213k | \ |
398 | 213k | s[4] = _mm256_unpackhi_epi16(src_0, src_1); \ |
399 | 213k | s[5] = _mm256_unpackhi_epi16(src_2, src_3); \ |
400 | 213k | s[6] = _mm256_unpackhi_epi16(src_4, src_5); \ |
401 | 213k | \ |
402 | 2.85M | for (i = 0; i < h; i += 2) { \ |
403 | 2.64M | const int16_t *data = &im_block[i * im_stride]; \ |
404 | 2.64M | \ |
405 | 2.64M | const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); \ |
406 | 2.64M | const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); \ |
407 | 2.64M | \ |
408 | 2.64M | s[3] = _mm256_unpacklo_epi16(s6, s7); \ |
409 | 2.64M | s[7] = _mm256_unpackhi_epi16(s6, s7); \ |
410 | 2.64M | \ |
411 | 2.64M | __m256i res_a = convolve(s, coeffs_v); \ |
412 | 2.64M | __m256i res_b = convolve(s + 4, coeffs_v); \ |
413 | 2.64M | \ |
414 | 2.64M | sr_2d_ver_round_and_store(res_a, res_b, dst_ptr, dst_stride, \ |
415 | 2.64M | round_const_v); \ |
416 | 2.64M | dst_ptr += 2 * dst_stride; \ |
417 | 2.64M | \ |
418 | 2.64M | s[0] = s[1]; \ |
419 | 2.64M | s[1] = s[2]; \ |
420 | 2.64M | s[2] = s[3]; \ |
421 | 2.64M | \ |
422 | 2.64M | s[4] = s[5]; \ |
423 | 2.64M | s[5] = s[6]; \ |
424 | 2.64M | s[6] = s[7]; \ |
425 | 2.64M | } |
426 | | |
427 | | #define CONVOLVE_SR_HORIZONTAL_FILTER_12TAP \ |
428 | 0 | const __m256i v_zero = _mm256_setzero_si256(); \ |
429 | 0 | __m256i s[12]; \ |
430 | 0 | if (w <= 4) { \ |
431 | 0 | for (i = 0; i < im_h; i += 2) { \ |
432 | 0 | const __m256i data = _mm256_permute2x128_si256( \ |
433 | 0 | _mm256_castsi128_si256( \ |
434 | 0 | _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))), \ |
435 | 0 | _mm256_castsi128_si256(_mm_loadu_si128( \ |
436 | 0 | (__m128i *)(&src_ptr[i * src_stride + src_stride + j]))), \ |
437 | 0 | 0x20); \ |
438 | 0 | const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero); \ |
439 | 0 | const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero); \ |
440 | 0 | const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo); \ |
441 | 0 | const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo); \ |
442 | 0 | \ |
443 | 0 | const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi); \ |
444 | 0 | const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi); \ |
445 | 0 | \ |
446 | 0 | s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2); \ |
447 | 0 | s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10); \ |
448 | 0 | s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2); \ |
449 | 0 | s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10); \ |
450 | 0 | s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2); \ |
451 | 0 | s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10); \ |
452 | 0 | \ |
453 | 0 | const __m256i res_lo = convolve_12taps(s, coeffs_h); \ |
454 | 0 | \ |
455 | 0 | __m256i res_32b_lo = _mm256_sra_epi32( \ |
456 | 0 | _mm256_add_epi32(res_lo, round_const_h12), round_shift_h12); \ |
457 | 0 | __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); \ |
458 | 0 | const __m128i res_0 = _mm256_extracti128_si256(res_16b_lo, 0); \ |
459 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_16b_lo, 1); \ |
460 | 0 | if (w > 2) { \ |
461 | 0 | _mm_storel_epi64((__m128i *)&im_block[i * im_stride], res_0); \ |
462 | 0 | _mm_storel_epi64((__m128i *)&im_block[i * im_stride + im_stride], \ |
463 | 0 | res_1); \ |
464 | 0 | } else { \ |
465 | 0 | uint32_t horiz_2; \ |
466 | 0 | horiz_2 = (uint32_t)_mm_cvtsi128_si32(res_0); \ |
467 | 0 | im_block[i * im_stride] = (uint16_t)horiz_2; \ |
468 | 0 | im_block[i * im_stride + 1] = (uint16_t)(horiz_2 >> 16); \ |
469 | 0 | horiz_2 = (uint32_t)_mm_cvtsi128_si32(res_1); \ |
470 | 0 | im_block[i * im_stride + im_stride] = (uint16_t)horiz_2; \ |
471 | 0 | im_block[i * im_stride + im_stride + 1] = (uint16_t)(horiz_2 >> 16); \ |
472 | 0 | } \ |
473 | 0 | } \ |
474 | 0 | } else { \ |
475 | 0 | for (i = 0; i < im_h; i++) { \ |
476 | 0 | const __m256i data = _mm256_permute2x128_si256( \ |
477 | 0 | _mm256_castsi128_si256( \ |
478 | 0 | _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))), \ |
479 | 0 | _mm256_castsi128_si256( \ |
480 | 0 | _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j + 4]))), \ |
481 | 0 | 0x20); \ |
482 | 0 | const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero); \ |
483 | 0 | const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero); \ |
484 | 0 | \ |
485 | 0 | const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo); \ |
486 | 0 | const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo); \ |
487 | 0 | \ |
488 | 0 | const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi); \ |
489 | 0 | const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi); \ |
490 | 0 | \ |
491 | 0 | s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2); \ |
492 | 0 | s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10); \ |
493 | 0 | s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2); \ |
494 | 0 | s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10); \ |
495 | 0 | s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2); \ |
496 | 0 | s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10); \ |
497 | 0 | \ |
498 | 0 | const __m256i res_lo = convolve_12taps(s, coeffs_h); \ |
499 | 0 | \ |
500 | 0 | __m256i res_32b_lo = _mm256_sra_epi32( \ |
501 | 0 | _mm256_add_epi32(res_lo, round_const_h12), round_shift_h12); \ |
502 | 0 | \ |
503 | 0 | __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); \ |
504 | 0 | _mm_store_si128((__m128i *)&im_block[i * im_stride], \ |
505 | 0 | _mm256_extracti128_si256( \ |
506 | 0 | _mm256_permute4x64_epi64(res_16b_lo, 0x88), 0)); \ |
507 | 0 | } \ |
508 | 0 | } |
509 | | |
510 | | #define CONVOLVE_SR_VERTICAL_FILTER_12TAP \ |
511 | 0 | __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ |
512 | 0 | __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ |
513 | 0 | __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \ |
514 | 0 | __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \ |
515 | 0 | __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); \ |
516 | 0 | __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); \ |
517 | 0 | __m256i src_6 = _mm256_loadu_si256((__m256i *)(im_block + 6 * im_stride)); \ |
518 | 0 | __m256i src_7 = _mm256_loadu_si256((__m256i *)(im_block + 7 * im_stride)); \ |
519 | 0 | __m256i src_8 = _mm256_loadu_si256((__m256i *)(im_block + 8 * im_stride)); \ |
520 | 0 | __m256i src_9 = _mm256_loadu_si256((__m256i *)(im_block + 9 * im_stride)); \ |
521 | 0 | \ |
522 | 0 | s[0] = _mm256_unpacklo_epi16(src_0, src_1); \ |
523 | 0 | s[1] = _mm256_unpacklo_epi16(src_2, src_3); \ |
524 | 0 | s[2] = _mm256_unpacklo_epi16(src_4, src_5); \ |
525 | 0 | s[3] = _mm256_unpacklo_epi16(src_6, src_7); \ |
526 | 0 | s[4] = _mm256_unpacklo_epi16(src_8, src_9); \ |
527 | 0 | \ |
528 | 0 | s[6] = _mm256_unpackhi_epi16(src_0, src_1); \ |
529 | 0 | s[7] = _mm256_unpackhi_epi16(src_2, src_3); \ |
530 | 0 | s[8] = _mm256_unpackhi_epi16(src_4, src_5); \ |
531 | 0 | s[9] = _mm256_unpackhi_epi16(src_6, src_7); \ |
532 | 0 | s[10] = _mm256_unpackhi_epi16(src_8, src_9); \ |
533 | 0 | \ |
534 | 0 | for (i = 0; i < h; i += 2) { \ |
535 | 0 | const int16_t *data = &im_block[i * im_stride]; \ |
536 | 0 | \ |
537 | 0 | const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 10 * im_stride)); \ |
538 | 0 | const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 11 * im_stride)); \ |
539 | 0 | \ |
540 | 0 | s[5] = _mm256_unpacklo_epi16(s6, s7); \ |
541 | 0 | s[11] = _mm256_unpackhi_epi16(s6, s7); \ |
542 | 0 | \ |
543 | 0 | __m256i res_a = convolve_12taps(s, coeffs_v); \ |
544 | 0 | __m256i res_b = convolve_12taps(s + 6, coeffs_v); \ |
545 | 0 | \ |
546 | 0 | res_a = \ |
547 | 0 | _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v); \ |
548 | 0 | res_b = \ |
549 | 0 | _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v); \ |
550 | 0 | \ |
551 | 0 | const __m256i res_a_round = _mm256_sra_epi32( \ |
552 | 0 | _mm256_add_epi32(res_a, round_const_v), round_shift_v); \ |
553 | 0 | const __m256i res_b_round = _mm256_sra_epi32( \ |
554 | 0 | _mm256_add_epi32(res_b, round_const_v), round_shift_v); \ |
555 | 0 | \ |
556 | 0 | const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); \ |
557 | 0 | const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); \ |
558 | 0 | \ |
559 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_8b); \ |
560 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); \ |
561 | 0 | \ |
562 | 0 | __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; \ |
563 | 0 | __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; \ |
564 | 0 | if (w - j > 4) { \ |
565 | 0 | _mm_storel_epi64(p_0, res_0); \ |
566 | 0 | _mm_storel_epi64(p_1, res_1); \ |
567 | 0 | } else if (w == 4) { \ |
568 | 0 | xx_storel_32(p_0, res_0); \ |
569 | 0 | xx_storel_32(p_1, res_1); \ |
570 | 0 | } else { \ |
571 | 0 | *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); \ |
572 | 0 | *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); \ |
573 | 0 | } \ |
574 | 0 | \ |
575 | 0 | s[0] = s[1]; \ |
576 | 0 | s[1] = s[2]; \ |
577 | 0 | s[2] = s[3]; \ |
578 | 0 | s[3] = s[4]; \ |
579 | 0 | s[4] = s[5]; \ |
580 | 0 | \ |
581 | 0 | s[6] = s[7]; \ |
582 | 0 | s[7] = s[8]; \ |
583 | 0 | s[8] = s[9]; \ |
584 | 0 | s[9] = s[10]; \ |
585 | 0 | s[10] = s[11]; \ |
586 | 0 | } |
587 | | |
588 | | #define DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP \ |
589 | 212k | do { \ |
590 | 3.40M | for (i = 0; i < im_h; i += 2) { \ |
591 | 3.19M | __m256i data = \ |
592 | 3.19M | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h)); \ |
593 | 3.19M | if (i + 1 < im_h) \ |
594 | 3.19M | data = _mm256_inserti128_si256( \ |
595 | 3.19M | data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1); \ |
596 | 3.19M | src_h += (src_stride << 1); \ |
597 | 3.19M | __m256i res = convolve_lowbd_x(data, coeffs_x, filt); \ |
598 | 3.19M | \ |
599 | 3.19M | res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), \ |
600 | 3.19M | round_shift_h); \ |
601 | 3.19M | \ |
602 | 3.19M | _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \ |
603 | 3.19M | } \ |
604 | 212k | } while (0) |
605 | | |
606 | | #define DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP \ |
607 | 283k | do { \ |
608 | 283k | __m256i s[8]; \ |
609 | 283k | __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ |
610 | 283k | __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ |
611 | 283k | __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \ |
612 | 283k | __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \ |
613 | 283k | __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); \ |
614 | 283k | __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); \ |
615 | 283k | \ |
616 | 283k | s[0] = _mm256_unpacklo_epi16(s0, s1); \ |
617 | 283k | s[1] = _mm256_unpacklo_epi16(s2, s3); \ |
618 | 283k | s[2] = _mm256_unpacklo_epi16(s4, s5); \ |
619 | 283k | \ |
620 | 283k | s[4] = _mm256_unpackhi_epi16(s0, s1); \ |
621 | 283k | s[5] = _mm256_unpackhi_epi16(s2, s3); \ |
622 | 283k | s[6] = _mm256_unpackhi_epi16(s4, s5); \ |
623 | 283k | \ |
624 | 3.00M | for (i = 0; i < h; i += 2) { \ |
625 | 2.72M | const int16_t *data = &im_block[i * im_stride]; \ |
626 | 2.72M | \ |
627 | 2.72M | const __m256i s6 = \ |
628 | 2.72M | _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); \ |
629 | 2.72M | const __m256i s7 = \ |
630 | 2.72M | _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); \ |
631 | 2.72M | \ |
632 | 2.72M | s[3] = _mm256_unpacklo_epi16(s6, s7); \ |
633 | 2.72M | s[7] = _mm256_unpackhi_epi16(s6, s7); \ |
634 | 2.72M | \ |
635 | 2.72M | const __m256i res_a = convolve(s, coeffs_y); \ |
636 | 2.72M | const __m256i res_a_round = _mm256_sra_epi32( \ |
637 | 2.72M | _mm256_add_epi32(res_a, round_const_v), round_shift_v); \ |
638 | 2.72M | \ |
639 | 2.72M | if (w - j > 4) { \ |
640 | 2.53M | const __m256i res_b = convolve(s + 4, coeffs_y); \ |
641 | 2.53M | const __m256i res_b_round = _mm256_sra_epi32( \ |
642 | 2.53M | _mm256_add_epi32(res_b, round_const_v), round_shift_v); \ |
643 | 2.53M | const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round); \ |
644 | 2.53M | const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); \ |
645 | 2.53M | \ |
646 | 2.53M | if (do_average) { \ |
647 | 1.11M | const __m256i data_ref_0 = \ |
648 | 1.11M | load_line2_avx2(&dst[i * dst_stride + j], \ |
649 | 1.11M | &dst[i * dst_stride + j + dst_stride]); \ |
650 | 1.11M | const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, \ |
651 | 1.11M | &wt, use_dist_wtd_comp_avg); \ |
652 | 1.11M | \ |
653 | 1.11M | const __m256i round_result = convolve_rounding( \ |
654 | 1.11M | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); \ |
655 | 1.11M | \ |
656 | 1.11M | const __m256i res_8 = \ |
657 | 1.11M | _mm256_packus_epi16(round_result, round_result); \ |
658 | 1.11M | const __m128i res_0 = _mm256_castsi256_si128(res_8); \ |
659 | 1.11M | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); \ |
660 | 1.11M | \ |
661 | 1.11M | _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); \ |
662 | 1.11M | _mm_storel_epi64( \ |
663 | 1.11M | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); \ |
664 | 1.42M | } else { \ |
665 | 1.42M | const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); \ |
666 | 1.42M | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); \ |
667 | 1.42M | \ |
668 | 1.42M | const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); \ |
669 | 1.42M | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), \ |
670 | 1.42M | res_1); \ |
671 | 1.42M | } \ |
672 | 2.53M | } else { \ |
673 | 180k | const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round); \ |
674 | 180k | const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); \ |
675 | 180k | \ |
676 | 180k | if (do_average) { \ |
677 | 83.3k | const __m256i data_ref_0 = \ |
678 | 83.3k | load_line2_avx2(&dst[i * dst_stride + j], \ |
679 | 83.3k | &dst[i * dst_stride + j + dst_stride]); \ |
680 | 83.3k | \ |
681 | 83.3k | const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, \ |
682 | 83.3k | &wt, use_dist_wtd_comp_avg); \ |
683 | 83.3k | \ |
684 | 83.3k | const __m256i round_result = convolve_rounding( \ |
685 | 83.3k | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); \ |
686 | 83.3k | \ |
687 | 83.3k | const __m256i res_8 = \ |
688 | 83.3k | _mm256_packus_epi16(round_result, round_result); \ |
689 | 83.3k | const __m128i res_0 = _mm256_castsi256_si128(res_8); \ |
690 | 83.3k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); \ |
691 | 83.3k | \ |
692 | 83.3k | *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); \ |
693 | 83.3k | *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = \ |
694 | 83.3k | _mm_cvtsi128_si32(res_1); \ |
695 | 83.3k | \ |
696 | 97.4k | } else { \ |
697 | 97.4k | const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); \ |
698 | 97.4k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); \ |
699 | 97.4k | \ |
700 | 97.4k | const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); \ |
701 | 97.4k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), \ |
702 | 97.4k | res_1); \ |
703 | 97.4k | } \ |
704 | 180k | } \ |
705 | 2.72M | \ |
706 | 2.72M | s[0] = s[1]; \ |
707 | 2.72M | s[1] = s[2]; \ |
708 | 2.72M | s[2] = s[3]; \ |
709 | 2.72M | \ |
710 | 2.72M | s[4] = s[5]; \ |
711 | 2.72M | s[5] = s[6]; \ |
712 | 2.72M | s[6] = s[7]; \ |
713 | 2.72M | } \ |
714 | 283k | } while (0) |
715 | | |
716 | | static inline void prepare_coeffs_2t_ssse3( |
717 | | const InterpFilterParams *const filter_params, const int32_t subpel_q4, |
718 | 39.2k | __m128i *const coeffs /* [2] */) { |
719 | 39.2k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( |
720 | 39.2k | filter_params, subpel_q4 & SUBPEL_MASK); |
721 | 39.2k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); |
722 | | |
723 | | // right shift all filter co-efficients by 1 to reduce the bits required. |
724 | | // This extra right shift will be taken care of at the end while rounding |
725 | | // the result. |
726 | | // Since all filter co-efficients are even, this change will not affect the |
727 | | // end result |
728 | 39.2k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), |
729 | 39.2k | _mm_set1_epi16((short)0xffff))); |
730 | | |
731 | 39.2k | const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1); |
732 | | |
733 | | // coeffs 3 4 3 4 3 4 3 4 |
734 | 39.2k | coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u)); |
735 | 39.2k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_2t_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_2t_ssse3 convolve_2d_avx2.c:prepare_coeffs_2t_ssse3 Line | Count | Source | 718 | 29.2k | __m128i *const coeffs /* [2] */) { | 719 | 29.2k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 720 | 29.2k | filter_params, subpel_q4 & SUBPEL_MASK); | 721 | 29.2k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 722 | | | 723 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 724 | | // This extra right shift will be taken care of at the end while rounding | 725 | | // the result. | 726 | | // Since all filter co-efficients are even, this change will not affect the | 727 | | // end result | 728 | 29.2k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 729 | 29.2k | _mm_set1_epi16((short)0xffff))); | 730 | | | 731 | 29.2k | const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1); | 732 | | | 733 | | // coeffs 3 4 3 4 3 4 3 4 | 734 | 29.2k | coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u)); | 735 | 29.2k | } |
convolve_avx2.c:prepare_coeffs_2t_ssse3 Line | Count | Source | 718 | 9.95k | __m128i *const coeffs /* [2] */) { | 719 | 9.95k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 720 | 9.95k | filter_params, subpel_q4 & SUBPEL_MASK); | 721 | 9.95k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 722 | | | 723 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 724 | | // This extra right shift will be taken care of at the end while rounding | 725 | | // the result. | 726 | | // Since all filter co-efficients are even, this change will not affect the | 727 | | // end result | 728 | 9.95k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 729 | 9.95k | _mm_set1_epi16((short)0xffff))); | 730 | | | 731 | 9.95k | const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1); | 732 | | | 733 | | // coeffs 3 4 3 4 3 4 3 4 | 734 | 9.95k | coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u)); | 735 | 9.95k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_2t_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_2t_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_2t_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_2t_ssse3 |
736 | | |
737 | | static inline void prepare_coeffs_4t_ssse3( |
738 | | const InterpFilterParams *const filter_params, const int32_t subpel_q4, |
739 | 1.08M | __m128i *const coeffs /* [2] */) { |
740 | 1.08M | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( |
741 | 1.08M | filter_params, subpel_q4 & SUBPEL_MASK); |
742 | 1.08M | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); |
743 | | |
744 | | // right shift all filter co-efficients by 1 to reduce the bits required. |
745 | | // This extra right shift will be taken care of at the end while rounding |
746 | | // the result. |
747 | | // Since all filter co-efficients are even, this change will not affect the |
748 | | // end result |
749 | 1.08M | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), |
750 | 1.08M | _mm_set1_epi16((short)0xffff))); |
751 | | |
752 | 1.08M | const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1); |
753 | | |
754 | | // coeffs 2 3 2 3 2 3 2 3 |
755 | 1.08M | coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u)); |
756 | | // coeffs 4 5 4 5 4 5 4 5 |
757 | 1.08M | coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u)); |
758 | 1.08M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_4t_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_4t_ssse3 convolve_2d_avx2.c:prepare_coeffs_4t_ssse3 Line | Count | Source | 739 | 801k | __m128i *const coeffs /* [2] */) { | 740 | 801k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 741 | 801k | filter_params, subpel_q4 & SUBPEL_MASK); | 742 | 801k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 743 | | | 744 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 745 | | // This extra right shift will be taken care of at the end while rounding | 746 | | // the result. | 747 | | // Since all filter co-efficients are even, this change will not affect the | 748 | | // end result | 749 | 801k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 750 | 801k | _mm_set1_epi16((short)0xffff))); | 751 | | | 752 | 801k | const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1); | 753 | | | 754 | | // coeffs 2 3 2 3 2 3 2 3 | 755 | 801k | coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u)); | 756 | | // coeffs 4 5 4 5 4 5 4 5 | 757 | 801k | coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u)); | 758 | 801k | } |
convolve_avx2.c:prepare_coeffs_4t_ssse3 Line | Count | Source | 739 | 285k | __m128i *const coeffs /* [2] */) { | 740 | 285k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 741 | 285k | filter_params, subpel_q4 & SUBPEL_MASK); | 742 | 285k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 743 | | | 744 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 745 | | // This extra right shift will be taken care of at the end while rounding | 746 | | // the result. | 747 | | // Since all filter co-efficients are even, this change will not affect the | 748 | | // end result | 749 | 285k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 750 | 285k | _mm_set1_epi16((short)0xffff))); | 751 | | | 752 | 285k | const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1); | 753 | | | 754 | | // coeffs 2 3 2 3 2 3 2 3 | 755 | 285k | coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u)); | 756 | | // coeffs 4 5 4 5 4 5 4 5 | 757 | 285k | coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u)); | 758 | 285k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_4t_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_4t_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_4t_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_4t_ssse3 |
759 | | |
760 | | static inline void prepare_coeffs_2t_lowbd( |
761 | | const InterpFilterParams *const filter_params, const int subpel_q4, |
762 | 29.9k | __m256i *const coeffs /* [4] */) { |
763 | 29.9k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( |
764 | 29.9k | filter_params, subpel_q4 & SUBPEL_MASK); |
765 | 29.9k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); |
766 | 29.9k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); |
767 | | |
768 | | // right shift all filter co-efficients by 1 to reduce the bits required. |
769 | | // This extra right shift will be taken care of at the end while rounding |
770 | | // the result. |
771 | | // Since all filter co-efficients are even, this change will not affect the |
772 | | // end result |
773 | 29.9k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), |
774 | 29.9k | _mm_set1_epi16((int16_t)0xffff))); |
775 | | |
776 | 29.9k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); |
777 | | |
778 | | // coeffs 3 4 3 4 3 4 3 4 |
779 | 29.9k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u)); |
780 | 29.9k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_2t_lowbd Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_2t_lowbd convolve_2d_avx2.c:prepare_coeffs_2t_lowbd Line | Count | Source | 762 | 26.3k | __m256i *const coeffs /* [4] */) { | 763 | 26.3k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 764 | 26.3k | filter_params, subpel_q4 & SUBPEL_MASK); | 765 | 26.3k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 766 | 26.3k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); | 767 | | | 768 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 769 | | // This extra right shift will be taken care of at the end while rounding | 770 | | // the result. | 771 | | // Since all filter co-efficients are even, this change will not affect the | 772 | | // end result | 773 | 26.3k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 774 | 26.3k | _mm_set1_epi16((int16_t)0xffff))); | 775 | | | 776 | 26.3k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); | 777 | | | 778 | | // coeffs 3 4 3 4 3 4 3 4 | 779 | 26.3k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u)); | 780 | 26.3k | } |
convolve_avx2.c:prepare_coeffs_2t_lowbd Line | Count | Source | 762 | 3.60k | __m256i *const coeffs /* [4] */) { | 763 | 3.60k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 764 | 3.60k | filter_params, subpel_q4 & SUBPEL_MASK); | 765 | 3.60k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 766 | 3.60k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); | 767 | | | 768 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 769 | | // This extra right shift will be taken care of at the end while rounding | 770 | | // the result. | 771 | | // Since all filter co-efficients are even, this change will not affect the | 772 | | // end result | 773 | 3.60k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 774 | 3.60k | _mm_set1_epi16((int16_t)0xffff))); | 775 | | | 776 | 3.60k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); | 777 | | | 778 | | // coeffs 3 4 3 4 3 4 3 4 | 779 | 3.60k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u)); | 780 | 3.60k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_2t_lowbd Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_2t_lowbd Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_2t_lowbd Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_2t_lowbd |
781 | | |
782 | | static inline void prepare_coeffs_4t_lowbd( |
783 | | const InterpFilterParams *const filter_params, const int subpel_q4, |
784 | 72.4k | __m256i *const coeffs /* [4] */) { |
785 | 72.4k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( |
786 | 72.4k | filter_params, subpel_q4 & SUBPEL_MASK); |
787 | 72.4k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); |
788 | 72.4k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); |
789 | | |
790 | | // right shift all filter co-efficients by 1 to reduce the bits required. |
791 | | // This extra right shift will be taken care of at the end while rounding |
792 | | // the result. |
793 | | // Since all filter co-efficients are even, this change will not affect the |
794 | | // end result |
795 | 72.4k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), |
796 | 72.4k | _mm_set1_epi16((short)0xffff))); |
797 | | |
798 | 72.4k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); |
799 | | |
800 | | // coeffs 2 3 2 3 2 3 2 3 |
801 | 72.4k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u)); |
802 | | // coeffs 4 5 4 5 4 5 4 5 |
803 | 72.4k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u)); |
804 | 72.4k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_4t_lowbd Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_4t_lowbd convolve_2d_avx2.c:prepare_coeffs_4t_lowbd Line | Count | Source | 784 | 72.4k | __m256i *const coeffs /* [4] */) { | 785 | 72.4k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 786 | 72.4k | filter_params, subpel_q4 & SUBPEL_MASK); | 787 | 72.4k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 788 | 72.4k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); | 789 | | | 790 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 791 | | // This extra right shift will be taken care of at the end while rounding | 792 | | // the result. | 793 | | // Since all filter co-efficients are even, this change will not affect the | 794 | | // end result | 795 | 72.4k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 796 | 72.4k | _mm_set1_epi16((short)0xffff))); | 797 | | | 798 | 72.4k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); | 799 | | | 800 | | // coeffs 2 3 2 3 2 3 2 3 | 801 | 72.4k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u)); | 802 | | // coeffs 4 5 4 5 4 5 4 5 | 803 | 72.4k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u)); | 804 | 72.4k | } |
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_4t_lowbd Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_4t_lowbd Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_4t_lowbd Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_4t_lowbd Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_4t_lowbd |
805 | | |
806 | | static inline void prepare_coeffs_6t_lowbd( |
807 | | const InterpFilterParams *const filter_params, const int subpel_q4, |
808 | 1.25M | __m256i *const coeffs /* [4] */) { |
809 | 1.25M | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( |
810 | 1.25M | filter_params, subpel_q4 & SUBPEL_MASK); |
811 | 1.25M | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); |
812 | 1.25M | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); |
813 | | |
814 | | // right shift all filter co-efficients by 1 to reduce the bits required. |
815 | | // This extra right shift will be taken care of at the end while rounding |
816 | | // the result. |
817 | | // Since all filter co-efficients are even, this change will not affect the |
818 | | // end result |
819 | 1.25M | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), |
820 | 1.25M | _mm_set1_epi16((int16_t)0xffff))); |
821 | | |
822 | 1.25M | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); |
823 | | |
824 | | // coeffs 1 2 1 2 1 2 1 2 |
825 | 1.25M | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0402u)); |
826 | | // coeffs 3 4 3 4 3 4 3 4 |
827 | 1.25M | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u)); |
828 | | // coeffs 5 6 5 6 5 6 5 6 |
829 | 1.25M | coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0c0au)); |
830 | 1.25M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_6t_lowbd Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_6t_lowbd convolve_2d_avx2.c:prepare_coeffs_6t_lowbd Line | Count | Source | 808 | 884k | __m256i *const coeffs /* [4] */) { | 809 | 884k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 810 | 884k | filter_params, subpel_q4 & SUBPEL_MASK); | 811 | 884k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 812 | 884k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); | 813 | | | 814 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 815 | | // This extra right shift will be taken care of at the end while rounding | 816 | | // the result. | 817 | | // Since all filter co-efficients are even, this change will not affect the | 818 | | // end result | 819 | 884k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 820 | 884k | _mm_set1_epi16((int16_t)0xffff))); | 821 | | | 822 | 884k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); | 823 | | | 824 | | // coeffs 1 2 1 2 1 2 1 2 | 825 | 884k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0402u)); | 826 | | // coeffs 3 4 3 4 3 4 3 4 | 827 | 884k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u)); | 828 | | // coeffs 5 6 5 6 5 6 5 6 | 829 | 884k | coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0c0au)); | 830 | 884k | } |
convolve_avx2.c:prepare_coeffs_6t_lowbd Line | Count | Source | 808 | 373k | __m256i *const coeffs /* [4] */) { | 809 | 373k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 810 | 373k | filter_params, subpel_q4 & SUBPEL_MASK); | 811 | 373k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 812 | 373k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); | 813 | | | 814 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 815 | | // This extra right shift will be taken care of at the end while rounding | 816 | | // the result. | 817 | | // Since all filter co-efficients are even, this change will not affect the | 818 | | // end result | 819 | 373k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 820 | 373k | _mm_set1_epi16((int16_t)0xffff))); | 821 | | | 822 | 373k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); | 823 | | | 824 | | // coeffs 1 2 1 2 1 2 1 2 | 825 | 373k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0402u)); | 826 | | // coeffs 3 4 3 4 3 4 3 4 | 827 | 373k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u)); | 828 | | // coeffs 5 6 5 6 5 6 5 6 | 829 | 373k | coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0c0au)); | 830 | 373k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_6t_lowbd Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_6t_lowbd Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_6t_lowbd Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_6t_lowbd |
831 | | |
832 | | static inline void prepare_coeffs_lowbd( |
833 | | const InterpFilterParams *const filter_params, const int subpel_q4, |
834 | 494k | __m256i *const coeffs /* [4] */) { |
835 | 494k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( |
836 | 494k | filter_params, subpel_q4 & SUBPEL_MASK); |
837 | 494k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); |
838 | 494k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); |
839 | | |
840 | | // right shift all filter co-efficients by 1 to reduce the bits required. |
841 | | // This extra right shift will be taken care of at the end while rounding |
842 | | // the result. |
843 | | // Since all filter co-efficients are even, this change will not affect the |
844 | | // end result |
845 | 494k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), |
846 | 494k | _mm_set1_epi16((short)0xffff))); |
847 | | |
848 | 494k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); |
849 | | |
850 | | // coeffs 0 1 0 1 0 1 0 1 |
851 | 494k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u)); |
852 | | // coeffs 2 3 2 3 2 3 2 3 |
853 | 494k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u)); |
854 | | // coeffs 4 5 4 5 4 5 4 5 |
855 | 494k | coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u)); |
856 | | // coeffs 6 7 6 7 6 7 6 7 |
857 | 494k | coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu)); |
858 | 494k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_lowbd Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_lowbd convolve_2d_avx2.c:prepare_coeffs_lowbd Line | Count | Source | 834 | 74.0k | __m256i *const coeffs /* [4] */) { | 835 | 74.0k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 836 | 74.0k | filter_params, subpel_q4 & SUBPEL_MASK); | 837 | 74.0k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 838 | 74.0k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); | 839 | | | 840 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 841 | | // This extra right shift will be taken care of at the end while rounding | 842 | | // the result. | 843 | | // Since all filter co-efficients are even, this change will not affect the | 844 | | // end result | 845 | 74.0k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 846 | 74.0k | _mm_set1_epi16((short)0xffff))); | 847 | | | 848 | 74.0k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); | 849 | | | 850 | | // coeffs 0 1 0 1 0 1 0 1 | 851 | 74.0k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u)); | 852 | | // coeffs 2 3 2 3 2 3 2 3 | 853 | 74.0k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u)); | 854 | | // coeffs 4 5 4 5 4 5 4 5 | 855 | 74.0k | coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u)); | 856 | | // coeffs 6 7 6 7 6 7 6 7 | 857 | 74.0k | coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu)); | 858 | 74.0k | } |
convolve_avx2.c:prepare_coeffs_lowbd Line | Count | Source | 834 | 44.1k | __m256i *const coeffs /* [4] */) { | 835 | 44.1k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 836 | 44.1k | filter_params, subpel_q4 & SUBPEL_MASK); | 837 | 44.1k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 838 | 44.1k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); | 839 | | | 840 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 841 | | // This extra right shift will be taken care of at the end while rounding | 842 | | // the result. | 843 | | // Since all filter co-efficients are even, this change will not affect the | 844 | | // end result | 845 | 44.1k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 846 | 44.1k | _mm_set1_epi16((short)0xffff))); | 847 | | | 848 | 44.1k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); | 849 | | | 850 | | // coeffs 0 1 0 1 0 1 0 1 | 851 | 44.1k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u)); | 852 | | // coeffs 2 3 2 3 2 3 2 3 | 853 | 44.1k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u)); | 854 | | // coeffs 4 5 4 5 4 5 4 5 | 855 | 44.1k | coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u)); | 856 | | // coeffs 6 7 6 7 6 7 6 7 | 857 | 44.1k | coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu)); | 858 | 44.1k | } |
jnt_convolve_avx2.c:prepare_coeffs_lowbd Line | Count | Source | 834 | 375k | __m256i *const coeffs /* [4] */) { | 835 | 375k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 836 | 375k | filter_params, subpel_q4 & SUBPEL_MASK); | 837 | 375k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 838 | 375k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); | 839 | | | 840 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 841 | | // This extra right shift will be taken care of at the end while rounding | 842 | | // the result. | 843 | | // Since all filter co-efficients are even, this change will not affect the | 844 | | // end result | 845 | 375k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 846 | 375k | _mm_set1_epi16((short)0xffff))); | 847 | | | 848 | 375k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); | 849 | | | 850 | | // coeffs 0 1 0 1 0 1 0 1 | 851 | 375k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u)); | 852 | | // coeffs 2 3 2 3 2 3 2 3 | 853 | 375k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u)); | 854 | | // coeffs 4 5 4 5 4 5 4 5 | 855 | 375k | coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u)); | 856 | | // coeffs 6 7 6 7 6 7 6 7 | 857 | 375k | coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu)); | 858 | 375k | } |
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_lowbd Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_lowbd Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_lowbd |
859 | | |
860 | | static inline void prepare_coeffs_2t( |
861 | | const InterpFilterParams *const filter_params, const int subpel_q4, |
862 | 55.6k | __m256i *const coeffs /* [4] */) { |
863 | 55.6k | const int16_t *filter = av1_get_interp_filter_subpel_kernel( |
864 | 55.6k | filter_params, subpel_q4 & SUBPEL_MASK); |
865 | | |
866 | 55.6k | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)(filter + 1)); |
867 | 55.6k | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); |
868 | | |
869 | | // coeffs 3 4 3 4 3 4 3 4 |
870 | 55.6k | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55); |
871 | 55.6k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_2t Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_2t convolve_2d_avx2.c:prepare_coeffs_2t Line | Count | Source | 862 | 55.6k | __m256i *const coeffs /* [4] */) { | 863 | 55.6k | const int16_t *filter = av1_get_interp_filter_subpel_kernel( | 864 | 55.6k | filter_params, subpel_q4 & SUBPEL_MASK); | 865 | | | 866 | 55.6k | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)(filter + 1)); | 867 | 55.6k | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); | 868 | | | 869 | | // coeffs 3 4 3 4 3 4 3 4 | 870 | | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55); | 871 | 55.6k | } |
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_2t Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_2t Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_2t Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_2t Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_2t |
872 | | |
873 | | static inline void prepare_coeffs_4t( |
874 | | const InterpFilterParams *const filter_params, const int subpel_q4, |
875 | 1.00M | __m256i *const coeffs /* [4] */) { |
876 | 1.00M | const int16_t *filter = av1_get_interp_filter_subpel_kernel( |
877 | 1.00M | filter_params, subpel_q4 & SUBPEL_MASK); |
878 | | |
879 | 1.00M | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); |
880 | 1.00M | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); |
881 | | // coeffs 2 3 2 3 2 3 2 3 |
882 | 1.00M | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55); |
883 | | // coeffs 4 5 4 5 4 5 4 5 |
884 | 1.00M | coeffs[1] = _mm256_shuffle_epi32(coeff, 0xaa); |
885 | 1.00M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_4t Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_4t convolve_2d_avx2.c:prepare_coeffs_4t Line | Count | Source | 875 | 1.00M | __m256i *const coeffs /* [4] */) { | 876 | 1.00M | const int16_t *filter = av1_get_interp_filter_subpel_kernel( | 877 | 1.00M | filter_params, subpel_q4 & SUBPEL_MASK); | 878 | | | 879 | 1.00M | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); | 880 | 1.00M | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); | 881 | | // coeffs 2 3 2 3 2 3 2 3 | 882 | 1.00M | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55); | 883 | | // coeffs 4 5 4 5 4 5 4 5 | 884 | | coeffs[1] = _mm256_shuffle_epi32(coeff, 0xaa); | 885 | 1.00M | } |
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_4t Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_4t Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_4t Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_4t Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_4t |
886 | | |
887 | | static inline void prepare_coeffs_6t( |
888 | | const InterpFilterParams *const filter_params, const int subpel_q4, |
889 | 760k | __m256i *const coeffs /* [4] */) { |
890 | 760k | const int16_t *filter = av1_get_interp_filter_subpel_kernel( |
891 | 760k | filter_params, subpel_q4 & SUBPEL_MASK); |
892 | | |
893 | 760k | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)(filter + 1)); |
894 | 760k | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); |
895 | | |
896 | | // coeffs 1 2 1 2 1 2 1 2 |
897 | 760k | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); |
898 | | // coeffs 3 4 3 4 3 4 3 4 |
899 | 760k | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); |
900 | | // coeffs 5 6 5 6 5 6 5 6 |
901 | 760k | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); |
902 | 760k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_6t Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_6t convolve_2d_avx2.c:prepare_coeffs_6t Line | Count | Source | 889 | 760k | __m256i *const coeffs /* [4] */) { | 890 | 760k | const int16_t *filter = av1_get_interp_filter_subpel_kernel( | 891 | 760k | filter_params, subpel_q4 & SUBPEL_MASK); | 892 | | | 893 | 760k | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)(filter + 1)); | 894 | 760k | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); | 895 | | | 896 | | // coeffs 1 2 1 2 1 2 1 2 | 897 | 760k | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); | 898 | | // coeffs 3 4 3 4 3 4 3 4 | 899 | 760k | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); | 900 | | // coeffs 5 6 5 6 5 6 5 6 | 901 | | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); | 902 | 760k | } |
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_6t Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_6t Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_6t Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_6t Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_6t |
903 | | |
904 | | static inline void prepare_coeffs(const InterpFilterParams *const filter_params, |
905 | | const int subpel_q4, |
906 | 7.56M | __m256i *const coeffs /* [4] */) { |
907 | 7.56M | const int16_t *filter = av1_get_interp_filter_subpel_kernel( |
908 | 7.56M | filter_params, subpel_q4 & SUBPEL_MASK); |
909 | | |
910 | 7.56M | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); |
911 | 7.56M | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); |
912 | | |
913 | | // coeffs 0 1 0 1 0 1 0 1 |
914 | 7.56M | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); |
915 | | // coeffs 2 3 2 3 2 3 2 3 |
916 | 7.56M | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); |
917 | | // coeffs 4 5 4 5 4 5 4 5 |
918 | 7.56M | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); |
919 | | // coeffs 6 7 6 7 6 7 6 7 |
920 | 7.56M | coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); |
921 | 7.56M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs highbd_convolve_avx2.c:prepare_coeffs Line | Count | Source | 906 | 1.49M | __m256i *const coeffs /* [4] */) { | 907 | 1.49M | const int16_t *filter = av1_get_interp_filter_subpel_kernel( | 908 | 1.49M | filter_params, subpel_q4 & SUBPEL_MASK); | 909 | | | 910 | 1.49M | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); | 911 | 1.49M | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); | 912 | | | 913 | | // coeffs 0 1 0 1 0 1 0 1 | 914 | 1.49M | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); | 915 | | // coeffs 2 3 2 3 2 3 2 3 | 916 | 1.49M | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); | 917 | | // coeffs 4 5 4 5 4 5 4 5 | 918 | 1.49M | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); | 919 | | // coeffs 6 7 6 7 6 7 6 7 | 920 | | coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); | 921 | 1.49M | } |
convolve_2d_avx2.c:prepare_coeffs Line | Count | Source | 906 | 69.5k | __m256i *const coeffs /* [4] */) { | 907 | 69.5k | const int16_t *filter = av1_get_interp_filter_subpel_kernel( | 908 | 69.5k | filter_params, subpel_q4 & SUBPEL_MASK); | 909 | | | 910 | 69.5k | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); | 911 | 69.5k | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); | 912 | | | 913 | | // coeffs 0 1 0 1 0 1 0 1 | 914 | 69.5k | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); | 915 | | // coeffs 2 3 2 3 2 3 2 3 | 916 | 69.5k | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); | 917 | | // coeffs 4 5 4 5 4 5 4 5 | 918 | 69.5k | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); | 919 | | // coeffs 6 7 6 7 6 7 6 7 | 920 | | coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); | 921 | 69.5k | } |
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs jnt_convolve_avx2.c:prepare_coeffs Line | Count | Source | 906 | 194k | __m256i *const coeffs /* [4] */) { | 907 | 194k | const int16_t *filter = av1_get_interp_filter_subpel_kernel( | 908 | 194k | filter_params, subpel_q4 & SUBPEL_MASK); | 909 | | | 910 | 194k | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); | 911 | 194k | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); | 912 | | | 913 | | // coeffs 0 1 0 1 0 1 0 1 | 914 | 194k | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); | 915 | | // coeffs 2 3 2 3 2 3 2 3 | 916 | 194k | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); | 917 | | // coeffs 4 5 4 5 4 5 4 5 | 918 | 194k | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); | 919 | | // coeffs 6 7 6 7 6 7 6 7 | 920 | | coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); | 921 | 194k | } |
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs highbd_convolve_2d_avx2.c:prepare_coeffs Line | Count | Source | 906 | 4.73M | __m256i *const coeffs /* [4] */) { | 907 | 4.73M | const int16_t *filter = av1_get_interp_filter_subpel_kernel( | 908 | 4.73M | filter_params, subpel_q4 & SUBPEL_MASK); | 909 | | | 910 | 4.73M | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); | 911 | 4.73M | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); | 912 | | | 913 | | // coeffs 0 1 0 1 0 1 0 1 | 914 | 4.73M | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); | 915 | | // coeffs 2 3 2 3 2 3 2 3 | 916 | 4.73M | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); | 917 | | // coeffs 4 5 4 5 4 5 4 5 | 918 | 4.73M | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); | 919 | | // coeffs 6 7 6 7 6 7 6 7 | 920 | | coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); | 921 | 4.73M | } |
highbd_jnt_convolve_avx2.c:prepare_coeffs Line | Count | Source | 906 | 1.06M | __m256i *const coeffs /* [4] */) { | 907 | 1.06M | const int16_t *filter = av1_get_interp_filter_subpel_kernel( | 908 | 1.06M | filter_params, subpel_q4 & SUBPEL_MASK); | 909 | | | 910 | 1.06M | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); | 911 | 1.06M | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); | 912 | | | 913 | | // coeffs 0 1 0 1 0 1 0 1 | 914 | 1.06M | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); | 915 | | // coeffs 2 3 2 3 2 3 2 3 | 916 | 1.06M | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); | 917 | | // coeffs 4 5 4 5 4 5 4 5 | 918 | 1.06M | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); | 919 | | // coeffs 6 7 6 7 6 7 6 7 | 920 | | coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); | 921 | 1.06M | } |
|
922 | | |
923 | | static inline void prepare_coeffs_12taps( |
924 | | const InterpFilterParams *const filter_params, const int subpel_q4, |
925 | 0 | __m256i *const coeffs /* [4] */) { |
926 | 0 | const int16_t *filter = av1_get_interp_filter_subpel_kernel( |
927 | 0 | filter_params, subpel_q4 & SUBPEL_MASK); |
928 | |
|
929 | 0 | __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); |
930 | 0 | __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); |
931 | | |
932 | | // coeffs 0 1 0 1 0 1 0 1 |
933 | 0 | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); |
934 | | // coeffs 2 3 2 3 2 3 2 3 |
935 | 0 | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); |
936 | | // coeffs 4 5 4 5 4 5 4 5 |
937 | 0 | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); |
938 | | // coeffs 6 7 6 7 6 7 6 7 |
939 | 0 | coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); |
940 | | // coeffs 8 9 10 11 0 0 0 0 |
941 | 0 | coeff_8 = _mm_loadl_epi64((__m128i *)(filter + 8)); |
942 | 0 | coeff = _mm256_broadcastq_epi64(coeff_8); |
943 | 0 | coeffs[4] = _mm256_shuffle_epi32(coeff, 0x00); // coeffs 8 9 8 9 8 9 8 9 |
944 | 0 | coeffs[5] = _mm256_shuffle_epi32(coeff, 0x55); // coeffs 10 11 10 11.. 10 11 |
945 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_12taps Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_12taps Unexecuted instantiation: convolve_2d_avx2.c:prepare_coeffs_12taps Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_12taps Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_12taps Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_12taps Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_12taps Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_12taps |
946 | | |
947 | | static inline __m128i convolve_lowbd_4tap_ssse3(const __m128i ss[2], |
948 | 5.23M | const __m128i coeffs[2]) { |
949 | 5.23M | const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]); |
950 | 5.23M | const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]); |
951 | | |
952 | 5.23M | return _mm_add_epi16(res_01, res_23); |
953 | 5.23M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_4tap_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_4tap_ssse3 convolve_2d_avx2.c:convolve_lowbd_4tap_ssse3 Line | Count | Source | 948 | 4.32M | const __m128i coeffs[2]) { | 949 | 4.32M | const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]); | 950 | 4.32M | const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]); | 951 | | | 952 | 4.32M | return _mm_add_epi16(res_01, res_23); | 953 | 4.32M | } |
convolve_avx2.c:convolve_lowbd_4tap_ssse3 Line | Count | Source | 948 | 914k | const __m128i coeffs[2]) { | 949 | 914k | const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]); | 950 | 914k | const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]); | 951 | | | 952 | 914k | return _mm_add_epi16(res_01, res_23); | 953 | 914k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_4tap_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_4tap_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_4tap_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_4tap_ssse3 |
954 | | |
955 | | static inline __m256i convolve_lowbd(const __m256i *const s, |
956 | 22.8M | const __m256i *const coeffs) { |
957 | 22.8M | const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); |
958 | 22.8M | const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); |
959 | 22.8M | const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); |
960 | 22.8M | const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]); |
961 | | |
962 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
963 | 22.8M | const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), |
964 | 22.8M | _mm256_add_epi16(res_23, res_67)); |
965 | | |
966 | 22.8M | return res; |
967 | 22.8M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd convolve_2d_avx2.c:convolve_lowbd Line | Count | Source | 956 | 3.59M | const __m256i *const coeffs) { | 957 | 3.59M | const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 958 | 3.59M | const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 959 | 3.59M | const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); | 960 | 3.59M | const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]); | 961 | | | 962 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 963 | 3.59M | const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), | 964 | 3.59M | _mm256_add_epi16(res_23, res_67)); | 965 | | | 966 | 3.59M | return res; | 967 | 3.59M | } |
convolve_avx2.c:convolve_lowbd Line | Count | Source | 956 | 440k | const __m256i *const coeffs) { | 957 | 440k | const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 958 | 440k | const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 959 | 440k | const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); | 960 | 440k | const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]); | 961 | | | 962 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 963 | 440k | const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), | 964 | 440k | _mm256_add_epi16(res_23, res_67)); | 965 | | | 966 | 440k | return res; | 967 | 440k | } |
jnt_convolve_avx2.c:convolve_lowbd Line | Count | Source | 956 | 5.40M | const __m256i *const coeffs) { | 957 | 5.40M | const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 958 | 5.40M | const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 959 | 5.40M | const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); | 960 | 5.40M | const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]); | 961 | | | 962 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 963 | 5.40M | const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), | 964 | 5.40M | _mm256_add_epi16(res_23, res_67)); | 965 | | | 966 | 5.40M | return res; | 967 | 5.40M | } |
wiener_convolve_avx2.c:convolve_lowbd Line | Count | Source | 956 | 13.4M | const __m256i *const coeffs) { | 957 | 13.4M | const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 958 | 13.4M | const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 959 | 13.4M | const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); | 960 | 13.4M | const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]); | 961 | | | 962 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 963 | 13.4M | const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), | 964 | 13.4M | _mm256_add_epi16(res_23, res_67)); | 965 | | | 966 | 13.4M | return res; | 967 | 13.4M | } |
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd |
968 | | |
969 | | static inline __m256i convolve_lowbd_6tap(const __m256i *const s, |
970 | 20.2M | const __m256i *const coeffs) { |
971 | 20.2M | const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); |
972 | 20.2M | const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); |
973 | 20.2M | const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); |
974 | | |
975 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
976 | 20.2M | const __m256i res = |
977 | 20.2M | _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), res_23); |
978 | | |
979 | 20.2M | return res; |
980 | 20.2M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_6tap Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_6tap convolve_2d_avx2.c:convolve_lowbd_6tap Line | Count | Source | 970 | 15.2M | const __m256i *const coeffs) { | 971 | 15.2M | const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 972 | 15.2M | const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 973 | 15.2M | const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); | 974 | | | 975 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 976 | 15.2M | const __m256i res = | 977 | 15.2M | _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), res_23); | 978 | | | 979 | 15.2M | return res; | 980 | 15.2M | } |
convolve_avx2.c:convolve_lowbd_6tap Line | Count | Source | 970 | 4.98M | const __m256i *const coeffs) { | 971 | 4.98M | const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 972 | 4.98M | const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 973 | 4.98M | const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); | 974 | | | 975 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 976 | 4.98M | const __m256i res = | 977 | 4.98M | _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), res_23); | 978 | | | 979 | 4.98M | return res; | 980 | 4.98M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_6tap Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_6tap Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_6tap Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_6tap |
981 | | |
982 | | static inline __m256i convolve_lowbd_4tap(const __m256i *const s, |
983 | 5.29M | const __m256i *const coeffs) { |
984 | 5.29M | const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]); |
985 | 5.29M | const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]); |
986 | | |
987 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
988 | 5.29M | const __m256i res = _mm256_add_epi16(res_45, res_23); |
989 | | |
990 | 5.29M | return res; |
991 | 5.29M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_4tap Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_4tap convolve_2d_avx2.c:convolve_lowbd_4tap Line | Count | Source | 983 | 1.90M | const __m256i *const coeffs) { | 984 | 1.90M | const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 985 | 1.90M | const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 986 | | | 987 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 988 | 1.90M | const __m256i res = _mm256_add_epi16(res_45, res_23); | 989 | | | 990 | 1.90M | return res; | 991 | 1.90M | } |
convolve_avx2.c:convolve_lowbd_4tap Line | Count | Source | 983 | 812k | const __m256i *const coeffs) { | 984 | 812k | const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 985 | 812k | const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 986 | | | 987 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 988 | 812k | const __m256i res = _mm256_add_epi16(res_45, res_23); | 989 | | | 990 | 812k | return res; | 991 | 812k | } |
jnt_convolve_avx2.c:convolve_lowbd_4tap Line | Count | Source | 983 | 2.58M | const __m256i *const coeffs) { | 984 | 2.58M | const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 985 | 2.58M | const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 986 | | | 987 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 988 | 2.58M | const __m256i res = _mm256_add_epi16(res_45, res_23); | 989 | | | 990 | 2.58M | return res; | 991 | 2.58M | } |
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_4tap Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_4tap Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_4tap |
992 | | |
993 | | static inline __m256i convolve_6tap(const __m256i *const s, |
994 | 22.9M | const __m256i *const coeffs) { |
995 | 22.9M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); |
996 | 22.9M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); |
997 | 22.9M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); |
998 | | |
999 | 22.9M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), res_2); |
1000 | | |
1001 | 22.9M | return res; |
1002 | 22.9M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_6tap Unexecuted instantiation: highbd_convolve_avx2.c:convolve_6tap convolve_2d_avx2.c:convolve_6tap Line | Count | Source | 994 | 22.9M | const __m256i *const coeffs) { | 995 | 22.9M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); | 996 | 22.9M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); | 997 | 22.9M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); | 998 | | | 999 | 22.9M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), res_2); | 1000 | | | 1001 | 22.9M | return res; | 1002 | 22.9M | } |
Unexecuted instantiation: convolve_avx2.c:convolve_6tap Unexecuted instantiation: jnt_convolve_avx2.c:convolve_6tap Unexecuted instantiation: wiener_convolve_avx2.c:convolve_6tap Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_6tap Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_6tap |
1003 | | |
1004 | | static inline __m256i convolve_12taps(const __m256i *const s, |
1005 | 0 | const __m256i *const coeffs) { |
1006 | 0 | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); |
1007 | 0 | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); |
1008 | 0 | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); |
1009 | 0 | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); |
1010 | 0 | const __m256i res_4 = _mm256_madd_epi16(s[4], coeffs[4]); |
1011 | 0 | const __m256i res_5 = _mm256_madd_epi16(s[5], coeffs[5]); |
1012 | |
|
1013 | 0 | const __m256i res1 = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), |
1014 | 0 | _mm256_add_epi32(res_2, res_3)); |
1015 | 0 | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_4, res_5), res1); |
1016 | |
|
1017 | 0 | return res; |
1018 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_12taps Unexecuted instantiation: highbd_convolve_avx2.c:convolve_12taps Unexecuted instantiation: convolve_2d_avx2.c:convolve_12taps Unexecuted instantiation: convolve_avx2.c:convolve_12taps Unexecuted instantiation: jnt_convolve_avx2.c:convolve_12taps Unexecuted instantiation: wiener_convolve_avx2.c:convolve_12taps Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_12taps Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_12taps |
1019 | | |
1020 | | static inline __m256i convolve(const __m256i *const s, |
1021 | 223M | const __m256i *const coeffs) { |
1022 | 223M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); |
1023 | 223M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); |
1024 | 223M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); |
1025 | 223M | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); |
1026 | | |
1027 | 223M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), |
1028 | 223M | _mm256_add_epi32(res_2, res_3)); |
1029 | | |
1030 | 223M | return res; |
1031 | 223M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve highbd_convolve_avx2.c:convolve Line | Count | Source | 1021 | 28.7M | const __m256i *const coeffs) { | 1022 | 28.7M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); | 1023 | 28.7M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); | 1024 | 28.7M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); | 1025 | 28.7M | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); | 1026 | | | 1027 | 28.7M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), | 1028 | 28.7M | _mm256_add_epi32(res_2, res_3)); | 1029 | | | 1030 | 28.7M | return res; | 1031 | 28.7M | } |
convolve_2d_avx2.c:convolve Line | Count | Source | 1021 | 5.35M | const __m256i *const coeffs) { | 1022 | 5.35M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); | 1023 | 5.35M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); | 1024 | 5.35M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); | 1025 | 5.35M | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); | 1026 | | | 1027 | 5.35M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), | 1028 | 5.35M | _mm256_add_epi32(res_2, res_3)); | 1029 | | | 1030 | 5.35M | return res; | 1031 | 5.35M | } |
Unexecuted instantiation: convolve_avx2.c:convolve jnt_convolve_avx2.c:convolve Line | Count | Source | 1021 | 5.26M | const __m256i *const coeffs) { | 1022 | 5.26M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); | 1023 | 5.26M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); | 1024 | 5.26M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); | 1025 | 5.26M | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); | 1026 | | | 1027 | 5.26M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), | 1028 | 5.26M | _mm256_add_epi32(res_2, res_3)); | 1029 | | | 1030 | 5.26M | return res; | 1031 | 5.26M | } |
wiener_convolve_avx2.c:convolve Line | Count | Source | 1021 | 25.2M | const __m256i *const coeffs) { | 1022 | 25.2M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); | 1023 | 25.2M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); | 1024 | 25.2M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); | 1025 | 25.2M | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); | 1026 | | | 1027 | 25.2M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), | 1028 | 25.2M | _mm256_add_epi32(res_2, res_3)); | 1029 | | | 1030 | 25.2M | return res; | 1031 | 25.2M | } |
highbd_convolve_2d_avx2.c:convolve Line | Count | Source | 1021 | 104M | const __m256i *const coeffs) { | 1022 | 104M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); | 1023 | 104M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); | 1024 | 104M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); | 1025 | 104M | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); | 1026 | | | 1027 | 104M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), | 1028 | 104M | _mm256_add_epi32(res_2, res_3)); | 1029 | | | 1030 | 104M | return res; | 1031 | 104M | } |
highbd_jnt_convolve_avx2.c:convolve Line | Count | Source | 1021 | 54.8M | const __m256i *const coeffs) { | 1022 | 54.8M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); | 1023 | 54.8M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); | 1024 | 54.8M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); | 1025 | 54.8M | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); | 1026 | | | 1027 | 54.8M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), | 1028 | 54.8M | _mm256_add_epi32(res_2, res_3)); | 1029 | | | 1030 | 54.8M | return res; | 1031 | 54.8M | } |
|
1032 | | |
1033 | | static inline __m256i convolve_4tap(const __m256i *const s, |
1034 | 5.69M | const __m256i *const coeffs) { |
1035 | 5.69M | const __m256i res_1 = _mm256_madd_epi16(s[0], coeffs[0]); |
1036 | 5.69M | const __m256i res_2 = _mm256_madd_epi16(s[1], coeffs[1]); |
1037 | | |
1038 | 5.69M | const __m256i res = _mm256_add_epi32(res_1, res_2); |
1039 | 5.69M | return res; |
1040 | 5.69M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_4tap Unexecuted instantiation: highbd_convolve_avx2.c:convolve_4tap convolve_2d_avx2.c:convolve_4tap Line | Count | Source | 1034 | 5.46M | const __m256i *const coeffs) { | 1035 | 5.46M | const __m256i res_1 = _mm256_madd_epi16(s[0], coeffs[0]); | 1036 | 5.46M | const __m256i res_2 = _mm256_madd_epi16(s[1], coeffs[1]); | 1037 | | | 1038 | 5.46M | const __m256i res = _mm256_add_epi32(res_1, res_2); | 1039 | 5.46M | return res; | 1040 | 5.46M | } |
Unexecuted instantiation: convolve_avx2.c:convolve_4tap jnt_convolve_avx2.c:convolve_4tap Line | Count | Source | 1034 | 235k | const __m256i *const coeffs) { | 1035 | 235k | const __m256i res_1 = _mm256_madd_epi16(s[0], coeffs[0]); | 1036 | 235k | const __m256i res_2 = _mm256_madd_epi16(s[1], coeffs[1]); | 1037 | | | 1038 | 235k | const __m256i res = _mm256_add_epi32(res_1, res_2); | 1039 | 235k | return res; | 1040 | 235k | } |
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_4tap Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_4tap Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_4tap |
1041 | | |
1042 | | static inline __m128i convolve_lowbd_x_2tap_ssse3(const __m128i data, |
1043 | | const __m128i *const coeffs, |
1044 | 110k | const __m128i *const filt) { |
1045 | 110k | __m128i s; |
1046 | 110k | s = _mm_shuffle_epi8(data, filt[0]); |
1047 | | |
1048 | 110k | return _mm_maddubs_epi16(s, coeffs[0]); |
1049 | 110k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x_2tap_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x_2tap_ssse3 convolve_2d_avx2.c:convolve_lowbd_x_2tap_ssse3 Line | Count | Source | 1044 | 110k | const __m128i *const filt) { | 1045 | 110k | __m128i s; | 1046 | 110k | s = _mm_shuffle_epi8(data, filt[0]); | 1047 | | | 1048 | 110k | return _mm_maddubs_epi16(s, coeffs[0]); | 1049 | 110k | } |
Unexecuted instantiation: convolve_avx2.c:convolve_lowbd_x_2tap_ssse3 Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_x_2tap_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_x_2tap_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x_2tap_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x_2tap_ssse3 |
1050 | | |
1051 | | static inline __m128i convolve_lowbd_x_4tap_ssse3(const __m128i data, |
1052 | | const __m128i *const coeffs, |
1053 | 4.32M | const __m128i *const filt) { |
1054 | 4.32M | __m128i s[2]; |
1055 | | |
1056 | 4.32M | s[0] = _mm_shuffle_epi8(data, filt[0]); |
1057 | 4.32M | s[1] = _mm_shuffle_epi8(data, filt[1]); |
1058 | | |
1059 | 4.32M | return convolve_lowbd_4tap_ssse3(s, coeffs); |
1060 | 4.32M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x_4tap_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x_4tap_ssse3 convolve_2d_avx2.c:convolve_lowbd_x_4tap_ssse3 Line | Count | Source | 1053 | 4.32M | const __m128i *const filt) { | 1054 | 4.32M | __m128i s[2]; | 1055 | | | 1056 | 4.32M | s[0] = _mm_shuffle_epi8(data, filt[0]); | 1057 | 4.32M | s[1] = _mm_shuffle_epi8(data, filt[1]); | 1058 | | | 1059 | 4.32M | return convolve_lowbd_4tap_ssse3(s, coeffs); | 1060 | 4.32M | } |
Unexecuted instantiation: convolve_avx2.c:convolve_lowbd_x_4tap_ssse3 Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_x_4tap_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_x_4tap_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x_4tap_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x_4tap_ssse3 |
1061 | | |
1062 | | static inline __m256i convolve_lowbd_x(const __m256i data, |
1063 | | const __m256i *const coeffs, |
1064 | 22.1M | const __m256i *const filt) { |
1065 | 22.1M | __m256i s[4]; |
1066 | | |
1067 | 22.1M | s[0] = _mm256_shuffle_epi8(data, filt[0]); |
1068 | 22.1M | s[1] = _mm256_shuffle_epi8(data, filt[1]); |
1069 | 22.1M | s[2] = _mm256_shuffle_epi8(data, filt[2]); |
1070 | 22.1M | s[3] = _mm256_shuffle_epi8(data, filt[3]); |
1071 | | |
1072 | 22.1M | return convolve_lowbd(s, coeffs); |
1073 | 22.1M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x convolve_2d_avx2.c:convolve_lowbd_x Line | Count | Source | 1064 | 3.59M | const __m256i *const filt) { | 1065 | 3.59M | __m256i s[4]; | 1066 | | | 1067 | 3.59M | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 1068 | 3.59M | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 1069 | 3.59M | s[2] = _mm256_shuffle_epi8(data, filt[2]); | 1070 | 3.59M | s[3] = _mm256_shuffle_epi8(data, filt[3]); | 1071 | | | 1072 | 3.59M | return convolve_lowbd(s, coeffs); | 1073 | 3.59M | } |
convolve_avx2.c:convolve_lowbd_x Line | Count | Source | 1064 | 440k | const __m256i *const filt) { | 1065 | 440k | __m256i s[4]; | 1066 | | | 1067 | 440k | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 1068 | 440k | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 1069 | 440k | s[2] = _mm256_shuffle_epi8(data, filt[2]); | 1070 | 440k | s[3] = _mm256_shuffle_epi8(data, filt[3]); | 1071 | | | 1072 | 440k | return convolve_lowbd(s, coeffs); | 1073 | 440k | } |
jnt_convolve_avx2.c:convolve_lowbd_x Line | Count | Source | 1064 | 4.63M | const __m256i *const filt) { | 1065 | 4.63M | __m256i s[4]; | 1066 | | | 1067 | 4.63M | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 1068 | 4.63M | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 1069 | 4.63M | s[2] = _mm256_shuffle_epi8(data, filt[2]); | 1070 | 4.63M | s[3] = _mm256_shuffle_epi8(data, filt[3]); | 1071 | | | 1072 | 4.63M | return convolve_lowbd(s, coeffs); | 1073 | 4.63M | } |
wiener_convolve_avx2.c:convolve_lowbd_x Line | Count | Source | 1064 | 13.4M | const __m256i *const filt) { | 1065 | 13.4M | __m256i s[4]; | 1066 | | | 1067 | 13.4M | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 1068 | 13.4M | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 1069 | 13.4M | s[2] = _mm256_shuffle_epi8(data, filt[2]); | 1070 | 13.4M | s[3] = _mm256_shuffle_epi8(data, filt[3]); | 1071 | | | 1072 | 13.4M | return convolve_lowbd(s, coeffs); | 1073 | 13.4M | } |
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x |
1074 | | |
1075 | | static inline __m256i convolve_lowbd_x_6tap(const __m256i data, |
1076 | | const __m256i *const coeffs, |
1077 | 20.2M | const __m256i *const filt) { |
1078 | 20.2M | __m256i s[4]; |
1079 | | |
1080 | 20.2M | s[0] = _mm256_shuffle_epi8(data, filt[0]); |
1081 | 20.2M | s[1] = _mm256_shuffle_epi8(data, filt[1]); |
1082 | 20.2M | s[2] = _mm256_shuffle_epi8(data, filt[2]); |
1083 | | |
1084 | 20.2M | return convolve_lowbd_6tap(s, coeffs); |
1085 | 20.2M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x_6tap Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x_6tap convolve_2d_avx2.c:convolve_lowbd_x_6tap Line | Count | Source | 1077 | 15.2M | const __m256i *const filt) { | 1078 | 15.2M | __m256i s[4]; | 1079 | | | 1080 | 15.2M | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 1081 | 15.2M | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 1082 | 15.2M | s[2] = _mm256_shuffle_epi8(data, filt[2]); | 1083 | | | 1084 | 15.2M | return convolve_lowbd_6tap(s, coeffs); | 1085 | 15.2M | } |
convolve_avx2.c:convolve_lowbd_x_6tap Line | Count | Source | 1077 | 4.98M | const __m256i *const filt) { | 1078 | 4.98M | __m256i s[4]; | 1079 | | | 1080 | 4.98M | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 1081 | 4.98M | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 1082 | 4.98M | s[2] = _mm256_shuffle_epi8(data, filt[2]); | 1083 | | | 1084 | 4.98M | return convolve_lowbd_6tap(s, coeffs); | 1085 | 4.98M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_x_6tap Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_x_6tap Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x_6tap Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x_6tap |
1086 | | |
1087 | | static inline __m256i convolve_lowbd_x_4tap(const __m256i data, |
1088 | | const __m256i *const coeffs, |
1089 | 4.90M | const __m256i *const filt) { |
1090 | 4.90M | __m256i s[2]; |
1091 | | |
1092 | 4.90M | s[0] = _mm256_shuffle_epi8(data, filt[0]); |
1093 | 4.90M | s[1] = _mm256_shuffle_epi8(data, filt[1]); |
1094 | | |
1095 | 4.90M | return convolve_lowbd_4tap(s, coeffs); |
1096 | 4.90M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x_4tap Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x_4tap convolve_2d_avx2.c:convolve_lowbd_x_4tap Line | Count | Source | 1089 | 1.90M | const __m256i *const filt) { | 1090 | 1.90M | __m256i s[2]; | 1091 | | | 1092 | 1.90M | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 1093 | 1.90M | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 1094 | | | 1095 | 1.90M | return convolve_lowbd_4tap(s, coeffs); | 1096 | 1.90M | } |
convolve_avx2.c:convolve_lowbd_x_4tap Line | Count | Source | 1089 | 812k | const __m256i *const filt) { | 1090 | 812k | __m256i s[2]; | 1091 | | | 1092 | 812k | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 1093 | 812k | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 1094 | | | 1095 | 812k | return convolve_lowbd_4tap(s, coeffs); | 1096 | 812k | } |
jnt_convolve_avx2.c:convolve_lowbd_x_4tap Line | Count | Source | 1089 | 2.18M | const __m256i *const filt) { | 1090 | 2.18M | __m256i s[2]; | 1091 | | | 1092 | 2.18M | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 1093 | 2.18M | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 1094 | | | 1095 | 2.18M | return convolve_lowbd_4tap(s, coeffs); | 1096 | 2.18M | } |
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_x_4tap Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x_4tap Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x_4tap |
1097 | | |
1098 | | static inline __m256i convolve_lowbd_x_2tap(const __m256i data, |
1099 | | const __m256i *const coeffs, |
1100 | 766k | const __m256i *const filt) { |
1101 | 766k | __m256i s; |
1102 | 766k | s = _mm256_shuffle_epi8(data, filt[0]); |
1103 | | |
1104 | 766k | return _mm256_maddubs_epi16(s, coeffs[0]); |
1105 | 766k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x_2tap Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x_2tap convolve_2d_avx2.c:convolve_lowbd_x_2tap Line | Count | Source | 1100 | 766k | const __m256i *const filt) { | 1101 | 766k | __m256i s; | 1102 | 766k | s = _mm256_shuffle_epi8(data, filt[0]); | 1103 | | | 1104 | 766k | return _mm256_maddubs_epi16(s, coeffs[0]); | 1105 | 766k | } |
Unexecuted instantiation: convolve_avx2.c:convolve_lowbd_x_2tap Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_x_2tap Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_x_2tap Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x_2tap Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x_2tap |
1106 | | |
1107 | | static inline void add_store_aligned_256(CONV_BUF_TYPE *const dst, |
1108 | | const __m256i *const res, |
1109 | 0 | const int do_average) { |
1110 | 0 | __m256i d; |
1111 | 0 | if (do_average) { |
1112 | 0 | d = _mm256_load_si256((__m256i *)dst); |
1113 | 0 | d = _mm256_add_epi32(d, *res); |
1114 | 0 | d = _mm256_srai_epi32(d, 1); |
1115 | 0 | } else { |
1116 | 0 | d = *res; |
1117 | 0 | } |
1118 | 0 | _mm256_store_si256((__m256i *)dst, d); |
1119 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:add_store_aligned_256 Unexecuted instantiation: highbd_convolve_avx2.c:add_store_aligned_256 Unexecuted instantiation: convolve_2d_avx2.c:add_store_aligned_256 Unexecuted instantiation: convolve_avx2.c:add_store_aligned_256 Unexecuted instantiation: jnt_convolve_avx2.c:add_store_aligned_256 Unexecuted instantiation: wiener_convolve_avx2.c:add_store_aligned_256 Unexecuted instantiation: highbd_convolve_2d_avx2.c:add_store_aligned_256 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:add_store_aligned_256 |
1120 | | |
1121 | | static inline __m256i comp_avg(const __m256i *const data_ref_0, |
1122 | | const __m256i *const res_unsigned, |
1123 | | const __m256i *const wt, |
1124 | 187M | const int use_dist_wtd_comp_avg) { |
1125 | 187M | __m256i res; |
1126 | 187M | if (use_dist_wtd_comp_avg) { |
1127 | 2.06M | const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned); |
1128 | 2.06M | const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned); |
1129 | | |
1130 | 2.06M | const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt); |
1131 | 2.06M | const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt); |
1132 | | |
1133 | 2.06M | const __m256i res_lo = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); |
1134 | 2.06M | const __m256i res_hi = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); |
1135 | | |
1136 | 2.06M | res = _mm256_packs_epi32(res_lo, res_hi); |
1137 | 185M | } else { |
1138 | 185M | const __m256i wt_res = _mm256_add_epi16(*data_ref_0, *res_unsigned); |
1139 | 185M | res = _mm256_srai_epi16(wt_res, 1); |
1140 | 185M | } |
1141 | 187M | return res; |
1142 | 187M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:comp_avg Unexecuted instantiation: highbd_convolve_avx2.c:comp_avg Unexecuted instantiation: convolve_2d_avx2.c:comp_avg Unexecuted instantiation: convolve_avx2.c:comp_avg jnt_convolve_avx2.c:comp_avg Line | Count | Source | 1124 | 187M | const int use_dist_wtd_comp_avg) { | 1125 | 187M | __m256i res; | 1126 | 187M | if (use_dist_wtd_comp_avg) { | 1127 | 2.06M | const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned); | 1128 | 2.06M | const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned); | 1129 | | | 1130 | 2.06M | const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt); | 1131 | 2.06M | const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt); | 1132 | | | 1133 | 2.06M | const __m256i res_lo = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); | 1134 | 2.06M | const __m256i res_hi = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); | 1135 | | | 1136 | 2.06M | res = _mm256_packs_epi32(res_lo, res_hi); | 1137 | 185M | } else { | 1138 | 185M | const __m256i wt_res = _mm256_add_epi16(*data_ref_0, *res_unsigned); | 1139 | 185M | res = _mm256_srai_epi16(wt_res, 1); | 1140 | 185M | } | 1141 | 187M | return res; | 1142 | 187M | } |
Unexecuted instantiation: wiener_convolve_avx2.c:comp_avg Unexecuted instantiation: highbd_convolve_2d_avx2.c:comp_avg Unexecuted instantiation: highbd_jnt_convolve_avx2.c:comp_avg |
1143 | | |
1144 | | static inline __m256i convolve_rounding(const __m256i *const res_unsigned, |
1145 | | const __m256i *const offset_const, |
1146 | | const __m256i *const round_const, |
1147 | 187M | const int round_shift) { |
1148 | 187M | const __m256i res_signed = _mm256_sub_epi16(*res_unsigned, *offset_const); |
1149 | 187M | const __m256i res_round = _mm256_srai_epi16( |
1150 | 187M | _mm256_add_epi16(res_signed, *round_const), round_shift); |
1151 | 187M | return res_round; |
1152 | 187M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_rounding Unexecuted instantiation: highbd_convolve_avx2.c:convolve_rounding Unexecuted instantiation: convolve_2d_avx2.c:convolve_rounding Unexecuted instantiation: convolve_avx2.c:convolve_rounding jnt_convolve_avx2.c:convolve_rounding Line | Count | Source | 1147 | 187M | const int round_shift) { | 1148 | 187M | const __m256i res_signed = _mm256_sub_epi16(*res_unsigned, *offset_const); | 1149 | 187M | const __m256i res_round = _mm256_srai_epi16( | 1150 | 187M | _mm256_add_epi16(res_signed, *round_const), round_shift); | 1151 | 187M | return res_round; | 1152 | 187M | } |
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_rounding Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_rounding Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_rounding |
1153 | | |
1154 | | static inline __m256i highbd_comp_avg(const __m256i *const data_ref_0, |
1155 | | const __m256i *const res_unsigned, |
1156 | | const __m256i *const wt0, |
1157 | | const __m256i *const wt1, |
1158 | 17.5M | const int use_dist_wtd_comp_avg) { |
1159 | 17.5M | __m256i res; |
1160 | 17.5M | if (use_dist_wtd_comp_avg) { |
1161 | 2.07M | const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0); |
1162 | 2.07M | const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1); |
1163 | 2.07M | const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res); |
1164 | 2.07M | res = _mm256_srai_epi32(wt_res, DIST_PRECISION_BITS); |
1165 | 15.4M | } else { |
1166 | 15.4M | const __m256i wt_res = _mm256_add_epi32(*data_ref_0, *res_unsigned); |
1167 | 15.4M | res = _mm256_srai_epi32(wt_res, 1); |
1168 | 15.4M | } |
1169 | 17.5M | return res; |
1170 | 17.5M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:highbd_comp_avg Unexecuted instantiation: highbd_convolve_avx2.c:highbd_comp_avg Unexecuted instantiation: convolve_2d_avx2.c:highbd_comp_avg Unexecuted instantiation: convolve_avx2.c:highbd_comp_avg Unexecuted instantiation: jnt_convolve_avx2.c:highbd_comp_avg Unexecuted instantiation: wiener_convolve_avx2.c:highbd_comp_avg Unexecuted instantiation: highbd_convolve_2d_avx2.c:highbd_comp_avg highbd_jnt_convolve_avx2.c:highbd_comp_avg Line | Count | Source | 1158 | 17.5M | const int use_dist_wtd_comp_avg) { | 1159 | 17.5M | __m256i res; | 1160 | 17.5M | if (use_dist_wtd_comp_avg) { | 1161 | 2.07M | const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0); | 1162 | 2.07M | const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1); | 1163 | 2.07M | const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res); | 1164 | 2.07M | res = _mm256_srai_epi32(wt_res, DIST_PRECISION_BITS); | 1165 | 15.4M | } else { | 1166 | 15.4M | const __m256i wt_res = _mm256_add_epi32(*data_ref_0, *res_unsigned); | 1167 | 15.4M | res = _mm256_srai_epi32(wt_res, 1); | 1168 | 15.4M | } | 1169 | 17.5M | return res; | 1170 | 17.5M | } |
|
1171 | | |
1172 | | static inline __m256i highbd_convolve_rounding( |
1173 | | const __m256i *const res_unsigned, const __m256i *const offset_const, |
1174 | 17.5M | const __m256i *const round_const, const int round_shift) { |
1175 | 17.5M | const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const); |
1176 | 17.5M | const __m256i res_round = _mm256_srai_epi32( |
1177 | 17.5M | _mm256_add_epi32(res_signed, *round_const), round_shift); |
1178 | | |
1179 | 17.5M | return res_round; |
1180 | 17.5M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:highbd_convolve_rounding Unexecuted instantiation: highbd_convolve_avx2.c:highbd_convolve_rounding Unexecuted instantiation: convolve_2d_avx2.c:highbd_convolve_rounding Unexecuted instantiation: convolve_avx2.c:highbd_convolve_rounding Unexecuted instantiation: jnt_convolve_avx2.c:highbd_convolve_rounding Unexecuted instantiation: wiener_convolve_avx2.c:highbd_convolve_rounding Unexecuted instantiation: highbd_convolve_2d_avx2.c:highbd_convolve_rounding highbd_jnt_convolve_avx2.c:highbd_convolve_rounding Line | Count | Source | 1174 | 17.5M | const __m256i *const round_const, const int round_shift) { | 1175 | 17.5M | const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const); | 1176 | 17.5M | const __m256i res_round = _mm256_srai_epi32( | 1177 | 17.5M | _mm256_add_epi32(res_signed, *round_const), round_shift); | 1178 | | | 1179 | 17.5M | return res_round; | 1180 | 17.5M | } |
|
1181 | | |
1182 | 6.51M | static inline __m256i round_sr_x_avx2(const __m256i data) { |
1183 | | // we can perform the below steps: |
1184 | | // data = (data + 2) >> 2 |
1185 | | // data = (data + 8) >> 4, |
1186 | | // in the below form as well |
1187 | | // data = (data + 0x22) >> 6 |
1188 | 6.51M | const __m256i value = _mm256_set1_epi16(34); |
1189 | 6.51M | const __m256i reg = _mm256_add_epi16(data, value); |
1190 | 6.51M | return _mm256_srai_epi16(reg, 6); |
1191 | 6.51M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_sr_x_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:round_sr_x_avx2 Unexecuted instantiation: convolve_2d_avx2.c:round_sr_x_avx2 convolve_avx2.c:round_sr_x_avx2 Line | Count | Source | 1182 | 6.51M | static inline __m256i round_sr_x_avx2(const __m256i data) { | 1183 | | // we can perform the below steps: | 1184 | | // data = (data + 2) >> 2 | 1185 | | // data = (data + 8) >> 4, | 1186 | | // in the below form as well | 1187 | | // data = (data + 0x22) >> 6 | 1188 | 6.51M | const __m256i value = _mm256_set1_epi16(34); | 1189 | 6.51M | const __m256i reg = _mm256_add_epi16(data, value); | 1190 | 6.51M | return _mm256_srai_epi16(reg, 6); | 1191 | 6.51M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:round_sr_x_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:round_sr_x_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_sr_x_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_sr_x_avx2 |
1192 | | |
1193 | | static inline __m128i convolve_x_4tap_4x2_ssse3(const uint8_t *const src, |
1194 | | const ptrdiff_t src_stride, |
1195 | 776k | __m128i *const coeffs) { |
1196 | 776k | __m128i data[2]; |
1197 | 776k | const __m128i f_l0 = _mm_load_si128((__m128i const *)filt1_global_sse2); |
1198 | 776k | const __m128i f_l1 = _mm_load_si128((__m128i const *)filt2_global_sse2); |
1199 | 776k | const __m128i src_1 = |
1200 | 776k | load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * src_stride)); |
1201 | | |
1202 | 776k | data[0] = _mm_shuffle_epi8(src_1, f_l0); |
1203 | 776k | data[1] = _mm_shuffle_epi8(src_1, f_l1); |
1204 | 776k | return convolve_lowbd_4tap_ssse3(data, coeffs); |
1205 | 776k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_4tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_4tap_4x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_4tap_4x2_ssse3 convolve_avx2.c:convolve_x_4tap_4x2_ssse3 Line | Count | Source | 1195 | 776k | __m128i *const coeffs) { | 1196 | 776k | __m128i data[2]; | 1197 | 776k | const __m128i f_l0 = _mm_load_si128((__m128i const *)filt1_global_sse2); | 1198 | 776k | const __m128i f_l1 = _mm_load_si128((__m128i const *)filt2_global_sse2); | 1199 | 776k | const __m128i src_1 = | 1200 | 776k | load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * src_stride)); | 1201 | | | 1202 | 776k | data[0] = _mm_shuffle_epi8(src_1, f_l0); | 1203 | 776k | data[1] = _mm_shuffle_epi8(src_1, f_l1); | 1204 | 776k | return convolve_lowbd_4tap_ssse3(data, coeffs); | 1205 | 776k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_4tap_4x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_4tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_4tap_4x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_4tap_4x2_ssse3 |
1206 | | |
1207 | 958k | static inline __m128i round_sr_x_ssse3(const __m128i data) { |
1208 | 958k | const __m128i val = _mm_set1_epi16(34); |
1209 | 958k | const __m128i reg = _mm_add_epi16(data, val); |
1210 | 958k | return _mm_srai_epi16(reg, 6); |
1211 | 958k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_sr_x_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:round_sr_x_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:round_sr_x_ssse3 convolve_avx2.c:round_sr_x_ssse3 Line | Count | Source | 1207 | 958k | static inline __m128i round_sr_x_ssse3(const __m128i data) { | 1208 | 958k | const __m128i val = _mm_set1_epi16(34); | 1209 | 958k | const __m128i reg = _mm_add_epi16(data, val); | 1210 | 958k | return _mm_srai_epi16(reg, 6); | 1211 | 958k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:round_sr_x_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:round_sr_x_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_sr_x_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_sr_x_ssse3 |
1212 | | |
1213 | | static inline void store_x_u8_4x2_sse2(const __m128i reg, uint8_t *const dst, |
1214 | 776k | const ptrdiff_t dst_stride) { |
1215 | 776k | xx_storel_32(dst, reg); |
1216 | 776k | *(uint32_t *)(dst + dst_stride) = |
1217 | 776k | ((uint32_t)_mm_extract_epi16(reg, 3) << 16) | _mm_extract_epi16(reg, 2); |
1218 | 776k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:store_x_u8_4x2_sse2 Unexecuted instantiation: highbd_convolve_avx2.c:store_x_u8_4x2_sse2 Unexecuted instantiation: convolve_2d_avx2.c:store_x_u8_4x2_sse2 convolve_avx2.c:store_x_u8_4x2_sse2 Line | Count | Source | 1214 | 776k | const ptrdiff_t dst_stride) { | 1215 | 776k | xx_storel_32(dst, reg); | 1216 | 776k | *(uint32_t *)(dst + dst_stride) = | 1217 | 776k | ((uint32_t)_mm_extract_epi16(reg, 3) << 16) | _mm_extract_epi16(reg, 2); | 1218 | 776k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:store_x_u8_4x2_sse2 Unexecuted instantiation: wiener_convolve_avx2.c:store_x_u8_4x2_sse2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:store_x_u8_4x2_sse2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:store_x_u8_4x2_sse2 |
1219 | | |
1220 | | static inline void pack_store_x_4x2_sse2(const __m128i reg, uint8_t *const dst, |
1221 | 776k | const ptrdiff_t dst_stride) { |
1222 | 776k | const __m128i reg_pack = _mm_packus_epi16(reg, reg); |
1223 | 776k | store_x_u8_4x2_sse2(reg_pack, dst, dst_stride); |
1224 | 776k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:pack_store_x_4x2_sse2 Unexecuted instantiation: highbd_convolve_avx2.c:pack_store_x_4x2_sse2 Unexecuted instantiation: convolve_2d_avx2.c:pack_store_x_4x2_sse2 convolve_avx2.c:pack_store_x_4x2_sse2 Line | Count | Source | 1221 | 776k | const ptrdiff_t dst_stride) { | 1222 | 776k | const __m128i reg_pack = _mm_packus_epi16(reg, reg); | 1223 | 776k | store_x_u8_4x2_sse2(reg_pack, dst, dst_stride); | 1224 | 776k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:pack_store_x_4x2_sse2 Unexecuted instantiation: wiener_convolve_avx2.c:pack_store_x_4x2_sse2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:pack_store_x_4x2_sse2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:pack_store_x_4x2_sse2 |
1225 | | |
1226 | | static inline __m128i convolve_x_4tap_2x2_ssse3(const uint8_t *const src, |
1227 | | const ptrdiff_t src_stride, |
1228 | 138k | __m128i *const coeffs) { |
1229 | 138k | __m128i data[2]; |
1230 | 138k | const __m128i f_0 = _mm_load_si128((__m128i const *)filt3_global_sse2); |
1231 | 138k | const __m128i f_1 = _mm_load_si128((__m128i const *)filt4_global_sse2); |
1232 | 138k | const __m128i reg = |
1233 | 138k | load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * src_stride)); |
1234 | | |
1235 | 138k | data[0] = _mm_shuffle_epi8(reg, f_0); |
1236 | 138k | data[1] = _mm_shuffle_epi8(reg, f_1); |
1237 | 138k | return convolve_lowbd_4tap_ssse3(data, coeffs); |
1238 | 138k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_4tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_4tap_2x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_4tap_2x2_ssse3 convolve_avx2.c:convolve_x_4tap_2x2_ssse3 Line | Count | Source | 1228 | 138k | __m128i *const coeffs) { | 1229 | 138k | __m128i data[2]; | 1230 | 138k | const __m128i f_0 = _mm_load_si128((__m128i const *)filt3_global_sse2); | 1231 | 138k | const __m128i f_1 = _mm_load_si128((__m128i const *)filt4_global_sse2); | 1232 | 138k | const __m128i reg = | 1233 | 138k | load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * src_stride)); | 1234 | | | 1235 | 138k | data[0] = _mm_shuffle_epi8(reg, f_0); | 1236 | 138k | data[1] = _mm_shuffle_epi8(reg, f_1); | 1237 | 138k | return convolve_lowbd_4tap_ssse3(data, coeffs); | 1238 | 138k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_4tap_2x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_4tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_4tap_2x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_4tap_2x2_ssse3 |
1239 | | |
1240 | | static inline void pack_store_x_2x2_sse2(const __m128i reg, uint8_t *const dst, |
1241 | 141k | const ptrdiff_t dst_stride) { |
1242 | 141k | const __m128i data = _mm_packus_epi16(reg, reg); |
1243 | 141k | *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(data); |
1244 | 141k | *(int16_t *)(dst + dst_stride) = (int16_t)_mm_extract_epi16(data, 1); |
1245 | 141k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:pack_store_x_2x2_sse2 Unexecuted instantiation: highbd_convolve_avx2.c:pack_store_x_2x2_sse2 Unexecuted instantiation: convolve_2d_avx2.c:pack_store_x_2x2_sse2 convolve_avx2.c:pack_store_x_2x2_sse2 Line | Count | Source | 1241 | 141k | const ptrdiff_t dst_stride) { | 1242 | 141k | const __m128i data = _mm_packus_epi16(reg, reg); | 1243 | 141k | *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(data); | 1244 | | *(int16_t *)(dst + dst_stride) = (int16_t)_mm_extract_epi16(data, 1); | 1245 | 141k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:pack_store_x_2x2_sse2 Unexecuted instantiation: wiener_convolve_avx2.c:pack_store_x_2x2_sse2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:pack_store_x_2x2_sse2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:pack_store_x_2x2_sse2 |
1246 | | |
1247 | | static inline __m128i convolve_x_2tap_ssse3(const __m128i *data, |
1248 | 44.3k | const __m128i *coeff) { |
1249 | 44.3k | return _mm_maddubs_epi16(data[0], coeff[0]); |
1250 | 44.3k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_ssse3 convolve_avx2.c:convolve_x_2tap_ssse3 Line | Count | Source | 1248 | 44.3k | const __m128i *coeff) { | 1249 | 44.3k | return _mm_maddubs_epi16(data[0], coeff[0]); | 1250 | 44.3k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_ssse3 |
1251 | | |
1252 | | static inline __m128i load8_x_4x2_sse4(const void *const src, |
1253 | 11.9k | const ptrdiff_t offset) { |
1254 | 11.9k | const __m128i s = _mm_cvtsi32_si128(loadu_int32(src)); |
1255 | 11.9k | return _mm_insert_epi32(s, loadu_int32((uint8_t *)src + offset), 1); |
1256 | 11.9k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load8_x_4x2_sse4 Unexecuted instantiation: highbd_convolve_avx2.c:load8_x_4x2_sse4 Unexecuted instantiation: convolve_2d_avx2.c:load8_x_4x2_sse4 convolve_avx2.c:load8_x_4x2_sse4 Line | Count | Source | 1253 | 11.9k | const ptrdiff_t offset) { | 1254 | 11.9k | const __m128i s = _mm_cvtsi32_si128(loadu_int32(src)); | 1255 | | return _mm_insert_epi32(s, loadu_int32((uint8_t *)src + offset), 1); | 1256 | 11.9k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load8_x_4x2_sse4 Unexecuted instantiation: wiener_convolve_avx2.c:load8_x_4x2_sse4 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load8_x_4x2_sse4 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load8_x_4x2_sse4 |
1257 | | |
1258 | | static inline __m128i load_x_u8_4x2_sse4(const uint8_t *const src, |
1259 | 11.9k | const ptrdiff_t stride) { |
1260 | 11.9k | return load8_x_4x2_sse4(src, sizeof(*src) * stride); |
1261 | 11.9k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_x_u8_4x2_sse4 Unexecuted instantiation: highbd_convolve_avx2.c:load_x_u8_4x2_sse4 Unexecuted instantiation: convolve_2d_avx2.c:load_x_u8_4x2_sse4 convolve_avx2.c:load_x_u8_4x2_sse4 Line | Count | Source | 1259 | 11.9k | const ptrdiff_t stride) { | 1260 | 11.9k | return load8_x_4x2_sse4(src, sizeof(*src) * stride); | 1261 | 11.9k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load_x_u8_4x2_sse4 Unexecuted instantiation: wiener_convolve_avx2.c:load_x_u8_4x2_sse4 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_x_u8_4x2_sse4 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_x_u8_4x2_sse4 |
1262 | | |
1263 | | static inline __m128i convolve_x_2tap_2x2_ssse3(const uint8_t *const src, |
1264 | | const ptrdiff_t stride, |
1265 | 3.28k | const __m128i *coeffs) { |
1266 | 3.28k | const __m128i flt = _mm_load_si128((__m128i const *)filt5_global_sse2); |
1267 | 3.28k | const __m128i reg = load_x_u8_4x2_sse4(src, stride); |
1268 | 3.28k | const __m128i data = _mm_shuffle_epi8(reg, flt); |
1269 | 3.28k | return convolve_x_2tap_ssse3(&data, coeffs); |
1270 | 3.28k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_2x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_2x2_ssse3 convolve_avx2.c:convolve_x_2tap_2x2_ssse3 Line | Count | Source | 1265 | 3.28k | const __m128i *coeffs) { | 1266 | 3.28k | const __m128i flt = _mm_load_si128((__m128i const *)filt5_global_sse2); | 1267 | 3.28k | const __m128i reg = load_x_u8_4x2_sse4(src, stride); | 1268 | 3.28k | const __m128i data = _mm_shuffle_epi8(reg, flt); | 1269 | 3.28k | return convolve_x_2tap_ssse3(&data, coeffs); | 1270 | 3.28k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_2x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_2x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_2x2_ssse3 |
1271 | | |
1272 | | static inline __m128i convolve_x_2tap_4x2_ssse3(const uint8_t *const src, |
1273 | | const ptrdiff_t stride, |
1274 | 13.9k | const __m128i *coeffs) { |
1275 | 13.9k | const __m128i flt = _mm_load_si128((__m128i const *)filt1_global_sse2); |
1276 | 13.9k | const __m128i data = |
1277 | 13.9k | load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * stride)); |
1278 | 13.9k | const __m128i res = _mm_shuffle_epi8(data, flt); |
1279 | 13.9k | return convolve_x_2tap_ssse3(&res, coeffs); |
1280 | 13.9k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_4x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_4x2_ssse3 convolve_avx2.c:convolve_x_2tap_4x2_ssse3 Line | Count | Source | 1274 | 13.9k | const __m128i *coeffs) { | 1275 | 13.9k | const __m128i flt = _mm_load_si128((__m128i const *)filt1_global_sse2); | 1276 | 13.9k | const __m128i data = | 1277 | 13.9k | load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * stride)); | 1278 | 13.9k | const __m128i res = _mm_shuffle_epi8(data, flt); | 1279 | 13.9k | return convolve_x_2tap_ssse3(&res, coeffs); | 1280 | 13.9k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_4x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_4x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_4x2_ssse3 |
1281 | | |
1282 | | static inline void convolve_x_2tap_8x2_ssse3(const uint8_t *const src, |
1283 | | const ptrdiff_t stride, |
1284 | | const __m128i *coeffs, |
1285 | 13.5k | __m128i *data) { |
1286 | 13.5k | __m128i res[2]; |
1287 | 13.5k | const __m128i reg_00 = _mm_loadu_si128((__m128i *)src); |
1288 | 13.5k | const __m128i reg_10 = _mm_loadu_si128((__m128i *)(src + stride)); |
1289 | 13.5k | const __m128i reg_01 = _mm_srli_si128(reg_00, 1); |
1290 | 13.5k | const __m128i reg_11 = _mm_srli_si128(reg_10, 1); |
1291 | 13.5k | res[0] = _mm_unpacklo_epi8(reg_00, reg_01); |
1292 | 13.5k | res[1] = _mm_unpacklo_epi8(reg_10, reg_11); |
1293 | | |
1294 | 13.5k | data[0] = convolve_x_2tap_ssse3(&res[0], coeffs); |
1295 | 13.5k | data[1] = convolve_x_2tap_ssse3(&res[1], coeffs); |
1296 | 13.5k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_8x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_8x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_8x2_ssse3 convolve_avx2.c:convolve_x_2tap_8x2_ssse3 Line | Count | Source | 1285 | 13.5k | __m128i *data) { | 1286 | 13.5k | __m128i res[2]; | 1287 | 13.5k | const __m128i reg_00 = _mm_loadu_si128((__m128i *)src); | 1288 | 13.5k | const __m128i reg_10 = _mm_loadu_si128((__m128i *)(src + stride)); | 1289 | 13.5k | const __m128i reg_01 = _mm_srli_si128(reg_00, 1); | 1290 | 13.5k | const __m128i reg_11 = _mm_srli_si128(reg_10, 1); | 1291 | 13.5k | res[0] = _mm_unpacklo_epi8(reg_00, reg_01); | 1292 | 13.5k | res[1] = _mm_unpacklo_epi8(reg_10, reg_11); | 1293 | | | 1294 | 13.5k | data[0] = convolve_x_2tap_ssse3(&res[0], coeffs); | 1295 | 13.5k | data[1] = convolve_x_2tap_ssse3(&res[1], coeffs); | 1296 | 13.5k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_8x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_8x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_8x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_8x2_ssse3 |
1297 | | |
1298 | | static inline __m256i loadu_x_8bit_16x2_avx2(const void *const src, |
1299 | 1.38M | const ptrdiff_t offset) { |
1300 | 1.38M | const __m128i reg0 = _mm_loadu_si128((__m128i *)src); |
1301 | 1.38M | const __m128i reg1 = _mm_loadu_si128((__m128i *)((uint8_t *)src + offset)); |
1302 | 1.38M | return _mm256_setr_m128i(reg0, reg1); |
1303 | 1.38M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:loadu_x_8bit_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:loadu_x_8bit_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:loadu_x_8bit_16x2_avx2 convolve_avx2.c:loadu_x_8bit_16x2_avx2 Line | Count | Source | 1299 | 1.38M | const ptrdiff_t offset) { | 1300 | 1.38M | const __m128i reg0 = _mm_loadu_si128((__m128i *)src); | 1301 | 1.38M | const __m128i reg1 = _mm_loadu_si128((__m128i *)((uint8_t *)src + offset)); | 1302 | | return _mm256_setr_m128i(reg0, reg1); | 1303 | 1.38M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:loadu_x_8bit_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:loadu_x_8bit_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:loadu_x_8bit_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:loadu_x_8bit_16x2_avx2 |
1304 | | |
1305 | | static inline __m256i convolve_x_2tap_avx2(const __m256i *data, |
1306 | 279k | const __m256i *coeffs) { |
1307 | 279k | return _mm256_maddubs_epi16(data[0], coeffs[0]); |
1308 | 279k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_avx2 convolve_avx2.c:convolve_x_2tap_avx2 Line | Count | Source | 1306 | 279k | const __m256i *coeffs) { | 1307 | 279k | return _mm256_maddubs_epi16(data[0], coeffs[0]); | 1308 | 279k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_avx2 |
1309 | | |
1310 | | static inline void convolve_x_2tap_16x2_avx2(const uint8_t *const src, |
1311 | | const ptrdiff_t stride, |
1312 | | const __m256i *coeffs, |
1313 | 10.4k | __m256i *data) { |
1314 | 10.4k | const __m256i reg0 = loadu_x_8bit_16x2_avx2(src, stride); |
1315 | 10.4k | const __m256i reg1 = loadu_x_8bit_16x2_avx2(src + 1, stride); |
1316 | 10.4k | const __m256i res0 = _mm256_unpacklo_epi8(reg0, reg1); |
1317 | 10.4k | const __m256i res1 = _mm256_unpackhi_epi8(reg0, reg1); |
1318 | 10.4k | data[0] = convolve_x_2tap_avx2(&res0, coeffs); |
1319 | 10.4k | data[1] = convolve_x_2tap_avx2(&res1, coeffs); |
1320 | 10.4k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_16x2_avx2 convolve_avx2.c:convolve_x_2tap_16x2_avx2 Line | Count | Source | 1313 | 10.4k | __m256i *data) { | 1314 | 10.4k | const __m256i reg0 = loadu_x_8bit_16x2_avx2(src, stride); | 1315 | 10.4k | const __m256i reg1 = loadu_x_8bit_16x2_avx2(src + 1, stride); | 1316 | 10.4k | const __m256i res0 = _mm256_unpacklo_epi8(reg0, reg1); | 1317 | 10.4k | const __m256i res1 = _mm256_unpackhi_epi8(reg0, reg1); | 1318 | 10.4k | data[0] = convolve_x_2tap_avx2(&res0, coeffs); | 1319 | 10.4k | data[1] = convolve_x_2tap_avx2(&res1, coeffs); | 1320 | 10.4k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_16x2_avx2 |
1321 | | |
1322 | | static inline void storeu_x_8bit_16x2_ssse3(const __m256i src, void *const dst, |
1323 | 692k | const ptrdiff_t offset) { |
1324 | 692k | const __m128i reg0 = _mm256_castsi256_si128(src); |
1325 | 692k | const __m128i reg1 = _mm256_extracti128_si256(src, 1); |
1326 | 692k | _mm_storeu_si128((__m128i *)dst, reg0); |
1327 | 692k | _mm_storeu_si128((__m128i *)((uint8_t *)dst + offset), reg1); |
1328 | 692k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:storeu_x_8bit_16x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:storeu_x_8bit_16x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:storeu_x_8bit_16x2_ssse3 convolve_avx2.c:storeu_x_8bit_16x2_ssse3 Line | Count | Source | 1323 | 692k | const ptrdiff_t offset) { | 1324 | 692k | const __m128i reg0 = _mm256_castsi256_si128(src); | 1325 | | const __m128i reg1 = _mm256_extracti128_si256(src, 1); | 1326 | 692k | _mm_storeu_si128((__m128i *)dst, reg0); | 1327 | 692k | _mm_storeu_si128((__m128i *)((uint8_t *)dst + offset), reg1); | 1328 | 692k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:storeu_x_8bit_16x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:storeu_x_8bit_16x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:storeu_x_8bit_16x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:storeu_x_8bit_16x2_ssse3 |
1329 | | |
1330 | | static inline void storeu_x_u8_16x2_ssse3(const __m256i src, uint8_t *const dst, |
1331 | 692k | const ptrdiff_t stride) { |
1332 | 692k | storeu_x_8bit_16x2_ssse3(src, dst, sizeof(*dst) * stride); |
1333 | 692k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:storeu_x_u8_16x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:storeu_x_u8_16x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:storeu_x_u8_16x2_ssse3 convolve_avx2.c:storeu_x_u8_16x2_ssse3 Line | Count | Source | 1331 | 692k | const ptrdiff_t stride) { | 1332 | 692k | storeu_x_8bit_16x2_ssse3(src, dst, sizeof(*dst) * stride); | 1333 | 692k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:storeu_x_u8_16x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:storeu_x_u8_16x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:storeu_x_u8_16x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:storeu_x_u8_16x2_ssse3 |
1334 | | |
1335 | | static inline void pack_store_x_16x2_avx2(const __m256i data0, |
1336 | | const __m256i data1, |
1337 | | uint8_t *const dst, |
1338 | 692k | const ptrdiff_t stride) { |
1339 | 692k | const __m256i res = _mm256_packus_epi16(data0, data1); |
1340 | 692k | storeu_x_u8_16x2_ssse3(res, dst, stride); |
1341 | 692k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:pack_store_x_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:pack_store_x_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:pack_store_x_16x2_avx2 convolve_avx2.c:pack_store_x_16x2_avx2 Line | Count | Source | 1338 | 692k | const ptrdiff_t stride) { | 1339 | 692k | const __m256i res = _mm256_packus_epi16(data0, data1); | 1340 | 692k | storeu_x_u8_16x2_ssse3(res, dst, stride); | 1341 | 692k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:pack_store_x_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:pack_store_x_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:pack_store_x_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:pack_store_x_16x2_avx2 |
1342 | | |
1343 | | static inline void round_pack_store_16x2_avx2(const __m256i *data, |
1344 | | uint8_t *const dst, |
1345 | 692k | const ptrdiff_t dst_stride) { |
1346 | 692k | __m256i reg[2]; |
1347 | | |
1348 | 692k | reg[0] = round_sr_x_avx2(data[0]); |
1349 | 692k | reg[1] = round_sr_x_avx2(data[1]); |
1350 | 692k | pack_store_x_16x2_avx2(reg[0], reg[1], dst, dst_stride); |
1351 | 692k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_pack_store_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:round_pack_store_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:round_pack_store_16x2_avx2 convolve_avx2.c:round_pack_store_16x2_avx2 Line | Count | Source | 1345 | 692k | const ptrdiff_t dst_stride) { | 1346 | 692k | __m256i reg[2]; | 1347 | | | 1348 | 692k | reg[0] = round_sr_x_avx2(data[0]); | 1349 | 692k | reg[1] = round_sr_x_avx2(data[1]); | 1350 | 692k | pack_store_x_16x2_avx2(reg[0], reg[1], dst, dst_stride); | 1351 | 692k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:round_pack_store_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:round_pack_store_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_pack_store_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_pack_store_16x2_avx2 |
1352 | | |
1353 | | static inline void convolve_x_2tap_32_avx2(const uint8_t *const src, |
1354 | | const __m256i *coeffs, |
1355 | 129k | __m256i *data) { |
1356 | 129k | const __m256i res0 = _mm256_loadu_si256((__m256i *)src); |
1357 | 129k | const __m256i res1 = _mm256_loadu_si256((__m256i *)(src + 1)); |
1358 | 129k | const __m256i reg0 = _mm256_unpacklo_epi8(res0, res1); |
1359 | 129k | const __m256i reg1 = _mm256_unpackhi_epi8(res0, res1); |
1360 | | |
1361 | 129k | data[0] = convolve_x_2tap_avx2(®0, coeffs); |
1362 | 129k | data[1] = convolve_x_2tap_avx2(®1, coeffs); |
1363 | 129k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_32_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_32_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_32_avx2 convolve_avx2.c:convolve_x_2tap_32_avx2 Line | Count | Source | 1355 | 129k | __m256i *data) { | 1356 | 129k | const __m256i res0 = _mm256_loadu_si256((__m256i *)src); | 1357 | 129k | const __m256i res1 = _mm256_loadu_si256((__m256i *)(src + 1)); | 1358 | 129k | const __m256i reg0 = _mm256_unpacklo_epi8(res0, res1); | 1359 | 129k | const __m256i reg1 = _mm256_unpackhi_epi8(res0, res1); | 1360 | | | 1361 | 129k | data[0] = convolve_x_2tap_avx2(®0, coeffs); | 1362 | 129k | data[1] = convolve_x_2tap_avx2(®1, coeffs); | 1363 | 129k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_32_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_32_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_32_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_32_avx2 |
1364 | | |
1365 | | static inline void pack_store_x_avx2(const __m256i data0, const __m256i data1, |
1366 | 1.74M | uint8_t *const dst) { |
1367 | 1.74M | const __m256i reg = _mm256_packus_epi16(data0, data1); |
1368 | 1.74M | _mm256_storeu_si256((__m256i *)dst, reg); |
1369 | 1.74M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:pack_store_x_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:pack_store_x_avx2 Unexecuted instantiation: convolve_2d_avx2.c:pack_store_x_avx2 convolve_avx2.c:pack_store_x_avx2 Line | Count | Source | 1366 | 1.74M | uint8_t *const dst) { | 1367 | 1.74M | const __m256i reg = _mm256_packus_epi16(data0, data1); | 1368 | 1.74M | _mm256_storeu_si256((__m256i *)dst, reg); | 1369 | 1.74M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:pack_store_x_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:pack_store_x_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:pack_store_x_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:pack_store_x_avx2 |
1370 | | |
1371 | | static inline void round_pack_store_32_avx2(const __m256i *data, |
1372 | 1.74M | uint8_t *const dst) { |
1373 | 1.74M | __m256i reg[2]; |
1374 | | |
1375 | 1.74M | reg[0] = round_sr_x_avx2(data[0]); |
1376 | 1.74M | reg[1] = round_sr_x_avx2(data[1]); |
1377 | 1.74M | pack_store_x_avx2(reg[0], reg[1], dst); |
1378 | 1.74M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_pack_store_32_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:round_pack_store_32_avx2 Unexecuted instantiation: convolve_2d_avx2.c:round_pack_store_32_avx2 convolve_avx2.c:round_pack_store_32_avx2 Line | Count | Source | 1372 | 1.74M | uint8_t *const dst) { | 1373 | 1.74M | __m256i reg[2]; | 1374 | | | 1375 | 1.74M | reg[0] = round_sr_x_avx2(data[0]); | 1376 | 1.74M | reg[1] = round_sr_x_avx2(data[1]); | 1377 | 1.74M | pack_store_x_avx2(reg[0], reg[1], dst); | 1378 | 1.74M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:round_pack_store_32_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:round_pack_store_32_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_pack_store_32_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_pack_store_32_avx2 |
1379 | | |
1380 | | static inline void convolve_round_2tap_32_avx2(const uint8_t *const src, |
1381 | | const __m256i *coeffs, |
1382 | 129k | uint8_t *const dst) { |
1383 | 129k | __m256i data[2]; |
1384 | | |
1385 | 129k | convolve_x_2tap_32_avx2(src, coeffs, data); |
1386 | 129k | round_pack_store_32_avx2(data, dst); |
1387 | 129k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_round_2tap_32_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_round_2tap_32_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_round_2tap_32_avx2 convolve_avx2.c:convolve_round_2tap_32_avx2 Line | Count | Source | 1382 | 129k | uint8_t *const dst) { | 1383 | 129k | __m256i data[2]; | 1384 | | | 1385 | 129k | convolve_x_2tap_32_avx2(src, coeffs, data); | 1386 | 129k | round_pack_store_32_avx2(data, dst); | 1387 | 129k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_round_2tap_32_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_round_2tap_32_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_round_2tap_32_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_round_2tap_32_avx2 |
1388 | | |
1389 | | static inline void load_avg_store_2tap_32_avx2(const uint8_t *const src, |
1390 | 102k | uint8_t *const dst) { |
1391 | 102k | const __m256i res0 = _mm256_loadu_si256((__m256i *)src); |
1392 | 102k | const __m256i res1 = _mm256_loadu_si256((__m256i *)(src + 1)); |
1393 | 102k | const __m256i data = _mm256_avg_epu8(res0, res1); |
1394 | 102k | _mm256_storeu_si256((__m256i *)dst, data); |
1395 | 102k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_avg_store_2tap_32_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:load_avg_store_2tap_32_avx2 Unexecuted instantiation: convolve_2d_avx2.c:load_avg_store_2tap_32_avx2 convolve_avx2.c:load_avg_store_2tap_32_avx2 Line | Count | Source | 1390 | 102k | uint8_t *const dst) { | 1391 | 102k | const __m256i res0 = _mm256_loadu_si256((__m256i *)src); | 1392 | 102k | const __m256i res1 = _mm256_loadu_si256((__m256i *)(src + 1)); | 1393 | 102k | const __m256i data = _mm256_avg_epu8(res0, res1); | 1394 | 102k | _mm256_storeu_si256((__m256i *)dst, data); | 1395 | 102k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load_avg_store_2tap_32_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:load_avg_store_2tap_32_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_avg_store_2tap_32_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_avg_store_2tap_32_avx2 |
1396 | | |
1397 | | static inline __m256i load_convolve_8tap_8x2_avx2(const uint8_t *const src, |
1398 | | const ptrdiff_t stride, |
1399 | | const __m256i *coeffs, |
1400 | 71.2k | const __m256i *flt) { |
1401 | 71.2k | const __m256i res = loadu_x_8bit_16x2_avx2(src, stride); |
1402 | 71.2k | return convolve_lowbd_x(res, coeffs, flt); |
1403 | 71.2k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_8tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_8tap_8x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_8tap_8x2_avx2 convolve_avx2.c:load_convolve_8tap_8x2_avx2 Line | Count | Source | 1400 | 71.2k | const __m256i *flt) { | 1401 | 71.2k | const __m256i res = loadu_x_8bit_16x2_avx2(src, stride); | 1402 | 71.2k | return convolve_lowbd_x(res, coeffs, flt); | 1403 | 71.2k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_8tap_8x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_8tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_8tap_8x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_8tap_8x2_avx2 |
1404 | | |
1405 | | static inline void load_convolve_8tap_16x2_avx2(const uint8_t *const src, |
1406 | | const int32_t src_stride, |
1407 | | const __m256i *coeffs, |
1408 | | const __m256i *flt, |
1409 | 35.6k | __m256i *reg) { |
1410 | 35.6k | reg[0] = load_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, flt); |
1411 | 35.6k | reg[1] = load_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, flt); |
1412 | 35.6k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_8tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_8tap_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_8tap_16x2_avx2 convolve_avx2.c:load_convolve_8tap_16x2_avx2 Line | Count | Source | 1409 | 35.6k | __m256i *reg) { | 1410 | 35.6k | reg[0] = load_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, flt); | 1411 | 35.6k | reg[1] = load_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, flt); | 1412 | 35.6k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_8tap_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_8tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_8tap_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_8tap_16x2_avx2 |
1413 | | |
1414 | | static inline void load_convolve_8tap_32_avx2(const uint8_t *const src, |
1415 | | const __m256i *coeffs, |
1416 | | const __m256i *filt, |
1417 | 164k | __m256i *data) { |
1418 | 164k | const __m256i reg_0 = _mm256_loadu_si256((__m256i *)src); |
1419 | 164k | const __m256i reg_8 = _mm256_loadu_si256((__m256i *)(src + 8)); |
1420 | | |
1421 | 164k | data[0] = convolve_lowbd_x(reg_0, coeffs, filt); |
1422 | 164k | data[1] = convolve_lowbd_x(reg_8, coeffs, filt); |
1423 | 164k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_8tap_32_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_8tap_32_avx2 Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_8tap_32_avx2 convolve_avx2.c:load_convolve_8tap_32_avx2 Line | Count | Source | 1417 | 164k | __m256i *data) { | 1418 | 164k | const __m256i reg_0 = _mm256_loadu_si256((__m256i *)src); | 1419 | 164k | const __m256i reg_8 = _mm256_loadu_si256((__m256i *)(src + 8)); | 1420 | | | 1421 | 164k | data[0] = convolve_lowbd_x(reg_0, coeffs, filt); | 1422 | 164k | data[1] = convolve_lowbd_x(reg_8, coeffs, filt); | 1423 | 164k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_8tap_32_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_8tap_32_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_8tap_32_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_8tap_32_avx2 |
1424 | | |
1425 | | static inline void load_convolve_round_8tap_32_avx2(const uint8_t *const src, |
1426 | | const __m256i *coeffs, |
1427 | | const __m256i *filt, |
1428 | 164k | uint8_t *const dst) { |
1429 | 164k | __m256i data[2]; |
1430 | | |
1431 | 164k | load_convolve_8tap_32_avx2(src, coeffs, filt, data); |
1432 | 164k | round_pack_store_32_avx2(data, dst); |
1433 | 164k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_round_8tap_32_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_round_8tap_32_avx2 Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_round_8tap_32_avx2 convolve_avx2.c:load_convolve_round_8tap_32_avx2 Line | Count | Source | 1428 | 164k | uint8_t *const dst) { | 1429 | 164k | __m256i data[2]; | 1430 | | | 1431 | 164k | load_convolve_8tap_32_avx2(src, coeffs, filt, data); | 1432 | 164k | round_pack_store_32_avx2(data, dst); | 1433 | 164k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_round_8tap_32_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_round_8tap_32_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_round_8tap_32_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_round_8tap_32_avx2 |
1434 | | |
1435 | | static inline void load_convolve_6tap_32_avx2(const uint8_t *const src, |
1436 | | const __m256i *coeffs, |
1437 | | const __m256i *filt, |
1438 | 1.44M | __m256i *data) { |
1439 | 1.44M | const __m256i reg0 = _mm256_loadu_si256((__m256i *)src); |
1440 | 1.44M | const __m256i reg1 = _mm256_loadu_si256((__m256i *)(src + 8)); |
1441 | | |
1442 | 1.44M | data[0] = convolve_lowbd_x_6tap(reg0, coeffs, filt); |
1443 | 1.44M | data[1] = convolve_lowbd_x_6tap(reg1, coeffs, filt); |
1444 | 1.44M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_6tap_32_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_6tap_32_avx2 Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_6tap_32_avx2 convolve_avx2.c:load_convolve_6tap_32_avx2 Line | Count | Source | 1438 | 1.44M | __m256i *data) { | 1439 | 1.44M | const __m256i reg0 = _mm256_loadu_si256((__m256i *)src); | 1440 | 1.44M | const __m256i reg1 = _mm256_loadu_si256((__m256i *)(src + 8)); | 1441 | | | 1442 | 1.44M | data[0] = convolve_lowbd_x_6tap(reg0, coeffs, filt); | 1443 | 1.44M | data[1] = convolve_lowbd_x_6tap(reg1, coeffs, filt); | 1444 | 1.44M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_6tap_32_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_6tap_32_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_6tap_32_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_6tap_32_avx2 |
1445 | | |
1446 | | static inline void convolve_sr_store_6tap_32_avx2(const uint8_t *const src, |
1447 | | const __m256i *coeffs, |
1448 | | const __m256i *filt, |
1449 | 1.44M | uint8_t *const dst) { |
1450 | 1.44M | __m256i data[2]; |
1451 | | |
1452 | 1.44M | load_convolve_6tap_32_avx2(src, coeffs, filt, data); |
1453 | 1.44M | round_pack_store_32_avx2(data, dst); |
1454 | 1.44M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_sr_store_6tap_32_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_sr_store_6tap_32_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_sr_store_6tap_32_avx2 convolve_avx2.c:convolve_sr_store_6tap_32_avx2 Line | Count | Source | 1449 | 1.44M | uint8_t *const dst) { | 1450 | 1.44M | __m256i data[2]; | 1451 | | | 1452 | 1.44M | load_convolve_6tap_32_avx2(src, coeffs, filt, data); | 1453 | 1.44M | round_pack_store_32_avx2(data, dst); | 1454 | 1.44M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_sr_store_6tap_32_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_sr_store_6tap_32_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_sr_store_6tap_32_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_sr_store_6tap_32_avx2 |
1455 | | |
1456 | | static inline __m256i load_convolve_6tap_8x2_avx2(const uint8_t *const src, |
1457 | | const ptrdiff_t stride, |
1458 | | const __m256i *coeffs, |
1459 | 1.29M | const __m256i *filt) { |
1460 | 1.29M | const __m256i data = loadu_x_8bit_16x2_avx2(src, stride); |
1461 | 1.29M | return convolve_lowbd_x_6tap(data, coeffs, filt); |
1462 | 1.29M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_6tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_6tap_8x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_6tap_8x2_avx2 convolve_avx2.c:load_convolve_6tap_8x2_avx2 Line | Count | Source | 1459 | 1.29M | const __m256i *filt) { | 1460 | 1.29M | const __m256i data = loadu_x_8bit_16x2_avx2(src, stride); | 1461 | 1.29M | return convolve_lowbd_x_6tap(data, coeffs, filt); | 1462 | 1.29M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_6tap_8x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_6tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_6tap_8x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_6tap_8x2_avx2 |
1463 | | |
1464 | | static inline void load_convolve_6tap_16x2_avx2(const uint8_t *const src, |
1465 | | const int32_t src_stride, |
1466 | | const __m256i *coeffs, |
1467 | | const __m256i *filt, |
1468 | 645k | __m256i *data) { |
1469 | 645k | data[0] = load_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt); |
1470 | 645k | data[1] = load_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt); |
1471 | 645k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_6tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_6tap_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_6tap_16x2_avx2 convolve_avx2.c:load_convolve_6tap_16x2_avx2 Line | Count | Source | 1468 | 645k | __m256i *data) { | 1469 | 645k | data[0] = load_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt); | 1470 | 645k | data[1] = load_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt); | 1471 | 645k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_6tap_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_6tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_6tap_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_6tap_16x2_avx2 |
1472 | | |
1473 | | #endif // AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ |