/src/aom/aom_dsp/x86/convolve_avx2.h
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2018, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #ifndef AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ |
13 | | #define AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ |
14 | | |
15 | | #include <immintrin.h> |
16 | | |
17 | | #include "aom_ports/mem.h" |
18 | | |
19 | | #include "aom_dsp/x86/mem_sse2.h" |
20 | | #include "aom_dsp/x86/synonyms.h" |
21 | | |
22 | | #include "av1/common/convolve.h" |
23 | | #include "av1/common/filter.h" |
24 | | |
25 | 737k | #define SECOND_32_BLK (32) |
26 | 660k | #define THIRD_32_BLK (32 << 1) |
27 | 330k | #define FOURTH_32_BLK (SECOND_32_BLK + THIRD_32_BLK) |
28 | | |
29 | | // filters for 16 |
30 | | DECLARE_ALIGNED(32, static const uint8_t, filt_global_avx2[]) = { |
31 | | 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, |
32 | | 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 2, 3, 3, 4, 4, 5, |
33 | | 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6, |
34 | | 7, 7, 8, 8, 9, 9, 10, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, |
35 | | 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, |
36 | | 12, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 6, 7, |
37 | | 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 |
38 | | }; |
39 | | |
40 | | DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[]) = { |
41 | | 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, |
42 | | 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, |
43 | | 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, |
44 | | }; |
45 | | |
46 | | DECLARE_ALIGNED(32, static const uint8_t, filt4_d4_global_avx2[]) = { |
47 | | 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, |
48 | | 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, |
49 | | }; |
50 | | |
51 | | DECLARE_ALIGNED(32, static const uint8_t, filt_center_global_avx2[32]) = { |
52 | | 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255, |
53 | | 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255 |
54 | | }; |
55 | | |
56 | | DECLARE_ALIGNED(32, static const uint8_t, |
57 | | filt1_global_sse2[16]) = { 0, 1, 1, 2, 2, 3, 3, 4, |
58 | | 8, 9, 9, 10, 10, 11, 11, 12 }; |
59 | | |
60 | | DECLARE_ALIGNED(32, static const uint8_t, |
61 | | filt2_global_sse2[16]) = { 2, 3, 3, 4, 4, 5, 5, 6, |
62 | | 10, 11, 11, 12, 12, 13, 13, 14 }; |
63 | | |
64 | | DECLARE_ALIGNED(32, static const uint8_t, |
65 | | filt3_global_sse2[16]) = { 0, 1, 1, 2, 8, 9, 9, 10, |
66 | | 0, 0, 0, 0, 0, 0, 0, 0 }; |
67 | | |
68 | | DECLARE_ALIGNED(32, static const uint8_t, |
69 | | filt4_global_sse2[16]) = { 2, 3, 3, 4, 10, 11, 11, 12, |
70 | | 0, 0, 0, 0, 0, 0, 0, 0 }; |
71 | | |
72 | | DECLARE_ALIGNED(32, static const uint8_t, |
73 | | filt5_global_sse2[16]) = { 0, 1, 1, 2, 4, 5, 5, 6, |
74 | | 0, 0, 0, 0, 0, 0, 0, 0 }; |
75 | | |
76 | | DECLARE_ALIGNED(32, static const uint8_t, |
77 | | filt1_global_avx2[32]) = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, |
78 | | 6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3, |
79 | | 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
80 | | |
81 | | DECLARE_ALIGNED(32, static const uint8_t, |
82 | | filt2_global_avx2[32]) = { 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, |
83 | | 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, |
84 | | 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 }; |
85 | | |
86 | | DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = { |
87 | | 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, |
88 | | 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 |
89 | | }; |
90 | | |
91 | | DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { |
92 | | 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, |
93 | | 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 |
94 | | }; |
95 | | |
96 | | #define CONVOLVE_SR_HOR_FILTER_W4(CONVOLVE_LOWBD) \ |
97 | 2.89M | for (i = 0; i < (im_h - 2); i += 2) { \ |
98 | 2.34M | __m128i data = \ |
99 | 2.34M | load_8bit_8x2_to_1_reg_sse2(&src_ptr[(i * src_stride)], src_stride); \ |
100 | 2.34M | __m128i res = CONVOLVE_LOWBD(data, coeffs_h, filt); \ |
101 | 2.34M | res = _mm_srai_epi16(_mm_add_epi16(res, round_const_h), 2); \ |
102 | 2.34M | _mm_store_si128((__m128i *)&im_block[i * 4], res); \ |
103 | 2.34M | } \ |
104 | 546k | __m128i data_1 = _mm_loadl_epi64((__m128i *)&src_ptr[(i * src_stride)]); \ |
105 | 546k | __m128i res = CONVOLVE_LOWBD(data_1, coeffs_h, filt); \ |
106 | 546k | res = _mm_srai_epi16(_mm_add_epi16(res, round_const_h), 2); \ |
107 | 546k | _mm_storel_epi64((__m128i *)&im_block[i * 4], res); |
108 | | |
109 | | #define CONVOLVE_SR_HOR_FILTER_2TAP_W4 \ |
110 | 19.1k | CONVOLVE_SR_HOR_FILTER_W4(convolve_lowbd_x_2tap_ssse3) |
111 | | |
112 | | #define CONVOLVE_SR_HOR_FILTER_4TAP_W4 \ |
113 | 527k | CONVOLVE_SR_HOR_FILTER_W4(convolve_lowbd_x_4tap_ssse3) |
114 | | |
115 | | static inline void sr_2d_ver_round_and_store_w4(int w, __m256i res, |
116 | | uint8_t *dst, int dst_stride, |
117 | 1.62M | __m256i round_const_v) { |
118 | 1.62M | const __m256i res_round = |
119 | 1.62M | _mm256_srai_epi32(_mm256_add_epi32(res, round_const_v), 11); |
120 | | |
121 | 1.62M | const __m256i res_16bit = _mm256_packs_epi32(res_round, res_round); |
122 | 1.62M | const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); |
123 | | |
124 | 1.62M | const __m128i r0 = _mm256_castsi256_si128(res_8b); |
125 | 1.62M | const __m128i r1 = _mm256_extracti128_si256(res_8b, 1); |
126 | | |
127 | 1.62M | __m128i *const p0 = (__m128i *)dst; |
128 | 1.62M | __m128i *const p1 = (__m128i *)(dst + dst_stride); |
129 | | |
130 | 1.62M | if (w == 4) { |
131 | 1.35M | xx_storel_32(p0, r0); |
132 | 1.35M | xx_storel_32(p1, r1); |
133 | 1.35M | } else { |
134 | 274k | assert(w == 2); |
135 | 274k | *(uint16_t *)p0 = (uint16_t)_mm_cvtsi128_si32(r0); |
136 | 274k | *(uint16_t *)p1 = (uint16_t)_mm_cvtsi128_si32(r1); |
137 | 274k | } |
138 | 1.62M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:sr_2d_ver_round_and_store_w4 Unexecuted instantiation: highbd_convolve_avx2.c:sr_2d_ver_round_and_store_w4 convolve_2d_avx2.c:sr_2d_ver_round_and_store_w4 Line | Count | Source | 117 | 1.62M | __m256i round_const_v) { | 118 | 1.62M | const __m256i res_round = | 119 | 1.62M | _mm256_srai_epi32(_mm256_add_epi32(res, round_const_v), 11); | 120 | | | 121 | 1.62M | const __m256i res_16bit = _mm256_packs_epi32(res_round, res_round); | 122 | 1.62M | const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); | 123 | | | 124 | 1.62M | const __m128i r0 = _mm256_castsi256_si128(res_8b); | 125 | 1.62M | const __m128i r1 = _mm256_extracti128_si256(res_8b, 1); | 126 | | | 127 | 1.62M | __m128i *const p0 = (__m128i *)dst; | 128 | 1.62M | __m128i *const p1 = (__m128i *)(dst + dst_stride); | 129 | | | 130 | 1.62M | if (w == 4) { | 131 | 1.35M | xx_storel_32(p0, r0); | 132 | 1.35M | xx_storel_32(p1, r1); | 133 | 1.35M | } else { | 134 | 274k | assert(w == 2); | 135 | 274k | *(uint16_t *)p0 = (uint16_t)_mm_cvtsi128_si32(r0); | 136 | 274k | *(uint16_t *)p1 = (uint16_t)_mm_cvtsi128_si32(r1); | 137 | 274k | } | 138 | 1.62M | } |
Unexecuted instantiation: convolve_avx2.c:sr_2d_ver_round_and_store_w4 Unexecuted instantiation: jnt_convolve_avx2.c:sr_2d_ver_round_and_store_w4 Unexecuted instantiation: wiener_convolve_avx2.c:sr_2d_ver_round_and_store_w4 Unexecuted instantiation: highbd_convolve_2d_avx2.c:sr_2d_ver_round_and_store_w4 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:sr_2d_ver_round_and_store_w4 |
139 | | |
140 | | #define CONVOLVE_SR_VER_FILTER_2TAP_W4 \ |
141 | 19.1k | __m128i s[2]; \ |
142 | 19.1k | s[0] = _mm_loadl_epi64((__m128i *)(im_block + 0 * 4)); \ |
143 | 19.1k | \ |
144 | 76.0k | for (i = 0; i < h; i += 2) { \ |
145 | 56.9k | const int16_t *data = &im_block[i * 4]; \ |
146 | 56.9k | s[1] = _mm_loadl_epi64((__m128i *)(data + 1 * 4)); \ |
147 | 56.9k | const __m256i src_0 = _mm256_setr_m128i(s[0], s[1]); \ |
148 | 56.9k | s[0] = _mm_loadl_epi64((__m128i *)(data + 2 * 4)); \ |
149 | 56.9k | const __m256i src_1 = _mm256_setr_m128i(s[1], s[0]); \ |
150 | 56.9k | const __m256i ss = _mm256_unpacklo_epi16(src_0, src_1); \ |
151 | 56.9k | \ |
152 | 56.9k | const __m256i res = _mm256_madd_epi16(ss, coeffs_v[0]); \ |
153 | 56.9k | \ |
154 | 56.9k | sr_2d_ver_round_and_store_w4(w, res, dst_ptr, dst_stride, round_const_v); \ |
155 | 56.9k | dst_ptr += 2 * dst_stride; \ |
156 | 56.9k | } |
157 | | |
158 | | #define CONVOLVE_SR_VER_FILTER_4TAP_W4 \ |
159 | 346k | __m128i s[4]; \ |
160 | 346k | __m256i ss[2]; \ |
161 | 346k | s[0] = _mm_loadl_epi64((__m128i *)(im_block + 0 * 4)); \ |
162 | 346k | s[1] = _mm_loadl_epi64((__m128i *)(im_block + 1 * 4)); \ |
163 | 346k | s[2] = _mm_loadl_epi64((__m128i *)(im_block + 2 * 4)); \ |
164 | 346k | \ |
165 | 346k | const __m256i src_0 = _mm256_setr_m128i(s[0], s[1]); \ |
166 | 346k | const __m256i src_1 = _mm256_setr_m128i(s[1], s[2]); \ |
167 | 346k | \ |
168 | 346k | ss[0] = _mm256_unpacklo_epi16(src_0, src_1); \ |
169 | 346k | \ |
170 | 1.00M | for (i = 0; i < h; i += 2) { \ |
171 | 658k | const int16_t *data = &im_block[i * 4]; \ |
172 | 658k | s[3] = _mm_loadl_epi64((__m128i *)(data + 3 * 4)); \ |
173 | 658k | const __m256i src_2 = _mm256_setr_m128i(s[2], s[3]); \ |
174 | 658k | s[2] = _mm_loadl_epi64((__m128i *)(data + 4 * 4)); \ |
175 | 658k | const __m256i src_3 = _mm256_setr_m128i(s[3], s[2]); \ |
176 | 658k | ss[1] = _mm256_unpacklo_epi16(src_2, src_3); \ |
177 | 658k | \ |
178 | 658k | const __m256i res = convolve_4tap(ss, coeffs_v); \ |
179 | 658k | \ |
180 | 658k | sr_2d_ver_round_and_store_w4(w, res, dst_ptr, dst_stride, round_const_v); \ |
181 | 658k | dst_ptr += 2 * dst_stride; \ |
182 | 658k | \ |
183 | 658k | ss[0] = ss[1]; \ |
184 | 658k | } |
185 | | |
186 | | #define CONVOLVE_SR_VER_FILTER_6TAP_W4 \ |
187 | 168k | __m128i s[6]; \ |
188 | 168k | __m256i ss[3]; \ |
189 | 168k | s[0] = _mm_loadl_epi64((__m128i *)(im_block + 0 * 4)); \ |
190 | 168k | s[1] = _mm_loadl_epi64((__m128i *)(im_block + 1 * 4)); \ |
191 | 168k | s[2] = _mm_loadl_epi64((__m128i *)(im_block + 2 * 4)); \ |
192 | 168k | s[3] = _mm_loadl_epi64((__m128i *)(im_block + 3 * 4)); \ |
193 | 168k | s[4] = _mm_loadl_epi64((__m128i *)(im_block + 4 * 4)); \ |
194 | 168k | \ |
195 | 168k | const __m256i src_0 = _mm256_setr_m128i(s[0], s[1]); \ |
196 | 168k | const __m256i src_1 = _mm256_setr_m128i(s[1], s[2]); \ |
197 | 168k | const __m256i src_2 = _mm256_setr_m128i(s[2], s[3]); \ |
198 | 168k | const __m256i src_3 = _mm256_setr_m128i(s[3], s[4]); \ |
199 | 168k | \ |
200 | 168k | ss[0] = _mm256_unpacklo_epi16(src_0, src_1); \ |
201 | 168k | ss[1] = _mm256_unpacklo_epi16(src_2, src_3); \ |
202 | 168k | \ |
203 | 1.02M | for (i = 0; i < h; i += 2) { \ |
204 | 855k | const int16_t *data = &im_block[i * 4]; \ |
205 | 855k | s[5] = _mm_loadl_epi64((__m128i *)(data + 5 * 4)); \ |
206 | 855k | const __m256i src_4 = _mm256_setr_m128i(s[4], s[5]); \ |
207 | 855k | s[4] = _mm_loadl_epi64((__m128i *)(data + 6 * 4)); \ |
208 | 855k | const __m256i src_5 = _mm256_setr_m128i(s[5], s[4]); \ |
209 | 855k | ss[2] = _mm256_unpacklo_epi16(src_4, src_5); \ |
210 | 855k | \ |
211 | 855k | const __m256i res = convolve_6tap(ss, coeffs_v); \ |
212 | 855k | \ |
213 | 855k | sr_2d_ver_round_and_store_w4(w, res, dst_ptr, dst_stride, round_const_v); \ |
214 | 855k | dst_ptr += 2 * dst_stride; \ |
215 | 855k | \ |
216 | 855k | ss[0] = ss[1]; \ |
217 | 855k | ss[1] = ss[2]; \ |
218 | 855k | } |
219 | | |
220 | | #define CONVOLVE_SR_VER_FILTER_8TAP_W4 \ |
221 | 11.7k | __m128i s[8]; \ |
222 | 11.7k | __m256i ss[4]; \ |
223 | 11.7k | s[0] = _mm_loadl_epi64((__m128i *)(im_block + 0 * 4)); \ |
224 | 11.7k | s[1] = _mm_loadl_epi64((__m128i *)(im_block + 1 * 4)); \ |
225 | 11.7k | s[2] = _mm_loadl_epi64((__m128i *)(im_block + 2 * 4)); \ |
226 | 11.7k | s[3] = _mm_loadl_epi64((__m128i *)(im_block + 3 * 4)); \ |
227 | 11.7k | s[4] = _mm_loadl_epi64((__m128i *)(im_block + 4 * 4)); \ |
228 | 11.7k | s[5] = _mm_loadl_epi64((__m128i *)(im_block + 5 * 4)); \ |
229 | 11.7k | s[6] = _mm_loadl_epi64((__m128i *)(im_block + 6 * 4)); \ |
230 | 11.7k | \ |
231 | 11.7k | const __m256i src_0 = _mm256_setr_m128i(s[0], s[1]); \ |
232 | 11.7k | const __m256i src_1 = _mm256_setr_m128i(s[1], s[2]); \ |
233 | 11.7k | const __m256i src_2 = _mm256_setr_m128i(s[2], s[3]); \ |
234 | 11.7k | const __m256i src_3 = _mm256_setr_m128i(s[3], s[4]); \ |
235 | 11.7k | const __m256i src_4 = _mm256_setr_m128i(s[4], s[5]); \ |
236 | 11.7k | const __m256i src_5 = _mm256_setr_m128i(s[5], s[6]); \ |
237 | 11.7k | \ |
238 | 11.7k | ss[0] = _mm256_unpacklo_epi16(src_0, src_1); \ |
239 | 11.7k | ss[1] = _mm256_unpacklo_epi16(src_2, src_3); \ |
240 | 11.7k | ss[2] = _mm256_unpacklo_epi16(src_4, src_5); \ |
241 | 11.7k | \ |
242 | 70.8k | for (i = 0; i < h; i += 2) { \ |
243 | 59.1k | const int16_t *data = &im_block[i * 4]; \ |
244 | 59.1k | s[7] = _mm_loadl_epi64((__m128i *)(data + 7 * 4)); \ |
245 | 59.1k | const __m256i src_6 = _mm256_setr_m128i(s[6], s[7]); \ |
246 | 59.1k | s[6] = _mm_loadl_epi64((__m128i *)(data + 8 * 4)); \ |
247 | 59.1k | const __m256i src_7 = _mm256_setr_m128i(s[7], s[6]); \ |
248 | 59.1k | ss[3] = _mm256_unpacklo_epi16(src_6, src_7); \ |
249 | 59.1k | \ |
250 | 59.1k | const __m256i res = convolve(ss, coeffs_v); \ |
251 | 59.1k | \ |
252 | 59.1k | sr_2d_ver_round_and_store_w4(w, res, dst_ptr, dst_stride, round_const_v); \ |
253 | 59.1k | dst_ptr += 2 * dst_stride; \ |
254 | 59.1k | \ |
255 | 59.1k | ss[0] = ss[1]; \ |
256 | 59.1k | ss[1] = ss[2]; \ |
257 | 59.1k | ss[2] = ss[3]; \ |
258 | 59.1k | } |
259 | | |
260 | | #define CONVOLVE_SR_HORIZONTAL_FILTER(CONVOLVE_LOWBD) \ |
261 | | for (i = 0; i < (im_h - 2); i += 2) { \ |
262 | | __m256i data = _mm256_castsi128_si256( \ |
263 | | _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \ |
264 | | data = _mm256_inserti128_si256( \ |
265 | | data, \ |
266 | | _mm_loadu_si128( \ |
267 | | (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]), \ |
268 | | 1); \ |
269 | | __m256i res = CONVOLVE_LOWBD(data, coeffs_h, filt); \ |
270 | | res = _mm256_srai_epi16(_mm256_add_epi16(res, round_const_h), 2); \ |
271 | | _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \ |
272 | | } \ |
273 | | __m256i data_1 = _mm256_castsi128_si256( \ |
274 | | _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \ |
275 | | __m256i res = CONVOLVE_LOWBD(data_1, coeffs_h, filt); \ |
276 | | res = _mm256_srai_epi16(_mm256_add_epi16(res, round_const_h), 2); \ |
277 | | _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); |
278 | | |
279 | | #define CONVOLVE_SR_HORIZONTAL_FILTER_2TAP \ |
280 | | CONVOLVE_SR_HORIZONTAL_FILTER(convolve_lowbd_x_2tap) |
281 | | |
282 | | #define CONVOLVE_SR_HORIZONTAL_FILTER_4TAP \ |
283 | | CONVOLVE_SR_HORIZONTAL_FILTER(convolve_lowbd_x_4tap) |
284 | | |
285 | | #define CONVOLVE_SR_HORIZONTAL_FILTER_6TAP \ |
286 | | CONVOLVE_SR_HORIZONTAL_FILTER(convolve_lowbd_x_6tap) |
287 | | |
288 | | #define CONVOLVE_SR_HORIZONTAL_FILTER_8TAP \ |
289 | | CONVOLVE_SR_HORIZONTAL_FILTER(convolve_lowbd_x) |
290 | | |
291 | | static inline void sr_2d_ver_round_and_store(__m256i res_a, __m256i res_b, |
292 | | uint8_t *dst, int dst_stride, |
293 | 11.1M | __m256i round_const_v) { |
294 | 11.1M | const __m256i res_a_round = |
295 | 11.1M | _mm256_srai_epi32(_mm256_add_epi32(res_a, round_const_v), 11); |
296 | 11.1M | const __m256i res_b_round = |
297 | 11.1M | _mm256_srai_epi32(_mm256_add_epi32(res_b, round_const_v), 11); |
298 | 11.1M | const __m256i r16 = _mm256_packs_epi32(res_a_round, res_b_round); |
299 | 11.1M | const __m256i r8 = _mm256_packus_epi16(r16, r16); |
300 | | |
301 | 11.1M | _mm_storel_epi64((__m128i *)dst, _mm256_castsi256_si128(r8)); |
302 | 11.1M | _mm_storel_epi64((__m128i *)(dst + dst_stride), |
303 | 11.1M | _mm256_extracti128_si256(r8, 1)); |
304 | 11.1M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:sr_2d_ver_round_and_store Unexecuted instantiation: highbd_convolve_avx2.c:sr_2d_ver_round_and_store convolve_2d_avx2.c:sr_2d_ver_round_and_store Line | Count | Source | 293 | 11.1M | __m256i round_const_v) { | 294 | 11.1M | const __m256i res_a_round = | 295 | 11.1M | _mm256_srai_epi32(_mm256_add_epi32(res_a, round_const_v), 11); | 296 | 11.1M | const __m256i res_b_round = | 297 | 11.1M | _mm256_srai_epi32(_mm256_add_epi32(res_b, round_const_v), 11); | 298 | 11.1M | const __m256i r16 = _mm256_packs_epi32(res_a_round, res_b_round); | 299 | 11.1M | const __m256i r8 = _mm256_packus_epi16(r16, r16); | 300 | | | 301 | 11.1M | _mm_storel_epi64((__m128i *)dst, _mm256_castsi256_si128(r8)); | 302 | 11.1M | _mm_storel_epi64((__m128i *)(dst + dst_stride), | 303 | | _mm256_extracti128_si256(r8, 1)); | 304 | 11.1M | } |
Unexecuted instantiation: convolve_avx2.c:sr_2d_ver_round_and_store Unexecuted instantiation: jnt_convolve_avx2.c:sr_2d_ver_round_and_store Unexecuted instantiation: wiener_convolve_avx2.c:sr_2d_ver_round_and_store Unexecuted instantiation: highbd_convolve_2d_avx2.c:sr_2d_ver_round_and_store Unexecuted instantiation: highbd_jnt_convolve_avx2.c:sr_2d_ver_round_and_store |
305 | | |
306 | | #define CONVOLVE_SR_VERTICAL_FILTER_2TAP \ |
307 | 435k | for (i = 0; i < h; i += 2) { \ |
308 | 402k | __m256i s[2]; \ |
309 | 402k | const int16_t *data = &im_block[i * im_stride]; \ |
310 | 402k | const __m256i s1 = _mm256_loadu_si256((__m256i *)(data + 0 * im_stride)); \ |
311 | 402k | const __m256i s2 = _mm256_loadu_si256((__m256i *)(data + 1 * im_stride)); \ |
312 | 402k | s[0] = _mm256_unpacklo_epi16(s1, s2); \ |
313 | 402k | s[1] = _mm256_unpackhi_epi16(s1, s2); \ |
314 | 402k | \ |
315 | 402k | __m256i res_a = _mm256_madd_epi16(s[0], coeffs_v[0]); \ |
316 | 402k | __m256i res_b = _mm256_madd_epi16(s[1], coeffs_v[0]); \ |
317 | 402k | \ |
318 | 402k | sr_2d_ver_round_and_store(res_a, res_b, dst_ptr, dst_stride, \ |
319 | 402k | round_const_v); \ |
320 | 402k | dst_ptr += 2 * dst_stride; \ |
321 | 402k | } |
322 | | |
323 | | #define CONVOLVE_SR_VERTICAL_FILTER_4TAP \ |
324 | 469k | __m256i s[6]; \ |
325 | 469k | __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ |
326 | 469k | __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ |
327 | 469k | \ |
328 | 469k | s[0] = _mm256_unpacklo_epi16(src_0, src_1); \ |
329 | 469k | s[2] = _mm256_unpackhi_epi16(src_0, src_1); \ |
330 | 469k | \ |
331 | 1.81M | for (i = 0; i < h; i += 2) { \ |
332 | 1.34M | const int16_t *data = &im_block[i * im_stride]; \ |
333 | 1.34M | const __m256i s4 = _mm256_loadu_si256((__m256i *)(data + 2 * im_stride)); \ |
334 | 1.34M | const __m256i s5 = _mm256_loadu_si256((__m256i *)(data + 3 * im_stride)); \ |
335 | 1.34M | s[1] = _mm256_unpacklo_epi16(s4, s5); \ |
336 | 1.34M | s[3] = _mm256_unpackhi_epi16(s4, s5); \ |
337 | 1.34M | \ |
338 | 1.34M | __m256i res_a = convolve_4tap(s, coeffs_v); \ |
339 | 1.34M | __m256i res_b = convolve_4tap(s + 2, coeffs_v); \ |
340 | 1.34M | \ |
341 | 1.34M | sr_2d_ver_round_and_store(res_a, res_b, dst_ptr, dst_stride, \ |
342 | 1.34M | round_const_v); \ |
343 | 1.34M | dst_ptr += 2 * dst_stride; \ |
344 | 1.34M | \ |
345 | 1.34M | s[0] = s[1]; \ |
346 | 1.34M | s[2] = s[3]; \ |
347 | 1.34M | } |
348 | | |
349 | | #define CONVOLVE_SR_VERTICAL_FILTER_6TAP \ |
350 | 679k | __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ |
351 | 679k | __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ |
352 | 679k | __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \ |
353 | 679k | __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \ |
354 | 679k | \ |
355 | 679k | __m256i s[8]; \ |
356 | 679k | s[0] = _mm256_unpacklo_epi16(src_0, src_1); \ |
357 | 679k | s[1] = _mm256_unpacklo_epi16(src_2, src_3); \ |
358 | 679k | \ |
359 | 679k | s[3] = _mm256_unpackhi_epi16(src_0, src_1); \ |
360 | 679k | s[4] = _mm256_unpackhi_epi16(src_2, src_3); \ |
361 | 679k | \ |
362 | 8.33M | for (i = 0; i < h; i += 2) { \ |
363 | 7.65M | const int16_t *data = &im_block[i * im_stride]; \ |
364 | 7.65M | \ |
365 | 7.65M | const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 4 * im_stride)); \ |
366 | 7.65M | const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 5 * im_stride)); \ |
367 | 7.65M | \ |
368 | 7.65M | s[2] = _mm256_unpacklo_epi16(s6, s7); \ |
369 | 7.65M | s[5] = _mm256_unpackhi_epi16(s6, s7); \ |
370 | 7.65M | \ |
371 | 7.65M | __m256i res_a = convolve_6tap(s, coeffs_v); \ |
372 | 7.65M | __m256i res_b = convolve_6tap(s + 3, coeffs_v); \ |
373 | 7.65M | \ |
374 | 7.65M | sr_2d_ver_round_and_store(res_a, res_b, dst_ptr, dst_stride, \ |
375 | 7.65M | round_const_v); \ |
376 | 7.65M | dst_ptr += 2 * dst_stride; \ |
377 | 7.65M | \ |
378 | 7.65M | s[0] = s[1]; \ |
379 | 7.65M | s[1] = s[2]; \ |
380 | 7.65M | \ |
381 | 7.65M | s[3] = s[4]; \ |
382 | 7.65M | s[4] = s[5]; \ |
383 | 7.65M | } |
384 | | |
385 | | #define CONVOLVE_SR_VERTICAL_FILTER_8TAP \ |
386 | 146k | __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ |
387 | 146k | __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ |
388 | 146k | __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \ |
389 | 146k | __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \ |
390 | 146k | __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); \ |
391 | 146k | __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); \ |
392 | 146k | \ |
393 | 146k | __m256i s[8]; \ |
394 | 146k | s[0] = _mm256_unpacklo_epi16(src_0, src_1); \ |
395 | 146k | s[1] = _mm256_unpacklo_epi16(src_2, src_3); \ |
396 | 146k | s[2] = _mm256_unpacklo_epi16(src_4, src_5); \ |
397 | 146k | \ |
398 | 146k | s[4] = _mm256_unpackhi_epi16(src_0, src_1); \ |
399 | 146k | s[5] = _mm256_unpackhi_epi16(src_2, src_3); \ |
400 | 146k | s[6] = _mm256_unpackhi_epi16(src_4, src_5); \ |
401 | 146k | \ |
402 | 1.92M | for (i = 0; i < h; i += 2) { \ |
403 | 1.77M | const int16_t *data = &im_block[i * im_stride]; \ |
404 | 1.77M | \ |
405 | 1.77M | const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); \ |
406 | 1.77M | const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); \ |
407 | 1.77M | \ |
408 | 1.77M | s[3] = _mm256_unpacklo_epi16(s6, s7); \ |
409 | 1.77M | s[7] = _mm256_unpackhi_epi16(s6, s7); \ |
410 | 1.77M | \ |
411 | 1.77M | __m256i res_a = convolve(s, coeffs_v); \ |
412 | 1.77M | __m256i res_b = convolve(s + 4, coeffs_v); \ |
413 | 1.77M | \ |
414 | 1.77M | sr_2d_ver_round_and_store(res_a, res_b, dst_ptr, dst_stride, \ |
415 | 1.77M | round_const_v); \ |
416 | 1.77M | dst_ptr += 2 * dst_stride; \ |
417 | 1.77M | \ |
418 | 1.77M | s[0] = s[1]; \ |
419 | 1.77M | s[1] = s[2]; \ |
420 | 1.77M | s[2] = s[3]; \ |
421 | 1.77M | \ |
422 | 1.77M | s[4] = s[5]; \ |
423 | 1.77M | s[5] = s[6]; \ |
424 | 1.77M | s[6] = s[7]; \ |
425 | 1.77M | } |
426 | | |
427 | | #define CONVOLVE_SR_VERTICAL_FILTER_12TAP \ |
428 | 0 | __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ |
429 | 0 | __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ |
430 | 0 | __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \ |
431 | 0 | __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \ |
432 | 0 | __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); \ |
433 | 0 | __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); \ |
434 | 0 | __m256i src_6 = _mm256_loadu_si256((__m256i *)(im_block + 6 * im_stride)); \ |
435 | 0 | __m256i src_7 = _mm256_loadu_si256((__m256i *)(im_block + 7 * im_stride)); \ |
436 | 0 | __m256i src_8 = _mm256_loadu_si256((__m256i *)(im_block + 8 * im_stride)); \ |
437 | 0 | __m256i src_9 = _mm256_loadu_si256((__m256i *)(im_block + 9 * im_stride)); \ |
438 | 0 | \ |
439 | 0 | s[0] = _mm256_unpacklo_epi16(src_0, src_1); \ |
440 | 0 | s[1] = _mm256_unpacklo_epi16(src_2, src_3); \ |
441 | 0 | s[2] = _mm256_unpacklo_epi16(src_4, src_5); \ |
442 | 0 | s[3] = _mm256_unpacklo_epi16(src_6, src_7); \ |
443 | 0 | s[4] = _mm256_unpacklo_epi16(src_8, src_9); \ |
444 | 0 | \ |
445 | 0 | s[6] = _mm256_unpackhi_epi16(src_0, src_1); \ |
446 | 0 | s[7] = _mm256_unpackhi_epi16(src_2, src_3); \ |
447 | 0 | s[8] = _mm256_unpackhi_epi16(src_4, src_5); \ |
448 | 0 | s[9] = _mm256_unpackhi_epi16(src_6, src_7); \ |
449 | 0 | s[10] = _mm256_unpackhi_epi16(src_8, src_9); \ |
450 | 0 | \ |
451 | 0 | for (i = 0; i < h; i += 2) { \ |
452 | 0 | const int16_t *data = &im_block[i * im_stride]; \ |
453 | 0 | \ |
454 | 0 | const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 10 * im_stride)); \ |
455 | 0 | const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 11 * im_stride)); \ |
456 | 0 | \ |
457 | 0 | s[5] = _mm256_unpacklo_epi16(s6, s7); \ |
458 | 0 | s[11] = _mm256_unpackhi_epi16(s6, s7); \ |
459 | 0 | \ |
460 | 0 | __m256i res_a = convolve_12taps(s, coeffs_v); \ |
461 | 0 | __m256i res_b = convolve_12taps(s + 6, coeffs_v); \ |
462 | 0 | \ |
463 | 0 | res_a = \ |
464 | 0 | _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v); \ |
465 | 0 | res_b = \ |
466 | 0 | _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v); \ |
467 | 0 | \ |
468 | 0 | const __m256i res_a_round = _mm256_sra_epi32( \ |
469 | 0 | _mm256_add_epi32(res_a, round_const_v), round_shift_v); \ |
470 | 0 | const __m256i res_b_round = _mm256_sra_epi32( \ |
471 | 0 | _mm256_add_epi32(res_b, round_const_v), round_shift_v); \ |
472 | 0 | \ |
473 | 0 | const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); \ |
474 | 0 | const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); \ |
475 | 0 | \ |
476 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_8b); \ |
477 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); \ |
478 | 0 | \ |
479 | 0 | __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; \ |
480 | 0 | __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; \ |
481 | 0 | if (w - j > 4) { \ |
482 | 0 | _mm_storel_epi64(p_0, res_0); \ |
483 | 0 | _mm_storel_epi64(p_1, res_1); \ |
484 | 0 | } else if (w == 4) { \ |
485 | 0 | xx_storel_32(p_0, res_0); \ |
486 | 0 | xx_storel_32(p_1, res_1); \ |
487 | 0 | } else { \ |
488 | 0 | *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); \ |
489 | 0 | *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); \ |
490 | 0 | } \ |
491 | 0 | \ |
492 | 0 | s[0] = s[1]; \ |
493 | 0 | s[1] = s[2]; \ |
494 | 0 | s[2] = s[3]; \ |
495 | 0 | s[3] = s[4]; \ |
496 | 0 | s[4] = s[5]; \ |
497 | 0 | \ |
498 | 0 | s[6] = s[7]; \ |
499 | 0 | s[7] = s[8]; \ |
500 | 0 | s[8] = s[9]; \ |
501 | 0 | s[9] = s[10]; \ |
502 | 0 | s[10] = s[11]; \ |
503 | 0 | } |
504 | | |
505 | | #define DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP \ |
506 | 198k | do { \ |
507 | 3.25M | for (i = 0; i < im_h; i += 2) { \ |
508 | 3.06M | __m256i data = \ |
509 | 3.06M | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h)); \ |
510 | 3.06M | if (i + 1 < im_h) \ |
511 | 3.06M | data = _mm256_inserti128_si256( \ |
512 | 3.06M | data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1); \ |
513 | 3.06M | src_h += (src_stride << 1); \ |
514 | 3.06M | __m256i res = convolve_lowbd_x(data, coeffs_x, filt); \ |
515 | 3.06M | \ |
516 | 3.06M | res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), \ |
517 | 3.06M | round_shift_h); \ |
518 | 3.06M | \ |
519 | 3.06M | _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \ |
520 | 3.06M | } \ |
521 | 198k | } while (0) |
522 | | |
523 | | #define DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP \ |
524 | 250k | do { \ |
525 | 250k | __m256i s[8]; \ |
526 | 250k | __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ |
527 | 250k | __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ |
528 | 250k | __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \ |
529 | 250k | __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \ |
530 | 250k | __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); \ |
531 | 250k | __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); \ |
532 | 250k | \ |
533 | 250k | s[0] = _mm256_unpacklo_epi16(s0, s1); \ |
534 | 250k | s[1] = _mm256_unpacklo_epi16(s2, s3); \ |
535 | 250k | s[2] = _mm256_unpacklo_epi16(s4, s5); \ |
536 | 250k | \ |
537 | 250k | s[4] = _mm256_unpackhi_epi16(s0, s1); \ |
538 | 250k | s[5] = _mm256_unpackhi_epi16(s2, s3); \ |
539 | 250k | s[6] = _mm256_unpackhi_epi16(s4, s5); \ |
540 | 250k | \ |
541 | 2.72M | for (i = 0; i < h; i += 2) { \ |
542 | 2.47M | const int16_t *data = &im_block[i * im_stride]; \ |
543 | 2.47M | \ |
544 | 2.47M | const __m256i s6 = \ |
545 | 2.47M | _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); \ |
546 | 2.47M | const __m256i s7 = \ |
547 | 2.47M | _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); \ |
548 | 2.47M | \ |
549 | 2.47M | s[3] = _mm256_unpacklo_epi16(s6, s7); \ |
550 | 2.47M | s[7] = _mm256_unpackhi_epi16(s6, s7); \ |
551 | 2.47M | \ |
552 | 2.47M | const __m256i res_a = convolve(s, coeffs_y); \ |
553 | 2.47M | const __m256i res_a_round = _mm256_sra_epi32( \ |
554 | 2.47M | _mm256_add_epi32(res_a, round_const_v), round_shift_v); \ |
555 | 2.47M | \ |
556 | 2.47M | if (w - j > 4) { \ |
557 | 2.33M | const __m256i res_b = convolve(s + 4, coeffs_y); \ |
558 | 2.33M | const __m256i res_b_round = _mm256_sra_epi32( \ |
559 | 2.33M | _mm256_add_epi32(res_b, round_const_v), round_shift_v); \ |
560 | 2.33M | const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round); \ |
561 | 2.33M | const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); \ |
562 | 2.33M | \ |
563 | 2.33M | if (do_average) { \ |
564 | 960k | const __m256i data_ref_0 = \ |
565 | 960k | load_line2_avx2(&dst[i * dst_stride + j], \ |
566 | 960k | &dst[i * dst_stride + j + dst_stride]); \ |
567 | 960k | const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, \ |
568 | 960k | &wt, use_dist_wtd_comp_avg); \ |
569 | 960k | \ |
570 | 960k | const __m256i round_result = convolve_rounding( \ |
571 | 960k | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); \ |
572 | 960k | \ |
573 | 960k | const __m256i res_8 = \ |
574 | 960k | _mm256_packus_epi16(round_result, round_result); \ |
575 | 960k | const __m128i res_0 = _mm256_castsi256_si128(res_8); \ |
576 | 960k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); \ |
577 | 960k | \ |
578 | 960k | _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); \ |
579 | 960k | _mm_storel_epi64( \ |
580 | 960k | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); \ |
581 | 1.37M | } else { \ |
582 | 1.37M | const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); \ |
583 | 1.37M | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); \ |
584 | 1.37M | \ |
585 | 1.37M | const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); \ |
586 | 1.37M | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), \ |
587 | 1.37M | res_1); \ |
588 | 1.37M | } \ |
589 | 2.33M | } else { \ |
590 | 141k | const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round); \ |
591 | 141k | const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); \ |
592 | 141k | \ |
593 | 141k | if (do_average) { \ |
594 | 61.9k | const __m256i data_ref_0 = \ |
595 | 61.9k | load_line2_avx2(&dst[i * dst_stride + j], \ |
596 | 61.9k | &dst[i * dst_stride + j + dst_stride]); \ |
597 | 61.9k | \ |
598 | 61.9k | const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, \ |
599 | 61.9k | &wt, use_dist_wtd_comp_avg); \ |
600 | 61.9k | \ |
601 | 61.9k | const __m256i round_result = convolve_rounding( \ |
602 | 61.9k | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); \ |
603 | 61.9k | \ |
604 | 61.9k | const __m256i res_8 = \ |
605 | 61.9k | _mm256_packus_epi16(round_result, round_result); \ |
606 | 61.9k | const __m128i res_0 = _mm256_castsi256_si128(res_8); \ |
607 | 61.9k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); \ |
608 | 61.9k | \ |
609 | 61.9k | *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); \ |
610 | 61.9k | *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = \ |
611 | 61.9k | _mm_cvtsi128_si32(res_1); \ |
612 | 61.9k | \ |
613 | 79.7k | } else { \ |
614 | 79.7k | const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); \ |
615 | 79.7k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); \ |
616 | 79.7k | \ |
617 | 79.7k | const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); \ |
618 | 79.7k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), \ |
619 | 79.7k | res_1); \ |
620 | 79.7k | } \ |
621 | 141k | } \ |
622 | 2.47M | \ |
623 | 2.47M | s[0] = s[1]; \ |
624 | 2.47M | s[1] = s[2]; \ |
625 | 2.47M | s[2] = s[3]; \ |
626 | 2.47M | \ |
627 | 2.47M | s[4] = s[5]; \ |
628 | 2.47M | s[5] = s[6]; \ |
629 | 2.47M | s[6] = s[7]; \ |
630 | 2.47M | } \ |
631 | 250k | } while (0) |
632 | | |
633 | | static inline void prepare_coeffs_2t_ssse3( |
634 | | const InterpFilterParams *const filter_params, const int32_t subpel_q4, |
635 | 37.0k | __m128i *const coeffs /* [4] */) { |
636 | 37.0k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( |
637 | 37.0k | filter_params, subpel_q4 & SUBPEL_MASK); |
638 | 37.0k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); |
639 | | |
640 | | // right shift all filter co-efficients by 1 to reduce the bits required. |
641 | | // This extra right shift will be taken care of at the end while rounding |
642 | | // the result. |
643 | | // Since all filter co-efficients are even, this change will not affect the |
644 | | // end result |
645 | 37.0k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), |
646 | 37.0k | _mm_set1_epi16((short)0xffff))); |
647 | | |
648 | 37.0k | const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1); |
649 | | |
650 | | // coeffs 3 4 3 4 3 4 3 4 |
651 | 37.0k | coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u)); |
652 | 37.0k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_2t_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_2t_ssse3 convolve_2d_avx2.c:prepare_coeffs_2t_ssse3 Line | Count | Source | 635 | 19.1k | __m128i *const coeffs /* [4] */) { | 636 | 19.1k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 637 | 19.1k | filter_params, subpel_q4 & SUBPEL_MASK); | 638 | 19.1k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 639 | | | 640 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 641 | | // This extra right shift will be taken care of at the end while rounding | 642 | | // the result. | 643 | | // Since all filter co-efficients are even, this change will not affect the | 644 | | // end result | 645 | 19.1k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 646 | 19.1k | _mm_set1_epi16((short)0xffff))); | 647 | | | 648 | 19.1k | const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1); | 649 | | | 650 | | // coeffs 3 4 3 4 3 4 3 4 | 651 | 19.1k | coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u)); | 652 | 19.1k | } |
convolve_avx2.c:prepare_coeffs_2t_ssse3 Line | Count | Source | 635 | 17.8k | __m128i *const coeffs /* [4] */) { | 636 | 17.8k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 637 | 17.8k | filter_params, subpel_q4 & SUBPEL_MASK); | 638 | 17.8k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 639 | | | 640 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 641 | | // This extra right shift will be taken care of at the end while rounding | 642 | | // the result. | 643 | | // Since all filter co-efficients are even, this change will not affect the | 644 | | // end result | 645 | 17.8k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 646 | 17.8k | _mm_set1_epi16((short)0xffff))); | 647 | | | 648 | 17.8k | const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1); | 649 | | | 650 | | // coeffs 3 4 3 4 3 4 3 4 | 651 | 17.8k | coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u)); | 652 | 17.8k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_2t_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_2t_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_2t_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_2t_ssse3 |
653 | | |
654 | | static inline void prepare_coeffs_4t_ssse3( |
655 | | const InterpFilterParams *const filter_params, const int32_t subpel_q4, |
656 | 787k | __m128i *const coeffs /* [4] */) { |
657 | 787k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( |
658 | 787k | filter_params, subpel_q4 & SUBPEL_MASK); |
659 | 787k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); |
660 | | |
661 | | // right shift all filter co-efficients by 1 to reduce the bits required. |
662 | | // This extra right shift will be taken care of at the end while rounding |
663 | | // the result. |
664 | | // Since all filter co-efficients are even, this change will not affect the |
665 | | // end result |
666 | 787k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), |
667 | 787k | _mm_set1_epi16((short)0xffff))); |
668 | | |
669 | 787k | const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1); |
670 | | |
671 | | // coeffs 2 3 2 3 2 3 2 3 |
672 | 787k | coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u)); |
673 | | // coeffs 4 5 4 5 4 5 4 5 |
674 | 787k | coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u)); |
675 | 787k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_4t_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_4t_ssse3 convolve_2d_avx2.c:prepare_coeffs_4t_ssse3 Line | Count | Source | 656 | 527k | __m128i *const coeffs /* [4] */) { | 657 | 527k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 658 | 527k | filter_params, subpel_q4 & SUBPEL_MASK); | 659 | 527k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 660 | | | 661 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 662 | | // This extra right shift will be taken care of at the end while rounding | 663 | | // the result. | 664 | | // Since all filter co-efficients are even, this change will not affect the | 665 | | // end result | 666 | 527k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 667 | 527k | _mm_set1_epi16((short)0xffff))); | 668 | | | 669 | 527k | const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1); | 670 | | | 671 | | // coeffs 2 3 2 3 2 3 2 3 | 672 | 527k | coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u)); | 673 | | // coeffs 4 5 4 5 4 5 4 5 | 674 | 527k | coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u)); | 675 | 527k | } |
convolve_avx2.c:prepare_coeffs_4t_ssse3 Line | Count | Source | 656 | 259k | __m128i *const coeffs /* [4] */) { | 657 | 259k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 658 | 259k | filter_params, subpel_q4 & SUBPEL_MASK); | 659 | 259k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 660 | | | 661 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 662 | | // This extra right shift will be taken care of at the end while rounding | 663 | | // the result. | 664 | | // Since all filter co-efficients are even, this change will not affect the | 665 | | // end result | 666 | 259k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 667 | 259k | _mm_set1_epi16((short)0xffff))); | 668 | | | 669 | 259k | const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1); | 670 | | | 671 | | // coeffs 2 3 2 3 2 3 2 3 | 672 | 259k | coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u)); | 673 | | // coeffs 4 5 4 5 4 5 4 5 | 674 | 259k | coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u)); | 675 | 259k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_4t_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_4t_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_4t_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_4t_ssse3 |
676 | | |
677 | | static inline void prepare_coeffs_6t_ssse3( |
678 | | const InterpFilterParams *const filter_params, const int32_t subpel_q4, |
679 | 63.2k | __m128i *const coeffs /* [4] */) { |
680 | 63.2k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( |
681 | 63.2k | filter_params, subpel_q4 & SUBPEL_MASK); |
682 | 63.2k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); |
683 | | |
684 | | // right shift all filter co-efficients by 1 to reduce the bits required. |
685 | | // This extra right shift will be taken care of at the end while rounding |
686 | | // the result. |
687 | | // Since all filter co-efficients are even, this change will not affect the |
688 | | // end result |
689 | 63.2k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), |
690 | 63.2k | _mm_set1_epi16((short)0xffff))); |
691 | | |
692 | 63.2k | const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1); |
693 | | |
694 | | // coeffs 2 3 2 3 2 3 2 3 |
695 | 63.2k | coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0402u)); |
696 | | // coeffs 4 5 4 5 4 5 4 5 |
697 | 63.2k | coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u)); |
698 | | // coeffs 5 6 5 6 5 6 5 6 |
699 | 63.2k | coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0c0au)); |
700 | 63.2k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_6t_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_6t_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:prepare_coeffs_6t_ssse3 convolve_avx2.c:prepare_coeffs_6t_ssse3 Line | Count | Source | 679 | 63.2k | __m128i *const coeffs /* [4] */) { | 680 | 63.2k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 681 | 63.2k | filter_params, subpel_q4 & SUBPEL_MASK); | 682 | 63.2k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 683 | | | 684 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 685 | | // This extra right shift will be taken care of at the end while rounding | 686 | | // the result. | 687 | | // Since all filter co-efficients are even, this change will not affect the | 688 | | // end result | 689 | 63.2k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 690 | 63.2k | _mm_set1_epi16((short)0xffff))); | 691 | | | 692 | 63.2k | const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1); | 693 | | | 694 | | // coeffs 2 3 2 3 2 3 2 3 | 695 | 63.2k | coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0402u)); | 696 | | // coeffs 4 5 4 5 4 5 4 5 | 697 | 63.2k | coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u)); | 698 | | // coeffs 5 6 5 6 5 6 5 6 | 699 | 63.2k | coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0c0au)); | 700 | 63.2k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_6t_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_6t_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_6t_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_6t_ssse3 |
701 | | |
702 | | static inline void prepare_coeffs_ssse3( |
703 | | const InterpFilterParams *const filter_params, const int32_t subpel_q4, |
704 | 6.74k | __m128i *const coeffs /* [4] */) { |
705 | 6.74k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( |
706 | 6.74k | filter_params, subpel_q4 & SUBPEL_MASK); |
707 | 6.74k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); |
708 | | |
709 | | // right shift all filter co-efficients by 1 to reduce the bits required. |
710 | | // This extra right shift will be taken care of at the end while rounding |
711 | | // the result. |
712 | | // Since all filter co-efficients are even, this change will not affect the |
713 | | // end result |
714 | 6.74k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), |
715 | 6.74k | _mm_set1_epi16((short)0xffff))); |
716 | | |
717 | 6.74k | const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1); |
718 | | |
719 | | // coeffs 0 1 0 1 0 1 0 1 |
720 | 6.74k | coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u)); |
721 | | // coeffs 2 3 2 3 2 3 2 3 |
722 | 6.74k | coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u)); |
723 | | // coeffs 4 5 4 5 4 5 4 5 |
724 | 6.74k | coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u)); |
725 | | // coeffs 6 7 6 7 6 7 6 7 |
726 | 6.74k | coeffs[3] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0e0cu)); |
727 | 6.74k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:prepare_coeffs_ssse3 convolve_avx2.c:prepare_coeffs_ssse3 Line | Count | Source | 704 | 6.74k | __m128i *const coeffs /* [4] */) { | 705 | 6.74k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 706 | 6.74k | filter_params, subpel_q4 & SUBPEL_MASK); | 707 | 6.74k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 708 | | | 709 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 710 | | // This extra right shift will be taken care of at the end while rounding | 711 | | // the result. | 712 | | // Since all filter co-efficients are even, this change will not affect the | 713 | | // end result | 714 | 6.74k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 715 | 6.74k | _mm_set1_epi16((short)0xffff))); | 716 | | | 717 | 6.74k | const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1); | 718 | | | 719 | | // coeffs 0 1 0 1 0 1 0 1 | 720 | 6.74k | coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u)); | 721 | | // coeffs 2 3 2 3 2 3 2 3 | 722 | 6.74k | coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u)); | 723 | | // coeffs 4 5 4 5 4 5 4 5 | 724 | 6.74k | coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u)); | 725 | | // coeffs 6 7 6 7 6 7 6 7 | 726 | 6.74k | coeffs[3] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0e0cu)); | 727 | 6.74k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_ssse3 |
728 | | |
729 | | static inline void prepare_coeffs_2t_lowbd( |
730 | | const InterpFilterParams *const filter_params, const int subpel_q4, |
731 | 28.2k | __m256i *const coeffs /* [4] */) { |
732 | 28.2k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( |
733 | 28.2k | filter_params, subpel_q4 & SUBPEL_MASK); |
734 | 28.2k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); |
735 | 28.2k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); |
736 | | |
737 | | // right shift all filter co-efficients by 1 to reduce the bits required. |
738 | | // This extra right shift will be taken care of at the end while rounding |
739 | | // the result. |
740 | | // Since all filter co-efficients are even, this change will not affect the |
741 | | // end result |
742 | 28.2k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), |
743 | 28.2k | _mm_set1_epi16((int16_t)0xffff))); |
744 | | |
745 | 28.2k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); |
746 | | |
747 | | // coeffs 3 4 3 4 3 4 3 4 |
748 | 28.2k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u)); |
749 | 28.2k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_2t_lowbd Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_2t_lowbd convolve_2d_avx2.c:prepare_coeffs_2t_lowbd Line | Count | Source | 731 | 17.5k | __m256i *const coeffs /* [4] */) { | 732 | 17.5k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 733 | 17.5k | filter_params, subpel_q4 & SUBPEL_MASK); | 734 | 17.5k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 735 | 17.5k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); | 736 | | | 737 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 738 | | // This extra right shift will be taken care of at the end while rounding | 739 | | // the result. | 740 | | // Since all filter co-efficients are even, this change will not affect the | 741 | | // end result | 742 | 17.5k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 743 | 17.5k | _mm_set1_epi16((int16_t)0xffff))); | 744 | | | 745 | 17.5k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); | 746 | | | 747 | | // coeffs 3 4 3 4 3 4 3 4 | 748 | 17.5k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u)); | 749 | 17.5k | } |
convolve_avx2.c:prepare_coeffs_2t_lowbd Line | Count | Source | 731 | 10.6k | __m256i *const coeffs /* [4] */) { | 732 | 10.6k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 733 | 10.6k | filter_params, subpel_q4 & SUBPEL_MASK); | 734 | 10.6k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 735 | 10.6k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); | 736 | | | 737 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 738 | | // This extra right shift will be taken care of at the end while rounding | 739 | | // the result. | 740 | | // Since all filter co-efficients are even, this change will not affect the | 741 | | // end result | 742 | 10.6k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 743 | 10.6k | _mm_set1_epi16((int16_t)0xffff))); | 744 | | | 745 | 10.6k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); | 746 | | | 747 | | // coeffs 3 4 3 4 3 4 3 4 | 748 | 10.6k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u)); | 749 | 10.6k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_2t_lowbd Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_2t_lowbd Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_2t_lowbd Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_2t_lowbd |
750 | | |
751 | | static inline void prepare_coeffs_4t_lowbd( |
752 | | const InterpFilterParams *const filter_params, const int subpel_q4, |
753 | 168k | __m256i *const coeffs /* [4] */) { |
754 | 168k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( |
755 | 168k | filter_params, subpel_q4 & SUBPEL_MASK); |
756 | 168k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); |
757 | 168k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); |
758 | | |
759 | | // right shift all filter co-efficients by 1 to reduce the bits required. |
760 | | // This extra right shift will be taken care of at the end while rounding |
761 | | // the result. |
762 | | // Since all filter co-efficients are even, this change will not affect the |
763 | | // end result |
764 | 168k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), |
765 | 168k | _mm_set1_epi16((short)0xffff))); |
766 | | |
767 | 168k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); |
768 | | |
769 | | // coeffs 2 3 2 3 2 3 2 3 |
770 | 168k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u)); |
771 | | // coeffs 4 5 4 5 4 5 4 5 |
772 | 168k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u)); |
773 | 168k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_4t_lowbd Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_4t_lowbd convolve_2d_avx2.c:prepare_coeffs_4t_lowbd Line | Count | Source | 753 | 34.9k | __m256i *const coeffs /* [4] */) { | 754 | 34.9k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 755 | 34.9k | filter_params, subpel_q4 & SUBPEL_MASK); | 756 | 34.9k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 757 | 34.9k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); | 758 | | | 759 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 760 | | // This extra right shift will be taken care of at the end while rounding | 761 | | // the result. | 762 | | // Since all filter co-efficients are even, this change will not affect the | 763 | | // end result | 764 | 34.9k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 765 | 34.9k | _mm_set1_epi16((short)0xffff))); | 766 | | | 767 | 34.9k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); | 768 | | | 769 | | // coeffs 2 3 2 3 2 3 2 3 | 770 | 34.9k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u)); | 771 | | // coeffs 4 5 4 5 4 5 4 5 | 772 | 34.9k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u)); | 773 | 34.9k | } |
convolve_avx2.c:prepare_coeffs_4t_lowbd Line | Count | Source | 753 | 133k | __m256i *const coeffs /* [4] */) { | 754 | 133k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 755 | 133k | filter_params, subpel_q4 & SUBPEL_MASK); | 756 | 133k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 757 | 133k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); | 758 | | | 759 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 760 | | // This extra right shift will be taken care of at the end while rounding | 761 | | // the result. | 762 | | // Since all filter co-efficients are even, this change will not affect the | 763 | | // end result | 764 | 133k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 765 | 133k | _mm_set1_epi16((short)0xffff))); | 766 | | | 767 | 133k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); | 768 | | | 769 | | // coeffs 2 3 2 3 2 3 2 3 | 770 | 133k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u)); | 771 | | // coeffs 4 5 4 5 4 5 4 5 | 772 | 133k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u)); | 773 | 133k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_4t_lowbd Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_4t_lowbd Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_4t_lowbd Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_4t_lowbd |
774 | | |
775 | | static inline void prepare_coeffs_6t_lowbd( |
776 | | const InterpFilterParams *const filter_params, const int subpel_q4, |
777 | 975k | __m256i *const coeffs /* [4] */) { |
778 | 975k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( |
779 | 975k | filter_params, subpel_q4 & SUBPEL_MASK); |
780 | 975k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); |
781 | 975k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); |
782 | | |
783 | | // right shift all filter co-efficients by 1 to reduce the bits required. |
784 | | // This extra right shift will be taken care of at the end while rounding |
785 | | // the result. |
786 | | // Since all filter co-efficients are even, this change will not affect the |
787 | | // end result |
788 | 975k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), |
789 | 975k | _mm_set1_epi16((int16_t)0xffff))); |
790 | | |
791 | 975k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); |
792 | | |
793 | | // coeffs 1 2 1 2 1 2 1 2 |
794 | 975k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0402u)); |
795 | | // coeffs 3 4 3 4 3 4 3 4 |
796 | 975k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u)); |
797 | | // coeffs 5 6 5 6 5 6 5 6 |
798 | 975k | coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0c0au)); |
799 | 975k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_6t_lowbd Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_6t_lowbd convolve_2d_avx2.c:prepare_coeffs_6t_lowbd Line | Count | Source | 777 | 632k | __m256i *const coeffs /* [4] */) { | 778 | 632k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 779 | 632k | filter_params, subpel_q4 & SUBPEL_MASK); | 780 | 632k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 781 | 632k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); | 782 | | | 783 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 784 | | // This extra right shift will be taken care of at the end while rounding | 785 | | // the result. | 786 | | // Since all filter co-efficients are even, this change will not affect the | 787 | | // end result | 788 | 632k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 789 | 632k | _mm_set1_epi16((int16_t)0xffff))); | 790 | | | 791 | 632k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); | 792 | | | 793 | | // coeffs 1 2 1 2 1 2 1 2 | 794 | 632k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0402u)); | 795 | | // coeffs 3 4 3 4 3 4 3 4 | 796 | 632k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u)); | 797 | | // coeffs 5 6 5 6 5 6 5 6 | 798 | 632k | coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0c0au)); | 799 | 632k | } |
convolve_avx2.c:prepare_coeffs_6t_lowbd Line | Count | Source | 777 | 343k | __m256i *const coeffs /* [4] */) { | 778 | 343k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 779 | 343k | filter_params, subpel_q4 & SUBPEL_MASK); | 780 | 343k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 781 | 343k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); | 782 | | | 783 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 784 | | // This extra right shift will be taken care of at the end while rounding | 785 | | // the result. | 786 | | // Since all filter co-efficients are even, this change will not affect the | 787 | | // end result | 788 | 343k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 789 | 343k | _mm_set1_epi16((int16_t)0xffff))); | 790 | | | 791 | 343k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); | 792 | | | 793 | | // coeffs 1 2 1 2 1 2 1 2 | 794 | 343k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0402u)); | 795 | | // coeffs 3 4 3 4 3 4 3 4 | 796 | 343k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u)); | 797 | | // coeffs 5 6 5 6 5 6 5 6 | 798 | 343k | coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0c0au)); | 799 | 343k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_6t_lowbd Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_6t_lowbd Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_6t_lowbd Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_6t_lowbd |
800 | | |
801 | | static inline void prepare_coeffs_lowbd( |
802 | | const InterpFilterParams *const filter_params, const int subpel_q4, |
803 | 400k | __m256i *const coeffs /* [4] */) { |
804 | 400k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( |
805 | 400k | filter_params, subpel_q4 & SUBPEL_MASK); |
806 | 400k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); |
807 | 400k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); |
808 | | |
809 | | // right shift all filter co-efficients by 1 to reduce the bits required. |
810 | | // This extra right shift will be taken care of at the end while rounding |
811 | | // the result. |
812 | | // Since all filter co-efficients are even, this change will not affect the |
813 | | // end result |
814 | 400k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), |
815 | 400k | _mm_set1_epi16((short)0xffff))); |
816 | | |
817 | 400k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); |
818 | | |
819 | | // coeffs 0 1 0 1 0 1 0 1 |
820 | 400k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u)); |
821 | | // coeffs 2 3 2 3 2 3 2 3 |
822 | 400k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u)); |
823 | | // coeffs 4 5 4 5 4 5 4 5 |
824 | 400k | coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u)); |
825 | | // coeffs 6 7 6 7 6 7 6 7 |
826 | 400k | coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu)); |
827 | 400k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_lowbd Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_lowbd convolve_2d_avx2.c:prepare_coeffs_lowbd Line | Count | Source | 803 | 57.6k | __m256i *const coeffs /* [4] */) { | 804 | 57.6k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 805 | 57.6k | filter_params, subpel_q4 & SUBPEL_MASK); | 806 | 57.6k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 807 | 57.6k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); | 808 | | | 809 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 810 | | // This extra right shift will be taken care of at the end while rounding | 811 | | // the result. | 812 | | // Since all filter co-efficients are even, this change will not affect the | 813 | | // end result | 814 | 57.6k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 815 | 57.6k | _mm_set1_epi16((short)0xffff))); | 816 | | | 817 | 57.6k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); | 818 | | | 819 | | // coeffs 0 1 0 1 0 1 0 1 | 820 | 57.6k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u)); | 821 | | // coeffs 2 3 2 3 2 3 2 3 | 822 | 57.6k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u)); | 823 | | // coeffs 4 5 4 5 4 5 4 5 | 824 | 57.6k | coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u)); | 825 | | // coeffs 6 7 6 7 6 7 6 7 | 826 | 57.6k | coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu)); | 827 | 57.6k | } |
convolve_avx2.c:prepare_coeffs_lowbd Line | Count | Source | 803 | 36.1k | __m256i *const coeffs /* [4] */) { | 804 | 36.1k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 805 | 36.1k | filter_params, subpel_q4 & SUBPEL_MASK); | 806 | 36.1k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 807 | 36.1k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); | 808 | | | 809 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 810 | | // This extra right shift will be taken care of at the end while rounding | 811 | | // the result. | 812 | | // Since all filter co-efficients are even, this change will not affect the | 813 | | // end result | 814 | 36.1k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 815 | 36.1k | _mm_set1_epi16((short)0xffff))); | 816 | | | 817 | 36.1k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); | 818 | | | 819 | | // coeffs 0 1 0 1 0 1 0 1 | 820 | 36.1k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u)); | 821 | | // coeffs 2 3 2 3 2 3 2 3 | 822 | 36.1k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u)); | 823 | | // coeffs 4 5 4 5 4 5 4 5 | 824 | 36.1k | coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u)); | 825 | | // coeffs 6 7 6 7 6 7 6 7 | 826 | 36.1k | coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu)); | 827 | 36.1k | } |
jnt_convolve_avx2.c:prepare_coeffs_lowbd Line | Count | Source | 803 | 306k | __m256i *const coeffs /* [4] */) { | 804 | 306k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 805 | 306k | filter_params, subpel_q4 & SUBPEL_MASK); | 806 | 306k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 807 | 306k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); | 808 | | | 809 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 810 | | // This extra right shift will be taken care of at the end while rounding | 811 | | // the result. | 812 | | // Since all filter co-efficients are even, this change will not affect the | 813 | | // end result | 814 | 306k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 815 | 306k | _mm_set1_epi16((short)0xffff))); | 816 | | | 817 | 306k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); | 818 | | | 819 | | // coeffs 0 1 0 1 0 1 0 1 | 820 | 306k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u)); | 821 | | // coeffs 2 3 2 3 2 3 2 3 | 822 | 306k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u)); | 823 | | // coeffs 4 5 4 5 4 5 4 5 | 824 | 306k | coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u)); | 825 | | // coeffs 6 7 6 7 6 7 6 7 | 826 | 306k | coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu)); | 827 | 306k | } |
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_lowbd Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_lowbd Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_lowbd |
828 | | |
829 | | static inline void prepare_coeffs_2t( |
830 | | const InterpFilterParams *const filter_params, const int subpel_q4, |
831 | 36.6k | __m256i *const coeffs /* [4] */) { |
832 | 36.6k | const int16_t *filter = av1_get_interp_filter_subpel_kernel( |
833 | 36.6k | filter_params, subpel_q4 & SUBPEL_MASK); |
834 | | |
835 | 36.6k | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)(filter + 1)); |
836 | 36.6k | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); |
837 | | |
838 | | // coeffs 3 4 3 4 3 4 3 4 |
839 | 36.6k | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55); |
840 | 36.6k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_2t Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_2t convolve_2d_avx2.c:prepare_coeffs_2t Line | Count | Source | 831 | 36.6k | __m256i *const coeffs /* [4] */) { | 832 | 36.6k | const int16_t *filter = av1_get_interp_filter_subpel_kernel( | 833 | 36.6k | filter_params, subpel_q4 & SUBPEL_MASK); | 834 | | | 835 | 36.6k | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)(filter + 1)); | 836 | 36.6k | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); | 837 | | | 838 | | // coeffs 3 4 3 4 3 4 3 4 | 839 | | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55); | 840 | 36.6k | } |
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_2t Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_2t Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_2t Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_2t Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_2t |
841 | | |
842 | | static inline void prepare_coeffs_4t( |
843 | | const InterpFilterParams *const filter_params, const int subpel_q4, |
844 | 693k | __m256i *const coeffs /* [4] */) { |
845 | 693k | const int16_t *filter = av1_get_interp_filter_subpel_kernel( |
846 | 693k | filter_params, subpel_q4 & SUBPEL_MASK); |
847 | | |
848 | 693k | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); |
849 | 693k | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); |
850 | | // coeffs 2 3 2 3 2 3 2 3 |
851 | 693k | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55); |
852 | | // coeffs 4 5 4 5 4 5 4 5 |
853 | 693k | coeffs[1] = _mm256_shuffle_epi32(coeff, 0xaa); |
854 | 693k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_4t Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_4t convolve_2d_avx2.c:prepare_coeffs_4t Line | Count | Source | 844 | 693k | __m256i *const coeffs /* [4] */) { | 845 | 693k | const int16_t *filter = av1_get_interp_filter_subpel_kernel( | 846 | 693k | filter_params, subpel_q4 & SUBPEL_MASK); | 847 | | | 848 | 693k | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); | 849 | 693k | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); | 850 | | // coeffs 2 3 2 3 2 3 2 3 | 851 | 693k | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55); | 852 | | // coeffs 4 5 4 5 4 5 4 5 | 853 | | coeffs[1] = _mm256_shuffle_epi32(coeff, 0xaa); | 854 | 693k | } |
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_4t Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_4t Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_4t Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_4t Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_4t |
855 | | |
856 | | static inline void prepare_coeffs_6t( |
857 | | const InterpFilterParams *const filter_params, const int subpel_q4, |
858 | 508k | __m256i *const coeffs /* [4] */) { |
859 | 508k | const int16_t *filter = av1_get_interp_filter_subpel_kernel( |
860 | 508k | filter_params, subpel_q4 & SUBPEL_MASK); |
861 | | |
862 | 508k | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)(filter + 1)); |
863 | 508k | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); |
864 | | |
865 | | // coeffs 1 2 1 2 1 2 1 2 |
866 | 508k | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); |
867 | | // coeffs 3 4 3 4 3 4 3 4 |
868 | 508k | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); |
869 | | // coeffs 5 6 5 6 5 6 5 6 |
870 | 508k | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); |
871 | 508k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_6t Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_6t convolve_2d_avx2.c:prepare_coeffs_6t Line | Count | Source | 858 | 508k | __m256i *const coeffs /* [4] */) { | 859 | 508k | const int16_t *filter = av1_get_interp_filter_subpel_kernel( | 860 | 508k | filter_params, subpel_q4 & SUBPEL_MASK); | 861 | | | 862 | 508k | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)(filter + 1)); | 863 | 508k | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); | 864 | | | 865 | | // coeffs 1 2 1 2 1 2 1 2 | 866 | 508k | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); | 867 | | // coeffs 3 4 3 4 3 4 3 4 | 868 | 508k | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); | 869 | | // coeffs 5 6 5 6 5 6 5 6 | 870 | | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); | 871 | 508k | } |
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_6t Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_6t Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_6t Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_6t Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_6t |
872 | | |
873 | | static inline void prepare_coeffs(const InterpFilterParams *const filter_params, |
874 | | const int subpel_q4, |
875 | 8.46M | __m256i *const coeffs /* [4] */) { |
876 | 8.46M | const int16_t *filter = av1_get_interp_filter_subpel_kernel( |
877 | 8.46M | filter_params, subpel_q4 & SUBPEL_MASK); |
878 | | |
879 | 8.46M | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); |
880 | 8.46M | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); |
881 | | |
882 | | // coeffs 0 1 0 1 0 1 0 1 |
883 | 8.46M | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); |
884 | | // coeffs 2 3 2 3 2 3 2 3 |
885 | 8.46M | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); |
886 | | // coeffs 4 5 4 5 4 5 4 5 |
887 | 8.46M | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); |
888 | | // coeffs 6 7 6 7 6 7 6 7 |
889 | 8.46M | coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); |
890 | 8.46M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs highbd_convolve_avx2.c:prepare_coeffs Line | Count | Source | 875 | 1.61M | __m256i *const coeffs /* [4] */) { | 876 | 1.61M | const int16_t *filter = av1_get_interp_filter_subpel_kernel( | 877 | 1.61M | filter_params, subpel_q4 & SUBPEL_MASK); | 878 | | | 879 | 1.61M | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); | 880 | 1.61M | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); | 881 | | | 882 | | // coeffs 0 1 0 1 0 1 0 1 | 883 | 1.61M | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); | 884 | | // coeffs 2 3 2 3 2 3 2 3 | 885 | 1.61M | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); | 886 | | // coeffs 4 5 4 5 4 5 4 5 | 887 | 1.61M | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); | 888 | | // coeffs 6 7 6 7 6 7 6 7 | 889 | | coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); | 890 | 1.61M | } |
convolve_2d_avx2.c:prepare_coeffs Line | Count | Source | 875 | 50.6k | __m256i *const coeffs /* [4] */) { | 876 | 50.6k | const int16_t *filter = av1_get_interp_filter_subpel_kernel( | 877 | 50.6k | filter_params, subpel_q4 & SUBPEL_MASK); | 878 | | | 879 | 50.6k | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); | 880 | 50.6k | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); | 881 | | | 882 | | // coeffs 0 1 0 1 0 1 0 1 | 883 | 50.6k | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); | 884 | | // coeffs 2 3 2 3 2 3 2 3 | 885 | 50.6k | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); | 886 | | // coeffs 4 5 4 5 4 5 4 5 | 887 | 50.6k | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); | 888 | | // coeffs 6 7 6 7 6 7 6 7 | 889 | | coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); | 890 | 50.6k | } |
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs jnt_convolve_avx2.c:prepare_coeffs Line | Count | Source | 875 | 162k | __m256i *const coeffs /* [4] */) { | 876 | 162k | const int16_t *filter = av1_get_interp_filter_subpel_kernel( | 877 | 162k | filter_params, subpel_q4 & SUBPEL_MASK); | 878 | | | 879 | 162k | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); | 880 | 162k | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); | 881 | | | 882 | | // coeffs 0 1 0 1 0 1 0 1 | 883 | 162k | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); | 884 | | // coeffs 2 3 2 3 2 3 2 3 | 885 | 162k | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); | 886 | | // coeffs 4 5 4 5 4 5 4 5 | 887 | 162k | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); | 888 | | // coeffs 6 7 6 7 6 7 6 7 | 889 | | coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); | 890 | 162k | } |
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs highbd_convolve_2d_avx2.c:prepare_coeffs Line | Count | Source | 875 | 5.56M | __m256i *const coeffs /* [4] */) { | 876 | 5.56M | const int16_t *filter = av1_get_interp_filter_subpel_kernel( | 877 | 5.56M | filter_params, subpel_q4 & SUBPEL_MASK); | 878 | | | 879 | 5.56M | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); | 880 | 5.56M | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); | 881 | | | 882 | | // coeffs 0 1 0 1 0 1 0 1 | 883 | 5.56M | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); | 884 | | // coeffs 2 3 2 3 2 3 2 3 | 885 | 5.56M | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); | 886 | | // coeffs 4 5 4 5 4 5 4 5 | 887 | 5.56M | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); | 888 | | // coeffs 6 7 6 7 6 7 6 7 | 889 | | coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); | 890 | 5.56M | } |
highbd_jnt_convolve_avx2.c:prepare_coeffs Line | Count | Source | 875 | 1.06M | __m256i *const coeffs /* [4] */) { | 876 | 1.06M | const int16_t *filter = av1_get_interp_filter_subpel_kernel( | 877 | 1.06M | filter_params, subpel_q4 & SUBPEL_MASK); | 878 | | | 879 | 1.06M | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); | 880 | 1.06M | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); | 881 | | | 882 | | // coeffs 0 1 0 1 0 1 0 1 | 883 | 1.06M | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); | 884 | | // coeffs 2 3 2 3 2 3 2 3 | 885 | 1.06M | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); | 886 | | // coeffs 4 5 4 5 4 5 4 5 | 887 | 1.06M | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); | 888 | | // coeffs 6 7 6 7 6 7 6 7 | 889 | | coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); | 890 | 1.06M | } |
|
891 | | |
892 | | static inline void prepare_coeffs_12taps( |
893 | | const InterpFilterParams *const filter_params, const int subpel_q4, |
894 | 0 | __m256i *const coeffs /* [4] */) { |
895 | 0 | const int16_t *filter = av1_get_interp_filter_subpel_kernel( |
896 | 0 | filter_params, subpel_q4 & SUBPEL_MASK); |
897 | |
|
898 | 0 | __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); |
899 | 0 | __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); |
900 | | |
901 | | // coeffs 0 1 0 1 0 1 0 1 |
902 | 0 | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); |
903 | | // coeffs 2 3 2 3 2 3 2 3 |
904 | 0 | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); |
905 | | // coeffs 4 5 4 5 4 5 4 5 |
906 | 0 | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); |
907 | | // coeffs 6 7 6 7 6 7 6 7 |
908 | 0 | coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); |
909 | | // coeffs 8 9 10 11 0 0 0 0 |
910 | 0 | coeff_8 = _mm_loadl_epi64((__m128i *)(filter + 8)); |
911 | 0 | coeff = _mm256_broadcastq_epi64(coeff_8); |
912 | 0 | coeffs[4] = _mm256_shuffle_epi32(coeff, 0x00); // coeffs 8 9 8 9 8 9 8 9 |
913 | 0 | coeffs[5] = _mm256_shuffle_epi32(coeff, 0x55); // coeffs 10 11 10 11.. 10 11 |
914 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_12taps Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_12taps Unexecuted instantiation: convolve_2d_avx2.c:prepare_coeffs_12taps Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_12taps Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_12taps Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_12taps Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_12taps Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_12taps |
915 | | |
916 | | static inline __m128i convolve_lowbd_4tap_ssse3(const __m128i ss[2], |
917 | 3.49M | const __m128i coeffs[2]) { |
918 | 3.49M | const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]); |
919 | 3.49M | const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]); |
920 | | |
921 | 3.49M | return _mm_add_epi16(res_01, res_23); |
922 | 3.49M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_4tap_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_4tap_ssse3 convolve_2d_avx2.c:convolve_lowbd_4tap_ssse3 Line | Count | Source | 917 | 2.81M | const __m128i coeffs[2]) { | 918 | 2.81M | const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]); | 919 | 2.81M | const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]); | 920 | | | 921 | 2.81M | return _mm_add_epi16(res_01, res_23); | 922 | 2.81M | } |
convolve_avx2.c:convolve_lowbd_4tap_ssse3 Line | Count | Source | 917 | 676k | const __m128i coeffs[2]) { | 918 | 676k | const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]); | 919 | 676k | const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]); | 920 | | | 921 | 676k | return _mm_add_epi16(res_01, res_23); | 922 | 676k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_4tap_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_4tap_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_4tap_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_4tap_ssse3 |
923 | | |
924 | | static inline __m128i convolve_lowbd_6tap_ssse3(const __m128i ss[3], |
925 | 327k | const __m128i coeffs[3]) { |
926 | 327k | const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]); |
927 | 327k | const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]); |
928 | 327k | const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]); |
929 | | |
930 | 327k | const __m128i res = _mm_add_epi16(_mm_add_epi16(res_01, res_45), res_23); |
931 | | |
932 | 327k | return res; |
933 | 327k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_6tap_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_6tap_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_lowbd_6tap_ssse3 convolve_avx2.c:convolve_lowbd_6tap_ssse3 Line | Count | Source | 925 | 327k | const __m128i coeffs[3]) { | 926 | 327k | const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]); | 927 | 327k | const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]); | 928 | 327k | const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]); | 929 | | | 930 | 327k | const __m128i res = _mm_add_epi16(_mm_add_epi16(res_01, res_45), res_23); | 931 | | | 932 | 327k | return res; | 933 | 327k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_6tap_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_6tap_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_6tap_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_6tap_ssse3 |
934 | | |
935 | | static inline __m128i convolve_lowbd_ssse3(const __m128i ss[4], |
936 | 36.0k | const __m128i coeffs[4]) { |
937 | 36.0k | const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]); |
938 | 36.0k | const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]); |
939 | 36.0k | const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]); |
940 | 36.0k | const __m128i res_67 = _mm_maddubs_epi16(ss[3], coeffs[3]); |
941 | | |
942 | 36.0k | const __m128i res = _mm_add_epi16(_mm_add_epi16(res_01, res_45), |
943 | 36.0k | _mm_add_epi16(res_23, res_67)); |
944 | | |
945 | 36.0k | return res; |
946 | 36.0k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_lowbd_ssse3 convolve_avx2.c:convolve_lowbd_ssse3 Line | Count | Source | 936 | 36.0k | const __m128i coeffs[4]) { | 937 | 36.0k | const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]); | 938 | 36.0k | const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]); | 939 | 36.0k | const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]); | 940 | 36.0k | const __m128i res_67 = _mm_maddubs_epi16(ss[3], coeffs[3]); | 941 | | | 942 | 36.0k | const __m128i res = _mm_add_epi16(_mm_add_epi16(res_01, res_45), | 943 | 36.0k | _mm_add_epi16(res_23, res_67)); | 944 | | | 945 | 36.0k | return res; | 946 | 36.0k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_ssse3 |
947 | | |
948 | | static inline __m256i convolve_lowbd(const __m256i *const s, |
949 | 26.9M | const __m256i *const coeffs) { |
950 | 26.9M | const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); |
951 | 26.9M | const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); |
952 | 26.9M | const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); |
953 | 26.9M | const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]); |
954 | | |
955 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
956 | 26.9M | const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), |
957 | 26.9M | _mm256_add_epi16(res_23, res_67)); |
958 | | |
959 | 26.9M | return res; |
960 | 26.9M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd convolve_2d_avx2.c:convolve_lowbd Line | Count | Source | 949 | 2.46M | const __m256i *const coeffs) { | 950 | 2.46M | const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 951 | 2.46M | const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 952 | 2.46M | const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); | 953 | 2.46M | const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]); | 954 | | | 955 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 956 | 2.46M | const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), | 957 | 2.46M | _mm256_add_epi16(res_23, res_67)); | 958 | | | 959 | 2.46M | return res; | 960 | 2.46M | } |
convolve_avx2.c:convolve_lowbd Line | Count | Source | 949 | 617k | const __m256i *const coeffs) { | 950 | 617k | const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 951 | 617k | const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 952 | 617k | const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); | 953 | 617k | const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]); | 954 | | | 955 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 956 | 617k | const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), | 957 | 617k | _mm256_add_epi16(res_23, res_67)); | 958 | | | 959 | 617k | return res; | 960 | 617k | } |
jnt_convolve_avx2.c:convolve_lowbd Line | Count | Source | 949 | 5.26M | const __m256i *const coeffs) { | 950 | 5.26M | const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 951 | 5.26M | const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 952 | 5.26M | const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); | 953 | 5.26M | const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]); | 954 | | | 955 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 956 | 5.26M | const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), | 957 | 5.26M | _mm256_add_epi16(res_23, res_67)); | 958 | | | 959 | 5.26M | return res; | 960 | 5.26M | } |
wiener_convolve_avx2.c:convolve_lowbd Line | Count | Source | 949 | 18.6M | const __m256i *const coeffs) { | 950 | 18.6M | const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 951 | 18.6M | const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 952 | 18.6M | const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); | 953 | 18.6M | const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]); | 954 | | | 955 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 956 | 18.6M | const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), | 957 | 18.6M | _mm256_add_epi16(res_23, res_67)); | 958 | | | 959 | 18.6M | return res; | 960 | 18.6M | } |
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd |
961 | | |
962 | | static inline __m256i convolve_lowbd_6tap(const __m256i *const s, |
963 | 17.7M | const __m256i *const coeffs) { |
964 | 17.7M | const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); |
965 | 17.7M | const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); |
966 | 17.7M | const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); |
967 | | |
968 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
969 | 17.7M | const __m256i res = |
970 | 17.7M | _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), res_23); |
971 | | |
972 | 17.7M | return res; |
973 | 17.7M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_6tap Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_6tap convolve_2d_avx2.c:convolve_lowbd_6tap Line | Count | Source | 963 | 10.9M | const __m256i *const coeffs) { | 964 | 10.9M | const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 965 | 10.9M | const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 966 | 10.9M | const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); | 967 | | | 968 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 969 | 10.9M | const __m256i res = | 970 | 10.9M | _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), res_23); | 971 | | | 972 | 10.9M | return res; | 973 | 10.9M | } |
convolve_avx2.c:convolve_lowbd_6tap Line | Count | Source | 963 | 6.81M | const __m256i *const coeffs) { | 964 | 6.81M | const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 965 | 6.81M | const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 966 | 6.81M | const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); | 967 | | | 968 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 969 | 6.81M | const __m256i res = | 970 | 6.81M | _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), res_23); | 971 | | | 972 | 6.81M | return res; | 973 | 6.81M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_6tap Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_6tap Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_6tap Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_6tap |
974 | | |
975 | | static inline __m256i convolve_lowbd_4tap(const __m256i *const s, |
976 | 3.85M | const __m256i *const coeffs) { |
977 | 3.85M | const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]); |
978 | 3.85M | const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]); |
979 | | |
980 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
981 | 3.85M | const __m256i res = _mm256_add_epi16(res_45, res_23); |
982 | | |
983 | 3.85M | return res; |
984 | 3.85M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_4tap Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_4tap convolve_2d_avx2.c:convolve_lowbd_4tap Line | Count | Source | 976 | 861k | const __m256i *const coeffs) { | 977 | 861k | const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 978 | 861k | const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 979 | | | 980 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 981 | 861k | const __m256i res = _mm256_add_epi16(res_45, res_23); | 982 | | | 983 | 861k | return res; | 984 | 861k | } |
convolve_avx2.c:convolve_lowbd_4tap Line | Count | Source | 976 | 1.13M | const __m256i *const coeffs) { | 977 | 1.13M | const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 978 | 1.13M | const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 979 | | | 980 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 981 | 1.13M | const __m256i res = _mm256_add_epi16(res_45, res_23); | 982 | | | 983 | 1.13M | return res; | 984 | 1.13M | } |
jnt_convolve_avx2.c:convolve_lowbd_4tap Line | Count | Source | 976 | 1.85M | const __m256i *const coeffs) { | 977 | 1.85M | const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 978 | 1.85M | const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 979 | | | 980 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 981 | 1.85M | const __m256i res = _mm256_add_epi16(res_45, res_23); | 982 | | | 983 | 1.85M | return res; | 984 | 1.85M | } |
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_4tap Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_4tap Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_4tap |
985 | | |
986 | | static inline __m256i convolve_6tap(const __m256i *const s, |
987 | 16.1M | const __m256i *const coeffs) { |
988 | 16.1M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); |
989 | 16.1M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); |
990 | 16.1M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); |
991 | | |
992 | 16.1M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), res_2); |
993 | | |
994 | 16.1M | return res; |
995 | 16.1M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_6tap Unexecuted instantiation: highbd_convolve_avx2.c:convolve_6tap convolve_2d_avx2.c:convolve_6tap Line | Count | Source | 987 | 16.1M | const __m256i *const coeffs) { | 988 | 16.1M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); | 989 | 16.1M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); | 990 | 16.1M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); | 991 | | | 992 | 16.1M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), res_2); | 993 | | | 994 | 16.1M | return res; | 995 | 16.1M | } |
Unexecuted instantiation: convolve_avx2.c:convolve_6tap Unexecuted instantiation: jnt_convolve_avx2.c:convolve_6tap Unexecuted instantiation: wiener_convolve_avx2.c:convolve_6tap Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_6tap Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_6tap |
996 | | |
997 | | static inline __m256i convolve_12taps(const __m256i *const s, |
998 | 0 | const __m256i *const coeffs) { |
999 | 0 | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); |
1000 | 0 | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); |
1001 | 0 | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); |
1002 | 0 | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); |
1003 | 0 | const __m256i res_4 = _mm256_madd_epi16(s[4], coeffs[4]); |
1004 | 0 | const __m256i res_5 = _mm256_madd_epi16(s[5], coeffs[5]); |
1005 | |
|
1006 | 0 | const __m256i res1 = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), |
1007 | 0 | _mm256_add_epi32(res_2, res_3)); |
1008 | 0 | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_4, res_5), res1); |
1009 | |
|
1010 | 0 | return res; |
1011 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_12taps Unexecuted instantiation: highbd_convolve_avx2.c:convolve_12taps Unexecuted instantiation: convolve_2d_avx2.c:convolve_12taps Unexecuted instantiation: convolve_avx2.c:convolve_12taps Unexecuted instantiation: jnt_convolve_avx2.c:convolve_12taps Unexecuted instantiation: wiener_convolve_avx2.c:convolve_12taps Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_12taps Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_12taps |
1012 | | |
1013 | | static inline __m256i convolve(const __m256i *const s, |
1014 | 223M | const __m256i *const coeffs) { |
1015 | 223M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); |
1016 | 223M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); |
1017 | 223M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); |
1018 | 223M | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); |
1019 | | |
1020 | 223M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), |
1021 | 223M | _mm256_add_epi32(res_2, res_3)); |
1022 | | |
1023 | 223M | return res; |
1024 | 223M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve highbd_convolve_avx2.c:convolve Line | Count | Source | 1014 | 27.3M | const __m256i *const coeffs) { | 1015 | 27.3M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); | 1016 | 27.3M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); | 1017 | 27.3M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); | 1018 | 27.3M | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); | 1019 | | | 1020 | 27.3M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), | 1021 | 27.3M | _mm256_add_epi32(res_2, res_3)); | 1022 | | | 1023 | 27.3M | return res; | 1024 | 27.3M | } |
convolve_2d_avx2.c:convolve Line | Count | Source | 1014 | 3.61M | const __m256i *const coeffs) { | 1015 | 3.61M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); | 1016 | 3.61M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); | 1017 | 3.61M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); | 1018 | 3.61M | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); | 1019 | | | 1020 | 3.61M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), | 1021 | 3.61M | _mm256_add_epi32(res_2, res_3)); | 1022 | | | 1023 | 3.61M | return res; | 1024 | 3.61M | } |
Unexecuted instantiation: convolve_avx2.c:convolve jnt_convolve_avx2.c:convolve Line | Count | Source | 1014 | 4.77M | const __m256i *const coeffs) { | 1015 | 4.77M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); | 1016 | 4.77M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); | 1017 | 4.77M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); | 1018 | 4.77M | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); | 1019 | | | 1020 | 4.77M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), | 1021 | 4.77M | _mm256_add_epi32(res_2, res_3)); | 1022 | | | 1023 | 4.77M | return res; | 1024 | 4.77M | } |
wiener_convolve_avx2.c:convolve Line | Count | Source | 1014 | 32.4M | const __m256i *const coeffs) { | 1015 | 32.4M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); | 1016 | 32.4M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); | 1017 | 32.4M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); | 1018 | 32.4M | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); | 1019 | | | 1020 | 32.4M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), | 1021 | 32.4M | _mm256_add_epi32(res_2, res_3)); | 1022 | | | 1023 | 32.4M | return res; | 1024 | 32.4M | } |
highbd_convolve_2d_avx2.c:convolve Line | Count | Source | 1014 | 110M | const __m256i *const coeffs) { | 1015 | 110M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); | 1016 | 110M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); | 1017 | 110M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); | 1018 | 110M | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); | 1019 | | | 1020 | 110M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), | 1021 | 110M | _mm256_add_epi32(res_2, res_3)); | 1022 | | | 1023 | 110M | return res; | 1024 | 110M | } |
highbd_jnt_convolve_avx2.c:convolve Line | Count | Source | 1014 | 44.1M | const __m256i *const coeffs) { | 1015 | 44.1M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); | 1016 | 44.1M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); | 1017 | 44.1M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); | 1018 | 44.1M | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); | 1019 | | | 1020 | 44.1M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), | 1021 | 44.1M | _mm256_add_epi32(res_2, res_3)); | 1022 | | | 1023 | 44.1M | return res; | 1024 | 44.1M | } |
|
1025 | | |
1026 | | static inline __m256i convolve_4tap(const __m256i *const s, |
1027 | 3.57M | const __m256i *const coeffs) { |
1028 | 3.57M | const __m256i res_1 = _mm256_madd_epi16(s[0], coeffs[0]); |
1029 | 3.57M | const __m256i res_2 = _mm256_madd_epi16(s[1], coeffs[1]); |
1030 | | |
1031 | 3.57M | const __m256i res = _mm256_add_epi32(res_1, res_2); |
1032 | 3.57M | return res; |
1033 | 3.57M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_4tap Unexecuted instantiation: highbd_convolve_avx2.c:convolve_4tap convolve_2d_avx2.c:convolve_4tap Line | Count | Source | 1027 | 3.34M | const __m256i *const coeffs) { | 1028 | 3.34M | const __m256i res_1 = _mm256_madd_epi16(s[0], coeffs[0]); | 1029 | 3.34M | const __m256i res_2 = _mm256_madd_epi16(s[1], coeffs[1]); | 1030 | | | 1031 | 3.34M | const __m256i res = _mm256_add_epi32(res_1, res_2); | 1032 | 3.34M | return res; | 1033 | 3.34M | } |
Unexecuted instantiation: convolve_avx2.c:convolve_4tap jnt_convolve_avx2.c:convolve_4tap Line | Count | Source | 1027 | 223k | const __m256i *const coeffs) { | 1028 | 223k | const __m256i res_1 = _mm256_madd_epi16(s[0], coeffs[0]); | 1029 | 223k | const __m256i res_2 = _mm256_madd_epi16(s[1], coeffs[1]); | 1030 | | | 1031 | 223k | const __m256i res = _mm256_add_epi32(res_1, res_2); | 1032 | 223k | return res; | 1033 | 223k | } |
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_4tap Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_4tap Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_4tap |
1034 | | |
1035 | | static inline __m128i convolve_lowbd_x_2tap_ssse3(const __m128i data, |
1036 | | const __m128i *const coeffs, |
1037 | 76.0k | const __m128i *const filt) { |
1038 | 76.0k | __m128i s; |
1039 | 76.0k | s = _mm_shuffle_epi8(data, filt[0]); |
1040 | | |
1041 | 76.0k | return _mm_maddubs_epi16(s, coeffs[0]); |
1042 | 76.0k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x_2tap_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x_2tap_ssse3 convolve_2d_avx2.c:convolve_lowbd_x_2tap_ssse3 Line | Count | Source | 1037 | 76.0k | const __m128i *const filt) { | 1038 | 76.0k | __m128i s; | 1039 | 76.0k | s = _mm_shuffle_epi8(data, filt[0]); | 1040 | | | 1041 | 76.0k | return _mm_maddubs_epi16(s, coeffs[0]); | 1042 | 76.0k | } |
Unexecuted instantiation: convolve_avx2.c:convolve_lowbd_x_2tap_ssse3 Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_x_2tap_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_x_2tap_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x_2tap_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x_2tap_ssse3 |
1043 | | |
1044 | | static inline __m128i convolve_lowbd_x_4tap_ssse3(const __m128i data, |
1045 | | const __m128i *const coeffs, |
1046 | 2.81M | const __m128i *const filt) { |
1047 | 2.81M | __m128i s[2]; |
1048 | | |
1049 | 2.81M | s[0] = _mm_shuffle_epi8(data, filt[0]); |
1050 | 2.81M | s[1] = _mm_shuffle_epi8(data, filt[1]); |
1051 | | |
1052 | 2.81M | return convolve_lowbd_4tap_ssse3(s, coeffs); |
1053 | 2.81M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x_4tap_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x_4tap_ssse3 convolve_2d_avx2.c:convolve_lowbd_x_4tap_ssse3 Line | Count | Source | 1046 | 2.81M | const __m128i *const filt) { | 1047 | 2.81M | __m128i s[2]; | 1048 | | | 1049 | 2.81M | s[0] = _mm_shuffle_epi8(data, filt[0]); | 1050 | 2.81M | s[1] = _mm_shuffle_epi8(data, filt[1]); | 1051 | | | 1052 | 2.81M | return convolve_lowbd_4tap_ssse3(s, coeffs); | 1053 | 2.81M | } |
Unexecuted instantiation: convolve_avx2.c:convolve_lowbd_x_4tap_ssse3 Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_x_4tap_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_x_4tap_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x_4tap_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x_4tap_ssse3 |
1054 | | |
1055 | | static inline __m256i convolve_lowbd_x(const __m256i data, |
1056 | | const __m256i *const coeffs, |
1057 | 25.9M | const __m256i *const filt) { |
1058 | 25.9M | __m256i s[4]; |
1059 | | |
1060 | 25.9M | s[0] = _mm256_shuffle_epi8(data, filt[0]); |
1061 | 25.9M | s[1] = _mm256_shuffle_epi8(data, filt[1]); |
1062 | 25.9M | s[2] = _mm256_shuffle_epi8(data, filt[2]); |
1063 | 25.9M | s[3] = _mm256_shuffle_epi8(data, filt[3]); |
1064 | | |
1065 | 25.9M | return convolve_lowbd(s, coeffs); |
1066 | 25.9M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x convolve_2d_avx2.c:convolve_lowbd_x Line | Count | Source | 1057 | 2.46M | const __m256i *const filt) { | 1058 | 2.46M | __m256i s[4]; | 1059 | | | 1060 | 2.46M | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 1061 | 2.46M | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 1062 | 2.46M | s[2] = _mm256_shuffle_epi8(data, filt[2]); | 1063 | 2.46M | s[3] = _mm256_shuffle_epi8(data, filt[3]); | 1064 | | | 1065 | 2.46M | return convolve_lowbd(s, coeffs); | 1066 | 2.46M | } |
convolve_avx2.c:convolve_lowbd_x Line | Count | Source | 1057 | 398k | const __m256i *const filt) { | 1058 | 398k | __m256i s[4]; | 1059 | | | 1060 | 398k | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 1061 | 398k | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 1062 | 398k | s[2] = _mm256_shuffle_epi8(data, filt[2]); | 1063 | 398k | s[3] = _mm256_shuffle_epi8(data, filt[3]); | 1064 | | | 1065 | 398k | return convolve_lowbd(s, coeffs); | 1066 | 398k | } |
jnt_convolve_avx2.c:convolve_lowbd_x Line | Count | Source | 1057 | 4.52M | const __m256i *const filt) { | 1058 | 4.52M | __m256i s[4]; | 1059 | | | 1060 | 4.52M | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 1061 | 4.52M | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 1062 | 4.52M | s[2] = _mm256_shuffle_epi8(data, filt[2]); | 1063 | 4.52M | s[3] = _mm256_shuffle_epi8(data, filt[3]); | 1064 | | | 1065 | 4.52M | return convolve_lowbd(s, coeffs); | 1066 | 4.52M | } |
wiener_convolve_avx2.c:convolve_lowbd_x Line | Count | Source | 1057 | 18.5M | const __m256i *const filt) { | 1058 | 18.5M | __m256i s[4]; | 1059 | | | 1060 | 18.5M | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 1061 | 18.5M | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 1062 | 18.5M | s[2] = _mm256_shuffle_epi8(data, filt[2]); | 1063 | 18.5M | s[3] = _mm256_shuffle_epi8(data, filt[3]); | 1064 | | | 1065 | 18.5M | return convolve_lowbd(s, coeffs); | 1066 | 18.5M | } |
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x |
1067 | | |
1068 | | static inline __m256i convolve_lowbd_x_6tap(const __m256i data, |
1069 | | const __m256i *const coeffs, |
1070 | 14.5M | const __m256i *const filt) { |
1071 | 14.5M | __m256i s[4]; |
1072 | | |
1073 | 14.5M | s[0] = _mm256_shuffle_epi8(data, filt[0]); |
1074 | 14.5M | s[1] = _mm256_shuffle_epi8(data, filt[1]); |
1075 | 14.5M | s[2] = _mm256_shuffle_epi8(data, filt[2]); |
1076 | | |
1077 | 14.5M | return convolve_lowbd_6tap(s, coeffs); |
1078 | 14.5M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x_6tap Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x_6tap convolve_2d_avx2.c:convolve_lowbd_x_6tap Line | Count | Source | 1070 | 10.9M | const __m256i *const filt) { | 1071 | 10.9M | __m256i s[4]; | 1072 | | | 1073 | 10.9M | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 1074 | 10.9M | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 1075 | 10.9M | s[2] = _mm256_shuffle_epi8(data, filt[2]); | 1076 | | | 1077 | 10.9M | return convolve_lowbd_6tap(s, coeffs); | 1078 | 10.9M | } |
convolve_avx2.c:convolve_lowbd_x_6tap Line | Count | Source | 1070 | 3.60M | const __m256i *const filt) { | 1071 | 3.60M | __m256i s[4]; | 1072 | | | 1073 | 3.60M | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 1074 | 3.60M | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 1075 | 3.60M | s[2] = _mm256_shuffle_epi8(data, filt[2]); | 1076 | | | 1077 | 3.60M | return convolve_lowbd_6tap(s, coeffs); | 1078 | 3.60M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_x_6tap Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_x_6tap Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x_6tap Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x_6tap |
1079 | | |
1080 | | static inline __m256i convolve_lowbd_x_4tap(const __m256i data, |
1081 | | const __m256i *const coeffs, |
1082 | 2.80M | const __m256i *const filt) { |
1083 | 2.80M | __m256i s[2]; |
1084 | | |
1085 | 2.80M | s[0] = _mm256_shuffle_epi8(data, filt[0]); |
1086 | 2.80M | s[1] = _mm256_shuffle_epi8(data, filt[1]); |
1087 | | |
1088 | 2.80M | return convolve_lowbd_4tap(s, coeffs); |
1089 | 2.80M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x_4tap Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x_4tap convolve_2d_avx2.c:convolve_lowbd_x_4tap Line | Count | Source | 1082 | 861k | const __m256i *const filt) { | 1083 | 861k | __m256i s[2]; | 1084 | | | 1085 | 861k | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 1086 | 861k | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 1087 | | | 1088 | 861k | return convolve_lowbd_4tap(s, coeffs); | 1089 | 861k | } |
convolve_avx2.c:convolve_lowbd_x_4tap Line | Count | Source | 1082 | 425k | const __m256i *const filt) { | 1083 | 425k | __m256i s[2]; | 1084 | | | 1085 | 425k | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 1086 | 425k | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 1087 | | | 1088 | 425k | return convolve_lowbd_4tap(s, coeffs); | 1089 | 425k | } |
jnt_convolve_avx2.c:convolve_lowbd_x_4tap Line | Count | Source | 1082 | 1.52M | const __m256i *const filt) { | 1083 | 1.52M | __m256i s[2]; | 1084 | | | 1085 | 1.52M | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 1086 | 1.52M | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 1087 | | | 1088 | 1.52M | return convolve_lowbd_4tap(s, coeffs); | 1089 | 1.52M | } |
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_x_4tap Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x_4tap Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x_4tap |
1090 | | |
1091 | | static inline __m256i convolve_lowbd_x_2tap(const __m256i data, |
1092 | | const __m256i *const coeffs, |
1093 | 435k | const __m256i *const filt) { |
1094 | 435k | __m256i s; |
1095 | 435k | s = _mm256_shuffle_epi8(data, filt[0]); |
1096 | | |
1097 | 435k | return _mm256_maddubs_epi16(s, coeffs[0]); |
1098 | 435k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x_2tap Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x_2tap convolve_2d_avx2.c:convolve_lowbd_x_2tap Line | Count | Source | 1093 | 435k | const __m256i *const filt) { | 1094 | 435k | __m256i s; | 1095 | 435k | s = _mm256_shuffle_epi8(data, filt[0]); | 1096 | | | 1097 | 435k | return _mm256_maddubs_epi16(s, coeffs[0]); | 1098 | 435k | } |
Unexecuted instantiation: convolve_avx2.c:convolve_lowbd_x_2tap Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_x_2tap Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_x_2tap Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x_2tap Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x_2tap |
1099 | | |
1100 | | static inline void add_store_aligned_256(CONV_BUF_TYPE *const dst, |
1101 | | const __m256i *const res, |
1102 | 0 | const int do_average) { |
1103 | 0 | __m256i d; |
1104 | 0 | if (do_average) { |
1105 | 0 | d = _mm256_load_si256((__m256i *)dst); |
1106 | 0 | d = _mm256_add_epi32(d, *res); |
1107 | 0 | d = _mm256_srai_epi32(d, 1); |
1108 | 0 | } else { |
1109 | 0 | d = *res; |
1110 | 0 | } |
1111 | 0 | _mm256_store_si256((__m256i *)dst, d); |
1112 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:add_store_aligned_256 Unexecuted instantiation: highbd_convolve_avx2.c:add_store_aligned_256 Unexecuted instantiation: convolve_2d_avx2.c:add_store_aligned_256 Unexecuted instantiation: convolve_avx2.c:add_store_aligned_256 Unexecuted instantiation: jnt_convolve_avx2.c:add_store_aligned_256 Unexecuted instantiation: wiener_convolve_avx2.c:add_store_aligned_256 Unexecuted instantiation: highbd_convolve_2d_avx2.c:add_store_aligned_256 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:add_store_aligned_256 |
1113 | | |
1114 | | static inline __m256i comp_avg(const __m256i *const data_ref_0, |
1115 | | const __m256i *const res_unsigned, |
1116 | | const __m256i *const wt, |
1117 | 163M | const int use_dist_wtd_comp_avg) { |
1118 | 163M | __m256i res; |
1119 | 163M | if (use_dist_wtd_comp_avg) { |
1120 | 1.48M | const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned); |
1121 | 1.48M | const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned); |
1122 | | |
1123 | 1.48M | const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt); |
1124 | 1.48M | const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt); |
1125 | | |
1126 | 1.48M | const __m256i res_lo = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); |
1127 | 1.48M | const __m256i res_hi = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); |
1128 | | |
1129 | 1.48M | res = _mm256_packs_epi32(res_lo, res_hi); |
1130 | 161M | } else { |
1131 | 161M | const __m256i wt_res = _mm256_add_epi16(*data_ref_0, *res_unsigned); |
1132 | 161M | res = _mm256_srai_epi16(wt_res, 1); |
1133 | 161M | } |
1134 | 163M | return res; |
1135 | 163M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:comp_avg Unexecuted instantiation: highbd_convolve_avx2.c:comp_avg Unexecuted instantiation: convolve_2d_avx2.c:comp_avg Unexecuted instantiation: convolve_avx2.c:comp_avg jnt_convolve_avx2.c:comp_avg Line | Count | Source | 1117 | 163M | const int use_dist_wtd_comp_avg) { | 1118 | 163M | __m256i res; | 1119 | 163M | if (use_dist_wtd_comp_avg) { | 1120 | 1.48M | const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned); | 1121 | 1.48M | const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned); | 1122 | | | 1123 | 1.48M | const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt); | 1124 | 1.48M | const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt); | 1125 | | | 1126 | 1.48M | const __m256i res_lo = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); | 1127 | 1.48M | const __m256i res_hi = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); | 1128 | | | 1129 | 1.48M | res = _mm256_packs_epi32(res_lo, res_hi); | 1130 | 161M | } else { | 1131 | 161M | const __m256i wt_res = _mm256_add_epi16(*data_ref_0, *res_unsigned); | 1132 | 161M | res = _mm256_srai_epi16(wt_res, 1); | 1133 | 161M | } | 1134 | 163M | return res; | 1135 | 163M | } |
Unexecuted instantiation: wiener_convolve_avx2.c:comp_avg Unexecuted instantiation: highbd_convolve_2d_avx2.c:comp_avg Unexecuted instantiation: highbd_jnt_convolve_avx2.c:comp_avg |
1136 | | |
1137 | | static inline __m256i convolve_rounding(const __m256i *const res_unsigned, |
1138 | | const __m256i *const offset_const, |
1139 | | const __m256i *const round_const, |
1140 | 162M | const int round_shift) { |
1141 | 162M | const __m256i res_signed = _mm256_sub_epi16(*res_unsigned, *offset_const); |
1142 | 162M | const __m256i res_round = _mm256_srai_epi16( |
1143 | 162M | _mm256_add_epi16(res_signed, *round_const), round_shift); |
1144 | 162M | return res_round; |
1145 | 162M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_rounding Unexecuted instantiation: highbd_convolve_avx2.c:convolve_rounding Unexecuted instantiation: convolve_2d_avx2.c:convolve_rounding Unexecuted instantiation: convolve_avx2.c:convolve_rounding jnt_convolve_avx2.c:convolve_rounding Line | Count | Source | 1140 | 162M | const int round_shift) { | 1141 | 162M | const __m256i res_signed = _mm256_sub_epi16(*res_unsigned, *offset_const); | 1142 | 162M | const __m256i res_round = _mm256_srai_epi16( | 1143 | 162M | _mm256_add_epi16(res_signed, *round_const), round_shift); | 1144 | 162M | return res_round; | 1145 | 162M | } |
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_rounding Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_rounding Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_rounding |
1146 | | |
1147 | | static inline __m256i highbd_comp_avg(const __m256i *const data_ref_0, |
1148 | | const __m256i *const res_unsigned, |
1149 | | const __m256i *const wt0, |
1150 | | const __m256i *const wt1, |
1151 | 12.3M | const int use_dist_wtd_comp_avg) { |
1152 | 12.3M | __m256i res; |
1153 | 12.3M | if (use_dist_wtd_comp_avg) { |
1154 | 2.13M | const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0); |
1155 | 2.13M | const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1); |
1156 | 2.13M | const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res); |
1157 | 2.13M | res = _mm256_srai_epi32(wt_res, DIST_PRECISION_BITS); |
1158 | 10.1M | } else { |
1159 | 10.1M | const __m256i wt_res = _mm256_add_epi32(*data_ref_0, *res_unsigned); |
1160 | 10.1M | res = _mm256_srai_epi32(wt_res, 1); |
1161 | 10.1M | } |
1162 | 12.3M | return res; |
1163 | 12.3M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:highbd_comp_avg Unexecuted instantiation: highbd_convolve_avx2.c:highbd_comp_avg Unexecuted instantiation: convolve_2d_avx2.c:highbd_comp_avg Unexecuted instantiation: convolve_avx2.c:highbd_comp_avg Unexecuted instantiation: jnt_convolve_avx2.c:highbd_comp_avg Unexecuted instantiation: wiener_convolve_avx2.c:highbd_comp_avg Unexecuted instantiation: highbd_convolve_2d_avx2.c:highbd_comp_avg highbd_jnt_convolve_avx2.c:highbd_comp_avg Line | Count | Source | 1151 | 12.3M | const int use_dist_wtd_comp_avg) { | 1152 | 12.3M | __m256i res; | 1153 | 12.3M | if (use_dist_wtd_comp_avg) { | 1154 | 2.13M | const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0); | 1155 | 2.13M | const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1); | 1156 | 2.13M | const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res); | 1157 | 2.13M | res = _mm256_srai_epi32(wt_res, DIST_PRECISION_BITS); | 1158 | 10.1M | } else { | 1159 | 10.1M | const __m256i wt_res = _mm256_add_epi32(*data_ref_0, *res_unsigned); | 1160 | 10.1M | res = _mm256_srai_epi32(wt_res, 1); | 1161 | 10.1M | } | 1162 | 12.3M | return res; | 1163 | 12.3M | } |
|
1164 | | |
1165 | | static inline __m256i highbd_convolve_rounding( |
1166 | | const __m256i *const res_unsigned, const __m256i *const offset_const, |
1167 | 12.3M | const __m256i *const round_const, const int round_shift) { |
1168 | 12.3M | const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const); |
1169 | 12.3M | const __m256i res_round = _mm256_srai_epi32( |
1170 | 12.3M | _mm256_add_epi32(res_signed, *round_const), round_shift); |
1171 | | |
1172 | 12.3M | return res_round; |
1173 | 12.3M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:highbd_convolve_rounding Unexecuted instantiation: highbd_convolve_avx2.c:highbd_convolve_rounding Unexecuted instantiation: convolve_2d_avx2.c:highbd_convolve_rounding Unexecuted instantiation: convolve_avx2.c:highbd_convolve_rounding Unexecuted instantiation: jnt_convolve_avx2.c:highbd_convolve_rounding Unexecuted instantiation: wiener_convolve_avx2.c:highbd_convolve_rounding Unexecuted instantiation: highbd_convolve_2d_avx2.c:highbd_convolve_rounding highbd_jnt_convolve_avx2.c:highbd_convolve_rounding Line | Count | Source | 1167 | 12.3M | const __m256i *const round_const, const int round_shift) { | 1168 | 12.3M | const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const); | 1169 | 12.3M | const __m256i res_round = _mm256_srai_epi32( | 1170 | 12.3M | _mm256_add_epi32(res_signed, *round_const), round_shift); | 1171 | | | 1172 | 12.3M | return res_round; | 1173 | 12.3M | } |
|
1174 | | |
1175 | 4.68M | static inline __m256i round_sr_x_avx2(const __m256i data) { |
1176 | | // we can perform the below steps: |
1177 | | // data = (data + 2) >> 2 |
1178 | | // data = (data + 8) >> 4, |
1179 | | // in the below form as well |
1180 | | // data = (data + 0x22) >> 6 |
1181 | 4.68M | const __m256i value = _mm256_set1_epi16(34); |
1182 | 4.68M | const __m256i reg = _mm256_add_epi16(data, value); |
1183 | 4.68M | return _mm256_srai_epi16(reg, 6); |
1184 | 4.68M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_sr_x_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:round_sr_x_avx2 Unexecuted instantiation: convolve_2d_avx2.c:round_sr_x_avx2 convolve_avx2.c:round_sr_x_avx2 Line | Count | Source | 1175 | 4.68M | static inline __m256i round_sr_x_avx2(const __m256i data) { | 1176 | | // we can perform the below steps: | 1177 | | // data = (data + 2) >> 2 | 1178 | | // data = (data + 8) >> 4, | 1179 | | // in the below form as well | 1180 | | // data = (data + 0x22) >> 6 | 1181 | 4.68M | const __m256i value = _mm256_set1_epi16(34); | 1182 | 4.68M | const __m256i reg = _mm256_add_epi16(data, value); | 1183 | 4.68M | return _mm256_srai_epi16(reg, 6); | 1184 | 4.68M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:round_sr_x_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:round_sr_x_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_sr_x_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_sr_x_avx2 |
1185 | | |
1186 | | static inline __m128i convolve_x_4tap_4x2_ssse3(const uint8_t *const src, |
1187 | | const ptrdiff_t src_stride, |
1188 | 383k | __m128i *const coeffs) { |
1189 | 383k | __m128i data[2]; |
1190 | 383k | const __m128i f_l0 = _mm_load_si128((__m128i const *)filt1_global_sse2); |
1191 | 383k | const __m128i f_l1 = _mm_load_si128((__m128i const *)filt2_global_sse2); |
1192 | 383k | const __m128i src_1 = |
1193 | 383k | load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * src_stride)); |
1194 | | |
1195 | 383k | data[0] = _mm_shuffle_epi8(src_1, f_l0); |
1196 | 383k | data[1] = _mm_shuffle_epi8(src_1, f_l1); |
1197 | 383k | return convolve_lowbd_4tap_ssse3(data, coeffs); |
1198 | 383k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_4tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_4tap_4x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_4tap_4x2_ssse3 convolve_avx2.c:convolve_x_4tap_4x2_ssse3 Line | Count | Source | 1188 | 383k | __m128i *const coeffs) { | 1189 | 383k | __m128i data[2]; | 1190 | 383k | const __m128i f_l0 = _mm_load_si128((__m128i const *)filt1_global_sse2); | 1191 | 383k | const __m128i f_l1 = _mm_load_si128((__m128i const *)filt2_global_sse2); | 1192 | 383k | const __m128i src_1 = | 1193 | 383k | load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * src_stride)); | 1194 | | | 1195 | 383k | data[0] = _mm_shuffle_epi8(src_1, f_l0); | 1196 | 383k | data[1] = _mm_shuffle_epi8(src_1, f_l1); | 1197 | 383k | return convolve_lowbd_4tap_ssse3(data, coeffs); | 1198 | 383k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_4tap_4x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_4tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_4tap_4x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_4tap_4x2_ssse3 |
1199 | | |
1200 | 501k | static inline __m128i round_sr_x_ssse3(const __m128i data) { |
1201 | 501k | const __m128i val = _mm_set1_epi16(34); |
1202 | 501k | const __m128i reg = _mm_add_epi16(data, val); |
1203 | 501k | return _mm_srai_epi16(reg, 6); |
1204 | 501k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_sr_x_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:round_sr_x_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:round_sr_x_ssse3 convolve_avx2.c:round_sr_x_ssse3 Line | Count | Source | 1200 | 501k | static inline __m128i round_sr_x_ssse3(const __m128i data) { | 1201 | 501k | const __m128i val = _mm_set1_epi16(34); | 1202 | 501k | const __m128i reg = _mm_add_epi16(data, val); | 1203 | 501k | return _mm_srai_epi16(reg, 6); | 1204 | 501k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:round_sr_x_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:round_sr_x_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_sr_x_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_sr_x_ssse3 |
1205 | | |
1206 | | static inline void store_8bit_4x2_sse2(const __m128i reg, uint8_t *const dst, |
1207 | 918k | const ptrdiff_t dst_stride) { |
1208 | 918k | xx_storel_32(dst, reg); |
1209 | 918k | *(uint32_t *)(dst + dst_stride) = |
1210 | 918k | ((uint32_t)_mm_extract_epi16(reg, 3) << 16) | _mm_extract_epi16(reg, 2); |
1211 | 918k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:store_8bit_4x2_sse2 Unexecuted instantiation: highbd_convolve_avx2.c:store_8bit_4x2_sse2 Unexecuted instantiation: convolve_2d_avx2.c:store_8bit_4x2_sse2 convolve_avx2.c:store_8bit_4x2_sse2 Line | Count | Source | 1207 | 918k | const ptrdiff_t dst_stride) { | 1208 | 918k | xx_storel_32(dst, reg); | 1209 | 918k | *(uint32_t *)(dst + dst_stride) = | 1210 | 918k | ((uint32_t)_mm_extract_epi16(reg, 3) << 16) | _mm_extract_epi16(reg, 2); | 1211 | 918k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:store_8bit_4x2_sse2 Unexecuted instantiation: wiener_convolve_avx2.c:store_8bit_4x2_sse2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:store_8bit_4x2_sse2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:store_8bit_4x2_sse2 |
1212 | | |
1213 | | static inline void pack_store_u8_4x2_sse2(const __m128i reg, uint8_t *const dst, |
1214 | 918k | const ptrdiff_t dst_stride) { |
1215 | 918k | const __m128i reg_pack = _mm_packus_epi16(reg, reg); |
1216 | 918k | store_8bit_4x2_sse2(reg_pack, dst, dst_stride); |
1217 | 918k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:pack_store_u8_4x2_sse2 Unexecuted instantiation: highbd_convolve_avx2.c:pack_store_u8_4x2_sse2 Unexecuted instantiation: convolve_2d_avx2.c:pack_store_u8_4x2_sse2 convolve_avx2.c:pack_store_u8_4x2_sse2 Line | Count | Source | 1214 | 918k | const ptrdiff_t dst_stride) { | 1215 | 918k | const __m128i reg_pack = _mm_packus_epi16(reg, reg); | 1216 | 918k | store_8bit_4x2_sse2(reg_pack, dst, dst_stride); | 1217 | 918k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:pack_store_u8_4x2_sse2 Unexecuted instantiation: wiener_convolve_avx2.c:pack_store_u8_4x2_sse2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:pack_store_u8_4x2_sse2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:pack_store_u8_4x2_sse2 |
1218 | | |
1219 | | static inline __m128i convolve_x_4tap_2x2_ssse3(const uint8_t *const src, |
1220 | | const ptrdiff_t src_stride, |
1221 | 64.2k | __m128i *const coeffs) { |
1222 | 64.2k | __m128i data[2]; |
1223 | 64.2k | const __m128i f_0 = _mm_load_si128((__m128i const *)filt3_global_sse2); |
1224 | 64.2k | const __m128i f_1 = _mm_load_si128((__m128i const *)filt4_global_sse2); |
1225 | 64.2k | const __m128i reg = |
1226 | 64.2k | load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * src_stride)); |
1227 | | |
1228 | 64.2k | data[0] = _mm_shuffle_epi8(reg, f_0); |
1229 | 64.2k | data[1] = _mm_shuffle_epi8(reg, f_1); |
1230 | 64.2k | return convolve_lowbd_4tap_ssse3(data, coeffs); |
1231 | 64.2k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_4tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_4tap_2x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_4tap_2x2_ssse3 convolve_avx2.c:convolve_x_4tap_2x2_ssse3 Line | Count | Source | 1221 | 64.2k | __m128i *const coeffs) { | 1222 | 64.2k | __m128i data[2]; | 1223 | 64.2k | const __m128i f_0 = _mm_load_si128((__m128i const *)filt3_global_sse2); | 1224 | 64.2k | const __m128i f_1 = _mm_load_si128((__m128i const *)filt4_global_sse2); | 1225 | 64.2k | const __m128i reg = | 1226 | 64.2k | load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * src_stride)); | 1227 | | | 1228 | 64.2k | data[0] = _mm_shuffle_epi8(reg, f_0); | 1229 | 64.2k | data[1] = _mm_shuffle_epi8(reg, f_1); | 1230 | 64.2k | return convolve_lowbd_4tap_ssse3(data, coeffs); | 1231 | 64.2k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_4tap_2x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_4tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_4tap_2x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_4tap_2x2_ssse3 |
1232 | | |
1233 | | static inline void pack_store_u8_2x2_sse2(const __m128i reg, uint8_t *const dst, |
1234 | 159k | const ptrdiff_t dst_stride) { |
1235 | 159k | const __m128i data = _mm_packus_epi16(reg, reg); |
1236 | 159k | *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(data); |
1237 | 159k | *(int16_t *)(dst + dst_stride) = (int16_t)_mm_extract_epi16(data, 1); |
1238 | 159k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:pack_store_u8_2x2_sse2 Unexecuted instantiation: highbd_convolve_avx2.c:pack_store_u8_2x2_sse2 Unexecuted instantiation: convolve_2d_avx2.c:pack_store_u8_2x2_sse2 convolve_avx2.c:pack_store_u8_2x2_sse2 Line | Count | Source | 1234 | 159k | const ptrdiff_t dst_stride) { | 1235 | 159k | const __m128i data = _mm_packus_epi16(reg, reg); | 1236 | 159k | *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(data); | 1237 | | *(int16_t *)(dst + dst_stride) = (int16_t)_mm_extract_epi16(data, 1); | 1238 | 159k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:pack_store_u8_2x2_sse2 Unexecuted instantiation: wiener_convolve_avx2.c:pack_store_u8_2x2_sse2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:pack_store_u8_2x2_sse2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:pack_store_u8_2x2_sse2 |
1239 | | |
1240 | | static inline __m128i convolve_x_2tap_ssse3(const __m128i *data, |
1241 | 53.5k | const __m128i *coeff) { |
1242 | 53.5k | return _mm_maddubs_epi16(data[0], coeff[0]); |
1243 | 53.5k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_ssse3 convolve_avx2.c:convolve_x_2tap_ssse3 Line | Count | Source | 1241 | 53.5k | const __m128i *coeff) { | 1242 | 53.5k | return _mm_maddubs_epi16(data[0], coeff[0]); | 1243 | 53.5k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_ssse3 |
1244 | | |
1245 | | static inline __m128i load8_x_4x2_sse4(const void *const src, |
1246 | 10.8k | const ptrdiff_t offset) { |
1247 | 10.8k | const __m128i s = _mm_cvtsi32_si128(loadu_int32(src)); |
1248 | 10.8k | return _mm_insert_epi32(s, loadu_int32((uint8_t *)src + offset), 1); |
1249 | 10.8k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load8_x_4x2_sse4 Unexecuted instantiation: highbd_convolve_avx2.c:load8_x_4x2_sse4 Unexecuted instantiation: convolve_2d_avx2.c:load8_x_4x2_sse4 convolve_avx2.c:load8_x_4x2_sse4 Line | Count | Source | 1246 | 10.8k | const ptrdiff_t offset) { | 1247 | 10.8k | const __m128i s = _mm_cvtsi32_si128(loadu_int32(src)); | 1248 | | return _mm_insert_epi32(s, loadu_int32((uint8_t *)src + offset), 1); | 1249 | 10.8k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load8_x_4x2_sse4 Unexecuted instantiation: wiener_convolve_avx2.c:load8_x_4x2_sse4 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load8_x_4x2_sse4 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load8_x_4x2_sse4 |
1250 | | |
1251 | | static inline __m128i load_x_u8_4x2_sse4(const uint8_t *const src, |
1252 | 10.8k | const ptrdiff_t stride) { |
1253 | 10.8k | return load8_x_4x2_sse4(src, sizeof(*src) * stride); |
1254 | 10.8k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_x_u8_4x2_sse4 Unexecuted instantiation: highbd_convolve_avx2.c:load_x_u8_4x2_sse4 Unexecuted instantiation: convolve_2d_avx2.c:load_x_u8_4x2_sse4 convolve_avx2.c:load_x_u8_4x2_sse4 Line | Count | Source | 1252 | 10.8k | const ptrdiff_t stride) { | 1253 | 10.8k | return load8_x_4x2_sse4(src, sizeof(*src) * stride); | 1254 | 10.8k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load_x_u8_4x2_sse4 Unexecuted instantiation: wiener_convolve_avx2.c:load_x_u8_4x2_sse4 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_x_u8_4x2_sse4 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_x_u8_4x2_sse4 |
1255 | | |
1256 | | static inline __m128i convolve_x_2tap_2x2_ssse3(const uint8_t *const src, |
1257 | | const ptrdiff_t stride, |
1258 | 3.11k | const __m128i *coeffs) { |
1259 | 3.11k | const __m128i flt = _mm_load_si128((__m128i const *)filt5_global_sse2); |
1260 | 3.11k | const __m128i reg = load_x_u8_4x2_sse4(src, stride); |
1261 | 3.11k | const __m128i data = _mm_shuffle_epi8(reg, flt); |
1262 | 3.11k | return convolve_x_2tap_ssse3(&data, coeffs); |
1263 | 3.11k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_2x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_2x2_ssse3 convolve_avx2.c:convolve_x_2tap_2x2_ssse3 Line | Count | Source | 1258 | 3.11k | const __m128i *coeffs) { | 1259 | 3.11k | const __m128i flt = _mm_load_si128((__m128i const *)filt5_global_sse2); | 1260 | 3.11k | const __m128i reg = load_x_u8_4x2_sse4(src, stride); | 1261 | 3.11k | const __m128i data = _mm_shuffle_epi8(reg, flt); | 1262 | 3.11k | return convolve_x_2tap_ssse3(&data, coeffs); | 1263 | 3.11k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_2x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_2x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_2x2_ssse3 |
1264 | | |
1265 | | static inline __m128i convolve_x_2tap_4x2_ssse3(const uint8_t *const src, |
1266 | | const ptrdiff_t stride, |
1267 | 15.1k | const __m128i *coeffs) { |
1268 | 15.1k | const __m128i flt = _mm_load_si128((__m128i const *)filt1_global_sse2); |
1269 | 15.1k | const __m128i data = |
1270 | 15.1k | load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * stride)); |
1271 | 15.1k | const __m128i res = _mm_shuffle_epi8(data, flt); |
1272 | 15.1k | return convolve_x_2tap_ssse3(&res, coeffs); |
1273 | 15.1k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_4x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_4x2_ssse3 convolve_avx2.c:convolve_x_2tap_4x2_ssse3 Line | Count | Source | 1267 | 15.1k | const __m128i *coeffs) { | 1268 | 15.1k | const __m128i flt = _mm_load_si128((__m128i const *)filt1_global_sse2); | 1269 | 15.1k | const __m128i data = | 1270 | 15.1k | load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * stride)); | 1271 | 15.1k | const __m128i res = _mm_shuffle_epi8(data, flt); | 1272 | 15.1k | return convolve_x_2tap_ssse3(&res, coeffs); | 1273 | 15.1k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_4x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_4x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_4x2_ssse3 |
1274 | | |
1275 | | static inline void convolve_x_2tap_8x2_ssse3(const uint8_t *const src, |
1276 | | const ptrdiff_t stride, |
1277 | | const __m128i *coeffs, |
1278 | 17.6k | __m128i *data) { |
1279 | 17.6k | __m128i res[2]; |
1280 | 17.6k | const __m128i reg_00 = _mm_loadu_si128((__m128i *)src); |
1281 | 17.6k | const __m128i reg_10 = _mm_loadu_si128((__m128i *)(src + stride)); |
1282 | 17.6k | const __m128i reg_01 = _mm_srli_si128(reg_00, 1); |
1283 | 17.6k | const __m128i reg_11 = _mm_srli_si128(reg_10, 1); |
1284 | 17.6k | res[0] = _mm_unpacklo_epi8(reg_00, reg_01); |
1285 | 17.6k | res[1] = _mm_unpacklo_epi8(reg_10, reg_11); |
1286 | | |
1287 | 17.6k | data[0] = convolve_x_2tap_ssse3(&res[0], coeffs); |
1288 | 17.6k | data[1] = convolve_x_2tap_ssse3(&res[1], coeffs); |
1289 | 17.6k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_8x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_8x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_8x2_ssse3 convolve_avx2.c:convolve_x_2tap_8x2_ssse3 Line | Count | Source | 1278 | 17.6k | __m128i *data) { | 1279 | 17.6k | __m128i res[2]; | 1280 | 17.6k | const __m128i reg_00 = _mm_loadu_si128((__m128i *)src); | 1281 | 17.6k | const __m128i reg_10 = _mm_loadu_si128((__m128i *)(src + stride)); | 1282 | 17.6k | const __m128i reg_01 = _mm_srli_si128(reg_00, 1); | 1283 | 17.6k | const __m128i reg_11 = _mm_srli_si128(reg_10, 1); | 1284 | 17.6k | res[0] = _mm_unpacklo_epi8(reg_00, reg_01); | 1285 | 17.6k | res[1] = _mm_unpacklo_epi8(reg_10, reg_11); | 1286 | | | 1287 | 17.6k | data[0] = convolve_x_2tap_ssse3(&res[0], coeffs); | 1288 | 17.6k | data[1] = convolve_x_2tap_ssse3(&res[1], coeffs); | 1289 | 17.6k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_8x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_8x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_8x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_8x2_ssse3 |
1290 | | |
1291 | | static inline __m256i loadu_x_8bit_16x2_avx2(const void *const src, |
1292 | 776k | const ptrdiff_t offset) { |
1293 | 776k | const __m128i reg0 = _mm_loadu_si128((__m128i *)src); |
1294 | 776k | const __m128i reg1 = _mm_loadu_si128((__m128i *)((uint8_t *)src + offset)); |
1295 | 776k | return _mm256_setr_m128i(reg0, reg1); |
1296 | 776k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:loadu_x_8bit_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:loadu_x_8bit_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:loadu_x_8bit_16x2_avx2 convolve_avx2.c:loadu_x_8bit_16x2_avx2 Line | Count | Source | 1292 | 776k | const ptrdiff_t offset) { | 1293 | 776k | const __m128i reg0 = _mm_loadu_si128((__m128i *)src); | 1294 | 776k | const __m128i reg1 = _mm_loadu_si128((__m128i *)((uint8_t *)src + offset)); | 1295 | 776k | return _mm256_setr_m128i(reg0, reg1); | 1296 | 776k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:loadu_x_8bit_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:loadu_x_8bit_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:loadu_x_8bit_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:loadu_x_8bit_16x2_avx2 |
1297 | | |
1298 | | static inline __m256i convolve_x_2tap_avx2(const __m256i *data, |
1299 | 255k | const __m256i *coeffs) { |
1300 | 255k | return _mm256_maddubs_epi16(data[0], coeffs[0]); |
1301 | 255k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_avx2 convolve_avx2.c:convolve_x_2tap_avx2 Line | Count | Source | 1299 | 255k | const __m256i *coeffs) { | 1300 | 255k | return _mm256_maddubs_epi16(data[0], coeffs[0]); | 1301 | 255k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_avx2 |
1302 | | |
1303 | | static inline void convolve_x_2tap_16x2_avx2(const uint8_t *const src, |
1304 | | const ptrdiff_t stride, |
1305 | | const __m256i *coeffs, |
1306 | 13.6k | __m256i *data) { |
1307 | 13.6k | const __m256i reg0 = loadu_x_8bit_16x2_avx2(src, stride); |
1308 | 13.6k | const __m256i reg1 = loadu_x_8bit_16x2_avx2(src + 1, stride); |
1309 | 13.6k | const __m256i res0 = _mm256_unpacklo_epi8(reg0, reg1); |
1310 | 13.6k | const __m256i res1 = _mm256_unpackhi_epi8(reg0, reg1); |
1311 | 13.6k | data[0] = convolve_x_2tap_avx2(&res0, coeffs); |
1312 | 13.6k | data[1] = convolve_x_2tap_avx2(&res1, coeffs); |
1313 | 13.6k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_16x2_avx2 convolve_avx2.c:convolve_x_2tap_16x2_avx2 Line | Count | Source | 1306 | 13.6k | __m256i *data) { | 1307 | 13.6k | const __m256i reg0 = loadu_x_8bit_16x2_avx2(src, stride); | 1308 | 13.6k | const __m256i reg1 = loadu_x_8bit_16x2_avx2(src + 1, stride); | 1309 | 13.6k | const __m256i res0 = _mm256_unpacklo_epi8(reg0, reg1); | 1310 | 13.6k | const __m256i res1 = _mm256_unpackhi_epi8(reg0, reg1); | 1311 | 13.6k | data[0] = convolve_x_2tap_avx2(&res0, coeffs); | 1312 | 13.6k | data[1] = convolve_x_2tap_avx2(&res1, coeffs); | 1313 | 13.6k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_16x2_avx2 |
1314 | | |
1315 | | static inline void store_u8_16x2_avx2(const __m256i src, uint8_t *const dst, |
1316 | 2.01M | const ptrdiff_t stride) { |
1317 | 2.01M | const __m128i reg0 = _mm256_castsi256_si128(src); |
1318 | 2.01M | const __m128i reg1 = _mm256_extracti128_si256(src, 1); |
1319 | 2.01M | _mm_storeu_si128((__m128i *)dst, reg0); |
1320 | 2.01M | _mm_storeu_si128((__m128i *)((uint8_t *)dst + stride), reg1); |
1321 | 2.01M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:store_u8_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:store_u8_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:store_u8_16x2_avx2 convolve_avx2.c:store_u8_16x2_avx2 Line | Count | Source | 1316 | 2.01M | const ptrdiff_t stride) { | 1317 | 2.01M | const __m128i reg0 = _mm256_castsi256_si128(src); | 1318 | | const __m128i reg1 = _mm256_extracti128_si256(src, 1); | 1319 | 2.01M | _mm_storeu_si128((__m128i *)dst, reg0); | 1320 | 2.01M | _mm_storeu_si128((__m128i *)((uint8_t *)dst + stride), reg1); | 1321 | 2.01M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:store_u8_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:store_u8_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:store_u8_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:store_u8_16x2_avx2 |
1322 | | |
1323 | | static inline void store_u8_8x2_avx2(const __m256i src, uint8_t *const dst, |
1324 | 583k | const ptrdiff_t stride) { |
1325 | 583k | const __m128i reg0 = _mm256_castsi256_si128(src); |
1326 | 583k | const __m128i reg1 = _mm256_extracti128_si256(src, 1); |
1327 | 583k | _mm_storel_epi64((__m128i *)dst, reg0); |
1328 | 583k | _mm_storel_epi64((__m128i *)(dst + stride), reg1); |
1329 | 583k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:store_u8_8x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:store_u8_8x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:store_u8_8x2_avx2 convolve_avx2.c:store_u8_8x2_avx2 Line | Count | Source | 1324 | 583k | const ptrdiff_t stride) { | 1325 | 583k | const __m128i reg0 = _mm256_castsi256_si128(src); | 1326 | | const __m128i reg1 = _mm256_extracti128_si256(src, 1); | 1327 | 583k | _mm_storel_epi64((__m128i *)dst, reg0); | 1328 | 583k | _mm_storel_epi64((__m128i *)(dst + stride), reg1); | 1329 | 583k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:store_u8_8x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:store_u8_8x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:store_u8_8x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:store_u8_8x2_avx2 |
1330 | | |
1331 | | static inline void pack_store_16x2_avx2(const __m256i data0, |
1332 | | const __m256i data1, uint8_t *const dst, |
1333 | 2.01M | const ptrdiff_t stride) { |
1334 | 2.01M | const __m256i res = _mm256_packus_epi16(data0, data1); |
1335 | 2.01M | store_u8_16x2_avx2(res, dst, stride); |
1336 | 2.01M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:pack_store_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:pack_store_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:pack_store_16x2_avx2 convolve_avx2.c:pack_store_16x2_avx2 Line | Count | Source | 1333 | 2.01M | const ptrdiff_t stride) { | 1334 | 2.01M | const __m256i res = _mm256_packus_epi16(data0, data1); | 1335 | 2.01M | store_u8_16x2_avx2(res, dst, stride); | 1336 | 2.01M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:pack_store_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:pack_store_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:pack_store_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:pack_store_16x2_avx2 |
1337 | | |
1338 | | static inline void pack_store_8x2_avx2(const __m256i data, uint8_t *const dst, |
1339 | 583k | const ptrdiff_t stride) { |
1340 | 583k | const __m256i res = _mm256_packus_epi16(data, data); |
1341 | 583k | store_u8_8x2_avx2(res, dst, stride); |
1342 | 583k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:pack_store_8x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:pack_store_8x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:pack_store_8x2_avx2 convolve_avx2.c:pack_store_8x2_avx2 Line | Count | Source | 1339 | 583k | const ptrdiff_t stride) { | 1340 | 583k | const __m256i res = _mm256_packus_epi16(data, data); | 1341 | 583k | store_u8_8x2_avx2(res, dst, stride); | 1342 | 583k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:pack_store_8x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:pack_store_8x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:pack_store_8x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:pack_store_8x2_avx2 |
1343 | | |
1344 | | static inline void round_pack_store_16x2_avx2(const __m256i *data, |
1345 | | uint8_t *const dst, |
1346 | 388k | const ptrdiff_t dst_stride) { |
1347 | 388k | __m256i reg[2]; |
1348 | | |
1349 | 388k | reg[0] = round_sr_x_avx2(data[0]); |
1350 | 388k | reg[1] = round_sr_x_avx2(data[1]); |
1351 | 388k | pack_store_16x2_avx2(reg[0], reg[1], dst, dst_stride); |
1352 | 388k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_pack_store_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:round_pack_store_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:round_pack_store_16x2_avx2 convolve_avx2.c:round_pack_store_16x2_avx2 Line | Count | Source | 1346 | 388k | const ptrdiff_t dst_stride) { | 1347 | 388k | __m256i reg[2]; | 1348 | | | 1349 | 388k | reg[0] = round_sr_x_avx2(data[0]); | 1350 | 388k | reg[1] = round_sr_x_avx2(data[1]); | 1351 | 388k | pack_store_16x2_avx2(reg[0], reg[1], dst, dst_stride); | 1352 | 388k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:round_pack_store_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:round_pack_store_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_pack_store_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_pack_store_16x2_avx2 |
1353 | | |
1354 | | static inline void convolve_x_2tap_32_avx2(const uint8_t *const src, |
1355 | | const __m256i *coeffs, |
1356 | 114k | __m256i *data) { |
1357 | 114k | const __m256i res0 = _mm256_loadu_si256((__m256i *)src); |
1358 | 114k | const __m256i res1 = _mm256_loadu_si256((__m256i *)(src + 1)); |
1359 | 114k | const __m256i reg0 = _mm256_unpacklo_epi8(res0, res1); |
1360 | 114k | const __m256i reg1 = _mm256_unpackhi_epi8(res0, res1); |
1361 | | |
1362 | 114k | data[0] = convolve_x_2tap_avx2(®0, coeffs); |
1363 | 114k | data[1] = convolve_x_2tap_avx2(®1, coeffs); |
1364 | 114k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_32_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_32_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_32_avx2 convolve_avx2.c:convolve_x_2tap_32_avx2 Line | Count | Source | 1356 | 114k | __m256i *data) { | 1357 | 114k | const __m256i res0 = _mm256_loadu_si256((__m256i *)src); | 1358 | 114k | const __m256i res1 = _mm256_loadu_si256((__m256i *)(src + 1)); | 1359 | 114k | const __m256i reg0 = _mm256_unpacklo_epi8(res0, res1); | 1360 | 114k | const __m256i reg1 = _mm256_unpackhi_epi8(res0, res1); | 1361 | | | 1362 | 114k | data[0] = convolve_x_2tap_avx2(®0, coeffs); | 1363 | 114k | data[1] = convolve_x_2tap_avx2(®1, coeffs); | 1364 | 114k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_32_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_32_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_32_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_32_avx2 |
1365 | | |
1366 | | static inline void pack_store_32_avx2(const __m256i data0, const __m256i data1, |
1367 | 1.78M | uint8_t *const dst) { |
1368 | 1.78M | const __m256i reg = _mm256_packus_epi16(data0, data1); |
1369 | 1.78M | _mm256_storeu_si256((__m256i *)dst, reg); |
1370 | 1.78M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:pack_store_32_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:pack_store_32_avx2 Unexecuted instantiation: convolve_2d_avx2.c:pack_store_32_avx2 convolve_avx2.c:pack_store_32_avx2 Line | Count | Source | 1367 | 1.78M | uint8_t *const dst) { | 1368 | 1.78M | const __m256i reg = _mm256_packus_epi16(data0, data1); | 1369 | 1.78M | _mm256_storeu_si256((__m256i *)dst, reg); | 1370 | 1.78M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:pack_store_32_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:pack_store_32_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:pack_store_32_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:pack_store_32_avx2 |
1371 | | |
1372 | | static inline void round_pack_store_32_avx2(const __m256i *data, |
1373 | 1.52M | uint8_t *const dst) { |
1374 | 1.52M | __m256i reg[2]; |
1375 | | |
1376 | 1.52M | reg[0] = round_sr_x_avx2(data[0]); |
1377 | 1.52M | reg[1] = round_sr_x_avx2(data[1]); |
1378 | 1.52M | pack_store_32_avx2(reg[0], reg[1], dst); |
1379 | 1.52M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_pack_store_32_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:round_pack_store_32_avx2 Unexecuted instantiation: convolve_2d_avx2.c:round_pack_store_32_avx2 convolve_avx2.c:round_pack_store_32_avx2 Line | Count | Source | 1373 | 1.52M | uint8_t *const dst) { | 1374 | 1.52M | __m256i reg[2]; | 1375 | | | 1376 | 1.52M | reg[0] = round_sr_x_avx2(data[0]); | 1377 | 1.52M | reg[1] = round_sr_x_avx2(data[1]); | 1378 | 1.52M | pack_store_32_avx2(reg[0], reg[1], dst); | 1379 | 1.52M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:round_pack_store_32_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:round_pack_store_32_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_pack_store_32_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_pack_store_32_avx2 |
1380 | | |
1381 | | static inline void convolve_round_2tap_32_avx2(const uint8_t *const src, |
1382 | | const __m256i *coeffs, |
1383 | 114k | uint8_t *const dst) { |
1384 | 114k | __m256i data[2]; |
1385 | | |
1386 | 114k | convolve_x_2tap_32_avx2(src, coeffs, data); |
1387 | 114k | round_pack_store_32_avx2(data, dst); |
1388 | 114k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_round_2tap_32_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_round_2tap_32_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_round_2tap_32_avx2 convolve_avx2.c:convolve_round_2tap_32_avx2 Line | Count | Source | 1383 | 114k | uint8_t *const dst) { | 1384 | 114k | __m256i data[2]; | 1385 | | | 1386 | 114k | convolve_x_2tap_32_avx2(src, coeffs, data); | 1387 | 114k | round_pack_store_32_avx2(data, dst); | 1388 | 114k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_round_2tap_32_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_round_2tap_32_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_round_2tap_32_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_round_2tap_32_avx2 |
1389 | | |
1390 | | static inline void load_avg_store_2tap_32_avx2(const uint8_t *const src, |
1391 | 100k | uint8_t *const dst) { |
1392 | 100k | const __m256i res0 = _mm256_loadu_si256((__m256i *)src); |
1393 | 100k | const __m256i res1 = _mm256_loadu_si256((__m256i *)(src + 1)); |
1394 | 100k | const __m256i data = _mm256_avg_epu8(res0, res1); |
1395 | 100k | _mm256_storeu_si256((__m256i *)dst, data); |
1396 | 100k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_avg_store_2tap_32_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:load_avg_store_2tap_32_avx2 Unexecuted instantiation: convolve_2d_avx2.c:load_avg_store_2tap_32_avx2 convolve_avx2.c:load_avg_store_2tap_32_avx2 Line | Count | Source | 1391 | 100k | uint8_t *const dst) { | 1392 | 100k | const __m256i res0 = _mm256_loadu_si256((__m256i *)src); | 1393 | 100k | const __m256i res1 = _mm256_loadu_si256((__m256i *)(src + 1)); | 1394 | 100k | const __m256i data = _mm256_avg_epu8(res0, res1); | 1395 | 100k | _mm256_storeu_si256((__m256i *)dst, data); | 1396 | 100k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load_avg_store_2tap_32_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:load_avg_store_2tap_32_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_avg_store_2tap_32_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_avg_store_2tap_32_avx2 |
1397 | | |
1398 | | static inline __m256i load_convolve_8tap_8x2_avx2(const uint8_t *const src, |
1399 | | const ptrdiff_t stride, |
1400 | | const __m256i *coeffs, |
1401 | 57.3k | const __m256i *flt) { |
1402 | 57.3k | const __m256i res = loadu_x_8bit_16x2_avx2(src, stride); |
1403 | 57.3k | return convolve_lowbd_x(res, coeffs, flt); |
1404 | 57.3k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_8tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_8tap_8x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_8tap_8x2_avx2 convolve_avx2.c:load_convolve_8tap_8x2_avx2 Line | Count | Source | 1401 | 57.3k | const __m256i *flt) { | 1402 | 57.3k | const __m256i res = loadu_x_8bit_16x2_avx2(src, stride); | 1403 | 57.3k | return convolve_lowbd_x(res, coeffs, flt); | 1404 | 57.3k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_8tap_8x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_8tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_8tap_8x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_8tap_8x2_avx2 |
1405 | | |
1406 | | static inline void load_convolve_8tap_16x2_avx2(const uint8_t *const src, |
1407 | | const int32_t src_stride, |
1408 | | const __m256i *coeffs, |
1409 | | const __m256i *flt, |
1410 | 28.6k | __m256i *reg) { |
1411 | 28.6k | reg[0] = load_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, flt); |
1412 | 28.6k | reg[1] = load_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, flt); |
1413 | 28.6k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_8tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_8tap_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_8tap_16x2_avx2 convolve_avx2.c:load_convolve_8tap_16x2_avx2 Line | Count | Source | 1410 | 28.6k | __m256i *reg) { | 1411 | 28.6k | reg[0] = load_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, flt); | 1412 | 28.6k | reg[1] = load_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, flt); | 1413 | 28.6k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_8tap_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_8tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_8tap_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_8tap_16x2_avx2 |
1414 | | |
1415 | | static inline void load_convolve_8tap_32_avx2(const uint8_t *const src, |
1416 | | const __m256i *coeffs, |
1417 | | const __m256i *filt, |
1418 | 155k | __m256i *data) { |
1419 | 155k | const __m256i reg_0 = _mm256_loadu_si256((__m256i *)src); |
1420 | 155k | const __m256i reg_8 = _mm256_loadu_si256((__m256i *)(src + 8)); |
1421 | | |
1422 | 155k | data[0] = convolve_lowbd_x(reg_0, coeffs, filt); |
1423 | 155k | data[1] = convolve_lowbd_x(reg_8, coeffs, filt); |
1424 | 155k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_8tap_32_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_8tap_32_avx2 Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_8tap_32_avx2 convolve_avx2.c:load_convolve_8tap_32_avx2 Line | Count | Source | 1418 | 155k | __m256i *data) { | 1419 | 155k | const __m256i reg_0 = _mm256_loadu_si256((__m256i *)src); | 1420 | 155k | const __m256i reg_8 = _mm256_loadu_si256((__m256i *)(src + 8)); | 1421 | | | 1422 | 155k | data[0] = convolve_lowbd_x(reg_0, coeffs, filt); | 1423 | 155k | data[1] = convolve_lowbd_x(reg_8, coeffs, filt); | 1424 | 155k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_8tap_32_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_8tap_32_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_8tap_32_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_8tap_32_avx2 |
1425 | | |
1426 | | static inline void load_convolve_round_8tap_32_avx2(const uint8_t *const src, |
1427 | | const __m256i *coeffs, |
1428 | | const __m256i *filt, |
1429 | 155k | uint8_t *const dst) { |
1430 | 155k | __m256i data[2]; |
1431 | | |
1432 | 155k | load_convolve_8tap_32_avx2(src, coeffs, filt, data); |
1433 | 155k | round_pack_store_32_avx2(data, dst); |
1434 | 155k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_round_8tap_32_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_round_8tap_32_avx2 Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_round_8tap_32_avx2 convolve_avx2.c:load_convolve_round_8tap_32_avx2 Line | Count | Source | 1429 | 155k | uint8_t *const dst) { | 1430 | 155k | __m256i data[2]; | 1431 | | | 1432 | 155k | load_convolve_8tap_32_avx2(src, coeffs, filt, data); | 1433 | 155k | round_pack_store_32_avx2(data, dst); | 1434 | 155k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_round_8tap_32_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_round_8tap_32_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_round_8tap_32_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_round_8tap_32_avx2 |
1435 | | |
1436 | | static inline void load_convolve_6tap_32_avx2(const uint8_t *const src, |
1437 | | const __m256i *coeffs, |
1438 | | const __m256i *filt, |
1439 | 1.25M | __m256i *data) { |
1440 | 1.25M | const __m256i reg0 = _mm256_loadu_si256((__m256i *)src); |
1441 | 1.25M | const __m256i reg1 = _mm256_loadu_si256((__m256i *)(src + 8)); |
1442 | | |
1443 | 1.25M | data[0] = convolve_lowbd_x_6tap(reg0, coeffs, filt); |
1444 | 1.25M | data[1] = convolve_lowbd_x_6tap(reg1, coeffs, filt); |
1445 | 1.25M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_6tap_32_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_6tap_32_avx2 Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_6tap_32_avx2 convolve_avx2.c:load_convolve_6tap_32_avx2 Line | Count | Source | 1439 | 1.25M | __m256i *data) { | 1440 | 1.25M | const __m256i reg0 = _mm256_loadu_si256((__m256i *)src); | 1441 | 1.25M | const __m256i reg1 = _mm256_loadu_si256((__m256i *)(src + 8)); | 1442 | | | 1443 | 1.25M | data[0] = convolve_lowbd_x_6tap(reg0, coeffs, filt); | 1444 | 1.25M | data[1] = convolve_lowbd_x_6tap(reg1, coeffs, filt); | 1445 | 1.25M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_6tap_32_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_6tap_32_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_6tap_32_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_6tap_32_avx2 |
1446 | | |
1447 | | static inline void convolve_sr_store_6tap_32_avx2(const uint8_t *const src, |
1448 | | const __m256i *coeffs, |
1449 | | const __m256i *filt, |
1450 | 1.25M | uint8_t *const dst) { |
1451 | 1.25M | __m256i data[2]; |
1452 | | |
1453 | 1.25M | load_convolve_6tap_32_avx2(src, coeffs, filt, data); |
1454 | 1.25M | round_pack_store_32_avx2(data, dst); |
1455 | 1.25M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_sr_store_6tap_32_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_sr_store_6tap_32_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_sr_store_6tap_32_avx2 convolve_avx2.c:convolve_sr_store_6tap_32_avx2 Line | Count | Source | 1450 | 1.25M | uint8_t *const dst) { | 1451 | 1.25M | __m256i data[2]; | 1452 | | | 1453 | 1.25M | load_convolve_6tap_32_avx2(src, coeffs, filt, data); | 1454 | 1.25M | round_pack_store_32_avx2(data, dst); | 1455 | 1.25M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_sr_store_6tap_32_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_sr_store_6tap_32_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_sr_store_6tap_32_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_sr_store_6tap_32_avx2 |
1456 | | |
1457 | | static inline __m256i load_convolve_6tap_8x2_avx2(const uint8_t *const src, |
1458 | | const ptrdiff_t stride, |
1459 | | const __m256i *coeffs, |
1460 | 691k | const __m256i *filt) { |
1461 | 691k | const __m256i data = loadu_x_8bit_16x2_avx2(src, stride); |
1462 | 691k | return convolve_lowbd_x_6tap(data, coeffs, filt); |
1463 | 691k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_6tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_6tap_8x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_6tap_8x2_avx2 convolve_avx2.c:load_convolve_6tap_8x2_avx2 Line | Count | Source | 1460 | 691k | const __m256i *filt) { | 1461 | 691k | const __m256i data = loadu_x_8bit_16x2_avx2(src, stride); | 1462 | 691k | return convolve_lowbd_x_6tap(data, coeffs, filt); | 1463 | 691k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_6tap_8x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_6tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_6tap_8x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_6tap_8x2_avx2 |
1464 | | |
1465 | | static inline void load_convolve_6tap_16x2_avx2(const uint8_t *const src, |
1466 | | const int32_t src_stride, |
1467 | | const __m256i *coeffs, |
1468 | | const __m256i *filt, |
1469 | 345k | __m256i *data) { |
1470 | 345k | data[0] = load_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt); |
1471 | 345k | data[1] = load_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt); |
1472 | 345k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_6tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_6tap_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_6tap_16x2_avx2 convolve_avx2.c:load_convolve_6tap_16x2_avx2 Line | Count | Source | 1469 | 345k | __m256i *data) { | 1470 | 345k | data[0] = load_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt); | 1471 | 345k | data[1] = load_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt); | 1472 | 345k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_6tap_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_6tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_6tap_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_6tap_16x2_avx2 |
1473 | | |
1474 | 611k | static inline __m128i round_sr_y_ssse3(const __m128i data) { |
1475 | 611k | const __m128i value = _mm_set1_epi16(32); |
1476 | 611k | const __m128i reg = _mm_add_epi16(data, value); |
1477 | 611k | return _mm_srai_epi16(reg, FILTER_BITS - 1); |
1478 | 611k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_sr_y_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:round_sr_y_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:round_sr_y_ssse3 convolve_avx2.c:round_sr_y_ssse3 Line | Count | Source | 1474 | 611k | static inline __m128i round_sr_y_ssse3(const __m128i data) { | 1475 | 611k | const __m128i value = _mm_set1_epi16(32); | 1476 | 611k | const __m128i reg = _mm_add_epi16(data, value); | 1477 | 611k | return _mm_srai_epi16(reg, FILTER_BITS - 1); | 1478 | 611k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:round_sr_y_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:round_sr_y_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_sr_y_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_sr_y_ssse3 |
1479 | | |
1480 | 4.35M | static inline __m256i round_sr_y_avx2(const __m256i data) { |
1481 | 4.35M | const __m256i value = _mm256_set1_epi16(32); |
1482 | 4.35M | const __m256i reg = _mm256_add_epi16(data, value); |
1483 | 4.35M | return _mm256_srai_epi16(reg, FILTER_BITS - 1); |
1484 | 4.35M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_sr_y_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:round_sr_y_avx2 Unexecuted instantiation: convolve_2d_avx2.c:round_sr_y_avx2 convolve_avx2.c:round_sr_y_avx2 Line | Count | Source | 1480 | 4.35M | static inline __m256i round_sr_y_avx2(const __m256i data) { | 1481 | 4.35M | const __m256i value = _mm256_set1_epi16(32); | 1482 | 4.35M | const __m256i reg = _mm256_add_epi16(data, value); | 1483 | 4.35M | return _mm256_srai_epi16(reg, FILTER_BITS - 1); | 1484 | 4.35M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:round_sr_y_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:round_sr_y_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_sr_y_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_sr_y_avx2 |
1485 | | |
1486 | | static inline void round_pack_store_y_8x2_avx2(const __m256i res, |
1487 | | uint8_t *const dst, |
1488 | 583k | const ptrdiff_t dst_stride) { |
1489 | 583k | __m256i r; |
1490 | | |
1491 | 583k | r = round_sr_y_avx2(res); |
1492 | 583k | pack_store_8x2_avx2(r, dst, dst_stride); |
1493 | 583k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_pack_store_y_8x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:round_pack_store_y_8x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:round_pack_store_y_8x2_avx2 convolve_avx2.c:round_pack_store_y_8x2_avx2 Line | Count | Source | 1488 | 583k | const ptrdiff_t dst_stride) { | 1489 | 583k | __m256i r; | 1490 | | | 1491 | 583k | r = round_sr_y_avx2(res); | 1492 | 583k | pack_store_8x2_avx2(r, dst, dst_stride); | 1493 | 583k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:round_pack_store_y_8x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:round_pack_store_y_8x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_pack_store_y_8x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_pack_store_y_8x2_avx2 |
1494 | | |
1495 | | static inline void round_pack_store_y_16x2_avx2(const __m256i res[2], |
1496 | | uint8_t *const dst, |
1497 | 1.62M | const ptrdiff_t dst_stride) { |
1498 | 1.62M | __m256i r[2]; |
1499 | | |
1500 | 1.62M | r[0] = round_sr_y_avx2(res[0]); |
1501 | 1.62M | r[1] = round_sr_y_avx2(res[1]); |
1502 | 1.62M | pack_store_16x2_avx2(r[0], r[1], dst, dst_stride); |
1503 | 1.62M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_pack_store_y_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:round_pack_store_y_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:round_pack_store_y_16x2_avx2 convolve_avx2.c:round_pack_store_y_16x2_avx2 Line | Count | Source | 1497 | 1.62M | const ptrdiff_t dst_stride) { | 1498 | 1.62M | __m256i r[2]; | 1499 | | | 1500 | 1.62M | r[0] = round_sr_y_avx2(res[0]); | 1501 | 1.62M | r[1] = round_sr_y_avx2(res[1]); | 1502 | 1.62M | pack_store_16x2_avx2(r[0], r[1], dst, dst_stride); | 1503 | 1.62M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:round_pack_store_y_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:round_pack_store_y_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_pack_store_y_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_pack_store_y_16x2_avx2 |
1504 | | |
1505 | | static inline void round_pack_store_y_32_avx2(const __m256i res[2], |
1506 | 256k | uint8_t *const dst) { |
1507 | 256k | __m256i r[2]; |
1508 | | |
1509 | 256k | r[0] = round_sr_y_avx2(res[0]); |
1510 | 256k | r[1] = round_sr_y_avx2(res[1]); |
1511 | 256k | pack_store_32_avx2(r[0], r[1], dst); |
1512 | 256k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_pack_store_y_32_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:round_pack_store_y_32_avx2 Unexecuted instantiation: convolve_2d_avx2.c:round_pack_store_y_32_avx2 convolve_avx2.c:round_pack_store_y_32_avx2 Line | Count | Source | 1506 | 256k | uint8_t *const dst) { | 1507 | 256k | __m256i r[2]; | 1508 | | | 1509 | 256k | r[0] = round_sr_y_avx2(res[0]); | 1510 | 256k | r[1] = round_sr_y_avx2(res[1]); | 1511 | 256k | pack_store_32_avx2(r[0], r[1], dst); | 1512 | 256k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:round_pack_store_y_32_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:round_pack_store_y_32_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_pack_store_y_32_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_pack_store_y_32_avx2 |
1513 | | |
1514 | | static inline void round_pack_store_y_32x2_avx2(const __m256i res[4], |
1515 | | uint8_t *const dst, |
1516 | 128k | const ptrdiff_t dst_stride) { |
1517 | 128k | round_pack_store_y_32_avx2(res, dst); |
1518 | 128k | round_pack_store_y_32_avx2(res + 2, dst + dst_stride); |
1519 | 128k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_pack_store_y_32x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:round_pack_store_y_32x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:round_pack_store_y_32x2_avx2 convolve_avx2.c:round_pack_store_y_32x2_avx2 Line | Count | Source | 1516 | 128k | const ptrdiff_t dst_stride) { | 1517 | 128k | round_pack_store_y_32_avx2(res, dst); | 1518 | 128k | round_pack_store_y_32_avx2(res + 2, dst + dst_stride); | 1519 | 128k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:round_pack_store_y_32x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:round_pack_store_y_32x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_pack_store_y_32x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_pack_store_y_32x2_avx2 |
1520 | | |
1521 | | static inline void convolve_y_2tap_2x2_ssse3(const uint8_t *const data, |
1522 | | const ptrdiff_t stride, |
1523 | | const __m128i *coeffs, |
1524 | 3.55k | __m128i d[2], __m128i *res) { |
1525 | 3.55k | d[1] = _mm_cvtsi32_si128(loadu_int16(data + 1 * stride)); |
1526 | 3.55k | const __m128i src_01a = _mm_unpacklo_epi16(d[0], d[1]); |
1527 | 3.55k | d[0] = _mm_cvtsi32_si128(loadu_int16(data + 2 * stride)); |
1528 | 3.55k | const __m128i src_12a = _mm_unpacklo_epi16(d[1], d[0]); |
1529 | | |
1530 | 3.55k | const __m128i s = _mm_unpacklo_epi8(src_01a, src_12a); |
1531 | | |
1532 | 3.55k | *res = _mm_maddubs_epi16(s, coeffs[0]); |
1533 | 3.55k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_2tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_2tap_2x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_2tap_2x2_ssse3 convolve_avx2.c:convolve_y_2tap_2x2_ssse3 Line | Count | Source | 1524 | 3.55k | __m128i d[2], __m128i *res) { | 1525 | 3.55k | d[1] = _mm_cvtsi32_si128(loadu_int16(data + 1 * stride)); | 1526 | 3.55k | const __m128i src_01a = _mm_unpacklo_epi16(d[0], d[1]); | 1527 | 3.55k | d[0] = _mm_cvtsi32_si128(loadu_int16(data + 2 * stride)); | 1528 | 3.55k | const __m128i src_12a = _mm_unpacklo_epi16(d[1], d[0]); | 1529 | | | 1530 | 3.55k | const __m128i s = _mm_unpacklo_epi8(src_01a, src_12a); | 1531 | | | 1532 | 3.55k | *res = _mm_maddubs_epi16(s, coeffs[0]); | 1533 | 3.55k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_2tap_2x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_2tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_2tap_2x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_2tap_2x2_ssse3 |
1534 | | |
1535 | | static inline void convolve_y_4tap_2x2_ssse3(const uint8_t *const data, |
1536 | | const ptrdiff_t stride, |
1537 | | const __m128i coeffs[2], |
1538 | | __m128i d[4], __m128i s[2], |
1539 | 34.1k | __m128i *res) { |
1540 | 34.1k | d[3] = _mm_cvtsi32_si128(loadu_int16(data + 3 * stride)); |
1541 | 34.1k | const __m128i src_23a = _mm_unpacklo_epi16(d[2], d[3]); |
1542 | 34.1k | d[2] = _mm_cvtsi32_si128(loadu_int16(data + 4 * stride)); |
1543 | 34.1k | const __m128i src_34a = _mm_unpacklo_epi16(d[3], d[2]); |
1544 | | |
1545 | 34.1k | s[1] = _mm_unpacklo_epi8(src_23a, src_34a); |
1546 | | |
1547 | 34.1k | *res = convolve_lowbd_4tap_ssse3(s, coeffs); |
1548 | 34.1k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_4tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_4tap_2x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_4tap_2x2_ssse3 convolve_avx2.c:convolve_y_4tap_2x2_ssse3 Line | Count | Source | 1539 | 34.1k | __m128i *res) { | 1540 | 34.1k | d[3] = _mm_cvtsi32_si128(loadu_int16(data + 3 * stride)); | 1541 | 34.1k | const __m128i src_23a = _mm_unpacklo_epi16(d[2], d[3]); | 1542 | 34.1k | d[2] = _mm_cvtsi32_si128(loadu_int16(data + 4 * stride)); | 1543 | 34.1k | const __m128i src_34a = _mm_unpacklo_epi16(d[3], d[2]); | 1544 | | | 1545 | 34.1k | s[1] = _mm_unpacklo_epi8(src_23a, src_34a); | 1546 | | | 1547 | 34.1k | *res = convolve_lowbd_4tap_ssse3(s, coeffs); | 1548 | 34.1k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_4tap_2x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_4tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_4tap_2x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_4tap_2x2_ssse3 |
1549 | | |
1550 | | static inline void convolve_y_6tap_2x2_ssse3(const uint8_t *const data, |
1551 | | const ptrdiff_t stride, |
1552 | | const __m128i coeffs[3], |
1553 | | __m128i d[6], __m128i s[3], |
1554 | 48.7k | __m128i *res) { |
1555 | 48.7k | d[5] = _mm_cvtsi32_si128(loadu_int16(data + 5 * stride)); |
1556 | 48.7k | const __m128i src_45a = _mm_unpacklo_epi16(d[4], d[5]); |
1557 | 48.7k | d[4] = _mm_cvtsi32_si128(loadu_int16(data + 6 * stride)); |
1558 | 48.7k | const __m128i src_56a = _mm_unpacklo_epi16(d[5], d[4]); |
1559 | | |
1560 | 48.7k | s[2] = _mm_unpacklo_epi8(src_45a, src_56a); |
1561 | | |
1562 | 48.7k | *res = convolve_lowbd_6tap_ssse3(s, coeffs); |
1563 | 48.7k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_6tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_6tap_2x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_6tap_2x2_ssse3 convolve_avx2.c:convolve_y_6tap_2x2_ssse3 Line | Count | Source | 1554 | 48.7k | __m128i *res) { | 1555 | 48.7k | d[5] = _mm_cvtsi32_si128(loadu_int16(data + 5 * stride)); | 1556 | 48.7k | const __m128i src_45a = _mm_unpacklo_epi16(d[4], d[5]); | 1557 | 48.7k | d[4] = _mm_cvtsi32_si128(loadu_int16(data + 6 * stride)); | 1558 | 48.7k | const __m128i src_56a = _mm_unpacklo_epi16(d[5], d[4]); | 1559 | | | 1560 | 48.7k | s[2] = _mm_unpacklo_epi8(src_45a, src_56a); | 1561 | | | 1562 | 48.7k | *res = convolve_lowbd_6tap_ssse3(s, coeffs); | 1563 | 48.7k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_6tap_2x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_6tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_6tap_2x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_6tap_2x2_ssse3 |
1564 | | |
1565 | | static inline void convolve_y_8tap_2x2_ssse3(const uint8_t *const data, |
1566 | | const ptrdiff_t stride, |
1567 | | const __m128i coeffs[4], |
1568 | | __m128i d[8], __m128i s[4], |
1569 | 5.93k | __m128i *res) { |
1570 | 5.93k | d[7] = _mm_cvtsi32_si128(loadu_int16(data + 7 * stride)); |
1571 | 5.93k | const __m128i src_67a = _mm_unpacklo_epi16(d[6], d[7]); |
1572 | 5.93k | d[6] = _mm_cvtsi32_si128(loadu_int16(data + 8 * stride)); |
1573 | 5.93k | const __m128i src_78a = _mm_unpacklo_epi16(d[7], d[6]); |
1574 | | |
1575 | 5.93k | s[3] = _mm_unpacklo_epi8(src_67a, src_78a); |
1576 | | |
1577 | 5.93k | *res = convolve_lowbd_ssse3(s, coeffs); |
1578 | 5.93k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_8tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_8tap_2x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_8tap_2x2_ssse3 convolve_avx2.c:convolve_y_8tap_2x2_ssse3 Line | Count | Source | 1569 | 5.93k | __m128i *res) { | 1570 | 5.93k | d[7] = _mm_cvtsi32_si128(loadu_int16(data + 7 * stride)); | 1571 | 5.93k | const __m128i src_67a = _mm_unpacklo_epi16(d[6], d[7]); | 1572 | 5.93k | d[6] = _mm_cvtsi32_si128(loadu_int16(data + 8 * stride)); | 1573 | 5.93k | const __m128i src_78a = _mm_unpacklo_epi16(d[7], d[6]); | 1574 | | | 1575 | 5.93k | s[3] = _mm_unpacklo_epi8(src_67a, src_78a); | 1576 | | | 1577 | 5.93k | *res = convolve_lowbd_ssse3(s, coeffs); | 1578 | 5.93k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_8tap_2x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_8tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_8tap_2x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_8tap_2x2_ssse3 |
1579 | | |
1580 | | static inline void convolve_y_2tap_4x2_ssse3(const uint8_t *const data, |
1581 | | const ptrdiff_t stride, |
1582 | | const __m128i *coeffs, |
1583 | 15.3k | __m128i d[2], __m128i *res) { |
1584 | 15.3k | d[1] = _mm_cvtsi32_si128(loadu_int32(data + 1 * stride)); |
1585 | 15.3k | const __m128i src_01a = _mm_unpacklo_epi32(d[0], d[1]); |
1586 | 15.3k | d[0] = _mm_cvtsi32_si128(loadu_int32(data + 2 * stride)); |
1587 | 15.3k | const __m128i src_12a = _mm_unpacklo_epi32(d[1], d[0]); |
1588 | | |
1589 | 15.3k | const __m128i s = _mm_unpacklo_epi8(src_01a, src_12a); |
1590 | | |
1591 | 15.3k | *res = _mm_maddubs_epi16(s, coeffs[0]); |
1592 | 15.3k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_2tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_2tap_4x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_2tap_4x2_ssse3 convolve_avx2.c:convolve_y_2tap_4x2_ssse3 Line | Count | Source | 1583 | 15.3k | __m128i d[2], __m128i *res) { | 1584 | 15.3k | d[1] = _mm_cvtsi32_si128(loadu_int32(data + 1 * stride)); | 1585 | 15.3k | const __m128i src_01a = _mm_unpacklo_epi32(d[0], d[1]); | 1586 | 15.3k | d[0] = _mm_cvtsi32_si128(loadu_int32(data + 2 * stride)); | 1587 | 15.3k | const __m128i src_12a = _mm_unpacklo_epi32(d[1], d[0]); | 1588 | | | 1589 | 15.3k | const __m128i s = _mm_unpacklo_epi8(src_01a, src_12a); | 1590 | | | 1591 | 15.3k | *res = _mm_maddubs_epi16(s, coeffs[0]); | 1592 | 15.3k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_2tap_4x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_2tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_2tap_4x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_2tap_4x2_ssse3 |
1593 | | |
1594 | | static inline void convolve_y_4tap_4x2_ssse3(const uint8_t *const data, |
1595 | | const ptrdiff_t stride, |
1596 | | const __m128i coeffs[2], |
1597 | | __m128i d[4], __m128i s[2], |
1598 | 194k | __m128i *res) { |
1599 | 194k | d[3] = _mm_cvtsi32_si128(loadu_int32(data + 3 * stride)); |
1600 | 194k | const __m128i src_23a = _mm_unpacklo_epi32(d[2], d[3]); |
1601 | 194k | d[2] = _mm_cvtsi32_si128(loadu_int32(data + 4 * stride)); |
1602 | 194k | const __m128i src_34a = _mm_unpacklo_epi32(d[3], d[2]); |
1603 | | |
1604 | 194k | s[1] = _mm_unpacklo_epi8(src_23a, src_34a); |
1605 | | |
1606 | 194k | *res = convolve_lowbd_4tap_ssse3(s, coeffs); |
1607 | 194k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_4tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_4tap_4x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_4tap_4x2_ssse3 convolve_avx2.c:convolve_y_4tap_4x2_ssse3 Line | Count | Source | 1598 | 194k | __m128i *res) { | 1599 | 194k | d[3] = _mm_cvtsi32_si128(loadu_int32(data + 3 * stride)); | 1600 | 194k | const __m128i src_23a = _mm_unpacklo_epi32(d[2], d[3]); | 1601 | 194k | d[2] = _mm_cvtsi32_si128(loadu_int32(data + 4 * stride)); | 1602 | 194k | const __m128i src_34a = _mm_unpacklo_epi32(d[3], d[2]); | 1603 | | | 1604 | 194k | s[1] = _mm_unpacklo_epi8(src_23a, src_34a); | 1605 | | | 1606 | 194k | *res = convolve_lowbd_4tap_ssse3(s, coeffs); | 1607 | 194k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_4tap_4x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_4tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_4tap_4x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_4tap_4x2_ssse3 |
1608 | | |
1609 | | static inline void convolve_y_6tap_4x2_ssse3(const uint8_t *const data, |
1610 | | const ptrdiff_t stride, |
1611 | | const __m128i coeffs[3], |
1612 | | __m128i d[6], __m128i s[3], |
1613 | 279k | __m128i *res) { |
1614 | 279k | d[5] = _mm_cvtsi32_si128(loadu_int32(data + 5 * stride)); |
1615 | 279k | const __m128i src_45a = _mm_unpacklo_epi32(d[4], d[5]); |
1616 | 279k | d[4] = _mm_cvtsi32_si128(loadu_int32(data + 6 * stride)); |
1617 | 279k | const __m128i src_56a = _mm_unpacklo_epi32(d[5], d[4]); |
1618 | | |
1619 | 279k | s[2] = _mm_unpacklo_epi8(src_45a, src_56a); |
1620 | | |
1621 | 279k | *res = convolve_lowbd_6tap_ssse3(s, coeffs); |
1622 | 279k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_6tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_6tap_4x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_6tap_4x2_ssse3 convolve_avx2.c:convolve_y_6tap_4x2_ssse3 Line | Count | Source | 1613 | 279k | __m128i *res) { | 1614 | 279k | d[5] = _mm_cvtsi32_si128(loadu_int32(data + 5 * stride)); | 1615 | 279k | const __m128i src_45a = _mm_unpacklo_epi32(d[4], d[5]); | 1616 | 279k | d[4] = _mm_cvtsi32_si128(loadu_int32(data + 6 * stride)); | 1617 | 279k | const __m128i src_56a = _mm_unpacklo_epi32(d[5], d[4]); | 1618 | | | 1619 | 279k | s[2] = _mm_unpacklo_epi8(src_45a, src_56a); | 1620 | | | 1621 | 279k | *res = convolve_lowbd_6tap_ssse3(s, coeffs); | 1622 | 279k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_6tap_4x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_6tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_6tap_4x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_6tap_4x2_ssse3 |
1623 | | |
1624 | | static inline void convolve_y_8tap_4x2_ssse3(const uint8_t *const data, |
1625 | | const ptrdiff_t stride, |
1626 | | const __m128i coeffs[4], |
1627 | | __m128i d[8], __m128i s[4], |
1628 | 30.0k | __m128i *res) { |
1629 | 30.0k | d[7] = _mm_cvtsi32_si128(loadu_int32(data + 7 * stride)); |
1630 | 30.0k | const __m128i src_67a = _mm_unpacklo_epi32(d[6], d[7]); |
1631 | 30.0k | d[6] = _mm_cvtsi32_si128(loadu_int32(data + 8 * stride)); |
1632 | 30.0k | const __m128i src_78a = _mm_unpacklo_epi32(d[7], d[6]); |
1633 | | |
1634 | 30.0k | s[3] = _mm_unpacklo_epi8(src_67a, src_78a); |
1635 | | |
1636 | 30.0k | res[0] = convolve_lowbd_ssse3(s, coeffs); |
1637 | 30.0k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_8tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_8tap_4x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_8tap_4x2_ssse3 convolve_avx2.c:convolve_y_8tap_4x2_ssse3 Line | Count | Source | 1628 | 30.0k | __m128i *res) { | 1629 | 30.0k | d[7] = _mm_cvtsi32_si128(loadu_int32(data + 7 * stride)); | 1630 | 30.0k | const __m128i src_67a = _mm_unpacklo_epi32(d[6], d[7]); | 1631 | 30.0k | d[6] = _mm_cvtsi32_si128(loadu_int32(data + 8 * stride)); | 1632 | 30.0k | const __m128i src_78a = _mm_unpacklo_epi32(d[7], d[6]); | 1633 | | | 1634 | 30.0k | s[3] = _mm_unpacklo_epi8(src_67a, src_78a); | 1635 | | | 1636 | 30.0k | res[0] = convolve_lowbd_ssse3(s, coeffs); | 1637 | 30.0k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_8tap_4x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_8tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_8tap_4x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_8tap_4x2_ssse3 |
1638 | | |
1639 | | static inline void convolve_y_2tap_8x2_avx2(const uint8_t *const data, |
1640 | | const ptrdiff_t stride, |
1641 | | const __m256i *coeffs, __m128i d[2], |
1642 | 12.7k | __m256i *res) { |
1643 | 12.7k | d[1] = _mm_loadu_si128((__m128i *)(data + 1 * stride)); |
1644 | 12.7k | const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]); |
1645 | 12.7k | d[0] = _mm_loadu_si128((__m128i *)(data + 2 * stride)); |
1646 | 12.7k | const __m256i src_12a = _mm256_setr_m128i(d[1], d[0]); |
1647 | | |
1648 | 12.7k | const __m256i s = _mm256_unpacklo_epi8(src_01a, src_12a); |
1649 | | |
1650 | 12.7k | *res = _mm256_maddubs_epi16(s, coeffs[0]); |
1651 | 12.7k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_2tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_2tap_8x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_2tap_8x2_avx2 convolve_avx2.c:convolve_y_2tap_8x2_avx2 Line | Count | Source | 1642 | 12.7k | __m256i *res) { | 1643 | 12.7k | d[1] = _mm_loadu_si128((__m128i *)(data + 1 * stride)); | 1644 | 12.7k | const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]); | 1645 | 12.7k | d[0] = _mm_loadu_si128((__m128i *)(data + 2 * stride)); | 1646 | 12.7k | const __m256i src_12a = _mm256_setr_m128i(d[1], d[0]); | 1647 | | | 1648 | 12.7k | const __m256i s = _mm256_unpacklo_epi8(src_01a, src_12a); | 1649 | | | 1650 | 12.7k | *res = _mm256_maddubs_epi16(s, coeffs[0]); | 1651 | 12.7k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_2tap_8x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_2tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_2tap_8x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_2tap_8x2_avx2 |
1652 | | |
1653 | | static inline void convolve_y_4tap_8x2_avx2(const uint8_t *const data, |
1654 | | const ptrdiff_t stride, |
1655 | | const __m256i coeffs[2], |
1656 | | __m128i d[4], __m256i s[2], |
1657 | 169k | __m256i *res) { |
1658 | 169k | d[3] = _mm_loadu_si128((__m128i *)(data + 3 * stride)); |
1659 | 169k | const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]); |
1660 | 169k | d[2] = _mm_loadu_si128((__m128i *)(data + 4 * stride)); |
1661 | 169k | const __m256i src_34a = _mm256_setr_m128i(d[3], d[2]); |
1662 | | |
1663 | 169k | s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); |
1664 | | |
1665 | 169k | *res = convolve_lowbd_4tap(s, coeffs); |
1666 | 169k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_4tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_4tap_8x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_4tap_8x2_avx2 convolve_avx2.c:convolve_y_4tap_8x2_avx2 Line | Count | Source | 1657 | 169k | __m256i *res) { | 1658 | 169k | d[3] = _mm_loadu_si128((__m128i *)(data + 3 * stride)); | 1659 | 169k | const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]); | 1660 | 169k | d[2] = _mm_loadu_si128((__m128i *)(data + 4 * stride)); | 1661 | 169k | const __m256i src_34a = _mm256_setr_m128i(d[3], d[2]); | 1662 | | | 1663 | 169k | s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); | 1664 | | | 1665 | 169k | *res = convolve_lowbd_4tap(s, coeffs); | 1666 | 169k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_4tap_8x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_4tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_4tap_8x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_4tap_8x2_avx2 |
1667 | | |
1668 | | static inline void convolve_y_6tap_8x2_avx2(const uint8_t *const data, |
1669 | | const ptrdiff_t stride, |
1670 | | const __m256i coeffs[3], |
1671 | | __m128i d[6], __m256i s[3], |
1672 | 370k | __m256i *res) { |
1673 | 370k | d[5] = _mm_loadu_si128((__m128i *)(data + 5 * stride)); |
1674 | 370k | const __m256i src_45a = _mm256_setr_m128i(d[4], d[5]); |
1675 | 370k | d[4] = _mm_loadu_si128((__m128i *)(data + 6 * stride)); |
1676 | 370k | const __m256i src_56a = _mm256_setr_m128i(d[5], d[4]); |
1677 | | |
1678 | 370k | s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); |
1679 | | |
1680 | 370k | *res = convolve_lowbd_6tap(s, coeffs); |
1681 | 370k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_6tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_6tap_8x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_6tap_8x2_avx2 convolve_avx2.c:convolve_y_6tap_8x2_avx2 Line | Count | Source | 1672 | 370k | __m256i *res) { | 1673 | 370k | d[5] = _mm_loadu_si128((__m128i *)(data + 5 * stride)); | 1674 | 370k | const __m256i src_45a = _mm256_setr_m128i(d[4], d[5]); | 1675 | 370k | d[4] = _mm_loadu_si128((__m128i *)(data + 6 * stride)); | 1676 | 370k | const __m256i src_56a = _mm256_setr_m128i(d[5], d[4]); | 1677 | | | 1678 | 370k | s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); | 1679 | | | 1680 | 370k | *res = convolve_lowbd_6tap(s, coeffs); | 1681 | 370k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_6tap_8x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_6tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_6tap_8x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_6tap_8x2_avx2 |
1682 | | |
1683 | | static inline void convolve_y_8tap_8x2_avx2(const uint8_t *const data, |
1684 | | const ptrdiff_t stride, |
1685 | | const __m256i coeffs[4], |
1686 | | __m128i d[8], __m256i s[4], |
1687 | 30.5k | __m256i *res) { |
1688 | 30.5k | d[7] = _mm_loadu_si128((__m128i *)(data + 7 * stride)); |
1689 | 30.5k | const __m256i src_67a = _mm256_setr_m128i(d[6], d[7]); |
1690 | 30.5k | d[6] = _mm_loadu_si128((__m128i *)(data + 8 * stride)); |
1691 | 30.5k | const __m256i src_78a = _mm256_setr_m128i(d[7], d[6]); |
1692 | | |
1693 | 30.5k | s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); |
1694 | | |
1695 | 30.5k | *res = convolve_lowbd(s, coeffs); |
1696 | 30.5k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_8tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_8tap_8x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_8tap_8x2_avx2 convolve_avx2.c:convolve_y_8tap_8x2_avx2 Line | Count | Source | 1687 | 30.5k | __m256i *res) { | 1688 | 30.5k | d[7] = _mm_loadu_si128((__m128i *)(data + 7 * stride)); | 1689 | 30.5k | const __m256i src_67a = _mm256_setr_m128i(d[6], d[7]); | 1690 | 30.5k | d[6] = _mm_loadu_si128((__m128i *)(data + 8 * stride)); | 1691 | 30.5k | const __m256i src_78a = _mm256_setr_m128i(d[7], d[6]); | 1692 | | | 1693 | 30.5k | s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); | 1694 | | | 1695 | 30.5k | *res = convolve_lowbd(s, coeffs); | 1696 | 30.5k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_8tap_8x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_8tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_8tap_8x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_8tap_8x2_avx2 |
1697 | | |
1698 | | static inline void convolve_y_2tap_16x2_avx2(const uint8_t *const data, |
1699 | | const ptrdiff_t stride, |
1700 | | const __m256i *coeffs, |
1701 | 13.9k | __m128i d[2], __m256i res[2]) { |
1702 | 13.9k | d[1] = _mm_loadu_si128((__m128i *)(data + 1 * stride)); |
1703 | 13.9k | const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]); |
1704 | 13.9k | d[0] = _mm_loadu_si128((__m128i *)(data + 2 * stride)); |
1705 | 13.9k | const __m256i src_12a = _mm256_setr_m128i(d[1], d[0]); |
1706 | | |
1707 | 13.9k | const __m256i s0 = _mm256_unpacklo_epi8(src_01a, src_12a); |
1708 | 13.9k | const __m256i s1 = _mm256_unpackhi_epi8(src_01a, src_12a); |
1709 | | |
1710 | 13.9k | res[0] = _mm256_maddubs_epi16(s0, coeffs[0]); |
1711 | 13.9k | res[1] = _mm256_maddubs_epi16(s1, coeffs[0]); |
1712 | 13.9k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_2tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_2tap_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_2tap_16x2_avx2 convolve_avx2.c:convolve_y_2tap_16x2_avx2 Line | Count | Source | 1701 | 13.9k | __m128i d[2], __m256i res[2]) { | 1702 | 13.9k | d[1] = _mm_loadu_si128((__m128i *)(data + 1 * stride)); | 1703 | 13.9k | const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]); | 1704 | 13.9k | d[0] = _mm_loadu_si128((__m128i *)(data + 2 * stride)); | 1705 | 13.9k | const __m256i src_12a = _mm256_setr_m128i(d[1], d[0]); | 1706 | | | 1707 | 13.9k | const __m256i s0 = _mm256_unpacklo_epi8(src_01a, src_12a); | 1708 | 13.9k | const __m256i s1 = _mm256_unpackhi_epi8(src_01a, src_12a); | 1709 | | | 1710 | 13.9k | res[0] = _mm256_maddubs_epi16(s0, coeffs[0]); | 1711 | 13.9k | res[1] = _mm256_maddubs_epi16(s1, coeffs[0]); | 1712 | 13.9k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_2tap_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_2tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_2tap_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_2tap_16x2_avx2 |
1713 | | |
1714 | | static inline void convolve_y_4tap_16x2_avx2(const uint8_t *const data, |
1715 | | const ptrdiff_t stride, |
1716 | | const __m256i coeffs[2], |
1717 | | __m128i d[4], __m256i s[4], |
1718 | 97.3k | __m256i res[2]) { |
1719 | 97.3k | d[3] = _mm_loadu_si128((__m128i *)(data + 3 * stride)); |
1720 | 97.3k | const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]); |
1721 | 97.3k | d[2] = _mm_loadu_si128((__m128i *)(data + 4 * stride)); |
1722 | 97.3k | const __m256i src_34a = _mm256_setr_m128i(d[3], d[2]); |
1723 | | |
1724 | 97.3k | s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); |
1725 | 97.3k | s[3] = _mm256_unpackhi_epi8(src_23a, src_34a); |
1726 | | |
1727 | 97.3k | res[0] = convolve_lowbd_4tap(s, coeffs); |
1728 | 97.3k | res[1] = convolve_lowbd_4tap(s + 2, coeffs); |
1729 | 97.3k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_4tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_4tap_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_4tap_16x2_avx2 convolve_avx2.c:convolve_y_4tap_16x2_avx2 Line | Count | Source | 1718 | 97.3k | __m256i res[2]) { | 1719 | 97.3k | d[3] = _mm_loadu_si128((__m128i *)(data + 3 * stride)); | 1720 | 97.3k | const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]); | 1721 | 97.3k | d[2] = _mm_loadu_si128((__m128i *)(data + 4 * stride)); | 1722 | 97.3k | const __m256i src_34a = _mm256_setr_m128i(d[3], d[2]); | 1723 | | | 1724 | 97.3k | s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); | 1725 | 97.3k | s[3] = _mm256_unpackhi_epi8(src_23a, src_34a); | 1726 | | | 1727 | 97.3k | res[0] = convolve_lowbd_4tap(s, coeffs); | 1728 | 97.3k | res[1] = convolve_lowbd_4tap(s + 2, coeffs); | 1729 | 97.3k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_4tap_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_4tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_4tap_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_4tap_16x2_avx2 |
1730 | | |
1731 | | static inline void convolve_y_6tap_16x2_avx2(const uint8_t *const data, |
1732 | | const ptrdiff_t stride, |
1733 | | const __m256i coeffs[3], |
1734 | | __m128i d[6], __m256i s[6], |
1735 | 1.42M | __m256i res[2]) { |
1736 | 1.42M | d[5] = _mm_loadu_si128((__m128i *)(data + 5 * stride)); |
1737 | 1.42M | const __m256i src_45a = _mm256_setr_m128i(d[4], d[5]); |
1738 | 1.42M | d[4] = _mm_loadu_si128((__m128i *)(data + 6 * stride)); |
1739 | 1.42M | const __m256i src_56a = _mm256_setr_m128i(d[5], d[4]); |
1740 | | |
1741 | 1.42M | s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); |
1742 | 1.42M | s[5] = _mm256_unpackhi_epi8(src_45a, src_56a); |
1743 | | |
1744 | 1.42M | res[0] = convolve_lowbd_6tap(s, coeffs); |
1745 | 1.42M | res[1] = convolve_lowbd_6tap(s + 3, coeffs); |
1746 | 1.42M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_6tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_6tap_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_6tap_16x2_avx2 convolve_avx2.c:convolve_y_6tap_16x2_avx2 Line | Count | Source | 1735 | 1.42M | __m256i res[2]) { | 1736 | 1.42M | d[5] = _mm_loadu_si128((__m128i *)(data + 5 * stride)); | 1737 | 1.42M | const __m256i src_45a = _mm256_setr_m128i(d[4], d[5]); | 1738 | 1.42M | d[4] = _mm_loadu_si128((__m128i *)(data + 6 * stride)); | 1739 | 1.42M | const __m256i src_56a = _mm256_setr_m128i(d[5], d[4]); | 1740 | | | 1741 | 1.42M | s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); | 1742 | 1.42M | s[5] = _mm256_unpackhi_epi8(src_45a, src_56a); | 1743 | | | 1744 | 1.42M | res[0] = convolve_lowbd_6tap(s, coeffs); | 1745 | 1.42M | res[1] = convolve_lowbd_6tap(s + 3, coeffs); | 1746 | 1.42M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_6tap_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_6tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_6tap_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_6tap_16x2_avx2 |
1747 | | |
1748 | | static inline void convolve_y_8tap_16x2_avx2(const uint8_t *const data, |
1749 | | const ptrdiff_t stride, |
1750 | | const __m256i coeffs[4], |
1751 | | __m128i d[8], __m256i s[8], |
1752 | 94.4k | __m256i res[2]) { |
1753 | 94.4k | d[7] = _mm_loadu_si128((__m128i *)(data + 7 * stride)); |
1754 | 94.4k | const __m256i src_67a = _mm256_setr_m128i(d[6], d[7]); |
1755 | 94.4k | d[6] = _mm_loadu_si128((__m128i *)(data + 8 * stride)); |
1756 | 94.4k | const __m256i src_78a = _mm256_setr_m128i(d[7], d[6]); |
1757 | | |
1758 | 94.4k | s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); |
1759 | 94.4k | s[7] = _mm256_unpackhi_epi8(src_67a, src_78a); |
1760 | | |
1761 | 94.4k | res[0] = convolve_lowbd(s, coeffs); |
1762 | 94.4k | res[1] = convolve_lowbd(s + 4, coeffs); |
1763 | 94.4k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_8tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_8tap_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_8tap_16x2_avx2 convolve_avx2.c:convolve_y_8tap_16x2_avx2 Line | Count | Source | 1752 | 94.4k | __m256i res[2]) { | 1753 | 94.4k | d[7] = _mm_loadu_si128((__m128i *)(data + 7 * stride)); | 1754 | 94.4k | const __m256i src_67a = _mm256_setr_m128i(d[6], d[7]); | 1755 | 94.4k | d[6] = _mm_loadu_si128((__m128i *)(data + 8 * stride)); | 1756 | 94.4k | const __m256i src_78a = _mm256_setr_m128i(d[7], d[6]); | 1757 | | | 1758 | 94.4k | s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); | 1759 | 94.4k | s[7] = _mm256_unpackhi_epi8(src_67a, src_78a); | 1760 | | | 1761 | 94.4k | res[0] = convolve_lowbd(s, coeffs); | 1762 | 94.4k | res[1] = convolve_lowbd(s + 4, coeffs); | 1763 | 94.4k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_8tap_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_8tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_8tap_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_8tap_16x2_avx2 |
1764 | | |
1765 | | static inline void convolve_y_2tap_32x2_avx2(const uint8_t *const data, |
1766 | | const ptrdiff_t stride, |
1767 | | const __m256i *coeffs, |
1768 | 41.0k | __m256i d[2], __m256i res[4]) { |
1769 | 41.0k | d[1] = _mm256_loadu_si256((__m256i *)(data + 1 * stride)); |
1770 | 41.0k | const __m256i s00 = _mm256_unpacklo_epi8(d[0], d[1]); |
1771 | 41.0k | const __m256i s01 = _mm256_unpackhi_epi8(d[0], d[1]); |
1772 | 41.0k | d[0] = _mm256_loadu_si256((__m256i *)(data + 2 * stride)); |
1773 | 41.0k | const __m256i s10 = _mm256_unpacklo_epi8(d[1], d[0]); |
1774 | 41.0k | const __m256i s11 = _mm256_unpackhi_epi8(d[1], d[0]); |
1775 | | |
1776 | 41.0k | res[0] = _mm256_maddubs_epi16(s00, coeffs[0]); |
1777 | 41.0k | res[1] = _mm256_maddubs_epi16(s01, coeffs[0]); |
1778 | 41.0k | res[2] = _mm256_maddubs_epi16(s10, coeffs[0]); |
1779 | 41.0k | res[3] = _mm256_maddubs_epi16(s11, coeffs[0]); |
1780 | 41.0k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_2tap_32x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_2tap_32x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_2tap_32x2_avx2 convolve_avx2.c:convolve_y_2tap_32x2_avx2 Line | Count | Source | 1768 | 41.0k | __m256i d[2], __m256i res[4]) { | 1769 | 41.0k | d[1] = _mm256_loadu_si256((__m256i *)(data + 1 * stride)); | 1770 | 41.0k | const __m256i s00 = _mm256_unpacklo_epi8(d[0], d[1]); | 1771 | 41.0k | const __m256i s01 = _mm256_unpackhi_epi8(d[0], d[1]); | 1772 | 41.0k | d[0] = _mm256_loadu_si256((__m256i *)(data + 2 * stride)); | 1773 | 41.0k | const __m256i s10 = _mm256_unpacklo_epi8(d[1], d[0]); | 1774 | 41.0k | const __m256i s11 = _mm256_unpackhi_epi8(d[1], d[0]); | 1775 | | | 1776 | 41.0k | res[0] = _mm256_maddubs_epi16(s00, coeffs[0]); | 1777 | 41.0k | res[1] = _mm256_maddubs_epi16(s01, coeffs[0]); | 1778 | 41.0k | res[2] = _mm256_maddubs_epi16(s10, coeffs[0]); | 1779 | 41.0k | res[3] = _mm256_maddubs_epi16(s11, coeffs[0]); | 1780 | 41.0k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_2tap_32x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_2tap_32x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_2tap_32x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_2tap_32x2_avx2 |
1781 | | |
1782 | | static inline void convolve_y_4tap_32x2_avx2(const uint8_t *const data, |
1783 | | const ptrdiff_t stride, |
1784 | | const __m256i coeffs[2], |
1785 | | __m256i d[4], __m256i s1[4], |
1786 | 87.2k | __m256i s2[4], __m256i res[4]) { |
1787 | 87.2k | d[3] = _mm256_loadu_si256((__m256i *)(data + 3 * stride)); |
1788 | 87.2k | s1[1] = _mm256_unpacklo_epi8(d[2], d[3]); |
1789 | 87.2k | s1[3] = _mm256_unpackhi_epi8(d[2], d[3]); |
1790 | 87.2k | d[2] = _mm256_loadu_si256((__m256i *)(data + 4 * stride)); |
1791 | 87.2k | s2[1] = _mm256_unpacklo_epi8(d[3], d[2]); |
1792 | 87.2k | s2[3] = _mm256_unpackhi_epi8(d[3], d[2]); |
1793 | | |
1794 | 87.2k | res[0] = convolve_lowbd_4tap(s1, coeffs); |
1795 | 87.2k | res[1] = convolve_lowbd_4tap(s1 + 2, coeffs); |
1796 | 87.2k | res[2] = convolve_lowbd_4tap(s2, coeffs); |
1797 | 87.2k | res[3] = convolve_lowbd_4tap(s2 + 2, coeffs); |
1798 | 87.2k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_4tap_32x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_4tap_32x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_4tap_32x2_avx2 convolve_avx2.c:convolve_y_4tap_32x2_avx2 Line | Count | Source | 1786 | 87.2k | __m256i s2[4], __m256i res[4]) { | 1787 | 87.2k | d[3] = _mm256_loadu_si256((__m256i *)(data + 3 * stride)); | 1788 | 87.2k | s1[1] = _mm256_unpacklo_epi8(d[2], d[3]); | 1789 | 87.2k | s1[3] = _mm256_unpackhi_epi8(d[2], d[3]); | 1790 | 87.2k | d[2] = _mm256_loadu_si256((__m256i *)(data + 4 * stride)); | 1791 | 87.2k | s2[1] = _mm256_unpacklo_epi8(d[3], d[2]); | 1792 | 87.2k | s2[3] = _mm256_unpackhi_epi8(d[3], d[2]); | 1793 | | | 1794 | 87.2k | res[0] = convolve_lowbd_4tap(s1, coeffs); | 1795 | 87.2k | res[1] = convolve_lowbd_4tap(s1 + 2, coeffs); | 1796 | 87.2k | res[2] = convolve_lowbd_4tap(s2, coeffs); | 1797 | 87.2k | res[3] = convolve_lowbd_4tap(s2 + 2, coeffs); | 1798 | 87.2k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_4tap_32x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_4tap_32x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_4tap_32x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_4tap_32x2_avx2 |
1799 | | #endif // AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ |