/src/aom/aom_dsp/x86/convolve_avx2.h
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2018, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #ifndef AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ |
13 | | #define AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ |
14 | | |
15 | | #include <immintrin.h> |
16 | | |
17 | | #include "aom_ports/mem.h" |
18 | | |
19 | | #include "aom_dsp/x86/mem_sse2.h" |
20 | | #include "aom_dsp/x86/synonyms.h" |
21 | | |
22 | | #include "av1/common/convolve.h" |
23 | | #include "av1/common/filter.h" |
24 | | |
25 | 820k | #define SECOND_32_BLK (32) |
26 | 731k | #define THIRD_32_BLK (32 << 1) |
27 | 365k | #define FOURTH_32_BLK (SECOND_32_BLK + THIRD_32_BLK) |
28 | | |
29 | | // filters for 16 |
30 | | DECLARE_ALIGNED(32, static const uint8_t, filt_global_avx2[]) = { |
31 | | 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, |
32 | | 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 2, 3, 3, 4, 4, 5, |
33 | | 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6, |
34 | | 7, 7, 8, 8, 9, 9, 10, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, |
35 | | 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, |
36 | | 12, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 6, 7, |
37 | | 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 |
38 | | }; |
39 | | |
40 | | DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[]) = { |
41 | | 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, |
42 | | 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, |
43 | | 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, |
44 | | }; |
45 | | |
46 | | DECLARE_ALIGNED(32, static const uint8_t, filt4_d4_global_avx2[]) = { |
47 | | 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, |
48 | | 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, |
49 | | }; |
50 | | |
51 | | DECLARE_ALIGNED(32, static const uint8_t, filt_center_global_avx2[32]) = { |
52 | | 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255, |
53 | | 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255 |
54 | | }; |
55 | | |
56 | | DECLARE_ALIGNED(32, static const uint8_t, |
57 | | filt1_global_sse2[16]) = { 0, 1, 1, 2, 2, 3, 3, 4, |
58 | | 8, 9, 9, 10, 10, 11, 11, 12 }; |
59 | | |
60 | | DECLARE_ALIGNED(32, static const uint8_t, |
61 | | filt2_global_sse2[16]) = { 2, 3, 3, 4, 4, 5, 5, 6, |
62 | | 10, 11, 11, 12, 12, 13, 13, 14 }; |
63 | | |
64 | | DECLARE_ALIGNED(32, static const uint8_t, |
65 | | filt3_global_sse2[16]) = { 0, 1, 1, 2, 8, 9, 9, 10, |
66 | | 0, 0, 0, 0, 0, 0, 0, 0 }; |
67 | | |
68 | | DECLARE_ALIGNED(32, static const uint8_t, |
69 | | filt4_global_sse2[16]) = { 2, 3, 3, 4, 10, 11, 11, 12, |
70 | | 0, 0, 0, 0, 0, 0, 0, 0 }; |
71 | | |
72 | | DECLARE_ALIGNED(32, static const uint8_t, |
73 | | filt5_global_sse2[16]) = { 0, 1, 1, 2, 4, 5, 5, 6, |
74 | | 0, 0, 0, 0, 0, 0, 0, 0 }; |
75 | | |
76 | | DECLARE_ALIGNED(32, static const uint8_t, |
77 | | filt1_global_avx2[32]) = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, |
78 | | 6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3, |
79 | | 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
80 | | |
81 | | DECLARE_ALIGNED(32, static const uint8_t, |
82 | | filt2_global_avx2[32]) = { 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, |
83 | | 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, |
84 | | 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 }; |
85 | | |
86 | | DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = { |
87 | | 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, |
88 | | 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 |
89 | | }; |
90 | | |
91 | | DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { |
92 | | 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, |
93 | | 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 |
94 | | }; |
95 | | |
96 | | #define CONVOLVE_SR_HOR_FILTER_W4(CONVOLVE_LOWBD) \ |
97 | 3.07M | for (i = 0; i < (im_h - 2); i += 2) { \ |
98 | 2.48M | __m128i data = \ |
99 | 2.48M | load_8bit_8x2_to_1_reg_sse2(&src_ptr[(i * src_stride)], src_stride); \ |
100 | 2.48M | __m128i res = CONVOLVE_LOWBD(data, coeffs_h, filt); \ |
101 | 2.48M | res = _mm_srai_epi16(_mm_add_epi16(res, round_const_h), 2); \ |
102 | 2.48M | _mm_store_si128((__m128i *)&im_block[i * 4], res); \ |
103 | 2.48M | } \ |
104 | 584k | __m128i data_1 = _mm_loadl_epi64((__m128i *)&src_ptr[(i * src_stride)]); \ |
105 | 584k | __m128i res = CONVOLVE_LOWBD(data_1, coeffs_h, filt); \ |
106 | 584k | res = _mm_srai_epi16(_mm_add_epi16(res, round_const_h), 2); \ |
107 | 584k | _mm_storel_epi64((__m128i *)&im_block[i * 4], res); |
108 | | |
109 | | #define CONVOLVE_SR_HOR_FILTER_2TAP_W4 \ |
110 | 15.6k | CONVOLVE_SR_HOR_FILTER_W4(convolve_lowbd_x_2tap_ssse3) |
111 | | |
112 | | #define CONVOLVE_SR_HOR_FILTER_4TAP_W4 \ |
113 | 568k | CONVOLVE_SR_HOR_FILTER_W4(convolve_lowbd_x_4tap_ssse3) |
114 | | |
115 | | static inline void sr_2d_ver_round_and_store_w4(int w, __m256i res, |
116 | | uint8_t *dst, int dst_stride, |
117 | 1.71M | __m256i round_const_v) { |
118 | 1.71M | const __m256i res_round = |
119 | 1.71M | _mm256_srai_epi32(_mm256_add_epi32(res, round_const_v), 11); |
120 | | |
121 | 1.71M | const __m256i res_16bit = _mm256_packs_epi32(res_round, res_round); |
122 | 1.71M | const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); |
123 | | |
124 | 1.71M | const __m128i r0 = _mm256_castsi256_si128(res_8b); |
125 | 1.71M | const __m128i r1 = _mm256_extracti128_si256(res_8b, 1); |
126 | | |
127 | 1.71M | __m128i *const p0 = (__m128i *)dst; |
128 | 1.71M | __m128i *const p1 = (__m128i *)(dst + dst_stride); |
129 | | |
130 | 1.71M | if (w == 4) { |
131 | 1.42M | xx_storel_32(p0, r0); |
132 | 1.42M | xx_storel_32(p1, r1); |
133 | 1.42M | } else { |
134 | 294k | assert(w == 2); |
135 | 294k | *(uint16_t *)p0 = (uint16_t)_mm_cvtsi128_si32(r0); |
136 | 294k | *(uint16_t *)p1 = (uint16_t)_mm_cvtsi128_si32(r1); |
137 | 294k | } |
138 | 1.71M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:sr_2d_ver_round_and_store_w4 Unexecuted instantiation: highbd_convolve_avx2.c:sr_2d_ver_round_and_store_w4 convolve_2d_avx2.c:sr_2d_ver_round_and_store_w4 Line | Count | Source | 117 | 1.71M | __m256i round_const_v) { | 118 | 1.71M | const __m256i res_round = | 119 | 1.71M | _mm256_srai_epi32(_mm256_add_epi32(res, round_const_v), 11); | 120 | | | 121 | 1.71M | const __m256i res_16bit = _mm256_packs_epi32(res_round, res_round); | 122 | 1.71M | const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); | 123 | | | 124 | 1.71M | const __m128i r0 = _mm256_castsi256_si128(res_8b); | 125 | 1.71M | const __m128i r1 = _mm256_extracti128_si256(res_8b, 1); | 126 | | | 127 | 1.71M | __m128i *const p0 = (__m128i *)dst; | 128 | 1.71M | __m128i *const p1 = (__m128i *)(dst + dst_stride); | 129 | | | 130 | 1.71M | if (w == 4) { | 131 | 1.42M | xx_storel_32(p0, r0); | 132 | 1.42M | xx_storel_32(p1, r1); | 133 | 1.42M | } else { | 134 | 294k | assert(w == 2); | 135 | 294k | *(uint16_t *)p0 = (uint16_t)_mm_cvtsi128_si32(r0); | 136 | 294k | *(uint16_t *)p1 = (uint16_t)_mm_cvtsi128_si32(r1); | 137 | 294k | } | 138 | 1.71M | } |
Unexecuted instantiation: convolve_avx2.c:sr_2d_ver_round_and_store_w4 Unexecuted instantiation: jnt_convolve_avx2.c:sr_2d_ver_round_and_store_w4 Unexecuted instantiation: wiener_convolve_avx2.c:sr_2d_ver_round_and_store_w4 Unexecuted instantiation: highbd_convolve_2d_avx2.c:sr_2d_ver_round_and_store_w4 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:sr_2d_ver_round_and_store_w4 |
139 | | |
140 | | #define CONVOLVE_SR_VER_FILTER_2TAP_W4 \ |
141 | 15.6k | __m128i s[2]; \ |
142 | 15.6k | s[0] = _mm_loadl_epi64((__m128i *)(im_block + 0 * 4)); \ |
143 | 15.6k | \ |
144 | 59.7k | for (i = 0; i < h; i += 2) { \ |
145 | 44.0k | const int16_t *data = &im_block[i * 4]; \ |
146 | 44.0k | s[1] = _mm_loadl_epi64((__m128i *)(data + 1 * 4)); \ |
147 | 44.0k | const __m256i src_0 = _mm256_setr_m128i(s[0], s[1]); \ |
148 | 44.0k | s[0] = _mm_loadl_epi64((__m128i *)(data + 2 * 4)); \ |
149 | 44.0k | const __m256i src_1 = _mm256_setr_m128i(s[1], s[0]); \ |
150 | 44.0k | const __m256i ss = _mm256_unpacklo_epi16(src_0, src_1); \ |
151 | 44.0k | \ |
152 | 44.0k | const __m256i res = _mm256_madd_epi16(ss, coeffs_v[0]); \ |
153 | 44.0k | \ |
154 | 44.0k | sr_2d_ver_round_and_store_w4(w, res, dst_ptr, dst_stride, round_const_v); \ |
155 | 44.0k | dst_ptr += 2 * dst_stride; \ |
156 | 44.0k | } |
157 | | |
158 | | #define CONVOLVE_SR_VER_FILTER_4TAP_W4 \ |
159 | 377k | __m128i s[4]; \ |
160 | 377k | __m256i ss[2]; \ |
161 | 377k | s[0] = _mm_loadl_epi64((__m128i *)(im_block + 0 * 4)); \ |
162 | 377k | s[1] = _mm_loadl_epi64((__m128i *)(im_block + 1 * 4)); \ |
163 | 377k | s[2] = _mm_loadl_epi64((__m128i *)(im_block + 2 * 4)); \ |
164 | 377k | \ |
165 | 377k | const __m256i src_0 = _mm256_setr_m128i(s[0], s[1]); \ |
166 | 377k | const __m256i src_1 = _mm256_setr_m128i(s[1], s[2]); \ |
167 | 377k | \ |
168 | 377k | ss[0] = _mm256_unpacklo_epi16(src_0, src_1); \ |
169 | 377k | \ |
170 | 1.08M | for (i = 0; i < h; i += 2) { \ |
171 | 708k | const int16_t *data = &im_block[i * 4]; \ |
172 | 708k | s[3] = _mm_loadl_epi64((__m128i *)(data + 3 * 4)); \ |
173 | 708k | const __m256i src_2 = _mm256_setr_m128i(s[2], s[3]); \ |
174 | 708k | s[2] = _mm_loadl_epi64((__m128i *)(data + 4 * 4)); \ |
175 | 708k | const __m256i src_3 = _mm256_setr_m128i(s[3], s[2]); \ |
176 | 708k | ss[1] = _mm256_unpacklo_epi16(src_2, src_3); \ |
177 | 708k | \ |
178 | 708k | const __m256i res = convolve_4tap(ss, coeffs_v); \ |
179 | 708k | \ |
180 | 708k | sr_2d_ver_round_and_store_w4(w, res, dst_ptr, dst_stride, round_const_v); \ |
181 | 708k | dst_ptr += 2 * dst_stride; \ |
182 | 708k | \ |
183 | 708k | ss[0] = ss[1]; \ |
184 | 708k | } |
185 | | |
186 | | #define CONVOLVE_SR_VER_FILTER_6TAP_W4 \ |
187 | 181k | __m128i s[6]; \ |
188 | 181k | __m256i ss[3]; \ |
189 | 181k | s[0] = _mm_loadl_epi64((__m128i *)(im_block + 0 * 4)); \ |
190 | 181k | s[1] = _mm_loadl_epi64((__m128i *)(im_block + 1 * 4)); \ |
191 | 181k | s[2] = _mm_loadl_epi64((__m128i *)(im_block + 2 * 4)); \ |
192 | 181k | s[3] = _mm_loadl_epi64((__m128i *)(im_block + 3 * 4)); \ |
193 | 181k | s[4] = _mm_loadl_epi64((__m128i *)(im_block + 4 * 4)); \ |
194 | 181k | \ |
195 | 181k | const __m256i src_0 = _mm256_setr_m128i(s[0], s[1]); \ |
196 | 181k | const __m256i src_1 = _mm256_setr_m128i(s[1], s[2]); \ |
197 | 181k | const __m256i src_2 = _mm256_setr_m128i(s[2], s[3]); \ |
198 | 181k | const __m256i src_3 = _mm256_setr_m128i(s[3], s[4]); \ |
199 | 181k | \ |
200 | 181k | ss[0] = _mm256_unpacklo_epi16(src_0, src_1); \ |
201 | 181k | ss[1] = _mm256_unpacklo_epi16(src_2, src_3); \ |
202 | 181k | \ |
203 | 1.09M | for (i = 0; i < h; i += 2) { \ |
204 | 914k | const int16_t *data = &im_block[i * 4]; \ |
205 | 914k | s[5] = _mm_loadl_epi64((__m128i *)(data + 5 * 4)); \ |
206 | 914k | const __m256i src_4 = _mm256_setr_m128i(s[4], s[5]); \ |
207 | 914k | s[4] = _mm_loadl_epi64((__m128i *)(data + 6 * 4)); \ |
208 | 914k | const __m256i src_5 = _mm256_setr_m128i(s[5], s[4]); \ |
209 | 914k | ss[2] = _mm256_unpacklo_epi16(src_4, src_5); \ |
210 | 914k | \ |
211 | 914k | const __m256i res = convolve_6tap(ss, coeffs_v); \ |
212 | 914k | \ |
213 | 914k | sr_2d_ver_round_and_store_w4(w, res, dst_ptr, dst_stride, round_const_v); \ |
214 | 914k | dst_ptr += 2 * dst_stride; \ |
215 | 914k | \ |
216 | 914k | ss[0] = ss[1]; \ |
217 | 914k | ss[1] = ss[2]; \ |
218 | 914k | } |
219 | | |
220 | | #define CONVOLVE_SR_VER_FILTER_8TAP_W4 \ |
221 | 9.94k | __m128i s[8]; \ |
222 | 9.94k | __m256i ss[4]; \ |
223 | 9.94k | s[0] = _mm_loadl_epi64((__m128i *)(im_block + 0 * 4)); \ |
224 | 9.94k | s[1] = _mm_loadl_epi64((__m128i *)(im_block + 1 * 4)); \ |
225 | 9.94k | s[2] = _mm_loadl_epi64((__m128i *)(im_block + 2 * 4)); \ |
226 | 9.94k | s[3] = _mm_loadl_epi64((__m128i *)(im_block + 3 * 4)); \ |
227 | 9.94k | s[4] = _mm_loadl_epi64((__m128i *)(im_block + 4 * 4)); \ |
228 | 9.94k | s[5] = _mm_loadl_epi64((__m128i *)(im_block + 5 * 4)); \ |
229 | 9.94k | s[6] = _mm_loadl_epi64((__m128i *)(im_block + 6 * 4)); \ |
230 | 9.94k | \ |
231 | 9.94k | const __m256i src_0 = _mm256_setr_m128i(s[0], s[1]); \ |
232 | 9.94k | const __m256i src_1 = _mm256_setr_m128i(s[1], s[2]); \ |
233 | 9.94k | const __m256i src_2 = _mm256_setr_m128i(s[2], s[3]); \ |
234 | 9.94k | const __m256i src_3 = _mm256_setr_m128i(s[3], s[4]); \ |
235 | 9.94k | const __m256i src_4 = _mm256_setr_m128i(s[4], s[5]); \ |
236 | 9.94k | const __m256i src_5 = _mm256_setr_m128i(s[5], s[6]); \ |
237 | 9.94k | \ |
238 | 9.94k | ss[0] = _mm256_unpacklo_epi16(src_0, src_1); \ |
239 | 9.94k | ss[1] = _mm256_unpacklo_epi16(src_2, src_3); \ |
240 | 9.94k | ss[2] = _mm256_unpacklo_epi16(src_4, src_5); \ |
241 | 9.94k | \ |
242 | 60.5k | for (i = 0; i < h; i += 2) { \ |
243 | 50.6k | const int16_t *data = &im_block[i * 4]; \ |
244 | 50.6k | s[7] = _mm_loadl_epi64((__m128i *)(data + 7 * 4)); \ |
245 | 50.6k | const __m256i src_6 = _mm256_setr_m128i(s[6], s[7]); \ |
246 | 50.6k | s[6] = _mm_loadl_epi64((__m128i *)(data + 8 * 4)); \ |
247 | 50.6k | const __m256i src_7 = _mm256_setr_m128i(s[7], s[6]); \ |
248 | 50.6k | ss[3] = _mm256_unpacklo_epi16(src_6, src_7); \ |
249 | 50.6k | \ |
250 | 50.6k | const __m256i res = convolve(ss, coeffs_v); \ |
251 | 50.6k | \ |
252 | 50.6k | sr_2d_ver_round_and_store_w4(w, res, dst_ptr, dst_stride, round_const_v); \ |
253 | 50.6k | dst_ptr += 2 * dst_stride; \ |
254 | 50.6k | \ |
255 | 50.6k | ss[0] = ss[1]; \ |
256 | 50.6k | ss[1] = ss[2]; \ |
257 | 50.6k | ss[2] = ss[3]; \ |
258 | 50.6k | } |
259 | | |
260 | | #define CONVOLVE_SR_HORIZONTAL_FILTER(CONVOLVE_LOWBD) \ |
261 | | for (i = 0; i < (im_h - 2); i += 2) { \ |
262 | | __m256i data = _mm256_castsi128_si256( \ |
263 | | _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \ |
264 | | data = _mm256_inserti128_si256( \ |
265 | | data, \ |
266 | | _mm_loadu_si128( \ |
267 | | (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]), \ |
268 | | 1); \ |
269 | | __m256i res = CONVOLVE_LOWBD(data, coeffs_h, filt); \ |
270 | | res = _mm256_srai_epi16(_mm256_add_epi16(res, round_const_h), 2); \ |
271 | | _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \ |
272 | | } \ |
273 | | __m256i data_1 = _mm256_castsi128_si256( \ |
274 | | _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \ |
275 | | __m256i res = CONVOLVE_LOWBD(data_1, coeffs_h, filt); \ |
276 | | res = _mm256_srai_epi16(_mm256_add_epi16(res, round_const_h), 2); \ |
277 | | _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); |
278 | | |
279 | | #define CONVOLVE_SR_HORIZONTAL_FILTER_2TAP \ |
280 | | CONVOLVE_SR_HORIZONTAL_FILTER(convolve_lowbd_x_2tap) |
281 | | |
282 | | #define CONVOLVE_SR_HORIZONTAL_FILTER_4TAP \ |
283 | | CONVOLVE_SR_HORIZONTAL_FILTER(convolve_lowbd_x_4tap) |
284 | | |
285 | | #define CONVOLVE_SR_HORIZONTAL_FILTER_6TAP \ |
286 | | CONVOLVE_SR_HORIZONTAL_FILTER(convolve_lowbd_x_6tap) |
287 | | |
288 | | #define CONVOLVE_SR_HORIZONTAL_FILTER_8TAP \ |
289 | | CONVOLVE_SR_HORIZONTAL_FILTER(convolve_lowbd_x) |
290 | | |
291 | | static inline void sr_2d_ver_round_and_store(__m256i res_a, __m256i res_b, |
292 | | uint8_t *dst, int dst_stride, |
293 | 11.8M | __m256i round_const_v) { |
294 | 11.8M | const __m256i res_a_round = |
295 | 11.8M | _mm256_srai_epi32(_mm256_add_epi32(res_a, round_const_v), 11); |
296 | 11.8M | const __m256i res_b_round = |
297 | 11.8M | _mm256_srai_epi32(_mm256_add_epi32(res_b, round_const_v), 11); |
298 | 11.8M | const __m256i r16 = _mm256_packs_epi32(res_a_round, res_b_round); |
299 | 11.8M | const __m256i r8 = _mm256_packus_epi16(r16, r16); |
300 | | |
301 | 11.8M | _mm_storel_epi64((__m128i *)dst, _mm256_castsi256_si128(r8)); |
302 | 11.8M | _mm_storel_epi64((__m128i *)(dst + dst_stride), |
303 | 11.8M | _mm256_extracti128_si256(r8, 1)); |
304 | 11.8M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:sr_2d_ver_round_and_store Unexecuted instantiation: highbd_convolve_avx2.c:sr_2d_ver_round_and_store convolve_2d_avx2.c:sr_2d_ver_round_and_store Line | Count | Source | 293 | 11.8M | __m256i round_const_v) { | 294 | 11.8M | const __m256i res_a_round = | 295 | 11.8M | _mm256_srai_epi32(_mm256_add_epi32(res_a, round_const_v), 11); | 296 | 11.8M | const __m256i res_b_round = | 297 | 11.8M | _mm256_srai_epi32(_mm256_add_epi32(res_b, round_const_v), 11); | 298 | 11.8M | const __m256i r16 = _mm256_packs_epi32(res_a_round, res_b_round); | 299 | 11.8M | const __m256i r8 = _mm256_packus_epi16(r16, r16); | 300 | | | 301 | 11.8M | _mm_storel_epi64((__m128i *)dst, _mm256_castsi256_si128(r8)); | 302 | 11.8M | _mm_storel_epi64((__m128i *)(dst + dst_stride), | 303 | | _mm256_extracti128_si256(r8, 1)); | 304 | 11.8M | } |
Unexecuted instantiation: convolve_avx2.c:sr_2d_ver_round_and_store Unexecuted instantiation: jnt_convolve_avx2.c:sr_2d_ver_round_and_store Unexecuted instantiation: wiener_convolve_avx2.c:sr_2d_ver_round_and_store Unexecuted instantiation: highbd_convolve_2d_avx2.c:sr_2d_ver_round_and_store Unexecuted instantiation: highbd_jnt_convolve_avx2.c:sr_2d_ver_round_and_store |
305 | | |
306 | | #define CONVOLVE_SR_VERTICAL_FILTER_2TAP \ |
307 | 433k | for (i = 0; i < h; i += 2) { \ |
308 | 403k | __m256i s[2]; \ |
309 | 403k | const int16_t *data = &im_block[i * im_stride]; \ |
310 | 403k | const __m256i s1 = _mm256_loadu_si256((__m256i *)(data + 0 * im_stride)); \ |
311 | 403k | const __m256i s2 = _mm256_loadu_si256((__m256i *)(data + 1 * im_stride)); \ |
312 | 403k | s[0] = _mm256_unpacklo_epi16(s1, s2); \ |
313 | 403k | s[1] = _mm256_unpackhi_epi16(s1, s2); \ |
314 | 403k | \ |
315 | 403k | __m256i res_a = _mm256_madd_epi16(s[0], coeffs_v[0]); \ |
316 | 403k | __m256i res_b = _mm256_madd_epi16(s[1], coeffs_v[0]); \ |
317 | 403k | \ |
318 | 403k | sr_2d_ver_round_and_store(res_a, res_b, dst_ptr, dst_stride, \ |
319 | 403k | round_const_v); \ |
320 | 403k | dst_ptr += 2 * dst_stride; \ |
321 | 403k | } |
322 | | |
323 | | #define CONVOLVE_SR_VERTICAL_FILTER_4TAP \ |
324 | 546k | __m256i s[6]; \ |
325 | 546k | __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ |
326 | 546k | __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ |
327 | 546k | \ |
328 | 546k | s[0] = _mm256_unpacklo_epi16(src_0, src_1); \ |
329 | 546k | s[2] = _mm256_unpackhi_epi16(src_0, src_1); \ |
330 | 546k | \ |
331 | 2.09M | for (i = 0; i < h; i += 2) { \ |
332 | 1.54M | const int16_t *data = &im_block[i * im_stride]; \ |
333 | 1.54M | const __m256i s4 = _mm256_loadu_si256((__m256i *)(data + 2 * im_stride)); \ |
334 | 1.54M | const __m256i s5 = _mm256_loadu_si256((__m256i *)(data + 3 * im_stride)); \ |
335 | 1.54M | s[1] = _mm256_unpacklo_epi16(s4, s5); \ |
336 | 1.54M | s[3] = _mm256_unpackhi_epi16(s4, s5); \ |
337 | 1.54M | \ |
338 | 1.54M | __m256i res_a = convolve_4tap(s, coeffs_v); \ |
339 | 1.54M | __m256i res_b = convolve_4tap(s + 2, coeffs_v); \ |
340 | 1.54M | \ |
341 | 1.54M | sr_2d_ver_round_and_store(res_a, res_b, dst_ptr, dst_stride, \ |
342 | 1.54M | round_const_v); \ |
343 | 1.54M | dst_ptr += 2 * dst_stride; \ |
344 | 1.54M | \ |
345 | 1.54M | s[0] = s[1]; \ |
346 | 1.54M | s[2] = s[3]; \ |
347 | 1.54M | } |
348 | | |
349 | | #define CONVOLVE_SR_VERTICAL_FILTER_6TAP \ |
350 | 766k | __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ |
351 | 766k | __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ |
352 | 766k | __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \ |
353 | 766k | __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \ |
354 | 766k | \ |
355 | 766k | __m256i s[8]; \ |
356 | 766k | s[0] = _mm256_unpacklo_epi16(src_0, src_1); \ |
357 | 766k | s[1] = _mm256_unpacklo_epi16(src_2, src_3); \ |
358 | 766k | \ |
359 | 766k | s[3] = _mm256_unpackhi_epi16(src_0, src_1); \ |
360 | 766k | s[4] = _mm256_unpackhi_epi16(src_2, src_3); \ |
361 | 766k | \ |
362 | 9.24M | for (i = 0; i < h; i += 2) { \ |
363 | 8.48M | const int16_t *data = &im_block[i * im_stride]; \ |
364 | 8.48M | \ |
365 | 8.48M | const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 4 * im_stride)); \ |
366 | 8.48M | const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 5 * im_stride)); \ |
367 | 8.48M | \ |
368 | 8.48M | s[2] = _mm256_unpacklo_epi16(s6, s7); \ |
369 | 8.48M | s[5] = _mm256_unpackhi_epi16(s6, s7); \ |
370 | 8.48M | \ |
371 | 8.48M | __m256i res_a = convolve_6tap(s, coeffs_v); \ |
372 | 8.48M | __m256i res_b = convolve_6tap(s + 3, coeffs_v); \ |
373 | 8.48M | \ |
374 | 8.48M | sr_2d_ver_round_and_store(res_a, res_b, dst_ptr, dst_stride, \ |
375 | 8.48M | round_const_v); \ |
376 | 8.48M | dst_ptr += 2 * dst_stride; \ |
377 | 8.48M | \ |
378 | 8.48M | s[0] = s[1]; \ |
379 | 8.48M | s[1] = s[2]; \ |
380 | 8.48M | \ |
381 | 8.48M | s[3] = s[4]; \ |
382 | 8.48M | s[4] = s[5]; \ |
383 | 8.48M | } |
384 | | |
385 | | #define CONVOLVE_SR_VERTICAL_FILTER_8TAP \ |
386 | 115k | __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ |
387 | 115k | __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ |
388 | 115k | __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \ |
389 | 115k | __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \ |
390 | 115k | __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); \ |
391 | 115k | __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); \ |
392 | 115k | \ |
393 | 115k | __m256i s[8]; \ |
394 | 115k | s[0] = _mm256_unpacklo_epi16(src_0, src_1); \ |
395 | 115k | s[1] = _mm256_unpacklo_epi16(src_2, src_3); \ |
396 | 115k | s[2] = _mm256_unpacklo_epi16(src_4, src_5); \ |
397 | 115k | \ |
398 | 115k | s[4] = _mm256_unpackhi_epi16(src_0, src_1); \ |
399 | 115k | s[5] = _mm256_unpackhi_epi16(src_2, src_3); \ |
400 | 115k | s[6] = _mm256_unpackhi_epi16(src_4, src_5); \ |
401 | 115k | \ |
402 | 1.53M | for (i = 0; i < h; i += 2) { \ |
403 | 1.41M | const int16_t *data = &im_block[i * im_stride]; \ |
404 | 1.41M | \ |
405 | 1.41M | const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); \ |
406 | 1.41M | const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); \ |
407 | 1.41M | \ |
408 | 1.41M | s[3] = _mm256_unpacklo_epi16(s6, s7); \ |
409 | 1.41M | s[7] = _mm256_unpackhi_epi16(s6, s7); \ |
410 | 1.41M | \ |
411 | 1.41M | __m256i res_a = convolve(s, coeffs_v); \ |
412 | 1.41M | __m256i res_b = convolve(s + 4, coeffs_v); \ |
413 | 1.41M | \ |
414 | 1.41M | sr_2d_ver_round_and_store(res_a, res_b, dst_ptr, dst_stride, \ |
415 | 1.41M | round_const_v); \ |
416 | 1.41M | dst_ptr += 2 * dst_stride; \ |
417 | 1.41M | \ |
418 | 1.41M | s[0] = s[1]; \ |
419 | 1.41M | s[1] = s[2]; \ |
420 | 1.41M | s[2] = s[3]; \ |
421 | 1.41M | \ |
422 | 1.41M | s[4] = s[5]; \ |
423 | 1.41M | s[5] = s[6]; \ |
424 | 1.41M | s[6] = s[7]; \ |
425 | 1.41M | } |
426 | | |
427 | | #define CONVOLVE_SR_VERTICAL_FILTER_12TAP \ |
428 | 0 | __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ |
429 | 0 | __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ |
430 | 0 | __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \ |
431 | 0 | __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \ |
432 | 0 | __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); \ |
433 | 0 | __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); \ |
434 | 0 | __m256i src_6 = _mm256_loadu_si256((__m256i *)(im_block + 6 * im_stride)); \ |
435 | 0 | __m256i src_7 = _mm256_loadu_si256((__m256i *)(im_block + 7 * im_stride)); \ |
436 | 0 | __m256i src_8 = _mm256_loadu_si256((__m256i *)(im_block + 8 * im_stride)); \ |
437 | 0 | __m256i src_9 = _mm256_loadu_si256((__m256i *)(im_block + 9 * im_stride)); \ |
438 | 0 | \ |
439 | 0 | s[0] = _mm256_unpacklo_epi16(src_0, src_1); \ |
440 | 0 | s[1] = _mm256_unpacklo_epi16(src_2, src_3); \ |
441 | 0 | s[2] = _mm256_unpacklo_epi16(src_4, src_5); \ |
442 | 0 | s[3] = _mm256_unpacklo_epi16(src_6, src_7); \ |
443 | 0 | s[4] = _mm256_unpacklo_epi16(src_8, src_9); \ |
444 | 0 | \ |
445 | 0 | s[6] = _mm256_unpackhi_epi16(src_0, src_1); \ |
446 | 0 | s[7] = _mm256_unpackhi_epi16(src_2, src_3); \ |
447 | 0 | s[8] = _mm256_unpackhi_epi16(src_4, src_5); \ |
448 | 0 | s[9] = _mm256_unpackhi_epi16(src_6, src_7); \ |
449 | 0 | s[10] = _mm256_unpackhi_epi16(src_8, src_9); \ |
450 | 0 | \ |
451 | 0 | for (i = 0; i < h; i += 2) { \ |
452 | 0 | const int16_t *data = &im_block[i * im_stride]; \ |
453 | 0 | \ |
454 | 0 | const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 10 * im_stride)); \ |
455 | 0 | const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 11 * im_stride)); \ |
456 | 0 | \ |
457 | 0 | s[5] = _mm256_unpacklo_epi16(s6, s7); \ |
458 | 0 | s[11] = _mm256_unpackhi_epi16(s6, s7); \ |
459 | 0 | \ |
460 | 0 | __m256i res_a = convolve_12taps(s, coeffs_v); \ |
461 | 0 | __m256i res_b = convolve_12taps(s + 6, coeffs_v); \ |
462 | 0 | \ |
463 | 0 | res_a = \ |
464 | 0 | _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v); \ |
465 | 0 | res_b = \ |
466 | 0 | _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v); \ |
467 | 0 | \ |
468 | 0 | const __m256i res_a_round = _mm256_sra_epi32( \ |
469 | 0 | _mm256_add_epi32(res_a, round_const_v), round_shift_v); \ |
470 | 0 | const __m256i res_b_round = _mm256_sra_epi32( \ |
471 | 0 | _mm256_add_epi32(res_b, round_const_v), round_shift_v); \ |
472 | 0 | \ |
473 | 0 | const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); \ |
474 | 0 | const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); \ |
475 | 0 | \ |
476 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_8b); \ |
477 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); \ |
478 | 0 | \ |
479 | 0 | __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; \ |
480 | 0 | __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; \ |
481 | 0 | if (w - j > 4) { \ |
482 | 0 | _mm_storel_epi64(p_0, res_0); \ |
483 | 0 | _mm_storel_epi64(p_1, res_1); \ |
484 | 0 | } else if (w == 4) { \ |
485 | 0 | xx_storel_32(p_0, res_0); \ |
486 | 0 | xx_storel_32(p_1, res_1); \ |
487 | 0 | } else { \ |
488 | 0 | *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); \ |
489 | 0 | *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); \ |
490 | 0 | } \ |
491 | 0 | \ |
492 | 0 | s[0] = s[1]; \ |
493 | 0 | s[1] = s[2]; \ |
494 | 0 | s[2] = s[3]; \ |
495 | 0 | s[3] = s[4]; \ |
496 | 0 | s[4] = s[5]; \ |
497 | 0 | \ |
498 | 0 | s[6] = s[7]; \ |
499 | 0 | s[7] = s[8]; \ |
500 | 0 | s[8] = s[9]; \ |
501 | 0 | s[9] = s[10]; \ |
502 | 0 | s[10] = s[11]; \ |
503 | 0 | } |
504 | | |
505 | | #define JNT_CONVOLVE_PROCESS_OUTPUT(res_unsigned, j_off) \ |
506 | 2.65M | do { \ |
507 | 2.65M | if (do_average) { \ |
508 | 1.10M | const __m256i data_ref_0 = \ |
509 | 1.10M | load_line2_avx2(&dst[i * dst_stride + (j_off)], \ |
510 | 1.10M | &dst[i * dst_stride + (j_off) + dst_stride]); \ |
511 | 1.10M | const __m256i comp_avg_res = \ |
512 | 1.10M | comp_avg(&data_ref_0, &(res_unsigned), &wt, use_dist_wtd_comp_avg); \ |
513 | 1.10M | const __m256i res_signed = _mm256_sub_epi16(comp_avg_res, offset_const); \ |
514 | 1.10M | const __m256i round_result = \ |
515 | 1.10M | _mm256_srai_epi16(_mm256_add_epi16(res_signed, rounding_const), 4); \ |
516 | 1.10M | const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); \ |
517 | 1.10M | const __m128i res_0 = _mm256_castsi256_si128(res_8); \ |
518 | 1.10M | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); \ |
519 | 1.10M | if (w - (j_off) > 4) { \ |
520 | 1.06M | _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + (j_off)]), \ |
521 | 1.06M | res_0); \ |
522 | 1.06M | _mm_storel_epi64( \ |
523 | 1.06M | (__m128i *)(&dst0[i * dst_stride0 + (j_off) + dst_stride0]), \ |
524 | 1.06M | res_1); \ |
525 | 1.06M | } else { \ |
526 | 34.9k | *(int *)(&dst0[i * dst_stride0 + (j_off)]) = _mm_cvtsi128_si32(res_0); \ |
527 | 34.9k | *(int *)(&dst0[i * dst_stride0 + (j_off) + dst_stride0]) = \ |
528 | 34.9k | _mm_cvtsi128_si32(res_1); \ |
529 | 34.9k | } \ |
530 | 1.55M | } else { \ |
531 | 1.55M | const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); \ |
532 | 1.55M | _mm_store_si128((__m128i *)(&dst[i * dst_stride + (j_off)]), res_0); \ |
533 | 1.55M | const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); \ |
534 | 1.55M | _mm_store_si128( \ |
535 | 1.55M | (__m128i *)(&dst[i * dst_stride + (j_off) + dst_stride]), res_1); \ |
536 | 1.55M | } \ |
537 | 2.65M | } while (0) |
538 | | |
539 | | #define JNT_CONVOLVE_HORIZONTAL_FILTER(src_h_start, convolve_fn, coeffs) \ |
540 | 353k | do { \ |
541 | 353k | const uint8_t *src_h = (src_h_start); \ |
542 | 4.90M | for (i = 0; i < im_h; i += 2) { \ |
543 | 4.55M | const __m256i data = load_line2_avx2(src_h, src_h + src_stride); \ |
544 | 4.55M | src_h += (src_stride << 1); \ |
545 | 4.55M | __m256i res = convolve_fn(data, coeffs, filt); \ |
546 | 4.55M | res = _mm256_srai_epi16(_mm256_add_epi16(res, round_const_h), 2); \ |
547 | 4.55M | _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \ |
548 | 4.55M | } \ |
549 | 353k | } while (0) |
550 | | |
551 | | #define JNT_CONVOLVE_VERTICAL_FILTER_8TAP \ |
552 | 238k | do { \ |
553 | 238k | __m256i s[8]; \ |
554 | 238k | __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ |
555 | 238k | __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ |
556 | 238k | __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \ |
557 | 238k | __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \ |
558 | 238k | __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); \ |
559 | 238k | __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); \ |
560 | 238k | \ |
561 | 238k | s[0] = _mm256_unpacklo_epi16(s0, s1); \ |
562 | 238k | s[1] = _mm256_unpacklo_epi16(s2, s3); \ |
563 | 238k | s[2] = _mm256_unpacklo_epi16(s4, s5); \ |
564 | 238k | \ |
565 | 238k | s[4] = _mm256_unpackhi_epi16(s0, s1); \ |
566 | 238k | s[5] = _mm256_unpackhi_epi16(s2, s3); \ |
567 | 238k | s[6] = _mm256_unpackhi_epi16(s4, s5); \ |
568 | 238k | \ |
569 | 2.89M | for (i = 0; i < h; i += 2) { \ |
570 | 2.65M | const int16_t *data = &im_block[i * im_stride]; \ |
571 | 2.65M | \ |
572 | 2.65M | const __m256i s6 = \ |
573 | 2.65M | _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); \ |
574 | 2.65M | const __m256i s7 = \ |
575 | 2.65M | _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); \ |
576 | 2.65M | \ |
577 | 2.65M | s[3] = _mm256_unpacklo_epi16(s6, s7); \ |
578 | 2.65M | s[7] = _mm256_unpackhi_epi16(s6, s7); \ |
579 | 2.65M | \ |
580 | 2.65M | const __m256i res_a = convolve(s, coeffs_y); \ |
581 | 2.65M | const __m256i res_a_round = \ |
582 | 2.65M | _mm256_srai_epi32(_mm256_add_epi32(res_a, round_const_v), 7); \ |
583 | 2.65M | \ |
584 | 2.65M | if (w - j > 4) { \ |
585 | 2.57M | const __m256i res_b = convolve(s + 4, coeffs_y); \ |
586 | 2.57M | const __m256i res_b_round = \ |
587 | 2.57M | _mm256_srai_epi32(_mm256_add_epi32(res_b, round_const_v), 7); \ |
588 | 2.57M | const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round); \ |
589 | 2.57M | const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); \ |
590 | 2.57M | JNT_CONVOLVE_PROCESS_OUTPUT(res_unsigned, j); \ |
591 | 2.57M | } else { \ |
592 | 78.4k | const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round); \ |
593 | 78.4k | const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); \ |
594 | 78.4k | JNT_CONVOLVE_PROCESS_OUTPUT(res_unsigned, j); \ |
595 | 78.4k | } \ |
596 | 2.65M | \ |
597 | 2.65M | s[0] = s[1]; \ |
598 | 2.65M | s[1] = s[2]; \ |
599 | 2.65M | s[2] = s[3]; \ |
600 | 2.65M | \ |
601 | 2.65M | s[4] = s[5]; \ |
602 | 2.65M | s[5] = s[6]; \ |
603 | 2.65M | s[6] = s[7]; \ |
604 | 2.65M | } \ |
605 | 238k | } while (0) |
606 | | |
607 | | static inline void prepare_coeffs_2t_ssse3( |
608 | | const InterpFilterParams *const filter_params, const int32_t subpel_q4, |
609 | 34.7k | __m128i *const coeffs /* [4] */) { |
610 | 34.7k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( |
611 | 34.7k | filter_params, subpel_q4 & SUBPEL_MASK); |
612 | 34.7k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); |
613 | | |
614 | | // right shift all filter co-efficients by 1 to reduce the bits required. |
615 | | // This extra right shift will be taken care of at the end while rounding |
616 | | // the result. |
617 | | // Since all filter co-efficients are even, this change will not affect the |
618 | | // end result |
619 | 34.7k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), |
620 | 34.7k | _mm_set1_epi16((short)0xffff))); |
621 | | |
622 | 34.7k | const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1); |
623 | | |
624 | | // coeffs 3 4 3 4 3 4 3 4 |
625 | 34.7k | coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u)); |
626 | 34.7k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_2t_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_2t_ssse3 convolve_2d_avx2.c:prepare_coeffs_2t_ssse3 Line | Count | Source | 609 | 15.6k | __m128i *const coeffs /* [4] */) { | 610 | 15.6k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 611 | 15.6k | filter_params, subpel_q4 & SUBPEL_MASK); | 612 | 15.6k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 613 | | | 614 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 615 | | // This extra right shift will be taken care of at the end while rounding | 616 | | // the result. | 617 | | // Since all filter co-efficients are even, this change will not affect the | 618 | | // end result | 619 | 15.6k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 620 | 15.6k | _mm_set1_epi16((short)0xffff))); | 621 | | | 622 | 15.6k | const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1); | 623 | | | 624 | | // coeffs 3 4 3 4 3 4 3 4 | 625 | 15.6k | coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u)); | 626 | 15.6k | } |
convolve_avx2.c:prepare_coeffs_2t_ssse3 Line | Count | Source | 609 | 19.0k | __m128i *const coeffs /* [4] */) { | 610 | 19.0k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 611 | 19.0k | filter_params, subpel_q4 & SUBPEL_MASK); | 612 | 19.0k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 613 | | | 614 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 615 | | // This extra right shift will be taken care of at the end while rounding | 616 | | // the result. | 617 | | // Since all filter co-efficients are even, this change will not affect the | 618 | | // end result | 619 | 19.0k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 620 | 19.0k | _mm_set1_epi16((short)0xffff))); | 621 | | | 622 | 19.0k | const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1); | 623 | | | 624 | | // coeffs 3 4 3 4 3 4 3 4 | 625 | 19.0k | coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u)); | 626 | 19.0k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_2t_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_2t_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_2t_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_2t_ssse3 |
627 | | |
628 | | static inline void prepare_coeffs_4t_ssse3( |
629 | | const InterpFilterParams *const filter_params, const int32_t subpel_q4, |
630 | 854k | __m128i *const coeffs /* [4] */) { |
631 | 854k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( |
632 | 854k | filter_params, subpel_q4 & SUBPEL_MASK); |
633 | 854k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); |
634 | | |
635 | | // right shift all filter co-efficients by 1 to reduce the bits required. |
636 | | // This extra right shift will be taken care of at the end while rounding |
637 | | // the result. |
638 | | // Since all filter co-efficients are even, this change will not affect the |
639 | | // end result |
640 | 854k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), |
641 | 854k | _mm_set1_epi16((short)0xffff))); |
642 | | |
643 | 854k | const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1); |
644 | | |
645 | | // coeffs 2 3 2 3 2 3 2 3 |
646 | 854k | coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u)); |
647 | | // coeffs 4 5 4 5 4 5 4 5 |
648 | 854k | coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u)); |
649 | 854k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_4t_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_4t_ssse3 convolve_2d_avx2.c:prepare_coeffs_4t_ssse3 Line | Count | Source | 630 | 568k | __m128i *const coeffs /* [4] */) { | 631 | 568k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 632 | 568k | filter_params, subpel_q4 & SUBPEL_MASK); | 633 | 568k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 634 | | | 635 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 636 | | // This extra right shift will be taken care of at the end while rounding | 637 | | // the result. | 638 | | // Since all filter co-efficients are even, this change will not affect the | 639 | | // end result | 640 | 568k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 641 | 568k | _mm_set1_epi16((short)0xffff))); | 642 | | | 643 | 568k | const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1); | 644 | | | 645 | | // coeffs 2 3 2 3 2 3 2 3 | 646 | 568k | coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u)); | 647 | | // coeffs 4 5 4 5 4 5 4 5 | 648 | 568k | coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u)); | 649 | 568k | } |
convolve_avx2.c:prepare_coeffs_4t_ssse3 Line | Count | Source | 630 | 285k | __m128i *const coeffs /* [4] */) { | 631 | 285k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 632 | 285k | filter_params, subpel_q4 & SUBPEL_MASK); | 633 | 285k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 634 | | | 635 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 636 | | // This extra right shift will be taken care of at the end while rounding | 637 | | // the result. | 638 | | // Since all filter co-efficients are even, this change will not affect the | 639 | | // end result | 640 | 285k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 641 | 285k | _mm_set1_epi16((short)0xffff))); | 642 | | | 643 | 285k | const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1); | 644 | | | 645 | | // coeffs 2 3 2 3 2 3 2 3 | 646 | 285k | coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u)); | 647 | | // coeffs 4 5 4 5 4 5 4 5 | 648 | 285k | coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u)); | 649 | 285k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_4t_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_4t_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_4t_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_4t_ssse3 |
650 | | |
651 | | static inline void prepare_coeffs_6t_ssse3( |
652 | | const InterpFilterParams *const filter_params, const int32_t subpel_q4, |
653 | 67.2k | __m128i *const coeffs /* [4] */) { |
654 | 67.2k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( |
655 | 67.2k | filter_params, subpel_q4 & SUBPEL_MASK); |
656 | 67.2k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); |
657 | | |
658 | | // right shift all filter co-efficients by 1 to reduce the bits required. |
659 | | // This extra right shift will be taken care of at the end while rounding |
660 | | // the result. |
661 | | // Since all filter co-efficients are even, this change will not affect the |
662 | | // end result |
663 | 67.2k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), |
664 | 67.2k | _mm_set1_epi16((short)0xffff))); |
665 | | |
666 | 67.2k | const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1); |
667 | | |
668 | | // coeffs 2 3 2 3 2 3 2 3 |
669 | 67.2k | coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0402u)); |
670 | | // coeffs 4 5 4 5 4 5 4 5 |
671 | 67.2k | coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u)); |
672 | | // coeffs 5 6 5 6 5 6 5 6 |
673 | 67.2k | coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0c0au)); |
674 | 67.2k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_6t_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_6t_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:prepare_coeffs_6t_ssse3 convolve_avx2.c:prepare_coeffs_6t_ssse3 Line | Count | Source | 653 | 67.2k | __m128i *const coeffs /* [4] */) { | 654 | 67.2k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 655 | 67.2k | filter_params, subpel_q4 & SUBPEL_MASK); | 656 | 67.2k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 657 | | | 658 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 659 | | // This extra right shift will be taken care of at the end while rounding | 660 | | // the result. | 661 | | // Since all filter co-efficients are even, this change will not affect the | 662 | | // end result | 663 | 67.2k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 664 | 67.2k | _mm_set1_epi16((short)0xffff))); | 665 | | | 666 | 67.2k | const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1); | 667 | | | 668 | | // coeffs 2 3 2 3 2 3 2 3 | 669 | 67.2k | coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0402u)); | 670 | | // coeffs 4 5 4 5 4 5 4 5 | 671 | 67.2k | coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u)); | 672 | | // coeffs 5 6 5 6 5 6 5 6 | 673 | 67.2k | coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0c0au)); | 674 | 67.2k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_6t_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_6t_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_6t_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_6t_ssse3 |
675 | | |
676 | | static inline void prepare_coeffs_ssse3( |
677 | | const InterpFilterParams *const filter_params, const int32_t subpel_q4, |
678 | 5.89k | __m128i *const coeffs /* [4] */) { |
679 | 5.89k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( |
680 | 5.89k | filter_params, subpel_q4 & SUBPEL_MASK); |
681 | 5.89k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); |
682 | | |
683 | | // right shift all filter co-efficients by 1 to reduce the bits required. |
684 | | // This extra right shift will be taken care of at the end while rounding |
685 | | // the result. |
686 | | // Since all filter co-efficients are even, this change will not affect the |
687 | | // end result |
688 | 5.89k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), |
689 | 5.89k | _mm_set1_epi16((short)0xffff))); |
690 | | |
691 | 5.89k | const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1); |
692 | | |
693 | | // coeffs 0 1 0 1 0 1 0 1 |
694 | 5.89k | coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u)); |
695 | | // coeffs 2 3 2 3 2 3 2 3 |
696 | 5.89k | coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u)); |
697 | | // coeffs 4 5 4 5 4 5 4 5 |
698 | 5.89k | coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u)); |
699 | | // coeffs 6 7 6 7 6 7 6 7 |
700 | 5.89k | coeffs[3] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0e0cu)); |
701 | 5.89k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:prepare_coeffs_ssse3 convolve_avx2.c:prepare_coeffs_ssse3 Line | Count | Source | 678 | 5.89k | __m128i *const coeffs /* [4] */) { | 679 | 5.89k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 680 | 5.89k | filter_params, subpel_q4 & SUBPEL_MASK); | 681 | 5.89k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 682 | | | 683 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 684 | | // This extra right shift will be taken care of at the end while rounding | 685 | | // the result. | 686 | | // Since all filter co-efficients are even, this change will not affect the | 687 | | // end result | 688 | 5.89k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 689 | 5.89k | _mm_set1_epi16((short)0xffff))); | 690 | | | 691 | 5.89k | const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1); | 692 | | | 693 | | // coeffs 0 1 0 1 0 1 0 1 | 694 | 5.89k | coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u)); | 695 | | // coeffs 2 3 2 3 2 3 2 3 | 696 | 5.89k | coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u)); | 697 | | // coeffs 4 5 4 5 4 5 4 5 | 698 | 5.89k | coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u)); | 699 | | // coeffs 6 7 6 7 6 7 6 7 | 700 | 5.89k | coeffs[3] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0e0cu)); | 701 | 5.89k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_ssse3 |
702 | | |
703 | | static inline void prepare_coeffs_2t_lowbd( |
704 | | const InterpFilterParams *const filter_params, const int subpel_q4, |
705 | 25.1k | __m256i *const coeffs /* [4] */) { |
706 | 25.1k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( |
707 | 25.1k | filter_params, subpel_q4 & SUBPEL_MASK); |
708 | 25.1k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); |
709 | 25.1k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); |
710 | | |
711 | | // right shift all filter co-efficients by 1 to reduce the bits required. |
712 | | // This extra right shift will be taken care of at the end while rounding |
713 | | // the result. |
714 | | // Since all filter co-efficients are even, this change will not affect the |
715 | | // end result |
716 | 25.1k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), |
717 | 25.1k | _mm_set1_epi16((int16_t)0xffff))); |
718 | | |
719 | 25.1k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); |
720 | | |
721 | | // coeffs 3 4 3 4 3 4 3 4 |
722 | 25.1k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u)); |
723 | 25.1k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_2t_lowbd Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_2t_lowbd convolve_2d_avx2.c:prepare_coeffs_2t_lowbd Line | Count | Source | 705 | 14.7k | __m256i *const coeffs /* [4] */) { | 706 | 14.7k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 707 | 14.7k | filter_params, subpel_q4 & SUBPEL_MASK); | 708 | 14.7k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 709 | 14.7k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); | 710 | | | 711 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 712 | | // This extra right shift will be taken care of at the end while rounding | 713 | | // the result. | 714 | | // Since all filter co-efficients are even, this change will not affect the | 715 | | // end result | 716 | 14.7k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 717 | 14.7k | _mm_set1_epi16((int16_t)0xffff))); | 718 | | | 719 | 14.7k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); | 720 | | | 721 | | // coeffs 3 4 3 4 3 4 3 4 | 722 | 14.7k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u)); | 723 | 14.7k | } |
convolve_avx2.c:prepare_coeffs_2t_lowbd Line | Count | Source | 705 | 10.3k | __m256i *const coeffs /* [4] */) { | 706 | 10.3k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 707 | 10.3k | filter_params, subpel_q4 & SUBPEL_MASK); | 708 | 10.3k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 709 | 10.3k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); | 710 | | | 711 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 712 | | // This extra right shift will be taken care of at the end while rounding | 713 | | // the result. | 714 | | // Since all filter co-efficients are even, this change will not affect the | 715 | | // end result | 716 | 10.3k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 717 | 10.3k | _mm_set1_epi16((int16_t)0xffff))); | 718 | | | 719 | 10.3k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); | 720 | | | 721 | | // coeffs 3 4 3 4 3 4 3 4 | 722 | 10.3k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u)); | 723 | 10.3k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_2t_lowbd Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_2t_lowbd Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_2t_lowbd Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_2t_lowbd |
724 | | |
725 | | static inline void prepare_coeffs_4t_lowbd( |
726 | | const InterpFilterParams *const filter_params, const int subpel_q4, |
727 | 186k | __m256i *const coeffs /* [4] */) { |
728 | 186k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( |
729 | 186k | filter_params, subpel_q4 & SUBPEL_MASK); |
730 | 186k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); |
731 | 186k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); |
732 | | |
733 | | // right shift all filter co-efficients by 1 to reduce the bits required. |
734 | | // This extra right shift will be taken care of at the end while rounding |
735 | | // the result. |
736 | | // Since all filter co-efficients are even, this change will not affect the |
737 | | // end result |
738 | 186k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), |
739 | 186k | _mm_set1_epi16((short)0xffff))); |
740 | | |
741 | 186k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); |
742 | | |
743 | | // coeffs 2 3 2 3 2 3 2 3 |
744 | 186k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u)); |
745 | | // coeffs 4 5 4 5 4 5 4 5 |
746 | 186k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u)); |
747 | 186k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_4t_lowbd Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_4t_lowbd convolve_2d_avx2.c:prepare_coeffs_4t_lowbd Line | Count | Source | 727 | 38.0k | __m256i *const coeffs /* [4] */) { | 728 | 38.0k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 729 | 38.0k | filter_params, subpel_q4 & SUBPEL_MASK); | 730 | 38.0k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 731 | 38.0k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); | 732 | | | 733 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 734 | | // This extra right shift will be taken care of at the end while rounding | 735 | | // the result. | 736 | | // Since all filter co-efficients are even, this change will not affect the | 737 | | // end result | 738 | 38.0k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 739 | 38.0k | _mm_set1_epi16((short)0xffff))); | 740 | | | 741 | 38.0k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); | 742 | | | 743 | | // coeffs 2 3 2 3 2 3 2 3 | 744 | 38.0k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u)); | 745 | | // coeffs 4 5 4 5 4 5 4 5 | 746 | 38.0k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u)); | 747 | 38.0k | } |
convolve_avx2.c:prepare_coeffs_4t_lowbd Line | Count | Source | 727 | 148k | __m256i *const coeffs /* [4] */) { | 728 | 148k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 729 | 148k | filter_params, subpel_q4 & SUBPEL_MASK); | 730 | 148k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 731 | 148k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); | 732 | | | 733 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 734 | | // This extra right shift will be taken care of at the end while rounding | 735 | | // the result. | 736 | | // Since all filter co-efficients are even, this change will not affect the | 737 | | // end result | 738 | 148k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 739 | 148k | _mm_set1_epi16((short)0xffff))); | 740 | | | 741 | 148k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); | 742 | | | 743 | | // coeffs 2 3 2 3 2 3 2 3 | 744 | 148k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u)); | 745 | | // coeffs 4 5 4 5 4 5 4 5 | 746 | 148k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u)); | 747 | 148k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_4t_lowbd Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_4t_lowbd Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_4t_lowbd Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_4t_lowbd |
748 | | |
749 | | static inline void prepare_coeffs_6t_lowbd( |
750 | | const InterpFilterParams *const filter_params, const int subpel_q4, |
751 | 1.14M | __m256i *const coeffs /* [4] */) { |
752 | 1.14M | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( |
753 | 1.14M | filter_params, subpel_q4 & SUBPEL_MASK); |
754 | 1.14M | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); |
755 | 1.14M | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); |
756 | | |
757 | | // right shift all filter co-efficients by 1 to reduce the bits required. |
758 | | // This extra right shift will be taken care of at the end while rounding |
759 | | // the result. |
760 | | // Since all filter co-efficients are even, this change will not affect the |
761 | | // end result |
762 | 1.14M | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), |
763 | 1.14M | _mm_set1_epi16((int16_t)0xffff))); |
764 | | |
765 | 1.14M | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); |
766 | | |
767 | | // coeffs 1 2 1 2 1 2 1 2 |
768 | 1.14M | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0402u)); |
769 | | // coeffs 3 4 3 4 3 4 3 4 |
770 | 1.14M | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u)); |
771 | | // coeffs 5 6 5 6 5 6 5 6 |
772 | 1.14M | coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0c0au)); |
773 | 1.14M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_6t_lowbd Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_6t_lowbd convolve_2d_avx2.c:prepare_coeffs_6t_lowbd Line | Count | Source | 751 | 730k | __m256i *const coeffs /* [4] */) { | 752 | 730k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 753 | 730k | filter_params, subpel_q4 & SUBPEL_MASK); | 754 | 730k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 755 | 730k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); | 756 | | | 757 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 758 | | // This extra right shift will be taken care of at the end while rounding | 759 | | // the result. | 760 | | // Since all filter co-efficients are even, this change will not affect the | 761 | | // end result | 762 | 730k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 763 | 730k | _mm_set1_epi16((int16_t)0xffff))); | 764 | | | 765 | 730k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); | 766 | | | 767 | | // coeffs 1 2 1 2 1 2 1 2 | 768 | 730k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0402u)); | 769 | | // coeffs 3 4 3 4 3 4 3 4 | 770 | 730k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u)); | 771 | | // coeffs 5 6 5 6 5 6 5 6 | 772 | 730k | coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0c0au)); | 773 | 730k | } |
convolve_avx2.c:prepare_coeffs_6t_lowbd Line | Count | Source | 751 | 410k | __m256i *const coeffs /* [4] */) { | 752 | 410k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 753 | 410k | filter_params, subpel_q4 & SUBPEL_MASK); | 754 | 410k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 755 | 410k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); | 756 | | | 757 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 758 | | // This extra right shift will be taken care of at the end while rounding | 759 | | // the result. | 760 | | // Since all filter co-efficients are even, this change will not affect the | 761 | | // end result | 762 | 410k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 763 | 410k | _mm_set1_epi16((int16_t)0xffff))); | 764 | | | 765 | 410k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); | 766 | | | 767 | | // coeffs 1 2 1 2 1 2 1 2 | 768 | 410k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0402u)); | 769 | | // coeffs 3 4 3 4 3 4 3 4 | 770 | 410k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u)); | 771 | | // coeffs 5 6 5 6 5 6 5 6 | 772 | 410k | coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0c0au)); | 773 | 410k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_6t_lowbd Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_6t_lowbd Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_6t_lowbd Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_6t_lowbd |
774 | | |
775 | | static inline void prepare_coeffs_lowbd( |
776 | | const InterpFilterParams *const filter_params, const int subpel_q4, |
777 | 455k | __m256i *const coeffs /* [4] */) { |
778 | 455k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( |
779 | 455k | filter_params, subpel_q4 & SUBPEL_MASK); |
780 | 455k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); |
781 | 455k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); |
782 | | |
783 | | // right shift all filter co-efficients by 1 to reduce the bits required. |
784 | | // This extra right shift will be taken care of at the end while rounding |
785 | | // the result. |
786 | | // Since all filter co-efficients are even, this change will not affect the |
787 | | // end result |
788 | 455k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), |
789 | 455k | _mm_set1_epi16((short)0xffff))); |
790 | | |
791 | 455k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); |
792 | | |
793 | | // coeffs 0 1 0 1 0 1 0 1 |
794 | 455k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u)); |
795 | | // coeffs 2 3 2 3 2 3 2 3 |
796 | 455k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u)); |
797 | | // coeffs 4 5 4 5 4 5 4 5 |
798 | 455k | coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u)); |
799 | | // coeffs 6 7 6 7 6 7 6 7 |
800 | 455k | coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu)); |
801 | 455k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_lowbd Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_lowbd convolve_2d_avx2.c:prepare_coeffs_lowbd Line | Count | Source | 777 | 47.8k | __m256i *const coeffs /* [4] */) { | 778 | 47.8k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 779 | 47.8k | filter_params, subpel_q4 & SUBPEL_MASK); | 780 | 47.8k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 781 | 47.8k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); | 782 | | | 783 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 784 | | // This extra right shift will be taken care of at the end while rounding | 785 | | // the result. | 786 | | // Since all filter co-efficients are even, this change will not affect the | 787 | | // end result | 788 | 47.8k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 789 | 47.8k | _mm_set1_epi16((short)0xffff))); | 790 | | | 791 | 47.8k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); | 792 | | | 793 | | // coeffs 0 1 0 1 0 1 0 1 | 794 | 47.8k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u)); | 795 | | // coeffs 2 3 2 3 2 3 2 3 | 796 | 47.8k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u)); | 797 | | // coeffs 4 5 4 5 4 5 4 5 | 798 | 47.8k | coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u)); | 799 | | // coeffs 6 7 6 7 6 7 6 7 | 800 | 47.8k | coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu)); | 801 | 47.8k | } |
convolve_avx2.c:prepare_coeffs_lowbd Line | Count | Source | 777 | 33.3k | __m256i *const coeffs /* [4] */) { | 778 | 33.3k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 779 | 33.3k | filter_params, subpel_q4 & SUBPEL_MASK); | 780 | 33.3k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 781 | 33.3k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); | 782 | | | 783 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 784 | | // This extra right shift will be taken care of at the end while rounding | 785 | | // the result. | 786 | | // Since all filter co-efficients are even, this change will not affect the | 787 | | // end result | 788 | 33.3k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 789 | 33.3k | _mm_set1_epi16((short)0xffff))); | 790 | | | 791 | 33.3k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); | 792 | | | 793 | | // coeffs 0 1 0 1 0 1 0 1 | 794 | 33.3k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u)); | 795 | | // coeffs 2 3 2 3 2 3 2 3 | 796 | 33.3k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u)); | 797 | | // coeffs 4 5 4 5 4 5 4 5 | 798 | 33.3k | coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u)); | 799 | | // coeffs 6 7 6 7 6 7 6 7 | 800 | 33.3k | coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu)); | 801 | 33.3k | } |
jnt_convolve_avx2.c:prepare_coeffs_lowbd Line | Count | Source | 777 | 374k | __m256i *const coeffs /* [4] */) { | 778 | 374k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 779 | 374k | filter_params, subpel_q4 & SUBPEL_MASK); | 780 | 374k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 781 | 374k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); | 782 | | | 783 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 784 | | // This extra right shift will be taken care of at the end while rounding | 785 | | // the result. | 786 | | // Since all filter co-efficients are even, this change will not affect the | 787 | | // end result | 788 | 374k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 789 | 374k | _mm_set1_epi16((short)0xffff))); | 790 | | | 791 | 374k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); | 792 | | | 793 | | // coeffs 0 1 0 1 0 1 0 1 | 794 | 374k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u)); | 795 | | // coeffs 2 3 2 3 2 3 2 3 | 796 | 374k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u)); | 797 | | // coeffs 4 5 4 5 4 5 4 5 | 798 | 374k | coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u)); | 799 | | // coeffs 6 7 6 7 6 7 6 7 | 800 | 374k | coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu)); | 801 | 374k | } |
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_lowbd Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_lowbd Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_lowbd |
802 | | |
803 | | static inline void prepare_coeffs_2t( |
804 | | const InterpFilterParams *const filter_params, const int subpel_q4, |
805 | 30.4k | __m256i *const coeffs /* [4] */) { |
806 | 30.4k | const int16_t *filter = av1_get_interp_filter_subpel_kernel( |
807 | 30.4k | filter_params, subpel_q4 & SUBPEL_MASK); |
808 | | |
809 | 30.4k | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)(filter + 1)); |
810 | 30.4k | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); |
811 | | |
812 | | // coeffs 3 4 3 4 3 4 3 4 |
813 | 30.4k | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55); |
814 | 30.4k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_2t Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_2t convolve_2d_avx2.c:prepare_coeffs_2t Line | Count | Source | 805 | 30.4k | __m256i *const coeffs /* [4] */) { | 806 | 30.4k | const int16_t *filter = av1_get_interp_filter_subpel_kernel( | 807 | 30.4k | filter_params, subpel_q4 & SUBPEL_MASK); | 808 | | | 809 | 30.4k | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)(filter + 1)); | 810 | 30.4k | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); | 811 | | | 812 | | // coeffs 3 4 3 4 3 4 3 4 | 813 | | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55); | 814 | 30.4k | } |
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_2t Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_2t Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_2t Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_2t Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_2t |
815 | | |
816 | | static inline void prepare_coeffs_4t( |
817 | | const InterpFilterParams *const filter_params, const int subpel_q4, |
818 | 779k | __m256i *const coeffs /* [4] */) { |
819 | 779k | const int16_t *filter = av1_get_interp_filter_subpel_kernel( |
820 | 779k | filter_params, subpel_q4 & SUBPEL_MASK); |
821 | | |
822 | 779k | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); |
823 | 779k | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); |
824 | | // coeffs 2 3 2 3 2 3 2 3 |
825 | 779k | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55); |
826 | | // coeffs 4 5 4 5 4 5 4 5 |
827 | 779k | coeffs[1] = _mm256_shuffle_epi32(coeff, 0xaa); |
828 | 779k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_4t Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_4t convolve_2d_avx2.c:prepare_coeffs_4t Line | Count | Source | 818 | 779k | __m256i *const coeffs /* [4] */) { | 819 | 779k | const int16_t *filter = av1_get_interp_filter_subpel_kernel( | 820 | 779k | filter_params, subpel_q4 & SUBPEL_MASK); | 821 | | | 822 | 779k | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); | 823 | 779k | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); | 824 | | // coeffs 2 3 2 3 2 3 2 3 | 825 | 779k | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55); | 826 | | // coeffs 4 5 4 5 4 5 4 5 | 827 | | coeffs[1] = _mm256_shuffle_epi32(coeff, 0xaa); | 828 | 779k | } |
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_4t Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_4t Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_4t Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_4t Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_4t |
829 | | |
830 | | static inline void prepare_coeffs_6t( |
831 | | const InterpFilterParams *const filter_params, const int subpel_q4, |
832 | 563k | __m256i *const coeffs /* [4] */) { |
833 | 563k | const int16_t *filter = av1_get_interp_filter_subpel_kernel( |
834 | 563k | filter_params, subpel_q4 & SUBPEL_MASK); |
835 | | |
836 | 563k | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)(filter + 1)); |
837 | 563k | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); |
838 | | |
839 | | // coeffs 1 2 1 2 1 2 1 2 |
840 | 563k | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); |
841 | | // coeffs 3 4 3 4 3 4 3 4 |
842 | 563k | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); |
843 | | // coeffs 5 6 5 6 5 6 5 6 |
844 | 563k | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); |
845 | 563k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_6t Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_6t convolve_2d_avx2.c:prepare_coeffs_6t Line | Count | Source | 832 | 563k | __m256i *const coeffs /* [4] */) { | 833 | 563k | const int16_t *filter = av1_get_interp_filter_subpel_kernel( | 834 | 563k | filter_params, subpel_q4 & SUBPEL_MASK); | 835 | | | 836 | 563k | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)(filter + 1)); | 837 | 563k | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); | 838 | | | 839 | | // coeffs 1 2 1 2 1 2 1 2 | 840 | 563k | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); | 841 | | // coeffs 3 4 3 4 3 4 3 4 | 842 | 563k | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); | 843 | | // coeffs 5 6 5 6 5 6 5 6 | 844 | | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); | 845 | 563k | } |
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_6t Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_6t Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_6t Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_6t Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_6t |
846 | | |
847 | | static inline void prepare_coeffs(const InterpFilterParams *const filter_params, |
848 | | const int subpel_q4, |
849 | 8.64M | __m256i *const coeffs /* [4] */) { |
850 | 8.64M | const int16_t *filter = av1_get_interp_filter_subpel_kernel( |
851 | 8.64M | filter_params, subpel_q4 & SUBPEL_MASK); |
852 | | |
853 | 8.64M | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); |
854 | 8.64M | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); |
855 | | |
856 | | // coeffs 0 1 0 1 0 1 0 1 |
857 | 8.64M | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); |
858 | | // coeffs 2 3 2 3 2 3 2 3 |
859 | 8.64M | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); |
860 | | // coeffs 4 5 4 5 4 5 4 5 |
861 | 8.64M | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); |
862 | | // coeffs 6 7 6 7 6 7 6 7 |
863 | 8.64M | coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); |
864 | 8.64M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs highbd_convolve_avx2.c:prepare_coeffs Line | Count | Source | 849 | 1.66M | __m256i *const coeffs /* [4] */) { | 850 | 1.66M | const int16_t *filter = av1_get_interp_filter_subpel_kernel( | 851 | 1.66M | filter_params, subpel_q4 & SUBPEL_MASK); | 852 | | | 853 | 1.66M | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); | 854 | 1.66M | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); | 855 | | | 856 | | // coeffs 0 1 0 1 0 1 0 1 | 857 | 1.66M | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); | 858 | | // coeffs 2 3 2 3 2 3 2 3 | 859 | 1.66M | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); | 860 | | // coeffs 4 5 4 5 4 5 4 5 | 861 | 1.66M | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); | 862 | | // coeffs 6 7 6 7 6 7 6 7 | 863 | | coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); | 864 | 1.66M | } |
convolve_2d_avx2.c:prepare_coeffs Line | Count | Source | 849 | 41.9k | __m256i *const coeffs /* [4] */) { | 850 | 41.9k | const int16_t *filter = av1_get_interp_filter_subpel_kernel( | 851 | 41.9k | filter_params, subpel_q4 & SUBPEL_MASK); | 852 | | | 853 | 41.9k | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); | 854 | 41.9k | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); | 855 | | | 856 | | // coeffs 0 1 0 1 0 1 0 1 | 857 | 41.9k | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); | 858 | | // coeffs 2 3 2 3 2 3 2 3 | 859 | 41.9k | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); | 860 | | // coeffs 4 5 4 5 4 5 4 5 | 861 | 41.9k | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); | 862 | | // coeffs 6 7 6 7 6 7 6 7 | 863 | | coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); | 864 | 41.9k | } |
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs jnt_convolve_avx2.c:prepare_coeffs Line | Count | Source | 849 | 201k | __m256i *const coeffs /* [4] */) { | 850 | 201k | const int16_t *filter = av1_get_interp_filter_subpel_kernel( | 851 | 201k | filter_params, subpel_q4 & SUBPEL_MASK); | 852 | | | 853 | 201k | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); | 854 | 201k | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); | 855 | | | 856 | | // coeffs 0 1 0 1 0 1 0 1 | 857 | 201k | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); | 858 | | // coeffs 2 3 2 3 2 3 2 3 | 859 | 201k | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); | 860 | | // coeffs 4 5 4 5 4 5 4 5 | 861 | 201k | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); | 862 | | // coeffs 6 7 6 7 6 7 6 7 | 863 | | coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); | 864 | 201k | } |
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs highbd_convolve_2d_avx2.c:prepare_coeffs Line | Count | Source | 849 | 5.38M | __m256i *const coeffs /* [4] */) { | 850 | 5.38M | const int16_t *filter = av1_get_interp_filter_subpel_kernel( | 851 | 5.38M | filter_params, subpel_q4 & SUBPEL_MASK); | 852 | | | 853 | 5.38M | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); | 854 | 5.38M | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); | 855 | | | 856 | | // coeffs 0 1 0 1 0 1 0 1 | 857 | 5.38M | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); | 858 | | // coeffs 2 3 2 3 2 3 2 3 | 859 | 5.38M | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); | 860 | | // coeffs 4 5 4 5 4 5 4 5 | 861 | 5.38M | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); | 862 | | // coeffs 6 7 6 7 6 7 6 7 | 863 | | coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); | 864 | 5.38M | } |
highbd_jnt_convolve_avx2.c:prepare_coeffs Line | Count | Source | 849 | 1.35M | __m256i *const coeffs /* [4] */) { | 850 | 1.35M | const int16_t *filter = av1_get_interp_filter_subpel_kernel( | 851 | 1.35M | filter_params, subpel_q4 & SUBPEL_MASK); | 852 | | | 853 | 1.35M | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); | 854 | 1.35M | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); | 855 | | | 856 | | // coeffs 0 1 0 1 0 1 0 1 | 857 | 1.35M | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); | 858 | | // coeffs 2 3 2 3 2 3 2 3 | 859 | 1.35M | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); | 860 | | // coeffs 4 5 4 5 4 5 4 5 | 861 | 1.35M | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); | 862 | | // coeffs 6 7 6 7 6 7 6 7 | 863 | | coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); | 864 | 1.35M | } |
|
865 | | |
866 | | static inline void prepare_coeffs_12taps( |
867 | | const InterpFilterParams *const filter_params, const int subpel_q4, |
868 | 0 | __m256i *const coeffs /* [4] */) { |
869 | 0 | const int16_t *filter = av1_get_interp_filter_subpel_kernel( |
870 | 0 | filter_params, subpel_q4 & SUBPEL_MASK); |
871 | |
|
872 | 0 | __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); |
873 | 0 | __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); |
874 | | |
875 | | // coeffs 0 1 0 1 0 1 0 1 |
876 | 0 | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); |
877 | | // coeffs 2 3 2 3 2 3 2 3 |
878 | 0 | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); |
879 | | // coeffs 4 5 4 5 4 5 4 5 |
880 | 0 | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); |
881 | | // coeffs 6 7 6 7 6 7 6 7 |
882 | 0 | coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); |
883 | | // coeffs 8 9 10 11 0 0 0 0 |
884 | 0 | coeff_8 = _mm_loadl_epi64((__m128i *)(filter + 8)); |
885 | 0 | coeff = _mm256_broadcastq_epi64(coeff_8); |
886 | 0 | coeffs[4] = _mm256_shuffle_epi32(coeff, 0x00); // coeffs 8 9 8 9 8 9 8 9 |
887 | 0 | coeffs[5] = _mm256_shuffle_epi32(coeff, 0x55); // coeffs 10 11 10 11.. 10 11 |
888 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_12taps Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_12taps Unexecuted instantiation: convolve_2d_avx2.c:prepare_coeffs_12taps Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_12taps Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_12taps Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_12taps Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_12taps Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_12taps |
889 | | |
890 | | static inline __m128i convolve_lowbd_4tap_ssse3(const __m128i ss[2], |
891 | 3.75M | const __m128i coeffs[2]) { |
892 | 3.75M | const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]); |
893 | 3.75M | const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]); |
894 | | |
895 | 3.75M | return _mm_add_epi16(res_01, res_23); |
896 | 3.75M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_4tap_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_4tap_ssse3 convolve_2d_avx2.c:convolve_lowbd_4tap_ssse3 Line | Count | Source | 891 | 3.01M | const __m128i coeffs[2]) { | 892 | 3.01M | const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]); | 893 | 3.01M | const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]); | 894 | | | 895 | 3.01M | return _mm_add_epi16(res_01, res_23); | 896 | 3.01M | } |
convolve_avx2.c:convolve_lowbd_4tap_ssse3 Line | Count | Source | 891 | 740k | const __m128i coeffs[2]) { | 892 | 740k | const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]); | 893 | 740k | const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]); | 894 | | | 895 | 740k | return _mm_add_epi16(res_01, res_23); | 896 | 740k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_4tap_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_4tap_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_4tap_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_4tap_ssse3 |
897 | | |
898 | | static inline __m128i convolve_lowbd_6tap_ssse3(const __m128i ss[3], |
899 | 348k | const __m128i coeffs[3]) { |
900 | 348k | const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]); |
901 | 348k | const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]); |
902 | 348k | const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]); |
903 | | |
904 | 348k | const __m128i res = _mm_add_epi16(_mm_add_epi16(res_01, res_45), res_23); |
905 | | |
906 | 348k | return res; |
907 | 348k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_6tap_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_6tap_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_lowbd_6tap_ssse3 convolve_avx2.c:convolve_lowbd_6tap_ssse3 Line | Count | Source | 899 | 348k | const __m128i coeffs[3]) { | 900 | 348k | const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]); | 901 | 348k | const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]); | 902 | 348k | const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]); | 903 | | | 904 | 348k | const __m128i res = _mm_add_epi16(_mm_add_epi16(res_01, res_45), res_23); | 905 | | | 906 | 348k | return res; | 907 | 348k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_6tap_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_6tap_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_6tap_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_6tap_ssse3 |
908 | | |
909 | | static inline __m128i convolve_lowbd_ssse3(const __m128i ss[4], |
910 | 30.5k | const __m128i coeffs[4]) { |
911 | 30.5k | const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]); |
912 | 30.5k | const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]); |
913 | 30.5k | const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]); |
914 | 30.5k | const __m128i res_67 = _mm_maddubs_epi16(ss[3], coeffs[3]); |
915 | | |
916 | 30.5k | const __m128i res = _mm_add_epi16(_mm_add_epi16(res_01, res_45), |
917 | 30.5k | _mm_add_epi16(res_23, res_67)); |
918 | | |
919 | 30.5k | return res; |
920 | 30.5k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_lowbd_ssse3 convolve_avx2.c:convolve_lowbd_ssse3 Line | Count | Source | 910 | 30.5k | const __m128i coeffs[4]) { | 911 | 30.5k | const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]); | 912 | 30.5k | const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]); | 913 | 30.5k | const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]); | 914 | 30.5k | const __m128i res_67 = _mm_maddubs_epi16(ss[3], coeffs[3]); | 915 | | | 916 | 30.5k | const __m128i res = _mm_add_epi16(_mm_add_epi16(res_01, res_45), | 917 | 30.5k | _mm_add_epi16(res_23, res_67)); | 918 | | | 919 | 30.5k | return res; | 920 | 30.5k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_ssse3 |
921 | | |
922 | | static inline __m256i convolve_lowbd(const __m256i *const s, |
923 | 21.3M | const __m256i *const coeffs) { |
924 | 21.3M | const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); |
925 | 21.3M | const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); |
926 | 21.3M | const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); |
927 | 21.3M | const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]); |
928 | | |
929 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
930 | 21.3M | const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), |
931 | 21.3M | _mm256_add_epi16(res_23, res_67)); |
932 | | |
933 | 21.3M | return res; |
934 | 21.3M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd convolve_2d_avx2.c:convolve_lowbd Line | Count | Source | 923 | 1.96M | const __m256i *const coeffs) { | 924 | 1.96M | const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 925 | 1.96M | const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 926 | 1.96M | const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); | 927 | 1.96M | const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]); | 928 | | | 929 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 930 | 1.96M | const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), | 931 | 1.96M | _mm256_add_epi16(res_23, res_67)); | 932 | | | 933 | 1.96M | return res; | 934 | 1.96M | } |
convolve_avx2.c:convolve_lowbd Line | Count | Source | 923 | 573k | const __m256i *const coeffs) { | 924 | 573k | const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 925 | 573k | const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 926 | 573k | const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); | 927 | 573k | const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]); | 928 | | | 929 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 930 | 573k | const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), | 931 | 573k | _mm256_add_epi16(res_23, res_67)); | 932 | | | 933 | 573k | return res; | 934 | 573k | } |
jnt_convolve_avx2.c:convolve_lowbd Line | Count | Source | 923 | 6.81M | const __m256i *const coeffs) { | 924 | 6.81M | const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 925 | 6.81M | const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 926 | 6.81M | const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); | 927 | 6.81M | const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]); | 928 | | | 929 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 930 | 6.81M | const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), | 931 | 6.81M | _mm256_add_epi16(res_23, res_67)); | 932 | | | 933 | 6.81M | return res; | 934 | 6.81M | } |
wiener_convolve_avx2.c:convolve_lowbd Line | Count | Source | 923 | 11.9M | const __m256i *const coeffs) { | 924 | 11.9M | const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 925 | 11.9M | const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 926 | 11.9M | const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); | 927 | 11.9M | const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]); | 928 | | | 929 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 930 | 11.9M | const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), | 931 | 11.9M | _mm256_add_epi16(res_23, res_67)); | 932 | | | 933 | 11.9M | return res; | 934 | 11.9M | } |
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd |
935 | | |
936 | | static inline __m256i convolve_lowbd_6tap(const __m256i *const s, |
937 | 20.2M | const __m256i *const coeffs) { |
938 | 20.2M | const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); |
939 | 20.2M | const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); |
940 | 20.2M | const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); |
941 | | |
942 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
943 | 20.2M | const __m256i res = |
944 | 20.2M | _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), res_23); |
945 | | |
946 | 20.2M | return res; |
947 | 20.2M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_6tap Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_6tap convolve_2d_avx2.c:convolve_lowbd_6tap Line | Count | Source | 937 | 12.2M | const __m256i *const coeffs) { | 938 | 12.2M | const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 939 | 12.2M | const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 940 | 12.2M | const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); | 941 | | | 942 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 943 | 12.2M | const __m256i res = | 944 | 12.2M | _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), res_23); | 945 | | | 946 | 12.2M | return res; | 947 | 12.2M | } |
convolve_avx2.c:convolve_lowbd_6tap Line | Count | Source | 937 | 8.02M | const __m256i *const coeffs) { | 938 | 8.02M | const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 939 | 8.02M | const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 940 | 8.02M | const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); | 941 | | | 942 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 943 | 8.02M | const __m256i res = | 944 | 8.02M | _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), res_23); | 945 | | | 946 | 8.02M | return res; | 947 | 8.02M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_6tap Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_6tap Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_6tap Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_6tap |
948 | | |
949 | | static inline __m256i convolve_lowbd_4tap(const __m256i *const s, |
950 | 4.65M | const __m256i *const coeffs) { |
951 | 4.65M | const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]); |
952 | 4.65M | const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]); |
953 | | |
954 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
955 | 4.65M | const __m256i res = _mm256_add_epi16(res_45, res_23); |
956 | | |
957 | 4.65M | return res; |
958 | 4.65M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_4tap Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_4tap convolve_2d_avx2.c:convolve_lowbd_4tap Line | Count | Source | 950 | 1.04M | const __m256i *const coeffs) { | 951 | 1.04M | const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 952 | 1.04M | const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 953 | | | 954 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 955 | 1.04M | const __m256i res = _mm256_add_epi16(res_45, res_23); | 956 | | | 957 | 1.04M | return res; | 958 | 1.04M | } |
convolve_avx2.c:convolve_lowbd_4tap Line | Count | Source | 950 | 1.56M | const __m256i *const coeffs) { | 951 | 1.56M | const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 952 | 1.56M | const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 953 | | | 954 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 955 | 1.56M | const __m256i res = _mm256_add_epi16(res_45, res_23); | 956 | | | 957 | 1.56M | return res; | 958 | 1.56M | } |
jnt_convolve_avx2.c:convolve_lowbd_4tap Line | Count | Source | 950 | 2.04M | const __m256i *const coeffs) { | 951 | 2.04M | const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 952 | 2.04M | const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 953 | | | 954 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 955 | 2.04M | const __m256i res = _mm256_add_epi16(res_45, res_23); | 956 | | | 957 | 2.04M | return res; | 958 | 2.04M | } |
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_4tap Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_4tap Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_4tap |
959 | | |
960 | | static inline __m256i convolve_6tap(const __m256i *const s, |
961 | 17.8M | const __m256i *const coeffs) { |
962 | 17.8M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); |
963 | 17.8M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); |
964 | 17.8M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); |
965 | | |
966 | 17.8M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), res_2); |
967 | | |
968 | 17.8M | return res; |
969 | 17.8M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_6tap Unexecuted instantiation: highbd_convolve_avx2.c:convolve_6tap convolve_2d_avx2.c:convolve_6tap Line | Count | Source | 961 | 17.8M | const __m256i *const coeffs) { | 962 | 17.8M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); | 963 | 17.8M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); | 964 | 17.8M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); | 965 | | | 966 | 17.8M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), res_2); | 967 | | | 968 | 17.8M | return res; | 969 | 17.8M | } |
Unexecuted instantiation: convolve_avx2.c:convolve_6tap Unexecuted instantiation: jnt_convolve_avx2.c:convolve_6tap Unexecuted instantiation: wiener_convolve_avx2.c:convolve_6tap Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_6tap Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_6tap |
970 | | |
971 | | static inline __m256i convolve_12taps(const __m256i *const s, |
972 | 0 | const __m256i *const coeffs) { |
973 | 0 | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); |
974 | 0 | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); |
975 | 0 | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); |
976 | 0 | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); |
977 | 0 | const __m256i res_4 = _mm256_madd_epi16(s[4], coeffs[4]); |
978 | 0 | const __m256i res_5 = _mm256_madd_epi16(s[5], coeffs[5]); |
979 | |
|
980 | 0 | const __m256i res1 = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), |
981 | 0 | _mm256_add_epi32(res_2, res_3)); |
982 | 0 | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_4, res_5), res1); |
983 | |
|
984 | 0 | return res; |
985 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_12taps Unexecuted instantiation: highbd_convolve_avx2.c:convolve_12taps Unexecuted instantiation: convolve_2d_avx2.c:convolve_12taps Unexecuted instantiation: convolve_avx2.c:convolve_12taps Unexecuted instantiation: jnt_convolve_avx2.c:convolve_12taps Unexecuted instantiation: wiener_convolve_avx2.c:convolve_12taps Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_12taps Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_12taps |
986 | | |
987 | | static inline __m256i convolve(const __m256i *const s, |
988 | 250M | const __m256i *const coeffs) { |
989 | 250M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); |
990 | 250M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); |
991 | 250M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); |
992 | 250M | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); |
993 | | |
994 | 250M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), |
995 | 250M | _mm256_add_epi32(res_2, res_3)); |
996 | | |
997 | 250M | return res; |
998 | 250M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve highbd_convolve_avx2.c:convolve Line | Count | Source | 988 | 33.0M | const __m256i *const coeffs) { | 989 | 33.0M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); | 990 | 33.0M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); | 991 | 33.0M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); | 992 | 33.0M | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); | 993 | | | 994 | 33.0M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), | 995 | 33.0M | _mm256_add_epi32(res_2, res_3)); | 996 | | | 997 | 33.0M | return res; | 998 | 33.0M | } |
convolve_2d_avx2.c:convolve Line | Count | Source | 988 | 2.88M | const __m256i *const coeffs) { | 989 | 2.88M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); | 990 | 2.88M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); | 991 | 2.88M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); | 992 | 2.88M | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); | 993 | | | 994 | 2.88M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), | 995 | 2.88M | _mm256_add_epi32(res_2, res_3)); | 996 | | | 997 | 2.88M | return res; | 998 | 2.88M | } |
Unexecuted instantiation: convolve_avx2.c:convolve jnt_convolve_avx2.c:convolve Line | Count | Source | 988 | 5.22M | const __m256i *const coeffs) { | 989 | 5.22M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); | 990 | 5.22M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); | 991 | 5.22M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); | 992 | 5.22M | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); | 993 | | | 994 | 5.22M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), | 995 | 5.22M | _mm256_add_epi32(res_2, res_3)); | 996 | | | 997 | 5.22M | return res; | 998 | 5.22M | } |
wiener_convolve_avx2.c:convolve Line | Count | Source | 988 | 22.3M | const __m256i *const coeffs) { | 989 | 22.3M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); | 990 | 22.3M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); | 991 | 22.3M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); | 992 | 22.3M | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); | 993 | | | 994 | 22.3M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), | 995 | 22.3M | _mm256_add_epi32(res_2, res_3)); | 996 | | | 997 | 22.3M | return res; | 998 | 22.3M | } |
highbd_convolve_2d_avx2.c:convolve Line | Count | Source | 988 | 120M | const __m256i *const coeffs) { | 989 | 120M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); | 990 | 120M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); | 991 | 120M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); | 992 | 120M | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); | 993 | | | 994 | 120M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), | 995 | 120M | _mm256_add_epi32(res_2, res_3)); | 996 | | | 997 | 120M | return res; | 998 | 120M | } |
highbd_jnt_convolve_avx2.c:convolve Line | Count | Source | 988 | 66.4M | const __m256i *const coeffs) { | 989 | 66.4M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); | 990 | 66.4M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); | 991 | 66.4M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); | 992 | 66.4M | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); | 993 | | | 994 | 66.4M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), | 995 | 66.4M | _mm256_add_epi32(res_2, res_3)); | 996 | | | 997 | 66.4M | return res; | 998 | 66.4M | } |
|
999 | | |
1000 | | static inline __m256i convolve_4tap(const __m256i *const s, |
1001 | 3.80M | const __m256i *const coeffs) { |
1002 | 3.80M | const __m256i res_1 = _mm256_madd_epi16(s[0], coeffs[0]); |
1003 | 3.80M | const __m256i res_2 = _mm256_madd_epi16(s[1], coeffs[1]); |
1004 | | |
1005 | 3.80M | const __m256i res = _mm256_add_epi32(res_1, res_2); |
1006 | 3.80M | return res; |
1007 | 3.80M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_4tap Unexecuted instantiation: highbd_convolve_avx2.c:convolve_4tap convolve_2d_avx2.c:convolve_4tap Line | Count | Source | 1001 | 3.80M | const __m256i *const coeffs) { | 1002 | 3.80M | const __m256i res_1 = _mm256_madd_epi16(s[0], coeffs[0]); | 1003 | 3.80M | const __m256i res_2 = _mm256_madd_epi16(s[1], coeffs[1]); | 1004 | | | 1005 | 3.80M | const __m256i res = _mm256_add_epi32(res_1, res_2); | 1006 | 3.80M | return res; | 1007 | 3.80M | } |
Unexecuted instantiation: convolve_avx2.c:convolve_4tap Unexecuted instantiation: jnt_convolve_avx2.c:convolve_4tap Unexecuted instantiation: wiener_convolve_avx2.c:convolve_4tap Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_4tap Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_4tap |
1008 | | |
1009 | | static inline __m128i convolve_lowbd_x_2tap_ssse3(const __m128i data, |
1010 | | const __m128i *const coeffs, |
1011 | 59.7k | const __m128i *const filt) { |
1012 | 59.7k | __m128i s; |
1013 | 59.7k | s = _mm_shuffle_epi8(data, filt[0]); |
1014 | | |
1015 | 59.7k | return _mm_maddubs_epi16(s, coeffs[0]); |
1016 | 59.7k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x_2tap_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x_2tap_ssse3 convolve_2d_avx2.c:convolve_lowbd_x_2tap_ssse3 Line | Count | Source | 1011 | 59.7k | const __m128i *const filt) { | 1012 | 59.7k | __m128i s; | 1013 | 59.7k | s = _mm_shuffle_epi8(data, filt[0]); | 1014 | | | 1015 | 59.7k | return _mm_maddubs_epi16(s, coeffs[0]); | 1016 | 59.7k | } |
Unexecuted instantiation: convolve_avx2.c:convolve_lowbd_x_2tap_ssse3 Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_x_2tap_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_x_2tap_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x_2tap_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x_2tap_ssse3 |
1017 | | |
1018 | | static inline __m128i convolve_lowbd_x_4tap_ssse3(const __m128i data, |
1019 | | const __m128i *const coeffs, |
1020 | 3.01M | const __m128i *const filt) { |
1021 | 3.01M | __m128i s[2]; |
1022 | | |
1023 | 3.01M | s[0] = _mm_shuffle_epi8(data, filt[0]); |
1024 | 3.01M | s[1] = _mm_shuffle_epi8(data, filt[1]); |
1025 | | |
1026 | 3.01M | return convolve_lowbd_4tap_ssse3(s, coeffs); |
1027 | 3.01M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x_4tap_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x_4tap_ssse3 convolve_2d_avx2.c:convolve_lowbd_x_4tap_ssse3 Line | Count | Source | 1020 | 3.01M | const __m128i *const filt) { | 1021 | 3.01M | __m128i s[2]; | 1022 | | | 1023 | 3.01M | s[0] = _mm_shuffle_epi8(data, filt[0]); | 1024 | 3.01M | s[1] = _mm_shuffle_epi8(data, filt[1]); | 1025 | | | 1026 | 3.01M | return convolve_lowbd_4tap_ssse3(s, coeffs); | 1027 | 3.01M | } |
Unexecuted instantiation: convolve_avx2.c:convolve_lowbd_x_4tap_ssse3 Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_x_4tap_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_x_4tap_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x_4tap_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x_4tap_ssse3 |
1028 | | |
1029 | | static inline __m256i convolve_lowbd_x(const __m256i data, |
1030 | | const __m256i *const coeffs, |
1031 | 20.2M | const __m256i *const filt) { |
1032 | 20.2M | __m256i s[4]; |
1033 | | |
1034 | 20.2M | s[0] = _mm256_shuffle_epi8(data, filt[0]); |
1035 | 20.2M | s[1] = _mm256_shuffle_epi8(data, filt[1]); |
1036 | 20.2M | s[2] = _mm256_shuffle_epi8(data, filt[2]); |
1037 | 20.2M | s[3] = _mm256_shuffle_epi8(data, filt[3]); |
1038 | | |
1039 | 20.2M | return convolve_lowbd(s, coeffs); |
1040 | 20.2M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x convolve_2d_avx2.c:convolve_lowbd_x Line | Count | Source | 1031 | 1.96M | const __m256i *const filt) { | 1032 | 1.96M | __m256i s[4]; | 1033 | | | 1034 | 1.96M | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 1035 | 1.96M | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 1036 | 1.96M | s[2] = _mm256_shuffle_epi8(data, filt[2]); | 1037 | 1.96M | s[3] = _mm256_shuffle_epi8(data, filt[3]); | 1038 | | | 1039 | 1.96M | return convolve_lowbd(s, coeffs); | 1040 | 1.96M | } |
convolve_avx2.c:convolve_lowbd_x Line | Count | Source | 1031 | 390k | const __m256i *const filt) { | 1032 | 390k | __m256i s[4]; | 1033 | | | 1034 | 390k | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 1035 | 390k | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 1036 | 390k | s[2] = _mm256_shuffle_epi8(data, filt[2]); | 1037 | 390k | s[3] = _mm256_shuffle_epi8(data, filt[3]); | 1038 | | | 1039 | 390k | return convolve_lowbd(s, coeffs); | 1040 | 390k | } |
jnt_convolve_avx2.c:convolve_lowbd_x Line | Count | Source | 1031 | 5.89M | const __m256i *const filt) { | 1032 | 5.89M | __m256i s[4]; | 1033 | | | 1034 | 5.89M | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 1035 | 5.89M | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 1036 | 5.89M | s[2] = _mm256_shuffle_epi8(data, filt[2]); | 1037 | 5.89M | s[3] = _mm256_shuffle_epi8(data, filt[3]); | 1038 | | | 1039 | 5.89M | return convolve_lowbd(s, coeffs); | 1040 | 5.89M | } |
wiener_convolve_avx2.c:convolve_lowbd_x Line | Count | Source | 1031 | 11.9M | const __m256i *const filt) { | 1032 | 11.9M | __m256i s[4]; | 1033 | | | 1034 | 11.9M | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 1035 | 11.9M | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 1036 | 11.9M | s[2] = _mm256_shuffle_epi8(data, filt[2]); | 1037 | 11.9M | s[3] = _mm256_shuffle_epi8(data, filt[3]); | 1038 | | | 1039 | 11.9M | return convolve_lowbd(s, coeffs); | 1040 | 11.9M | } |
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x |
1041 | | |
1042 | | static inline __m256i convolve_lowbd_x_6tap(const __m256i data, |
1043 | | const __m256i *const coeffs, |
1044 | 16.5M | const __m256i *const filt) { |
1045 | 16.5M | __m256i s[4]; |
1046 | | |
1047 | 16.5M | s[0] = _mm256_shuffle_epi8(data, filt[0]); |
1048 | 16.5M | s[1] = _mm256_shuffle_epi8(data, filt[1]); |
1049 | 16.5M | s[2] = _mm256_shuffle_epi8(data, filt[2]); |
1050 | | |
1051 | 16.5M | return convolve_lowbd_6tap(s, coeffs); |
1052 | 16.5M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x_6tap Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x_6tap convolve_2d_avx2.c:convolve_lowbd_x_6tap Line | Count | Source | 1044 | 12.2M | const __m256i *const filt) { | 1045 | 12.2M | __m256i s[4]; | 1046 | | | 1047 | 12.2M | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 1048 | 12.2M | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 1049 | 12.2M | s[2] = _mm256_shuffle_epi8(data, filt[2]); | 1050 | | | 1051 | 12.2M | return convolve_lowbd_6tap(s, coeffs); | 1052 | 12.2M | } |
convolve_avx2.c:convolve_lowbd_x_6tap Line | Count | Source | 1044 | 4.26M | const __m256i *const filt) { | 1045 | 4.26M | __m256i s[4]; | 1046 | | | 1047 | 4.26M | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 1048 | 4.26M | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 1049 | 4.26M | s[2] = _mm256_shuffle_epi8(data, filt[2]); | 1050 | | | 1051 | 4.26M | return convolve_lowbd_6tap(s, coeffs); | 1052 | 4.26M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_x_6tap Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_x_6tap Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x_6tap Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x_6tap |
1053 | | |
1054 | | static inline __m256i convolve_lowbd_x_4tap(const __m256i data, |
1055 | | const __m256i *const coeffs, |
1056 | 3.40M | const __m256i *const filt) { |
1057 | 3.40M | __m256i s[2]; |
1058 | | |
1059 | 3.40M | s[0] = _mm256_shuffle_epi8(data, filt[0]); |
1060 | 3.40M | s[1] = _mm256_shuffle_epi8(data, filt[1]); |
1061 | | |
1062 | 3.40M | return convolve_lowbd_4tap(s, coeffs); |
1063 | 3.40M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x_4tap Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x_4tap convolve_2d_avx2.c:convolve_lowbd_x_4tap Line | Count | Source | 1056 | 1.04M | const __m256i *const filt) { | 1057 | 1.04M | __m256i s[2]; | 1058 | | | 1059 | 1.04M | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 1060 | 1.04M | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 1061 | | | 1062 | 1.04M | return convolve_lowbd_4tap(s, coeffs); | 1063 | 1.04M | } |
convolve_avx2.c:convolve_lowbd_x_4tap Line | Count | Source | 1056 | 627k | const __m256i *const filt) { | 1057 | 627k | __m256i s[2]; | 1058 | | | 1059 | 627k | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 1060 | 627k | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 1061 | | | 1062 | 627k | return convolve_lowbd_4tap(s, coeffs); | 1063 | 627k | } |
jnt_convolve_avx2.c:convolve_lowbd_x_4tap Line | Count | Source | 1056 | 1.73M | const __m256i *const filt) { | 1057 | 1.73M | __m256i s[2]; | 1058 | | | 1059 | 1.73M | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 1060 | 1.73M | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 1061 | | | 1062 | 1.73M | return convolve_lowbd_4tap(s, coeffs); | 1063 | 1.73M | } |
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_x_4tap Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x_4tap Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x_4tap |
1064 | | |
1065 | | static inline __m256i convolve_lowbd_x_2tap(const __m256i data, |
1066 | | const __m256i *const coeffs, |
1067 | 433k | const __m256i *const filt) { |
1068 | 433k | __m256i s; |
1069 | 433k | s = _mm256_shuffle_epi8(data, filt[0]); |
1070 | | |
1071 | 433k | return _mm256_maddubs_epi16(s, coeffs[0]); |
1072 | 433k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x_2tap Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x_2tap convolve_2d_avx2.c:convolve_lowbd_x_2tap Line | Count | Source | 1067 | 433k | const __m256i *const filt) { | 1068 | 433k | __m256i s; | 1069 | 433k | s = _mm256_shuffle_epi8(data, filt[0]); | 1070 | | | 1071 | 433k | return _mm256_maddubs_epi16(s, coeffs[0]); | 1072 | 433k | } |
Unexecuted instantiation: convolve_avx2.c:convolve_lowbd_x_2tap Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_x_2tap Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_x_2tap Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x_2tap Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x_2tap |
1073 | | |
1074 | | static inline void add_store_aligned_256(CONV_BUF_TYPE *const dst, |
1075 | | const __m256i *const res, |
1076 | 0 | const int do_average) { |
1077 | 0 | __m256i d; |
1078 | 0 | if (do_average) { |
1079 | 0 | d = _mm256_load_si256((__m256i *)dst); |
1080 | 0 | d = _mm256_add_epi32(d, *res); |
1081 | 0 | d = _mm256_srai_epi32(d, 1); |
1082 | 0 | } else { |
1083 | 0 | d = *res; |
1084 | 0 | } |
1085 | 0 | _mm256_store_si256((__m256i *)dst, d); |
1086 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:add_store_aligned_256 Unexecuted instantiation: highbd_convolve_avx2.c:add_store_aligned_256 Unexecuted instantiation: convolve_2d_avx2.c:add_store_aligned_256 Unexecuted instantiation: convolve_avx2.c:add_store_aligned_256 Unexecuted instantiation: jnt_convolve_avx2.c:add_store_aligned_256 Unexecuted instantiation: wiener_convolve_avx2.c:add_store_aligned_256 Unexecuted instantiation: highbd_convolve_2d_avx2.c:add_store_aligned_256 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:add_store_aligned_256 |
1087 | | |
1088 | | static inline __m256i comp_avg(const __m256i *const data_ref_0, |
1089 | | const __m256i *const res_unsigned, |
1090 | | const __m256i *const wt, |
1091 | 218M | const int use_dist_wtd_comp_avg) { |
1092 | 218M | __m256i res; |
1093 | 218M | if (use_dist_wtd_comp_avg) { |
1094 | 1.76M | const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned); |
1095 | 1.76M | const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned); |
1096 | | |
1097 | 1.76M | const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt); |
1098 | 1.76M | const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt); |
1099 | | |
1100 | 1.76M | const __m256i res_lo = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); |
1101 | 1.76M | const __m256i res_hi = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); |
1102 | | |
1103 | 1.76M | res = _mm256_packs_epi32(res_lo, res_hi); |
1104 | 216M | } else { |
1105 | 216M | const __m256i wt_res = _mm256_add_epi16(*data_ref_0, *res_unsigned); |
1106 | 216M | res = _mm256_srai_epi16(wt_res, 1); |
1107 | 216M | } |
1108 | 218M | return res; |
1109 | 218M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:comp_avg Unexecuted instantiation: highbd_convolve_avx2.c:comp_avg Unexecuted instantiation: convolve_2d_avx2.c:comp_avg Unexecuted instantiation: convolve_avx2.c:comp_avg jnt_convolve_avx2.c:comp_avg Line | Count | Source | 1091 | 218M | const int use_dist_wtd_comp_avg) { | 1092 | 218M | __m256i res; | 1093 | 218M | if (use_dist_wtd_comp_avg) { | 1094 | 1.76M | const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned); | 1095 | 1.76M | const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned); | 1096 | | | 1097 | 1.76M | const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt); | 1098 | 1.76M | const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt); | 1099 | | | 1100 | 1.76M | const __m256i res_lo = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); | 1101 | 1.76M | const __m256i res_hi = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); | 1102 | | | 1103 | 1.76M | res = _mm256_packs_epi32(res_lo, res_hi); | 1104 | 216M | } else { | 1105 | 216M | const __m256i wt_res = _mm256_add_epi16(*data_ref_0, *res_unsigned); | 1106 | 216M | res = _mm256_srai_epi16(wt_res, 1); | 1107 | 216M | } | 1108 | 218M | return res; | 1109 | 218M | } |
Unexecuted instantiation: wiener_convolve_avx2.c:comp_avg Unexecuted instantiation: highbd_convolve_2d_avx2.c:comp_avg Unexecuted instantiation: highbd_jnt_convolve_avx2.c:comp_avg |
1110 | | |
1111 | | static inline __m256i convolve_rounding(const __m256i *const res_unsigned, |
1112 | | const __m256i *const offset_const, |
1113 | | const __m256i *const round_const, |
1114 | 217M | const int round_shift) { |
1115 | 217M | const __m256i res_signed = _mm256_sub_epi16(*res_unsigned, *offset_const); |
1116 | 217M | const __m256i res_round = _mm256_srai_epi16( |
1117 | 217M | _mm256_add_epi16(res_signed, *round_const), round_shift); |
1118 | 217M | return res_round; |
1119 | 217M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_rounding Unexecuted instantiation: highbd_convolve_avx2.c:convolve_rounding Unexecuted instantiation: convolve_2d_avx2.c:convolve_rounding Unexecuted instantiation: convolve_avx2.c:convolve_rounding jnt_convolve_avx2.c:convolve_rounding Line | Count | Source | 1114 | 217M | const int round_shift) { | 1115 | 217M | const __m256i res_signed = _mm256_sub_epi16(*res_unsigned, *offset_const); | 1116 | 217M | const __m256i res_round = _mm256_srai_epi16( | 1117 | 217M | _mm256_add_epi16(res_signed, *round_const), round_shift); | 1118 | 217M | return res_round; | 1119 | 217M | } |
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_rounding Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_rounding Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_rounding |
1120 | | |
1121 | | static inline __m256i highbd_comp_avg(const __m256i *const data_ref_0, |
1122 | | const __m256i *const res_unsigned, |
1123 | | const __m256i *const wt0, |
1124 | | const __m256i *const wt1, |
1125 | 19.4M | const int use_dist_wtd_comp_avg) { |
1126 | 19.4M | __m256i res; |
1127 | 19.4M | if (use_dist_wtd_comp_avg) { |
1128 | 2.41M | const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0); |
1129 | 2.41M | const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1); |
1130 | 2.41M | const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res); |
1131 | 2.41M | res = _mm256_srai_epi32(wt_res, DIST_PRECISION_BITS); |
1132 | 17.0M | } else { |
1133 | 17.0M | const __m256i wt_res = _mm256_add_epi32(*data_ref_0, *res_unsigned); |
1134 | 17.0M | res = _mm256_srai_epi32(wt_res, 1); |
1135 | 17.0M | } |
1136 | 19.4M | return res; |
1137 | 19.4M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:highbd_comp_avg Unexecuted instantiation: highbd_convolve_avx2.c:highbd_comp_avg Unexecuted instantiation: convolve_2d_avx2.c:highbd_comp_avg Unexecuted instantiation: convolve_avx2.c:highbd_comp_avg Unexecuted instantiation: jnt_convolve_avx2.c:highbd_comp_avg Unexecuted instantiation: wiener_convolve_avx2.c:highbd_comp_avg Unexecuted instantiation: highbd_convolve_2d_avx2.c:highbd_comp_avg highbd_jnt_convolve_avx2.c:highbd_comp_avg Line | Count | Source | 1125 | 19.4M | const int use_dist_wtd_comp_avg) { | 1126 | 19.4M | __m256i res; | 1127 | 19.4M | if (use_dist_wtd_comp_avg) { | 1128 | 2.41M | const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0); | 1129 | 2.41M | const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1); | 1130 | 2.41M | const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res); | 1131 | 2.41M | res = _mm256_srai_epi32(wt_res, DIST_PRECISION_BITS); | 1132 | 17.0M | } else { | 1133 | 17.0M | const __m256i wt_res = _mm256_add_epi32(*data_ref_0, *res_unsigned); | 1134 | 17.0M | res = _mm256_srai_epi32(wt_res, 1); | 1135 | 17.0M | } | 1136 | 19.4M | return res; | 1137 | 19.4M | } |
|
1138 | | |
1139 | | static inline __m256i highbd_convolve_rounding( |
1140 | | const __m256i *const res_unsigned, const __m256i *const offset_const, |
1141 | 19.4M | const __m256i *const round_const, const int round_shift) { |
1142 | 19.4M | const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const); |
1143 | 19.4M | const __m256i res_round = _mm256_srai_epi32( |
1144 | 19.4M | _mm256_add_epi32(res_signed, *round_const), round_shift); |
1145 | | |
1146 | 19.4M | return res_round; |
1147 | 19.4M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:highbd_convolve_rounding Unexecuted instantiation: highbd_convolve_avx2.c:highbd_convolve_rounding Unexecuted instantiation: convolve_2d_avx2.c:highbd_convolve_rounding Unexecuted instantiation: convolve_avx2.c:highbd_convolve_rounding Unexecuted instantiation: jnt_convolve_avx2.c:highbd_convolve_rounding Unexecuted instantiation: wiener_convolve_avx2.c:highbd_convolve_rounding Unexecuted instantiation: highbd_convolve_2d_avx2.c:highbd_convolve_rounding highbd_jnt_convolve_avx2.c:highbd_convolve_rounding Line | Count | Source | 1141 | 19.4M | const __m256i *const round_const, const int round_shift) { | 1142 | 19.4M | const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const); | 1143 | 19.4M | const __m256i res_round = _mm256_srai_epi32( | 1144 | 19.4M | _mm256_add_epi32(res_signed, *round_const), round_shift); | 1145 | | | 1146 | 19.4M | return res_round; | 1147 | 19.4M | } |
|
1148 | | |
1149 | 5.55M | static inline __m256i round_sr_x_avx2(const __m256i data) { |
1150 | | // we can perform the below steps: |
1151 | | // data = (data + 2) >> 2 |
1152 | | // data = (data + 8) >> 4, |
1153 | | // in the below form as well |
1154 | | // data = (data + 0x22) >> 6 |
1155 | 5.55M | const __m256i value = _mm256_set1_epi16(34); |
1156 | 5.55M | const __m256i reg = _mm256_add_epi16(data, value); |
1157 | 5.55M | return _mm256_srai_epi16(reg, 6); |
1158 | 5.55M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_sr_x_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:round_sr_x_avx2 Unexecuted instantiation: convolve_2d_avx2.c:round_sr_x_avx2 convolve_avx2.c:round_sr_x_avx2 Line | Count | Source | 1149 | 5.55M | static inline __m256i round_sr_x_avx2(const __m256i data) { | 1150 | | // we can perform the below steps: | 1151 | | // data = (data + 2) >> 2 | 1152 | | // data = (data + 8) >> 4, | 1153 | | // in the below form as well | 1154 | | // data = (data + 0x22) >> 6 | 1155 | 5.55M | const __m256i value = _mm256_set1_epi16(34); | 1156 | 5.55M | const __m256i reg = _mm256_add_epi16(data, value); | 1157 | 5.55M | return _mm256_srai_epi16(reg, 6); | 1158 | 5.55M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:round_sr_x_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:round_sr_x_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_sr_x_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_sr_x_avx2 |
1159 | | |
1160 | | static inline __m128i convolve_x_4tap_4x2_ssse3(const uint8_t *const src, |
1161 | | const ptrdiff_t src_stride, |
1162 | 429k | __m128i *const coeffs) { |
1163 | 429k | __m128i data[2]; |
1164 | 429k | const __m128i f_l0 = _mm_load_si128((__m128i const *)filt1_global_sse2); |
1165 | 429k | const __m128i f_l1 = _mm_load_si128((__m128i const *)filt2_global_sse2); |
1166 | 429k | const __m128i src_1 = |
1167 | 429k | load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * src_stride)); |
1168 | | |
1169 | 429k | data[0] = _mm_shuffle_epi8(src_1, f_l0); |
1170 | 429k | data[1] = _mm_shuffle_epi8(src_1, f_l1); |
1171 | 429k | return convolve_lowbd_4tap_ssse3(data, coeffs); |
1172 | 429k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_4tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_4tap_4x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_4tap_4x2_ssse3 convolve_avx2.c:convolve_x_4tap_4x2_ssse3 Line | Count | Source | 1162 | 429k | __m128i *const coeffs) { | 1163 | 429k | __m128i data[2]; | 1164 | 429k | const __m128i f_l0 = _mm_load_si128((__m128i const *)filt1_global_sse2); | 1165 | 429k | const __m128i f_l1 = _mm_load_si128((__m128i const *)filt2_global_sse2); | 1166 | 429k | const __m128i src_1 = | 1167 | 429k | load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * src_stride)); | 1168 | | | 1169 | 429k | data[0] = _mm_shuffle_epi8(src_1, f_l0); | 1170 | 429k | data[1] = _mm_shuffle_epi8(src_1, f_l1); | 1171 | 429k | return convolve_lowbd_4tap_ssse3(data, coeffs); | 1172 | 429k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_4tap_4x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_4tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_4tap_4x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_4tap_4x2_ssse3 |
1173 | | |
1174 | 560k | static inline __m128i round_sr_x_ssse3(const __m128i data) { |
1175 | 560k | const __m128i val = _mm_set1_epi16(34); |
1176 | 560k | const __m128i reg = _mm_add_epi16(data, val); |
1177 | 560k | return _mm_srai_epi16(reg, 6); |
1178 | 560k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_sr_x_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:round_sr_x_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:round_sr_x_ssse3 convolve_avx2.c:round_sr_x_ssse3 Line | Count | Source | 1174 | 560k | static inline __m128i round_sr_x_ssse3(const __m128i data) { | 1175 | 560k | const __m128i val = _mm_set1_epi16(34); | 1176 | 560k | const __m128i reg = _mm_add_epi16(data, val); | 1177 | 560k | return _mm_srai_epi16(reg, 6); | 1178 | 560k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:round_sr_x_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:round_sr_x_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_sr_x_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_sr_x_ssse3 |
1179 | | |
1180 | | static inline void store_8bit_4x2_sse2(const __m128i reg, uint8_t *const dst, |
1181 | 985k | const ptrdiff_t dst_stride) { |
1182 | 985k | xx_storel_32(dst, reg); |
1183 | 985k | *(uint32_t *)(dst + dst_stride) = |
1184 | 985k | ((uint32_t)_mm_extract_epi16(reg, 3) << 16) | _mm_extract_epi16(reg, 2); |
1185 | 985k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:store_8bit_4x2_sse2 Unexecuted instantiation: highbd_convolve_avx2.c:store_8bit_4x2_sse2 Unexecuted instantiation: convolve_2d_avx2.c:store_8bit_4x2_sse2 convolve_avx2.c:store_8bit_4x2_sse2 Line | Count | Source | 1181 | 985k | const ptrdiff_t dst_stride) { | 1182 | 985k | xx_storel_32(dst, reg); | 1183 | 985k | *(uint32_t *)(dst + dst_stride) = | 1184 | 985k | ((uint32_t)_mm_extract_epi16(reg, 3) << 16) | _mm_extract_epi16(reg, 2); | 1185 | 985k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:store_8bit_4x2_sse2 Unexecuted instantiation: wiener_convolve_avx2.c:store_8bit_4x2_sse2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:store_8bit_4x2_sse2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:store_8bit_4x2_sse2 |
1186 | | |
1187 | | static inline void pack_store_u8_4x2_sse2(const __m128i reg, uint8_t *const dst, |
1188 | 985k | const ptrdiff_t dst_stride) { |
1189 | 985k | const __m128i reg_pack = _mm_packus_epi16(reg, reg); |
1190 | 985k | store_8bit_4x2_sse2(reg_pack, dst, dst_stride); |
1191 | 985k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:pack_store_u8_4x2_sse2 Unexecuted instantiation: highbd_convolve_avx2.c:pack_store_u8_4x2_sse2 Unexecuted instantiation: convolve_2d_avx2.c:pack_store_u8_4x2_sse2 convolve_avx2.c:pack_store_u8_4x2_sse2 Line | Count | Source | 1188 | 985k | const ptrdiff_t dst_stride) { | 1189 | 985k | const __m128i reg_pack = _mm_packus_epi16(reg, reg); | 1190 | 985k | store_8bit_4x2_sse2(reg_pack, dst, dst_stride); | 1191 | 985k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:pack_store_u8_4x2_sse2 Unexecuted instantiation: wiener_convolve_avx2.c:pack_store_u8_4x2_sse2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:pack_store_u8_4x2_sse2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:pack_store_u8_4x2_sse2 |
1192 | | |
1193 | | static inline __m128i convolve_x_4tap_2x2_ssse3(const uint8_t *const src, |
1194 | | const ptrdiff_t src_stride, |
1195 | 74.3k | __m128i *const coeffs) { |
1196 | 74.3k | __m128i data[2]; |
1197 | 74.3k | const __m128i f_0 = _mm_load_si128((__m128i const *)filt3_global_sse2); |
1198 | 74.3k | const __m128i f_1 = _mm_load_si128((__m128i const *)filt4_global_sse2); |
1199 | 74.3k | const __m128i reg = |
1200 | 74.3k | load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * src_stride)); |
1201 | | |
1202 | 74.3k | data[0] = _mm_shuffle_epi8(reg, f_0); |
1203 | 74.3k | data[1] = _mm_shuffle_epi8(reg, f_1); |
1204 | 74.3k | return convolve_lowbd_4tap_ssse3(data, coeffs); |
1205 | 74.3k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_4tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_4tap_2x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_4tap_2x2_ssse3 convolve_avx2.c:convolve_x_4tap_2x2_ssse3 Line | Count | Source | 1195 | 74.3k | __m128i *const coeffs) { | 1196 | 74.3k | __m128i data[2]; | 1197 | 74.3k | const __m128i f_0 = _mm_load_si128((__m128i const *)filt3_global_sse2); | 1198 | 74.3k | const __m128i f_1 = _mm_load_si128((__m128i const *)filt4_global_sse2); | 1199 | 74.3k | const __m128i reg = | 1200 | 74.3k | load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * src_stride)); | 1201 | | | 1202 | 74.3k | data[0] = _mm_shuffle_epi8(reg, f_0); | 1203 | 74.3k | data[1] = _mm_shuffle_epi8(reg, f_1); | 1204 | 74.3k | return convolve_lowbd_4tap_ssse3(data, coeffs); | 1205 | 74.3k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_4tap_2x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_4tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_4tap_2x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_4tap_2x2_ssse3 |
1206 | | |
1207 | | static inline void pack_store_u8_2x2_sse2(const __m128i reg, uint8_t *const dst, |
1208 | 173k | const ptrdiff_t dst_stride) { |
1209 | 173k | const __m128i data = _mm_packus_epi16(reg, reg); |
1210 | 173k | *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(data); |
1211 | 173k | *(int16_t *)(dst + dst_stride) = (int16_t)_mm_extract_epi16(data, 1); |
1212 | 173k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:pack_store_u8_2x2_sse2 Unexecuted instantiation: highbd_convolve_avx2.c:pack_store_u8_2x2_sse2 Unexecuted instantiation: convolve_2d_avx2.c:pack_store_u8_2x2_sse2 convolve_avx2.c:pack_store_u8_2x2_sse2 Line | Count | Source | 1208 | 173k | const ptrdiff_t dst_stride) { | 1209 | 173k | const __m128i data = _mm_packus_epi16(reg, reg); | 1210 | 173k | *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(data); | 1211 | | *(int16_t *)(dst + dst_stride) = (int16_t)_mm_extract_epi16(data, 1); | 1212 | 173k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:pack_store_u8_2x2_sse2 Unexecuted instantiation: wiener_convolve_avx2.c:pack_store_u8_2x2_sse2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:pack_store_u8_2x2_sse2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:pack_store_u8_2x2_sse2 |
1213 | | |
1214 | | static inline __m128i convolve_x_2tap_ssse3(const __m128i *data, |
1215 | 56.5k | const __m128i *coeff) { |
1216 | 56.5k | return _mm_maddubs_epi16(data[0], coeff[0]); |
1217 | 56.5k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_ssse3 convolve_avx2.c:convolve_x_2tap_ssse3 Line | Count | Source | 1215 | 56.5k | const __m128i *coeff) { | 1216 | 56.5k | return _mm_maddubs_epi16(data[0], coeff[0]); | 1217 | 56.5k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_ssse3 |
1218 | | |
1219 | | static inline __m128i load8_x_4x2_sse4(const void *const src, |
1220 | 10.6k | const ptrdiff_t offset) { |
1221 | 10.6k | const __m128i s = _mm_cvtsi32_si128(loadu_int32(src)); |
1222 | 10.6k | return _mm_insert_epi32(s, loadu_int32((uint8_t *)src + offset), 1); |
1223 | 10.6k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load8_x_4x2_sse4 Unexecuted instantiation: highbd_convolve_avx2.c:load8_x_4x2_sse4 Unexecuted instantiation: convolve_2d_avx2.c:load8_x_4x2_sse4 convolve_avx2.c:load8_x_4x2_sse4 Line | Count | Source | 1220 | 10.6k | const ptrdiff_t offset) { | 1221 | 10.6k | const __m128i s = _mm_cvtsi32_si128(loadu_int32(src)); | 1222 | | return _mm_insert_epi32(s, loadu_int32((uint8_t *)src + offset), 1); | 1223 | 10.6k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load8_x_4x2_sse4 Unexecuted instantiation: wiener_convolve_avx2.c:load8_x_4x2_sse4 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load8_x_4x2_sse4 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load8_x_4x2_sse4 |
1224 | | |
1225 | | static inline __m128i load_x_u8_4x2_sse4(const uint8_t *const src, |
1226 | 10.6k | const ptrdiff_t stride) { |
1227 | 10.6k | return load8_x_4x2_sse4(src, sizeof(*src) * stride); |
1228 | 10.6k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_x_u8_4x2_sse4 Unexecuted instantiation: highbd_convolve_avx2.c:load_x_u8_4x2_sse4 Unexecuted instantiation: convolve_2d_avx2.c:load_x_u8_4x2_sse4 convolve_avx2.c:load_x_u8_4x2_sse4 Line | Count | Source | 1226 | 10.6k | const ptrdiff_t stride) { | 1227 | 10.6k | return load8_x_4x2_sse4(src, sizeof(*src) * stride); | 1228 | 10.6k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load_x_u8_4x2_sse4 Unexecuted instantiation: wiener_convolve_avx2.c:load_x_u8_4x2_sse4 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_x_u8_4x2_sse4 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_x_u8_4x2_sse4 |
1229 | | |
1230 | | static inline __m128i convolve_x_2tap_2x2_ssse3(const uint8_t *const src, |
1231 | | const ptrdiff_t stride, |
1232 | 4.21k | const __m128i *coeffs) { |
1233 | 4.21k | const __m128i flt = _mm_load_si128((__m128i const *)filt5_global_sse2); |
1234 | 4.21k | const __m128i reg = load_x_u8_4x2_sse4(src, stride); |
1235 | 4.21k | const __m128i data = _mm_shuffle_epi8(reg, flt); |
1236 | 4.21k | return convolve_x_2tap_ssse3(&data, coeffs); |
1237 | 4.21k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_2x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_2x2_ssse3 convolve_avx2.c:convolve_x_2tap_2x2_ssse3 Line | Count | Source | 1232 | 4.21k | const __m128i *coeffs) { | 1233 | 4.21k | const __m128i flt = _mm_load_si128((__m128i const *)filt5_global_sse2); | 1234 | 4.21k | const __m128i reg = load_x_u8_4x2_sse4(src, stride); | 1235 | 4.21k | const __m128i data = _mm_shuffle_epi8(reg, flt); | 1236 | 4.21k | return convolve_x_2tap_ssse3(&data, coeffs); | 1237 | 4.21k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_2x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_2x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_2x2_ssse3 |
1238 | | |
1239 | | static inline __m128i convolve_x_2tap_4x2_ssse3(const uint8_t *const src, |
1240 | | const ptrdiff_t stride, |
1241 | 17.7k | const __m128i *coeffs) { |
1242 | 17.7k | const __m128i flt = _mm_load_si128((__m128i const *)filt1_global_sse2); |
1243 | 17.7k | const __m128i data = |
1244 | 17.7k | load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * stride)); |
1245 | 17.7k | const __m128i res = _mm_shuffle_epi8(data, flt); |
1246 | 17.7k | return convolve_x_2tap_ssse3(&res, coeffs); |
1247 | 17.7k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_4x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_4x2_ssse3 convolve_avx2.c:convolve_x_2tap_4x2_ssse3 Line | Count | Source | 1241 | 17.7k | const __m128i *coeffs) { | 1242 | 17.7k | const __m128i flt = _mm_load_si128((__m128i const *)filt1_global_sse2); | 1243 | 17.7k | const __m128i data = | 1244 | 17.7k | load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * stride)); | 1245 | 17.7k | const __m128i res = _mm_shuffle_epi8(data, flt); | 1246 | 17.7k | return convolve_x_2tap_ssse3(&res, coeffs); | 1247 | 17.7k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_4x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_4x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_4x2_ssse3 |
1248 | | |
1249 | | static inline void convolve_x_2tap_8x2_ssse3(const uint8_t *const src, |
1250 | | const ptrdiff_t stride, |
1251 | | const __m128i *coeffs, |
1252 | 17.3k | __m128i *data) { |
1253 | 17.3k | __m128i res[2]; |
1254 | 17.3k | const __m128i reg_00 = _mm_loadu_si128((__m128i *)src); |
1255 | 17.3k | const __m128i reg_10 = _mm_loadu_si128((__m128i *)(src + stride)); |
1256 | 17.3k | const __m128i reg_01 = _mm_srli_si128(reg_00, 1); |
1257 | 17.3k | const __m128i reg_11 = _mm_srli_si128(reg_10, 1); |
1258 | 17.3k | res[0] = _mm_unpacklo_epi8(reg_00, reg_01); |
1259 | 17.3k | res[1] = _mm_unpacklo_epi8(reg_10, reg_11); |
1260 | | |
1261 | 17.3k | data[0] = convolve_x_2tap_ssse3(&res[0], coeffs); |
1262 | 17.3k | data[1] = convolve_x_2tap_ssse3(&res[1], coeffs); |
1263 | 17.3k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_8x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_8x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_8x2_ssse3 convolve_avx2.c:convolve_x_2tap_8x2_ssse3 Line | Count | Source | 1252 | 17.3k | __m128i *data) { | 1253 | 17.3k | __m128i res[2]; | 1254 | 17.3k | const __m128i reg_00 = _mm_loadu_si128((__m128i *)src); | 1255 | 17.3k | const __m128i reg_10 = _mm_loadu_si128((__m128i *)(src + stride)); | 1256 | 17.3k | const __m128i reg_01 = _mm_srli_si128(reg_00, 1); | 1257 | 17.3k | const __m128i reg_11 = _mm_srli_si128(reg_10, 1); | 1258 | 17.3k | res[0] = _mm_unpacklo_epi8(reg_00, reg_01); | 1259 | 17.3k | res[1] = _mm_unpacklo_epi8(reg_10, reg_11); | 1260 | | | 1261 | 17.3k | data[0] = convolve_x_2tap_ssse3(&res[0], coeffs); | 1262 | 17.3k | data[1] = convolve_x_2tap_ssse3(&res[1], coeffs); | 1263 | 17.3k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_8x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_8x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_8x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_8x2_ssse3 |
1264 | | |
1265 | | static inline __m256i loadu_x_8bit_16x2_avx2(const void *const src, |
1266 | 937k | const ptrdiff_t offset) { |
1267 | 937k | const __m128i reg0 = _mm_loadu_si128((__m128i *)src); |
1268 | 937k | const __m128i reg1 = _mm_loadu_si128((__m128i *)((uint8_t *)src + offset)); |
1269 | 937k | return _mm256_setr_m128i(reg0, reg1); |
1270 | 937k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:loadu_x_8bit_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:loadu_x_8bit_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:loadu_x_8bit_16x2_avx2 convolve_avx2.c:loadu_x_8bit_16x2_avx2 Line | Count | Source | 1266 | 937k | const ptrdiff_t offset) { | 1267 | 937k | const __m128i reg0 = _mm_loadu_si128((__m128i *)src); | 1268 | 937k | const __m128i reg1 = _mm_loadu_si128((__m128i *)((uint8_t *)src + offset)); | 1269 | 937k | return _mm256_setr_m128i(reg0, reg1); | 1270 | 937k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:loadu_x_8bit_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:loadu_x_8bit_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:loadu_x_8bit_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:loadu_x_8bit_16x2_avx2 |
1271 | | |
1272 | | static inline __m256i convolve_x_2tap_avx2(const __m256i *data, |
1273 | 272k | const __m256i *coeffs) { |
1274 | 272k | return _mm256_maddubs_epi16(data[0], coeffs[0]); |
1275 | 272k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_avx2 convolve_avx2.c:convolve_x_2tap_avx2 Line | Count | Source | 1273 | 272k | const __m256i *coeffs) { | 1274 | 272k | return _mm256_maddubs_epi16(data[0], coeffs[0]); | 1275 | 272k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_avx2 |
1276 | | |
1277 | | static inline void convolve_x_2tap_16x2_avx2(const uint8_t *const src, |
1278 | | const ptrdiff_t stride, |
1279 | | const __m256i *coeffs, |
1280 | 11.8k | __m256i *data) { |
1281 | 11.8k | const __m256i reg0 = loadu_x_8bit_16x2_avx2(src, stride); |
1282 | 11.8k | const __m256i reg1 = loadu_x_8bit_16x2_avx2(src + 1, stride); |
1283 | 11.8k | const __m256i res0 = _mm256_unpacklo_epi8(reg0, reg1); |
1284 | 11.8k | const __m256i res1 = _mm256_unpackhi_epi8(reg0, reg1); |
1285 | 11.8k | data[0] = convolve_x_2tap_avx2(&res0, coeffs); |
1286 | 11.8k | data[1] = convolve_x_2tap_avx2(&res1, coeffs); |
1287 | 11.8k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_16x2_avx2 convolve_avx2.c:convolve_x_2tap_16x2_avx2 Line | Count | Source | 1280 | 11.8k | __m256i *data) { | 1281 | 11.8k | const __m256i reg0 = loadu_x_8bit_16x2_avx2(src, stride); | 1282 | 11.8k | const __m256i reg1 = loadu_x_8bit_16x2_avx2(src + 1, stride); | 1283 | 11.8k | const __m256i res0 = _mm256_unpacklo_epi8(reg0, reg1); | 1284 | 11.8k | const __m256i res1 = _mm256_unpackhi_epi8(reg0, reg1); | 1285 | 11.8k | data[0] = convolve_x_2tap_avx2(&res0, coeffs); | 1286 | 11.8k | data[1] = convolve_x_2tap_avx2(&res1, coeffs); | 1287 | 11.8k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_16x2_avx2 |
1288 | | |
1289 | | static inline void store_u8_16x2_avx2(const __m256i src, uint8_t *const dst, |
1290 | 2.35M | const ptrdiff_t stride) { |
1291 | 2.35M | const __m128i reg0 = _mm256_castsi256_si128(src); |
1292 | 2.35M | const __m128i reg1 = _mm256_extracti128_si256(src, 1); |
1293 | 2.35M | _mm_storeu_si128((__m128i *)dst, reg0); |
1294 | 2.35M | _mm_storeu_si128((__m128i *)((uint8_t *)dst + stride), reg1); |
1295 | 2.35M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:store_u8_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:store_u8_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:store_u8_16x2_avx2 convolve_avx2.c:store_u8_16x2_avx2 Line | Count | Source | 1290 | 2.35M | const ptrdiff_t stride) { | 1291 | 2.35M | const __m128i reg0 = _mm256_castsi256_si128(src); | 1292 | | const __m128i reg1 = _mm256_extracti128_si256(src, 1); | 1293 | 2.35M | _mm_storeu_si128((__m128i *)dst, reg0); | 1294 | 2.35M | _mm_storeu_si128((__m128i *)((uint8_t *)dst + stride), reg1); | 1295 | 2.35M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:store_u8_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:store_u8_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:store_u8_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:store_u8_16x2_avx2 |
1296 | | |
1297 | | static inline void store_u8_8x2_avx2(const __m256i src, uint8_t *const dst, |
1298 | 625k | const ptrdiff_t stride) { |
1299 | 625k | const __m128i reg0 = _mm256_castsi256_si128(src); |
1300 | 625k | const __m128i reg1 = _mm256_extracti128_si256(src, 1); |
1301 | 625k | _mm_storel_epi64((__m128i *)dst, reg0); |
1302 | 625k | _mm_storel_epi64((__m128i *)(dst + stride), reg1); |
1303 | 625k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:store_u8_8x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:store_u8_8x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:store_u8_8x2_avx2 convolve_avx2.c:store_u8_8x2_avx2 Line | Count | Source | 1298 | 625k | const ptrdiff_t stride) { | 1299 | 625k | const __m128i reg0 = _mm256_castsi256_si128(src); | 1300 | | const __m128i reg1 = _mm256_extracti128_si256(src, 1); | 1301 | 625k | _mm_storel_epi64((__m128i *)dst, reg0); | 1302 | 625k | _mm_storel_epi64((__m128i *)(dst + stride), reg1); | 1303 | 625k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:store_u8_8x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:store_u8_8x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:store_u8_8x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:store_u8_8x2_avx2 |
1304 | | |
1305 | | static inline void pack_store_16x2_avx2(const __m256i data0, |
1306 | | const __m256i data1, uint8_t *const dst, |
1307 | 2.35M | const ptrdiff_t stride) { |
1308 | 2.35M | const __m256i res = _mm256_packus_epi16(data0, data1); |
1309 | 2.35M | store_u8_16x2_avx2(res, dst, stride); |
1310 | 2.35M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:pack_store_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:pack_store_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:pack_store_16x2_avx2 convolve_avx2.c:pack_store_16x2_avx2 Line | Count | Source | 1307 | 2.35M | const ptrdiff_t stride) { | 1308 | 2.35M | const __m256i res = _mm256_packus_epi16(data0, data1); | 1309 | 2.35M | store_u8_16x2_avx2(res, dst, stride); | 1310 | 2.35M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:pack_store_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:pack_store_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:pack_store_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:pack_store_16x2_avx2 |
1311 | | |
1312 | | static inline void pack_store_8x2_avx2(const __m256i data, uint8_t *const dst, |
1313 | 625k | const ptrdiff_t stride) { |
1314 | 625k | const __m256i res = _mm256_packus_epi16(data, data); |
1315 | 625k | store_u8_8x2_avx2(res, dst, stride); |
1316 | 625k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:pack_store_8x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:pack_store_8x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:pack_store_8x2_avx2 convolve_avx2.c:pack_store_8x2_avx2 Line | Count | Source | 1313 | 625k | const ptrdiff_t stride) { | 1314 | 625k | const __m256i res = _mm256_packus_epi16(data, data); | 1315 | 625k | store_u8_8x2_avx2(res, dst, stride); | 1316 | 625k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:pack_store_8x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:pack_store_8x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:pack_store_8x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:pack_store_8x2_avx2 |
1317 | | |
1318 | | static inline void round_pack_store_16x2_avx2(const __m256i *data, |
1319 | | uint8_t *const dst, |
1320 | 468k | const ptrdiff_t dst_stride) { |
1321 | 468k | __m256i reg[2]; |
1322 | | |
1323 | 468k | reg[0] = round_sr_x_avx2(data[0]); |
1324 | 468k | reg[1] = round_sr_x_avx2(data[1]); |
1325 | 468k | pack_store_16x2_avx2(reg[0], reg[1], dst, dst_stride); |
1326 | 468k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_pack_store_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:round_pack_store_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:round_pack_store_16x2_avx2 convolve_avx2.c:round_pack_store_16x2_avx2 Line | Count | Source | 1320 | 468k | const ptrdiff_t dst_stride) { | 1321 | 468k | __m256i reg[2]; | 1322 | | | 1323 | 468k | reg[0] = round_sr_x_avx2(data[0]); | 1324 | 468k | reg[1] = round_sr_x_avx2(data[1]); | 1325 | 468k | pack_store_16x2_avx2(reg[0], reg[1], dst, dst_stride); | 1326 | 468k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:round_pack_store_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:round_pack_store_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_pack_store_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_pack_store_16x2_avx2 |
1327 | | |
1328 | | static inline void convolve_x_2tap_32_avx2(const uint8_t *const src, |
1329 | | const __m256i *coeffs, |
1330 | 124k | __m256i *data) { |
1331 | 124k | const __m256i res0 = _mm256_loadu_si256((__m256i *)src); |
1332 | 124k | const __m256i res1 = _mm256_loadu_si256((__m256i *)(src + 1)); |
1333 | 124k | const __m256i reg0 = _mm256_unpacklo_epi8(res0, res1); |
1334 | 124k | const __m256i reg1 = _mm256_unpackhi_epi8(res0, res1); |
1335 | | |
1336 | 124k | data[0] = convolve_x_2tap_avx2(®0, coeffs); |
1337 | 124k | data[1] = convolve_x_2tap_avx2(®1, coeffs); |
1338 | 124k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_32_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_32_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_32_avx2 convolve_avx2.c:convolve_x_2tap_32_avx2 Line | Count | Source | 1330 | 124k | __m256i *data) { | 1331 | 124k | const __m256i res0 = _mm256_loadu_si256((__m256i *)src); | 1332 | 124k | const __m256i res1 = _mm256_loadu_si256((__m256i *)(src + 1)); | 1333 | 124k | const __m256i reg0 = _mm256_unpacklo_epi8(res0, res1); | 1334 | 124k | const __m256i reg1 = _mm256_unpackhi_epi8(res0, res1); | 1335 | | | 1336 | 124k | data[0] = convolve_x_2tap_avx2(®0, coeffs); | 1337 | 124k | data[1] = convolve_x_2tap_avx2(®1, coeffs); | 1338 | 124k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_32_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_32_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_32_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_32_avx2 |
1339 | | |
1340 | | static inline void pack_store_32_avx2(const __m256i data0, const __m256i data1, |
1341 | 2.09M | uint8_t *const dst) { |
1342 | 2.09M | const __m256i reg = _mm256_packus_epi16(data0, data1); |
1343 | 2.09M | _mm256_storeu_si256((__m256i *)dst, reg); |
1344 | 2.09M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:pack_store_32_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:pack_store_32_avx2 Unexecuted instantiation: convolve_2d_avx2.c:pack_store_32_avx2 convolve_avx2.c:pack_store_32_avx2 Line | Count | Source | 1341 | 2.09M | uint8_t *const dst) { | 1342 | 2.09M | const __m256i reg = _mm256_packus_epi16(data0, data1); | 1343 | 2.09M | _mm256_storeu_si256((__m256i *)dst, reg); | 1344 | 2.09M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:pack_store_32_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:pack_store_32_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:pack_store_32_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:pack_store_32_avx2 |
1345 | | |
1346 | | static inline void round_pack_store_32_avx2(const __m256i *data, |
1347 | 1.74M | uint8_t *const dst) { |
1348 | 1.74M | __m256i reg[2]; |
1349 | | |
1350 | 1.74M | reg[0] = round_sr_x_avx2(data[0]); |
1351 | 1.74M | reg[1] = round_sr_x_avx2(data[1]); |
1352 | 1.74M | pack_store_32_avx2(reg[0], reg[1], dst); |
1353 | 1.74M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_pack_store_32_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:round_pack_store_32_avx2 Unexecuted instantiation: convolve_2d_avx2.c:round_pack_store_32_avx2 convolve_avx2.c:round_pack_store_32_avx2 Line | Count | Source | 1347 | 1.74M | uint8_t *const dst) { | 1348 | 1.74M | __m256i reg[2]; | 1349 | | | 1350 | 1.74M | reg[0] = round_sr_x_avx2(data[0]); | 1351 | 1.74M | reg[1] = round_sr_x_avx2(data[1]); | 1352 | 1.74M | pack_store_32_avx2(reg[0], reg[1], dst); | 1353 | 1.74M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:round_pack_store_32_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:round_pack_store_32_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_pack_store_32_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_pack_store_32_avx2 |
1354 | | |
1355 | | static inline void convolve_round_2tap_32_avx2(const uint8_t *const src, |
1356 | | const __m256i *coeffs, |
1357 | 124k | uint8_t *const dst) { |
1358 | 124k | __m256i data[2]; |
1359 | | |
1360 | 124k | convolve_x_2tap_32_avx2(src, coeffs, data); |
1361 | 124k | round_pack_store_32_avx2(data, dst); |
1362 | 124k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_round_2tap_32_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_round_2tap_32_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_round_2tap_32_avx2 convolve_avx2.c:convolve_round_2tap_32_avx2 Line | Count | Source | 1357 | 124k | uint8_t *const dst) { | 1358 | 124k | __m256i data[2]; | 1359 | | | 1360 | 124k | convolve_x_2tap_32_avx2(src, coeffs, data); | 1361 | 124k | round_pack_store_32_avx2(data, dst); | 1362 | 124k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_round_2tap_32_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_round_2tap_32_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_round_2tap_32_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_round_2tap_32_avx2 |
1363 | | |
1364 | | static inline void load_avg_store_2tap_32_avx2(const uint8_t *const src, |
1365 | 116k | uint8_t *const dst) { |
1366 | 116k | const __m256i res0 = _mm256_loadu_si256((__m256i *)src); |
1367 | 116k | const __m256i res1 = _mm256_loadu_si256((__m256i *)(src + 1)); |
1368 | 116k | const __m256i data = _mm256_avg_epu8(res0, res1); |
1369 | 116k | _mm256_storeu_si256((__m256i *)dst, data); |
1370 | 116k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_avg_store_2tap_32_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:load_avg_store_2tap_32_avx2 Unexecuted instantiation: convolve_2d_avx2.c:load_avg_store_2tap_32_avx2 convolve_avx2.c:load_avg_store_2tap_32_avx2 Line | Count | Source | 1365 | 116k | uint8_t *const dst) { | 1366 | 116k | const __m256i res0 = _mm256_loadu_si256((__m256i *)src); | 1367 | 116k | const __m256i res1 = _mm256_loadu_si256((__m256i *)(src + 1)); | 1368 | 116k | const __m256i data = _mm256_avg_epu8(res0, res1); | 1369 | 116k | _mm256_storeu_si256((__m256i *)dst, data); | 1370 | 116k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load_avg_store_2tap_32_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:load_avg_store_2tap_32_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_avg_store_2tap_32_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_avg_store_2tap_32_avx2 |
1371 | | |
1372 | | static inline __m256i load_convolve_8tap_8x2_avx2(const uint8_t *const src, |
1373 | | const ptrdiff_t stride, |
1374 | | const __m256i *coeffs, |
1375 | 47.0k | const __m256i *flt) { |
1376 | 47.0k | const __m256i res = loadu_x_8bit_16x2_avx2(src, stride); |
1377 | 47.0k | return convolve_lowbd_x(res, coeffs, flt); |
1378 | 47.0k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_8tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_8tap_8x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_8tap_8x2_avx2 convolve_avx2.c:load_convolve_8tap_8x2_avx2 Line | Count | Source | 1375 | 47.0k | const __m256i *flt) { | 1376 | 47.0k | const __m256i res = loadu_x_8bit_16x2_avx2(src, stride); | 1377 | 47.0k | return convolve_lowbd_x(res, coeffs, flt); | 1378 | 47.0k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_8tap_8x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_8tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_8tap_8x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_8tap_8x2_avx2 |
1379 | | |
1380 | | static inline void load_convolve_8tap_16x2_avx2(const uint8_t *const src, |
1381 | | const int32_t src_stride, |
1382 | | const __m256i *coeffs, |
1383 | | const __m256i *flt, |
1384 | 23.5k | __m256i *reg) { |
1385 | 23.5k | reg[0] = load_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, flt); |
1386 | 23.5k | reg[1] = load_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, flt); |
1387 | 23.5k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_8tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_8tap_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_8tap_16x2_avx2 convolve_avx2.c:load_convolve_8tap_16x2_avx2 Line | Count | Source | 1384 | 23.5k | __m256i *reg) { | 1385 | 23.5k | reg[0] = load_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, flt); | 1386 | 23.5k | reg[1] = load_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, flt); | 1387 | 23.5k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_8tap_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_8tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_8tap_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_8tap_16x2_avx2 |
1388 | | |
1389 | | static inline void load_convolve_8tap_32_avx2(const uint8_t *const src, |
1390 | | const __m256i *coeffs, |
1391 | | const __m256i *filt, |
1392 | 160k | __m256i *data) { |
1393 | 160k | const __m256i reg_0 = _mm256_loadu_si256((__m256i *)src); |
1394 | 160k | const __m256i reg_8 = _mm256_loadu_si256((__m256i *)(src + 8)); |
1395 | | |
1396 | 160k | data[0] = convolve_lowbd_x(reg_0, coeffs, filt); |
1397 | 160k | data[1] = convolve_lowbd_x(reg_8, coeffs, filt); |
1398 | 160k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_8tap_32_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_8tap_32_avx2 Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_8tap_32_avx2 convolve_avx2.c:load_convolve_8tap_32_avx2 Line | Count | Source | 1392 | 160k | __m256i *data) { | 1393 | 160k | const __m256i reg_0 = _mm256_loadu_si256((__m256i *)src); | 1394 | 160k | const __m256i reg_8 = _mm256_loadu_si256((__m256i *)(src + 8)); | 1395 | | | 1396 | 160k | data[0] = convolve_lowbd_x(reg_0, coeffs, filt); | 1397 | 160k | data[1] = convolve_lowbd_x(reg_8, coeffs, filt); | 1398 | 160k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_8tap_32_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_8tap_32_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_8tap_32_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_8tap_32_avx2 |
1399 | | |
1400 | | static inline void load_convolve_round_8tap_32_avx2(const uint8_t *const src, |
1401 | | const __m256i *coeffs, |
1402 | | const __m256i *filt, |
1403 | 160k | uint8_t *const dst) { |
1404 | 160k | __m256i data[2]; |
1405 | | |
1406 | 160k | load_convolve_8tap_32_avx2(src, coeffs, filt, data); |
1407 | 160k | round_pack_store_32_avx2(data, dst); |
1408 | 160k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_round_8tap_32_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_round_8tap_32_avx2 Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_round_8tap_32_avx2 convolve_avx2.c:load_convolve_round_8tap_32_avx2 Line | Count | Source | 1403 | 160k | uint8_t *const dst) { | 1404 | 160k | __m256i data[2]; | 1405 | | | 1406 | 160k | load_convolve_8tap_32_avx2(src, coeffs, filt, data); | 1407 | 160k | round_pack_store_32_avx2(data, dst); | 1408 | 160k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_round_8tap_32_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_round_8tap_32_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_round_8tap_32_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_round_8tap_32_avx2 |
1409 | | |
1410 | | static inline void load_convolve_6tap_32_avx2(const uint8_t *const src, |
1411 | | const __m256i *coeffs, |
1412 | | const __m256i *filt, |
1413 | 1.46M | __m256i *data) { |
1414 | 1.46M | const __m256i reg0 = _mm256_loadu_si256((__m256i *)src); |
1415 | 1.46M | const __m256i reg1 = _mm256_loadu_si256((__m256i *)(src + 8)); |
1416 | | |
1417 | 1.46M | data[0] = convolve_lowbd_x_6tap(reg0, coeffs, filt); |
1418 | 1.46M | data[1] = convolve_lowbd_x_6tap(reg1, coeffs, filt); |
1419 | 1.46M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_6tap_32_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_6tap_32_avx2 Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_6tap_32_avx2 convolve_avx2.c:load_convolve_6tap_32_avx2 Line | Count | Source | 1413 | 1.46M | __m256i *data) { | 1414 | 1.46M | const __m256i reg0 = _mm256_loadu_si256((__m256i *)src); | 1415 | 1.46M | const __m256i reg1 = _mm256_loadu_si256((__m256i *)(src + 8)); | 1416 | | | 1417 | 1.46M | data[0] = convolve_lowbd_x_6tap(reg0, coeffs, filt); | 1418 | 1.46M | data[1] = convolve_lowbd_x_6tap(reg1, coeffs, filt); | 1419 | 1.46M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_6tap_32_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_6tap_32_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_6tap_32_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_6tap_32_avx2 |
1420 | | |
1421 | | static inline void convolve_sr_store_6tap_32_avx2(const uint8_t *const src, |
1422 | | const __m256i *coeffs, |
1423 | | const __m256i *filt, |
1424 | 1.46M | uint8_t *const dst) { |
1425 | 1.46M | __m256i data[2]; |
1426 | | |
1427 | 1.46M | load_convolve_6tap_32_avx2(src, coeffs, filt, data); |
1428 | 1.46M | round_pack_store_32_avx2(data, dst); |
1429 | 1.46M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_sr_store_6tap_32_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_sr_store_6tap_32_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_sr_store_6tap_32_avx2 convolve_avx2.c:convolve_sr_store_6tap_32_avx2 Line | Count | Source | 1424 | 1.46M | uint8_t *const dst) { | 1425 | 1.46M | __m256i data[2]; | 1426 | | | 1427 | 1.46M | load_convolve_6tap_32_avx2(src, coeffs, filt, data); | 1428 | 1.46M | round_pack_store_32_avx2(data, dst); | 1429 | 1.46M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_sr_store_6tap_32_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_sr_store_6tap_32_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_sr_store_6tap_32_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_sr_store_6tap_32_avx2 |
1430 | | |
1431 | | static inline __m256i load_convolve_6tap_8x2_avx2(const uint8_t *const src, |
1432 | | const ptrdiff_t stride, |
1433 | | const __m256i *coeffs, |
1434 | 867k | const __m256i *filt) { |
1435 | 867k | const __m256i data = loadu_x_8bit_16x2_avx2(src, stride); |
1436 | 867k | return convolve_lowbd_x_6tap(data, coeffs, filt); |
1437 | 867k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_6tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_6tap_8x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_6tap_8x2_avx2 convolve_avx2.c:load_convolve_6tap_8x2_avx2 Line | Count | Source | 1434 | 867k | const __m256i *filt) { | 1435 | 867k | const __m256i data = loadu_x_8bit_16x2_avx2(src, stride); | 1436 | 867k | return convolve_lowbd_x_6tap(data, coeffs, filt); | 1437 | 867k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_6tap_8x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_6tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_6tap_8x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_6tap_8x2_avx2 |
1438 | | |
1439 | | static inline void load_convolve_6tap_16x2_avx2(const uint8_t *const src, |
1440 | | const int32_t src_stride, |
1441 | | const __m256i *coeffs, |
1442 | | const __m256i *filt, |
1443 | 433k | __m256i *data) { |
1444 | 433k | data[0] = load_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt); |
1445 | 433k | data[1] = load_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt); |
1446 | 433k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_6tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_6tap_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_6tap_16x2_avx2 convolve_avx2.c:load_convolve_6tap_16x2_avx2 Line | Count | Source | 1443 | 433k | __m256i *data) { | 1444 | 433k | data[0] = load_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt); | 1445 | 433k | data[1] = load_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt); | 1446 | 433k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_6tap_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_6tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_6tap_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_6tap_16x2_avx2 |
1447 | | |
1448 | 632k | static inline __m128i round_sr_y_ssse3(const __m128i data) { |
1449 | 632k | const __m128i value = _mm_set1_epi16(32); |
1450 | 632k | const __m128i reg = _mm_add_epi16(data, value); |
1451 | 632k | return _mm_srai_epi16(reg, FILTER_BITS - 1); |
1452 | 632k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_sr_y_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:round_sr_y_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:round_sr_y_ssse3 convolve_avx2.c:round_sr_y_ssse3 Line | Count | Source | 1448 | 632k | static inline __m128i round_sr_y_ssse3(const __m128i data) { | 1449 | 632k | const __m128i value = _mm_set1_epi16(32); | 1450 | 632k | const __m128i reg = _mm_add_epi16(data, value); | 1451 | 632k | return _mm_srai_epi16(reg, FILTER_BITS - 1); | 1452 | 632k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:round_sr_y_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:round_sr_y_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_sr_y_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_sr_y_ssse3 |
1453 | | |
1454 | 5.08M | static inline __m256i round_sr_y_avx2(const __m256i data) { |
1455 | 5.08M | const __m256i value = _mm256_set1_epi16(32); |
1456 | 5.08M | const __m256i reg = _mm256_add_epi16(data, value); |
1457 | 5.08M | return _mm256_srai_epi16(reg, FILTER_BITS - 1); |
1458 | 5.08M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_sr_y_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:round_sr_y_avx2 Unexecuted instantiation: convolve_2d_avx2.c:round_sr_y_avx2 convolve_avx2.c:round_sr_y_avx2 Line | Count | Source | 1454 | 5.08M | static inline __m256i round_sr_y_avx2(const __m256i data) { | 1455 | 5.08M | const __m256i value = _mm256_set1_epi16(32); | 1456 | 5.08M | const __m256i reg = _mm256_add_epi16(data, value); | 1457 | 5.08M | return _mm256_srai_epi16(reg, FILTER_BITS - 1); | 1458 | 5.08M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:round_sr_y_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:round_sr_y_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_sr_y_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_sr_y_avx2 |
1459 | | |
1460 | | static inline void round_pack_store_y_8x2_avx2(const __m256i res, |
1461 | | uint8_t *const dst, |
1462 | 625k | const ptrdiff_t dst_stride) { |
1463 | 625k | __m256i r; |
1464 | | |
1465 | 625k | r = round_sr_y_avx2(res); |
1466 | 625k | pack_store_8x2_avx2(r, dst, dst_stride); |
1467 | 625k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_pack_store_y_8x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:round_pack_store_y_8x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:round_pack_store_y_8x2_avx2 convolve_avx2.c:round_pack_store_y_8x2_avx2 Line | Count | Source | 1462 | 625k | const ptrdiff_t dst_stride) { | 1463 | 625k | __m256i r; | 1464 | | | 1465 | 625k | r = round_sr_y_avx2(res); | 1466 | 625k | pack_store_8x2_avx2(r, dst, dst_stride); | 1467 | 625k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:round_pack_store_y_8x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:round_pack_store_y_8x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_pack_store_y_8x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_pack_store_y_8x2_avx2 |
1468 | | |
1469 | | static inline void round_pack_store_y_16x2_avx2(const __m256i res[2], |
1470 | | uint8_t *const dst, |
1471 | 1.88M | const ptrdiff_t dst_stride) { |
1472 | 1.88M | __m256i r[2]; |
1473 | | |
1474 | 1.88M | r[0] = round_sr_y_avx2(res[0]); |
1475 | 1.88M | r[1] = round_sr_y_avx2(res[1]); |
1476 | 1.88M | pack_store_16x2_avx2(r[0], r[1], dst, dst_stride); |
1477 | 1.88M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_pack_store_y_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:round_pack_store_y_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:round_pack_store_y_16x2_avx2 convolve_avx2.c:round_pack_store_y_16x2_avx2 Line | Count | Source | 1471 | 1.88M | const ptrdiff_t dst_stride) { | 1472 | 1.88M | __m256i r[2]; | 1473 | | | 1474 | 1.88M | r[0] = round_sr_y_avx2(res[0]); | 1475 | 1.88M | r[1] = round_sr_y_avx2(res[1]); | 1476 | 1.88M | pack_store_16x2_avx2(r[0], r[1], dst, dst_stride); | 1477 | 1.88M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:round_pack_store_y_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:round_pack_store_y_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_pack_store_y_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_pack_store_y_16x2_avx2 |
1478 | | |
1479 | | static inline void round_pack_store_y_32_avx2(const __m256i res[2], |
1480 | 342k | uint8_t *const dst) { |
1481 | 342k | __m256i r[2]; |
1482 | | |
1483 | 342k | r[0] = round_sr_y_avx2(res[0]); |
1484 | 342k | r[1] = round_sr_y_avx2(res[1]); |
1485 | 342k | pack_store_32_avx2(r[0], r[1], dst); |
1486 | 342k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_pack_store_y_32_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:round_pack_store_y_32_avx2 Unexecuted instantiation: convolve_2d_avx2.c:round_pack_store_y_32_avx2 convolve_avx2.c:round_pack_store_y_32_avx2 Line | Count | Source | 1480 | 342k | uint8_t *const dst) { | 1481 | 342k | __m256i r[2]; | 1482 | | | 1483 | 342k | r[0] = round_sr_y_avx2(res[0]); | 1484 | 342k | r[1] = round_sr_y_avx2(res[1]); | 1485 | 342k | pack_store_32_avx2(r[0], r[1], dst); | 1486 | 342k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:round_pack_store_y_32_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:round_pack_store_y_32_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_pack_store_y_32_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_pack_store_y_32_avx2 |
1487 | | |
1488 | | static inline void round_pack_store_y_32x2_avx2(const __m256i res[4], |
1489 | | uint8_t *const dst, |
1490 | 171k | const ptrdiff_t dst_stride) { |
1491 | 171k | round_pack_store_y_32_avx2(res, dst); |
1492 | 171k | round_pack_store_y_32_avx2(res + 2, dst + dst_stride); |
1493 | 171k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_pack_store_y_32x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:round_pack_store_y_32x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:round_pack_store_y_32x2_avx2 convolve_avx2.c:round_pack_store_y_32x2_avx2 Line | Count | Source | 1490 | 171k | const ptrdiff_t dst_stride) { | 1491 | 171k | round_pack_store_y_32_avx2(res, dst); | 1492 | 171k | round_pack_store_y_32_avx2(res + 2, dst + dst_stride); | 1493 | 171k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:round_pack_store_y_32x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:round_pack_store_y_32x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_pack_store_y_32x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_pack_store_y_32x2_avx2 |
1494 | | |
1495 | | static inline void convolve_y_2tap_2x2_ssse3(const uint8_t *const data, |
1496 | | const ptrdiff_t stride, |
1497 | | const __m128i *coeffs, |
1498 | 3.21k | __m128i d[2], __m128i *res) { |
1499 | 3.21k | d[1] = _mm_cvtsi32_si128(loadu_int16(data + 1 * stride)); |
1500 | 3.21k | const __m128i src_01a = _mm_unpacklo_epi16(d[0], d[1]); |
1501 | 3.21k | d[0] = _mm_cvtsi32_si128(loadu_int16(data + 2 * stride)); |
1502 | 3.21k | const __m128i src_12a = _mm_unpacklo_epi16(d[1], d[0]); |
1503 | | |
1504 | 3.21k | const __m128i s = _mm_unpacklo_epi8(src_01a, src_12a); |
1505 | | |
1506 | 3.21k | *res = _mm_maddubs_epi16(s, coeffs[0]); |
1507 | 3.21k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_2tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_2tap_2x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_2tap_2x2_ssse3 convolve_avx2.c:convolve_y_2tap_2x2_ssse3 Line | Count | Source | 1498 | 3.21k | __m128i d[2], __m128i *res) { | 1499 | 3.21k | d[1] = _mm_cvtsi32_si128(loadu_int16(data + 1 * stride)); | 1500 | 3.21k | const __m128i src_01a = _mm_unpacklo_epi16(d[0], d[1]); | 1501 | 3.21k | d[0] = _mm_cvtsi32_si128(loadu_int16(data + 2 * stride)); | 1502 | 3.21k | const __m128i src_12a = _mm_unpacklo_epi16(d[1], d[0]); | 1503 | | | 1504 | 3.21k | const __m128i s = _mm_unpacklo_epi8(src_01a, src_12a); | 1505 | | | 1506 | 3.21k | *res = _mm_maddubs_epi16(s, coeffs[0]); | 1507 | 3.21k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_2tap_2x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_2tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_2tap_2x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_2tap_2x2_ssse3 |
1508 | | |
1509 | | static inline void convolve_y_4tap_2x2_ssse3(const uint8_t *const data, |
1510 | | const ptrdiff_t stride, |
1511 | | const __m128i coeffs[2], |
1512 | | __m128i d[4], __m128i s[2], |
1513 | 37.0k | __m128i *res) { |
1514 | 37.0k | d[3] = _mm_cvtsi32_si128(loadu_int16(data + 3 * stride)); |
1515 | 37.0k | const __m128i src_23a = _mm_unpacklo_epi16(d[2], d[3]); |
1516 | 37.0k | d[2] = _mm_cvtsi32_si128(loadu_int16(data + 4 * stride)); |
1517 | 37.0k | const __m128i src_34a = _mm_unpacklo_epi16(d[3], d[2]); |
1518 | | |
1519 | 37.0k | s[1] = _mm_unpacklo_epi8(src_23a, src_34a); |
1520 | | |
1521 | 37.0k | *res = convolve_lowbd_4tap_ssse3(s, coeffs); |
1522 | 37.0k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_4tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_4tap_2x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_4tap_2x2_ssse3 convolve_avx2.c:convolve_y_4tap_2x2_ssse3 Line | Count | Source | 1513 | 37.0k | __m128i *res) { | 1514 | 37.0k | d[3] = _mm_cvtsi32_si128(loadu_int16(data + 3 * stride)); | 1515 | 37.0k | const __m128i src_23a = _mm_unpacklo_epi16(d[2], d[3]); | 1516 | 37.0k | d[2] = _mm_cvtsi32_si128(loadu_int16(data + 4 * stride)); | 1517 | 37.0k | const __m128i src_34a = _mm_unpacklo_epi16(d[3], d[2]); | 1518 | | | 1519 | 37.0k | s[1] = _mm_unpacklo_epi8(src_23a, src_34a); | 1520 | | | 1521 | 37.0k | *res = convolve_lowbd_4tap_ssse3(s, coeffs); | 1522 | 37.0k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_4tap_2x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_4tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_4tap_2x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_4tap_2x2_ssse3 |
1523 | | |
1524 | | static inline void convolve_y_6tap_2x2_ssse3(const uint8_t *const data, |
1525 | | const ptrdiff_t stride, |
1526 | | const __m128i coeffs[3], |
1527 | | __m128i d[6], __m128i s[3], |
1528 | 49.3k | __m128i *res) { |
1529 | 49.3k | d[5] = _mm_cvtsi32_si128(loadu_int16(data + 5 * stride)); |
1530 | 49.3k | const __m128i src_45a = _mm_unpacklo_epi16(d[4], d[5]); |
1531 | 49.3k | d[4] = _mm_cvtsi32_si128(loadu_int16(data + 6 * stride)); |
1532 | 49.3k | const __m128i src_56a = _mm_unpacklo_epi16(d[5], d[4]); |
1533 | | |
1534 | 49.3k | s[2] = _mm_unpacklo_epi8(src_45a, src_56a); |
1535 | | |
1536 | 49.3k | *res = convolve_lowbd_6tap_ssse3(s, coeffs); |
1537 | 49.3k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_6tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_6tap_2x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_6tap_2x2_ssse3 convolve_avx2.c:convolve_y_6tap_2x2_ssse3 Line | Count | Source | 1528 | 49.3k | __m128i *res) { | 1529 | 49.3k | d[5] = _mm_cvtsi32_si128(loadu_int16(data + 5 * stride)); | 1530 | 49.3k | const __m128i src_45a = _mm_unpacklo_epi16(d[4], d[5]); | 1531 | 49.3k | d[4] = _mm_cvtsi32_si128(loadu_int16(data + 6 * stride)); | 1532 | 49.3k | const __m128i src_56a = _mm_unpacklo_epi16(d[5], d[4]); | 1533 | | | 1534 | 49.3k | s[2] = _mm_unpacklo_epi8(src_45a, src_56a); | 1535 | | | 1536 | 49.3k | *res = convolve_lowbd_6tap_ssse3(s, coeffs); | 1537 | 49.3k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_6tap_2x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_6tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_6tap_2x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_6tap_2x2_ssse3 |
1538 | | |
1539 | | static inline void convolve_y_8tap_2x2_ssse3(const uint8_t *const data, |
1540 | | const ptrdiff_t stride, |
1541 | | const __m128i coeffs[4], |
1542 | | __m128i d[8], __m128i s[4], |
1543 | 5.12k | __m128i *res) { |
1544 | 5.12k | d[7] = _mm_cvtsi32_si128(loadu_int16(data + 7 * stride)); |
1545 | 5.12k | const __m128i src_67a = _mm_unpacklo_epi16(d[6], d[7]); |
1546 | 5.12k | d[6] = _mm_cvtsi32_si128(loadu_int16(data + 8 * stride)); |
1547 | 5.12k | const __m128i src_78a = _mm_unpacklo_epi16(d[7], d[6]); |
1548 | | |
1549 | 5.12k | s[3] = _mm_unpacklo_epi8(src_67a, src_78a); |
1550 | | |
1551 | 5.12k | *res = convolve_lowbd_ssse3(s, coeffs); |
1552 | 5.12k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_8tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_8tap_2x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_8tap_2x2_ssse3 convolve_avx2.c:convolve_y_8tap_2x2_ssse3 Line | Count | Source | 1543 | 5.12k | __m128i *res) { | 1544 | 5.12k | d[7] = _mm_cvtsi32_si128(loadu_int16(data + 7 * stride)); | 1545 | 5.12k | const __m128i src_67a = _mm_unpacklo_epi16(d[6], d[7]); | 1546 | 5.12k | d[6] = _mm_cvtsi32_si128(loadu_int16(data + 8 * stride)); | 1547 | 5.12k | const __m128i src_78a = _mm_unpacklo_epi16(d[7], d[6]); | 1548 | | | 1549 | 5.12k | s[3] = _mm_unpacklo_epi8(src_67a, src_78a); | 1550 | | | 1551 | 5.12k | *res = convolve_lowbd_ssse3(s, coeffs); | 1552 | 5.12k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_8tap_2x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_8tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_8tap_2x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_8tap_2x2_ssse3 |
1553 | | |
1554 | | static inline void convolve_y_2tap_4x2_ssse3(const uint8_t *const data, |
1555 | | const ptrdiff_t stride, |
1556 | | const __m128i *coeffs, |
1557 | 13.8k | __m128i d[2], __m128i *res) { |
1558 | 13.8k | d[1] = _mm_cvtsi32_si128(loadu_int32(data + 1 * stride)); |
1559 | 13.8k | const __m128i src_01a = _mm_unpacklo_epi32(d[0], d[1]); |
1560 | 13.8k | d[0] = _mm_cvtsi32_si128(loadu_int32(data + 2 * stride)); |
1561 | 13.8k | const __m128i src_12a = _mm_unpacklo_epi32(d[1], d[0]); |
1562 | | |
1563 | 13.8k | const __m128i s = _mm_unpacklo_epi8(src_01a, src_12a); |
1564 | | |
1565 | 13.8k | *res = _mm_maddubs_epi16(s, coeffs[0]); |
1566 | 13.8k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_2tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_2tap_4x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_2tap_4x2_ssse3 convolve_avx2.c:convolve_y_2tap_4x2_ssse3 Line | Count | Source | 1557 | 13.8k | __m128i d[2], __m128i *res) { | 1558 | 13.8k | d[1] = _mm_cvtsi32_si128(loadu_int32(data + 1 * stride)); | 1559 | 13.8k | const __m128i src_01a = _mm_unpacklo_epi32(d[0], d[1]); | 1560 | 13.8k | d[0] = _mm_cvtsi32_si128(loadu_int32(data + 2 * stride)); | 1561 | 13.8k | const __m128i src_12a = _mm_unpacklo_epi32(d[1], d[0]); | 1562 | | | 1563 | 13.8k | const __m128i s = _mm_unpacklo_epi8(src_01a, src_12a); | 1564 | | | 1565 | 13.8k | *res = _mm_maddubs_epi16(s, coeffs[0]); | 1566 | 13.8k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_2tap_4x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_2tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_2tap_4x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_2tap_4x2_ssse3 |
1567 | | |
1568 | | static inline void convolve_y_4tap_4x2_ssse3(const uint8_t *const data, |
1569 | | const ptrdiff_t stride, |
1570 | | const __m128i coeffs[2], |
1571 | | __m128i d[4], __m128i s[2], |
1572 | 199k | __m128i *res) { |
1573 | 199k | d[3] = _mm_cvtsi32_si128(loadu_int32(data + 3 * stride)); |
1574 | 199k | const __m128i src_23a = _mm_unpacklo_epi32(d[2], d[3]); |
1575 | 199k | d[2] = _mm_cvtsi32_si128(loadu_int32(data + 4 * stride)); |
1576 | 199k | const __m128i src_34a = _mm_unpacklo_epi32(d[3], d[2]); |
1577 | | |
1578 | 199k | s[1] = _mm_unpacklo_epi8(src_23a, src_34a); |
1579 | | |
1580 | 199k | *res = convolve_lowbd_4tap_ssse3(s, coeffs); |
1581 | 199k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_4tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_4tap_4x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_4tap_4x2_ssse3 convolve_avx2.c:convolve_y_4tap_4x2_ssse3 Line | Count | Source | 1572 | 199k | __m128i *res) { | 1573 | 199k | d[3] = _mm_cvtsi32_si128(loadu_int32(data + 3 * stride)); | 1574 | 199k | const __m128i src_23a = _mm_unpacklo_epi32(d[2], d[3]); | 1575 | 199k | d[2] = _mm_cvtsi32_si128(loadu_int32(data + 4 * stride)); | 1576 | 199k | const __m128i src_34a = _mm_unpacklo_epi32(d[3], d[2]); | 1577 | | | 1578 | 199k | s[1] = _mm_unpacklo_epi8(src_23a, src_34a); | 1579 | | | 1580 | 199k | *res = convolve_lowbd_4tap_ssse3(s, coeffs); | 1581 | 199k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_4tap_4x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_4tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_4tap_4x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_4tap_4x2_ssse3 |
1582 | | |
1583 | | static inline void convolve_y_6tap_4x2_ssse3(const uint8_t *const data, |
1584 | | const ptrdiff_t stride, |
1585 | | const __m128i coeffs[3], |
1586 | | __m128i d[6], __m128i s[3], |
1587 | 299k | __m128i *res) { |
1588 | 299k | d[5] = _mm_cvtsi32_si128(loadu_int32(data + 5 * stride)); |
1589 | 299k | const __m128i src_45a = _mm_unpacklo_epi32(d[4], d[5]); |
1590 | 299k | d[4] = _mm_cvtsi32_si128(loadu_int32(data + 6 * stride)); |
1591 | 299k | const __m128i src_56a = _mm_unpacklo_epi32(d[5], d[4]); |
1592 | | |
1593 | 299k | s[2] = _mm_unpacklo_epi8(src_45a, src_56a); |
1594 | | |
1595 | 299k | *res = convolve_lowbd_6tap_ssse3(s, coeffs); |
1596 | 299k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_6tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_6tap_4x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_6tap_4x2_ssse3 convolve_avx2.c:convolve_y_6tap_4x2_ssse3 Line | Count | Source | 1587 | 299k | __m128i *res) { | 1588 | 299k | d[5] = _mm_cvtsi32_si128(loadu_int32(data + 5 * stride)); | 1589 | 299k | const __m128i src_45a = _mm_unpacklo_epi32(d[4], d[5]); | 1590 | 299k | d[4] = _mm_cvtsi32_si128(loadu_int32(data + 6 * stride)); | 1591 | 299k | const __m128i src_56a = _mm_unpacklo_epi32(d[5], d[4]); | 1592 | | | 1593 | 299k | s[2] = _mm_unpacklo_epi8(src_45a, src_56a); | 1594 | | | 1595 | 299k | *res = convolve_lowbd_6tap_ssse3(s, coeffs); | 1596 | 299k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_6tap_4x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_6tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_6tap_4x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_6tap_4x2_ssse3 |
1597 | | |
1598 | | static inline void convolve_y_8tap_4x2_ssse3(const uint8_t *const data, |
1599 | | const ptrdiff_t stride, |
1600 | | const __m128i coeffs[4], |
1601 | | __m128i d[8], __m128i s[4], |
1602 | 25.4k | __m128i *res) { |
1603 | 25.4k | d[7] = _mm_cvtsi32_si128(loadu_int32(data + 7 * stride)); |
1604 | 25.4k | const __m128i src_67a = _mm_unpacklo_epi32(d[6], d[7]); |
1605 | 25.4k | d[6] = _mm_cvtsi32_si128(loadu_int32(data + 8 * stride)); |
1606 | 25.4k | const __m128i src_78a = _mm_unpacklo_epi32(d[7], d[6]); |
1607 | | |
1608 | 25.4k | s[3] = _mm_unpacklo_epi8(src_67a, src_78a); |
1609 | | |
1610 | 25.4k | res[0] = convolve_lowbd_ssse3(s, coeffs); |
1611 | 25.4k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_8tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_8tap_4x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_8tap_4x2_ssse3 convolve_avx2.c:convolve_y_8tap_4x2_ssse3 Line | Count | Source | 1602 | 25.4k | __m128i *res) { | 1603 | 25.4k | d[7] = _mm_cvtsi32_si128(loadu_int32(data + 7 * stride)); | 1604 | 25.4k | const __m128i src_67a = _mm_unpacklo_epi32(d[6], d[7]); | 1605 | 25.4k | d[6] = _mm_cvtsi32_si128(loadu_int32(data + 8 * stride)); | 1606 | 25.4k | const __m128i src_78a = _mm_unpacklo_epi32(d[7], d[6]); | 1607 | | | 1608 | 25.4k | s[3] = _mm_unpacklo_epi8(src_67a, src_78a); | 1609 | | | 1610 | 25.4k | res[0] = convolve_lowbd_ssse3(s, coeffs); | 1611 | 25.4k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_8tap_4x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_8tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_8tap_4x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_8tap_4x2_ssse3 |
1612 | | |
1613 | | static inline void convolve_y_2tap_8x2_avx2(const uint8_t *const data, |
1614 | | const ptrdiff_t stride, |
1615 | | const __m256i *coeffs, __m128i d[2], |
1616 | 12.4k | __m256i *res) { |
1617 | 12.4k | d[1] = _mm_loadu_si128((__m128i *)(data + 1 * stride)); |
1618 | 12.4k | const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]); |
1619 | 12.4k | d[0] = _mm_loadu_si128((__m128i *)(data + 2 * stride)); |
1620 | 12.4k | const __m256i src_12a = _mm256_setr_m128i(d[1], d[0]); |
1621 | | |
1622 | 12.4k | const __m256i s = _mm256_unpacklo_epi8(src_01a, src_12a); |
1623 | | |
1624 | 12.4k | *res = _mm256_maddubs_epi16(s, coeffs[0]); |
1625 | 12.4k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_2tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_2tap_8x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_2tap_8x2_avx2 convolve_avx2.c:convolve_y_2tap_8x2_avx2 Line | Count | Source | 1616 | 12.4k | __m256i *res) { | 1617 | 12.4k | d[1] = _mm_loadu_si128((__m128i *)(data + 1 * stride)); | 1618 | 12.4k | const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]); | 1619 | 12.4k | d[0] = _mm_loadu_si128((__m128i *)(data + 2 * stride)); | 1620 | 12.4k | const __m256i src_12a = _mm256_setr_m128i(d[1], d[0]); | 1621 | | | 1622 | 12.4k | const __m256i s = _mm256_unpacklo_epi8(src_01a, src_12a); | 1623 | | | 1624 | 12.4k | *res = _mm256_maddubs_epi16(s, coeffs[0]); | 1625 | 12.4k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_2tap_8x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_2tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_2tap_8x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_2tap_8x2_avx2 |
1626 | | |
1627 | | static inline void convolve_y_4tap_8x2_avx2(const uint8_t *const data, |
1628 | | const ptrdiff_t stride, |
1629 | | const __m256i coeffs[2], |
1630 | | __m128i d[4], __m256i s[2], |
1631 | 180k | __m256i *res) { |
1632 | 180k | d[3] = _mm_loadu_si128((__m128i *)(data + 3 * stride)); |
1633 | 180k | const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]); |
1634 | 180k | d[2] = _mm_loadu_si128((__m128i *)(data + 4 * stride)); |
1635 | 180k | const __m256i src_34a = _mm256_setr_m128i(d[3], d[2]); |
1636 | | |
1637 | 180k | s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); |
1638 | | |
1639 | 180k | *res = convolve_lowbd_4tap(s, coeffs); |
1640 | 180k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_4tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_4tap_8x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_4tap_8x2_avx2 convolve_avx2.c:convolve_y_4tap_8x2_avx2 Line | Count | Source | 1631 | 180k | __m256i *res) { | 1632 | 180k | d[3] = _mm_loadu_si128((__m128i *)(data + 3 * stride)); | 1633 | 180k | const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]); | 1634 | 180k | d[2] = _mm_loadu_si128((__m128i *)(data + 4 * stride)); | 1635 | 180k | const __m256i src_34a = _mm256_setr_m128i(d[3], d[2]); | 1636 | | | 1637 | 180k | s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); | 1638 | | | 1639 | 180k | *res = convolve_lowbd_4tap(s, coeffs); | 1640 | 180k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_4tap_8x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_4tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_4tap_8x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_4tap_8x2_avx2 |
1641 | | |
1642 | | static inline void convolve_y_6tap_8x2_avx2(const uint8_t *const data, |
1643 | | const ptrdiff_t stride, |
1644 | | const __m256i coeffs[3], |
1645 | | __m128i d[6], __m256i s[3], |
1646 | 407k | __m256i *res) { |
1647 | 407k | d[5] = _mm_loadu_si128((__m128i *)(data + 5 * stride)); |
1648 | 407k | const __m256i src_45a = _mm256_setr_m128i(d[4], d[5]); |
1649 | 407k | d[4] = _mm_loadu_si128((__m128i *)(data + 6 * stride)); |
1650 | 407k | const __m256i src_56a = _mm256_setr_m128i(d[5], d[4]); |
1651 | | |
1652 | 407k | s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); |
1653 | | |
1654 | 407k | *res = convolve_lowbd_6tap(s, coeffs); |
1655 | 407k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_6tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_6tap_8x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_6tap_8x2_avx2 convolve_avx2.c:convolve_y_6tap_8x2_avx2 Line | Count | Source | 1646 | 407k | __m256i *res) { | 1647 | 407k | d[5] = _mm_loadu_si128((__m128i *)(data + 5 * stride)); | 1648 | 407k | const __m256i src_45a = _mm256_setr_m128i(d[4], d[5]); | 1649 | 407k | d[4] = _mm_loadu_si128((__m128i *)(data + 6 * stride)); | 1650 | 407k | const __m256i src_56a = _mm256_setr_m128i(d[5], d[4]); | 1651 | | | 1652 | 407k | s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); | 1653 | | | 1654 | 407k | *res = convolve_lowbd_6tap(s, coeffs); | 1655 | 407k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_6tap_8x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_6tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_6tap_8x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_6tap_8x2_avx2 |
1656 | | |
1657 | | static inline void convolve_y_8tap_8x2_avx2(const uint8_t *const data, |
1658 | | const ptrdiff_t stride, |
1659 | | const __m256i coeffs[4], |
1660 | | __m128i d[8], __m256i s[4], |
1661 | 24.1k | __m256i *res) { |
1662 | 24.1k | d[7] = _mm_loadu_si128((__m128i *)(data + 7 * stride)); |
1663 | 24.1k | const __m256i src_67a = _mm256_setr_m128i(d[6], d[7]); |
1664 | 24.1k | d[6] = _mm_loadu_si128((__m128i *)(data + 8 * stride)); |
1665 | 24.1k | const __m256i src_78a = _mm256_setr_m128i(d[7], d[6]); |
1666 | | |
1667 | 24.1k | s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); |
1668 | | |
1669 | 24.1k | *res = convolve_lowbd(s, coeffs); |
1670 | 24.1k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_8tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_8tap_8x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_8tap_8x2_avx2 convolve_avx2.c:convolve_y_8tap_8x2_avx2 Line | Count | Source | 1661 | 24.1k | __m256i *res) { | 1662 | 24.1k | d[7] = _mm_loadu_si128((__m128i *)(data + 7 * stride)); | 1663 | 24.1k | const __m256i src_67a = _mm256_setr_m128i(d[6], d[7]); | 1664 | 24.1k | d[6] = _mm_loadu_si128((__m128i *)(data + 8 * stride)); | 1665 | 24.1k | const __m256i src_78a = _mm256_setr_m128i(d[7], d[6]); | 1666 | | | 1667 | 24.1k | s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); | 1668 | | | 1669 | 24.1k | *res = convolve_lowbd(s, coeffs); | 1670 | 24.1k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_8tap_8x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_8tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_8tap_8x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_8tap_8x2_avx2 |
1671 | | |
1672 | | static inline void convolve_y_2tap_16x2_avx2(const uint8_t *const data, |
1673 | | const ptrdiff_t stride, |
1674 | | const __m256i *coeffs, |
1675 | 12.8k | __m128i d[2], __m256i res[2]) { |
1676 | 12.8k | d[1] = _mm_loadu_si128((__m128i *)(data + 1 * stride)); |
1677 | 12.8k | const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]); |
1678 | 12.8k | d[0] = _mm_loadu_si128((__m128i *)(data + 2 * stride)); |
1679 | 12.8k | const __m256i src_12a = _mm256_setr_m128i(d[1], d[0]); |
1680 | | |
1681 | 12.8k | const __m256i s0 = _mm256_unpacklo_epi8(src_01a, src_12a); |
1682 | 12.8k | const __m256i s1 = _mm256_unpackhi_epi8(src_01a, src_12a); |
1683 | | |
1684 | 12.8k | res[0] = _mm256_maddubs_epi16(s0, coeffs[0]); |
1685 | 12.8k | res[1] = _mm256_maddubs_epi16(s1, coeffs[0]); |
1686 | 12.8k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_2tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_2tap_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_2tap_16x2_avx2 convolve_avx2.c:convolve_y_2tap_16x2_avx2 Line | Count | Source | 1675 | 12.8k | __m128i d[2], __m256i res[2]) { | 1676 | 12.8k | d[1] = _mm_loadu_si128((__m128i *)(data + 1 * stride)); | 1677 | 12.8k | const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]); | 1678 | 12.8k | d[0] = _mm_loadu_si128((__m128i *)(data + 2 * stride)); | 1679 | 12.8k | const __m256i src_12a = _mm256_setr_m128i(d[1], d[0]); | 1680 | | | 1681 | 12.8k | const __m256i s0 = _mm256_unpacklo_epi8(src_01a, src_12a); | 1682 | 12.8k | const __m256i s1 = _mm256_unpackhi_epi8(src_01a, src_12a); | 1683 | | | 1684 | 12.8k | res[0] = _mm256_maddubs_epi16(s0, coeffs[0]); | 1685 | 12.8k | res[1] = _mm256_maddubs_epi16(s1, coeffs[0]); | 1686 | 12.8k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_2tap_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_2tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_2tap_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_2tap_16x2_avx2 |
1687 | | |
1688 | | static inline void convolve_y_4tap_16x2_avx2(const uint8_t *const data, |
1689 | | const ptrdiff_t stride, |
1690 | | const __m256i coeffs[2], |
1691 | | __m128i d[4], __m256i s[4], |
1692 | 111k | __m256i res[2]) { |
1693 | 111k | d[3] = _mm_loadu_si128((__m128i *)(data + 3 * stride)); |
1694 | 111k | const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]); |
1695 | 111k | d[2] = _mm_loadu_si128((__m128i *)(data + 4 * stride)); |
1696 | 111k | const __m256i src_34a = _mm256_setr_m128i(d[3], d[2]); |
1697 | | |
1698 | 111k | s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); |
1699 | 111k | s[3] = _mm256_unpackhi_epi8(src_23a, src_34a); |
1700 | | |
1701 | 111k | res[0] = convolve_lowbd_4tap(s, coeffs); |
1702 | 111k | res[1] = convolve_lowbd_4tap(s + 2, coeffs); |
1703 | 111k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_4tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_4tap_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_4tap_16x2_avx2 convolve_avx2.c:convolve_y_4tap_16x2_avx2 Line | Count | Source | 1692 | 111k | __m256i res[2]) { | 1693 | 111k | d[3] = _mm_loadu_si128((__m128i *)(data + 3 * stride)); | 1694 | 111k | const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]); | 1695 | 111k | d[2] = _mm_loadu_si128((__m128i *)(data + 4 * stride)); | 1696 | 111k | const __m256i src_34a = _mm256_setr_m128i(d[3], d[2]); | 1697 | | | 1698 | 111k | s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); | 1699 | 111k | s[3] = _mm256_unpackhi_epi8(src_23a, src_34a); | 1700 | | | 1701 | 111k | res[0] = convolve_lowbd_4tap(s, coeffs); | 1702 | 111k | res[1] = convolve_lowbd_4tap(s + 2, coeffs); | 1703 | 111k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_4tap_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_4tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_4tap_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_4tap_16x2_avx2 |
1704 | | |
1705 | | static inline void convolve_y_6tap_16x2_avx2(const uint8_t *const data, |
1706 | | const ptrdiff_t stride, |
1707 | | const __m256i coeffs[3], |
1708 | | __m128i d[6], __m256i s[6], |
1709 | 1.68M | __m256i res[2]) { |
1710 | 1.68M | d[5] = _mm_loadu_si128((__m128i *)(data + 5 * stride)); |
1711 | 1.68M | const __m256i src_45a = _mm256_setr_m128i(d[4], d[5]); |
1712 | 1.68M | d[4] = _mm_loadu_si128((__m128i *)(data + 6 * stride)); |
1713 | 1.68M | const __m256i src_56a = _mm256_setr_m128i(d[5], d[4]); |
1714 | | |
1715 | 1.68M | s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); |
1716 | 1.68M | s[5] = _mm256_unpackhi_epi8(src_45a, src_56a); |
1717 | | |
1718 | 1.68M | res[0] = convolve_lowbd_6tap(s, coeffs); |
1719 | 1.68M | res[1] = convolve_lowbd_6tap(s + 3, coeffs); |
1720 | 1.68M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_6tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_6tap_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_6tap_16x2_avx2 convolve_avx2.c:convolve_y_6tap_16x2_avx2 Line | Count | Source | 1709 | 1.68M | __m256i res[2]) { | 1710 | 1.68M | d[5] = _mm_loadu_si128((__m128i *)(data + 5 * stride)); | 1711 | 1.68M | const __m256i src_45a = _mm256_setr_m128i(d[4], d[5]); | 1712 | 1.68M | d[4] = _mm_loadu_si128((__m128i *)(data + 6 * stride)); | 1713 | 1.68M | const __m256i src_56a = _mm256_setr_m128i(d[5], d[4]); | 1714 | | | 1715 | 1.68M | s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); | 1716 | 1.68M | s[5] = _mm256_unpackhi_epi8(src_45a, src_56a); | 1717 | | | 1718 | 1.68M | res[0] = convolve_lowbd_6tap(s, coeffs); | 1719 | 1.68M | res[1] = convolve_lowbd_6tap(s + 3, coeffs); | 1720 | 1.68M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_6tap_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_6tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_6tap_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_6tap_16x2_avx2 |
1721 | | |
1722 | | static inline void convolve_y_8tap_16x2_avx2(const uint8_t *const data, |
1723 | | const ptrdiff_t stride, |
1724 | | const __m256i coeffs[4], |
1725 | | __m128i d[8], __m256i s[8], |
1726 | 79.1k | __m256i res[2]) { |
1727 | 79.1k | d[7] = _mm_loadu_si128((__m128i *)(data + 7 * stride)); |
1728 | 79.1k | const __m256i src_67a = _mm256_setr_m128i(d[6], d[7]); |
1729 | 79.1k | d[6] = _mm_loadu_si128((__m128i *)(data + 8 * stride)); |
1730 | 79.1k | const __m256i src_78a = _mm256_setr_m128i(d[7], d[6]); |
1731 | | |
1732 | 79.1k | s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); |
1733 | 79.1k | s[7] = _mm256_unpackhi_epi8(src_67a, src_78a); |
1734 | | |
1735 | 79.1k | res[0] = convolve_lowbd(s, coeffs); |
1736 | 79.1k | res[1] = convolve_lowbd(s + 4, coeffs); |
1737 | 79.1k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_8tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_8tap_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_8tap_16x2_avx2 convolve_avx2.c:convolve_y_8tap_16x2_avx2 Line | Count | Source | 1726 | 79.1k | __m256i res[2]) { | 1727 | 79.1k | d[7] = _mm_loadu_si128((__m128i *)(data + 7 * stride)); | 1728 | 79.1k | const __m256i src_67a = _mm256_setr_m128i(d[6], d[7]); | 1729 | 79.1k | d[6] = _mm_loadu_si128((__m128i *)(data + 8 * stride)); | 1730 | 79.1k | const __m256i src_78a = _mm256_setr_m128i(d[7], d[6]); | 1731 | | | 1732 | 79.1k | s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); | 1733 | 79.1k | s[7] = _mm256_unpackhi_epi8(src_67a, src_78a); | 1734 | | | 1735 | 79.1k | res[0] = convolve_lowbd(s, coeffs); | 1736 | 79.1k | res[1] = convolve_lowbd(s + 4, coeffs); | 1737 | 79.1k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_8tap_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_8tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_8tap_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_8tap_16x2_avx2 |
1738 | | |
1739 | | static inline void convolve_y_2tap_32x2_avx2(const uint8_t *const data, |
1740 | | const ptrdiff_t stride, |
1741 | | const __m256i *coeffs, |
1742 | 38.7k | __m256i d[2], __m256i res[4]) { |
1743 | 38.7k | d[1] = _mm256_loadu_si256((__m256i *)(data + 1 * stride)); |
1744 | 38.7k | const __m256i s00 = _mm256_unpacklo_epi8(d[0], d[1]); |
1745 | 38.7k | const __m256i s01 = _mm256_unpackhi_epi8(d[0], d[1]); |
1746 | 38.7k | d[0] = _mm256_loadu_si256((__m256i *)(data + 2 * stride)); |
1747 | 38.7k | const __m256i s10 = _mm256_unpacklo_epi8(d[1], d[0]); |
1748 | 38.7k | const __m256i s11 = _mm256_unpackhi_epi8(d[1], d[0]); |
1749 | | |
1750 | 38.7k | res[0] = _mm256_maddubs_epi16(s00, coeffs[0]); |
1751 | 38.7k | res[1] = _mm256_maddubs_epi16(s01, coeffs[0]); |
1752 | 38.7k | res[2] = _mm256_maddubs_epi16(s10, coeffs[0]); |
1753 | 38.7k | res[3] = _mm256_maddubs_epi16(s11, coeffs[0]); |
1754 | 38.7k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_2tap_32x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_2tap_32x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_2tap_32x2_avx2 convolve_avx2.c:convolve_y_2tap_32x2_avx2 Line | Count | Source | 1742 | 38.7k | __m256i d[2], __m256i res[4]) { | 1743 | 38.7k | d[1] = _mm256_loadu_si256((__m256i *)(data + 1 * stride)); | 1744 | 38.7k | const __m256i s00 = _mm256_unpacklo_epi8(d[0], d[1]); | 1745 | 38.7k | const __m256i s01 = _mm256_unpackhi_epi8(d[0], d[1]); | 1746 | 38.7k | d[0] = _mm256_loadu_si256((__m256i *)(data + 2 * stride)); | 1747 | 38.7k | const __m256i s10 = _mm256_unpacklo_epi8(d[1], d[0]); | 1748 | 38.7k | const __m256i s11 = _mm256_unpackhi_epi8(d[1], d[0]); | 1749 | | | 1750 | 38.7k | res[0] = _mm256_maddubs_epi16(s00, coeffs[0]); | 1751 | 38.7k | res[1] = _mm256_maddubs_epi16(s01, coeffs[0]); | 1752 | 38.7k | res[2] = _mm256_maddubs_epi16(s10, coeffs[0]); | 1753 | 38.7k | res[3] = _mm256_maddubs_epi16(s11, coeffs[0]); | 1754 | 38.7k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_2tap_32x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_2tap_32x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_2tap_32x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_2tap_32x2_avx2 |
1755 | | |
1756 | | static inline void convolve_y_4tap_32x2_avx2(const uint8_t *const data, |
1757 | | const ptrdiff_t stride, |
1758 | | const __m256i coeffs[2], |
1759 | | __m256i d[4], __m256i s1[4], |
1760 | 132k | __m256i s2[4], __m256i res[4]) { |
1761 | 132k | d[3] = _mm256_loadu_si256((__m256i *)(data + 3 * stride)); |
1762 | 132k | s1[1] = _mm256_unpacklo_epi8(d[2], d[3]); |
1763 | 132k | s1[3] = _mm256_unpackhi_epi8(d[2], d[3]); |
1764 | 132k | d[2] = _mm256_loadu_si256((__m256i *)(data + 4 * stride)); |
1765 | 132k | s2[1] = _mm256_unpacklo_epi8(d[3], d[2]); |
1766 | 132k | s2[3] = _mm256_unpackhi_epi8(d[3], d[2]); |
1767 | | |
1768 | 132k | res[0] = convolve_lowbd_4tap(s1, coeffs); |
1769 | 132k | res[1] = convolve_lowbd_4tap(s1 + 2, coeffs); |
1770 | 132k | res[2] = convolve_lowbd_4tap(s2, coeffs); |
1771 | 132k | res[3] = convolve_lowbd_4tap(s2 + 2, coeffs); |
1772 | 132k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_4tap_32x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_4tap_32x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_4tap_32x2_avx2 convolve_avx2.c:convolve_y_4tap_32x2_avx2 Line | Count | Source | 1760 | 132k | __m256i s2[4], __m256i res[4]) { | 1761 | 132k | d[3] = _mm256_loadu_si256((__m256i *)(data + 3 * stride)); | 1762 | 132k | s1[1] = _mm256_unpacklo_epi8(d[2], d[3]); | 1763 | 132k | s1[3] = _mm256_unpackhi_epi8(d[2], d[3]); | 1764 | 132k | d[2] = _mm256_loadu_si256((__m256i *)(data + 4 * stride)); | 1765 | 132k | s2[1] = _mm256_unpacklo_epi8(d[3], d[2]); | 1766 | 132k | s2[3] = _mm256_unpackhi_epi8(d[3], d[2]); | 1767 | | | 1768 | 132k | res[0] = convolve_lowbd_4tap(s1, coeffs); | 1769 | 132k | res[1] = convolve_lowbd_4tap(s1 + 2, coeffs); | 1770 | 132k | res[2] = convolve_lowbd_4tap(s2, coeffs); | 1771 | 132k | res[3] = convolve_lowbd_4tap(s2 + 2, coeffs); | 1772 | 132k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_4tap_32x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_4tap_32x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_4tap_32x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_4tap_32x2_avx2 |
1773 | | #endif // AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ |