/src/aom/aom_dsp/x86/convolve_avx2.h
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2018, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #ifndef AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ |
13 | | #define AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ |
14 | | |
15 | | #include <immintrin.h> |
16 | | |
17 | | #include "aom_ports/mem.h" |
18 | | |
19 | | #include "aom_dsp/x86/mem_sse2.h" |
20 | | #include "aom_dsp/x86/synonyms.h" |
21 | | |
22 | | #include "av1/common/convolve.h" |
23 | | #include "av1/common/filter.h" |
24 | | |
25 | 527k | #define SECOND_32_BLK (32) |
26 | 448k | #define THIRD_32_BLK (32 << 1) |
27 | 224k | #define FOURTH_32_BLK (SECOND_32_BLK + THIRD_32_BLK) |
28 | | |
29 | | // filters for 16 |
30 | | DECLARE_ALIGNED(32, static const uint8_t, filt_global_avx2[]) = { |
31 | | 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, |
32 | | 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 2, 3, 3, 4, 4, 5, |
33 | | 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6, |
34 | | 7, 7, 8, 8, 9, 9, 10, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, |
35 | | 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, |
36 | | 12, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 6, 7, |
37 | | 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 |
38 | | }; |
39 | | |
40 | | DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[]) = { |
41 | | 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, |
42 | | 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, |
43 | | 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, |
44 | | }; |
45 | | |
46 | | DECLARE_ALIGNED(32, static const uint8_t, filt4_d4_global_avx2[]) = { |
47 | | 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, |
48 | | 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, |
49 | | }; |
50 | | |
51 | | DECLARE_ALIGNED(32, static const uint8_t, filt_center_global_avx2[32]) = { |
52 | | 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255, |
53 | | 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255 |
54 | | }; |
55 | | |
56 | | DECLARE_ALIGNED(32, static const uint8_t, |
57 | | filt1_global_sse2[16]) = { 0, 1, 1, 2, 2, 3, 3, 4, |
58 | | 8, 9, 9, 10, 10, 11, 11, 12 }; |
59 | | |
60 | | DECLARE_ALIGNED(32, static const uint8_t, |
61 | | filt2_global_sse2[16]) = { 2, 3, 3, 4, 4, 5, 5, 6, |
62 | | 10, 11, 11, 12, 12, 13, 13, 14 }; |
63 | | |
64 | | DECLARE_ALIGNED(32, static const uint8_t, |
65 | | filt3_global_sse2[16]) = { 0, 1, 1, 2, 8, 9, 9, 10, |
66 | | 0, 0, 0, 0, 0, 0, 0, 0 }; |
67 | | |
68 | | DECLARE_ALIGNED(32, static const uint8_t, |
69 | | filt4_global_sse2[16]) = { 2, 3, 3, 4, 10, 11, 11, 12, |
70 | | 0, 0, 0, 0, 0, 0, 0, 0 }; |
71 | | |
72 | | DECLARE_ALIGNED(32, static const uint8_t, |
73 | | filt5_global_sse2[16]) = { 0, 1, 1, 2, 4, 5, 5, 6, |
74 | | 0, 0, 0, 0, 0, 0, 0, 0 }; |
75 | | |
76 | | DECLARE_ALIGNED(32, static const uint8_t, |
77 | | filt1_global_avx2[32]) = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, |
78 | | 6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3, |
79 | | 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; |
80 | | |
81 | | DECLARE_ALIGNED(32, static const uint8_t, |
82 | | filt2_global_avx2[32]) = { 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, |
83 | | 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, |
84 | | 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 }; |
85 | | |
86 | | DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = { |
87 | | 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, |
88 | | 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 |
89 | | }; |
90 | | |
91 | | DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { |
92 | | 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, |
93 | | 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 |
94 | | }; |
95 | | |
96 | | #define CONVOLVE_SR_HOR_FILTER_W4(CONVOLVE_LOWBD) \ |
97 | 2.90M | for (i = 0; i < (im_h - 2); i += 2) { \ |
98 | 2.35M | __m128i data = \ |
99 | 2.35M | load_8bit_8x2_to_1_reg_sse2(&src_ptr[(i * src_stride)], src_stride); \ |
100 | 2.35M | __m128i res = CONVOLVE_LOWBD(data, coeffs_h, filt); \ |
101 | 2.35M | res = _mm_srai_epi16(_mm_add_epi16(res, round_const_h), 2); \ |
102 | 2.35M | _mm_store_si128((__m128i *)&im_block[i * 4], res); \ |
103 | 2.35M | } \ |
104 | 551k | __m128i data_1 = _mm_loadl_epi64((__m128i *)&src_ptr[(i * src_stride)]); \ |
105 | 551k | __m128i res = CONVOLVE_LOWBD(data_1, coeffs_h, filt); \ |
106 | 551k | res = _mm_srai_epi16(_mm_add_epi16(res, round_const_h), 2); \ |
107 | 551k | _mm_storel_epi64((__m128i *)&im_block[i * 4], res); |
108 | | |
109 | | #define CONVOLVE_SR_HOR_FILTER_2TAP_W4 \ |
110 | 22.8k | CONVOLVE_SR_HOR_FILTER_W4(convolve_lowbd_x_2tap_ssse3) |
111 | | |
112 | | #define CONVOLVE_SR_HOR_FILTER_4TAP_W4 \ |
113 | 528k | CONVOLVE_SR_HOR_FILTER_W4(convolve_lowbd_x_4tap_ssse3) |
114 | | |
115 | | static inline void sr_2d_ver_round_and_store_w4(int w, __m256i res, |
116 | | uint8_t *dst, int dst_stride, |
117 | 1.63M | __m256i round_const_v) { |
118 | 1.63M | const __m256i res_round = |
119 | 1.63M | _mm256_srai_epi32(_mm256_add_epi32(res, round_const_v), 11); |
120 | | |
121 | 1.63M | const __m256i res_16bit = _mm256_packs_epi32(res_round, res_round); |
122 | 1.63M | const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); |
123 | | |
124 | 1.63M | const __m128i r0 = _mm256_castsi256_si128(res_8b); |
125 | 1.63M | const __m128i r1 = _mm256_extracti128_si256(res_8b, 1); |
126 | | |
127 | 1.63M | __m128i *const p0 = (__m128i *)dst; |
128 | 1.63M | __m128i *const p1 = (__m128i *)(dst + dst_stride); |
129 | | |
130 | 1.63M | if (w == 4) { |
131 | 1.35M | xx_storel_32(p0, r0); |
132 | 1.35M | xx_storel_32(p1, r1); |
133 | 1.35M | } else { |
134 | 283k | assert(w == 2); |
135 | 283k | *(uint16_t *)p0 = (uint16_t)_mm_cvtsi128_si32(r0); |
136 | 283k | *(uint16_t *)p1 = (uint16_t)_mm_cvtsi128_si32(r1); |
137 | 283k | } |
138 | 1.63M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:sr_2d_ver_round_and_store_w4 Unexecuted instantiation: highbd_convolve_avx2.c:sr_2d_ver_round_and_store_w4 convolve_2d_avx2.c:sr_2d_ver_round_and_store_w4 Line | Count | Source | 117 | 1.63M | __m256i round_const_v) { | 118 | 1.63M | const __m256i res_round = | 119 | 1.63M | _mm256_srai_epi32(_mm256_add_epi32(res, round_const_v), 11); | 120 | | | 121 | 1.63M | const __m256i res_16bit = _mm256_packs_epi32(res_round, res_round); | 122 | 1.63M | const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); | 123 | | | 124 | 1.63M | const __m128i r0 = _mm256_castsi256_si128(res_8b); | 125 | 1.63M | const __m128i r1 = _mm256_extracti128_si256(res_8b, 1); | 126 | | | 127 | 1.63M | __m128i *const p0 = (__m128i *)dst; | 128 | 1.63M | __m128i *const p1 = (__m128i *)(dst + dst_stride); | 129 | | | 130 | 1.63M | if (w == 4) { | 131 | 1.35M | xx_storel_32(p0, r0); | 132 | 1.35M | xx_storel_32(p1, r1); | 133 | 1.35M | } else { | 134 | 283k | assert(w == 2); | 135 | 283k | *(uint16_t *)p0 = (uint16_t)_mm_cvtsi128_si32(r0); | 136 | 283k | *(uint16_t *)p1 = (uint16_t)_mm_cvtsi128_si32(r1); | 137 | 283k | } | 138 | 1.63M | } |
Unexecuted instantiation: convolve_avx2.c:sr_2d_ver_round_and_store_w4 Unexecuted instantiation: jnt_convolve_avx2.c:sr_2d_ver_round_and_store_w4 Unexecuted instantiation: wiener_convolve_avx2.c:sr_2d_ver_round_and_store_w4 Unexecuted instantiation: highbd_convolve_2d_avx2.c:sr_2d_ver_round_and_store_w4 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:sr_2d_ver_round_and_store_w4 |
139 | | |
140 | | #define CONVOLVE_SR_VER_FILTER_2TAP_W4 \ |
141 | 22.8k | __m128i s[2]; \ |
142 | 22.8k | s[0] = _mm_loadl_epi64((__m128i *)(im_block + 0 * 4)); \ |
143 | 22.8k | \ |
144 | 87.1k | for (i = 0; i < h; i += 2) { \ |
145 | 64.2k | const int16_t *data = &im_block[i * 4]; \ |
146 | 64.2k | s[1] = _mm_loadl_epi64((__m128i *)(data + 1 * 4)); \ |
147 | 64.2k | const __m256i src_0 = _mm256_setr_m128i(s[0], s[1]); \ |
148 | 64.2k | s[0] = _mm_loadl_epi64((__m128i *)(data + 2 * 4)); \ |
149 | 64.2k | const __m256i src_1 = _mm256_setr_m128i(s[1], s[0]); \ |
150 | 64.2k | const __m256i ss = _mm256_unpacklo_epi16(src_0, src_1); \ |
151 | 64.2k | \ |
152 | 64.2k | const __m256i res = _mm256_madd_epi16(ss, coeffs_v[0]); \ |
153 | 64.2k | \ |
154 | 64.2k | sr_2d_ver_round_and_store_w4(w, res, dst_ptr, dst_stride, round_const_v); \ |
155 | 64.2k | dst_ptr += 2 * dst_stride; \ |
156 | 64.2k | } |
157 | | |
158 | | #define CONVOLVE_SR_VER_FILTER_4TAP_W4 \ |
159 | 347k | __m128i s[4]; \ |
160 | 347k | __m256i ss[2]; \ |
161 | 347k | s[0] = _mm_loadl_epi64((__m128i *)(im_block + 0 * 4)); \ |
162 | 347k | s[1] = _mm_loadl_epi64((__m128i *)(im_block + 1 * 4)); \ |
163 | 347k | s[2] = _mm_loadl_epi64((__m128i *)(im_block + 2 * 4)); \ |
164 | 347k | \ |
165 | 347k | const __m256i src_0 = _mm256_setr_m128i(s[0], s[1]); \ |
166 | 347k | const __m256i src_1 = _mm256_setr_m128i(s[1], s[2]); \ |
167 | 347k | \ |
168 | 347k | ss[0] = _mm256_unpacklo_epi16(src_0, src_1); \ |
169 | 347k | \ |
170 | 1.00M | for (i = 0; i < h; i += 2) { \ |
171 | 658k | const int16_t *data = &im_block[i * 4]; \ |
172 | 658k | s[3] = _mm_loadl_epi64((__m128i *)(data + 3 * 4)); \ |
173 | 658k | const __m256i src_2 = _mm256_setr_m128i(s[2], s[3]); \ |
174 | 658k | s[2] = _mm_loadl_epi64((__m128i *)(data + 4 * 4)); \ |
175 | 658k | const __m256i src_3 = _mm256_setr_m128i(s[3], s[2]); \ |
176 | 658k | ss[1] = _mm256_unpacklo_epi16(src_2, src_3); \ |
177 | 658k | \ |
178 | 658k | const __m256i res = convolve_4tap(ss, coeffs_v); \ |
179 | 658k | \ |
180 | 658k | sr_2d_ver_round_and_store_w4(w, res, dst_ptr, dst_stride, round_const_v); \ |
181 | 658k | dst_ptr += 2 * dst_stride; \ |
182 | 658k | \ |
183 | 658k | ss[0] = ss[1]; \ |
184 | 658k | } |
185 | | |
186 | | #define CONVOLVE_SR_VER_FILTER_6TAP_W4 \ |
187 | 170k | __m128i s[6]; \ |
188 | 170k | __m256i ss[3]; \ |
189 | 170k | s[0] = _mm_loadl_epi64((__m128i *)(im_block + 0 * 4)); \ |
190 | 170k | s[1] = _mm_loadl_epi64((__m128i *)(im_block + 1 * 4)); \ |
191 | 170k | s[2] = _mm_loadl_epi64((__m128i *)(im_block + 2 * 4)); \ |
192 | 170k | s[3] = _mm_loadl_epi64((__m128i *)(im_block + 3 * 4)); \ |
193 | 170k | s[4] = _mm_loadl_epi64((__m128i *)(im_block + 4 * 4)); \ |
194 | 170k | \ |
195 | 170k | const __m256i src_0 = _mm256_setr_m128i(s[0], s[1]); \ |
196 | 170k | const __m256i src_1 = _mm256_setr_m128i(s[1], s[2]); \ |
197 | 170k | const __m256i src_2 = _mm256_setr_m128i(s[2], s[3]); \ |
198 | 170k | const __m256i src_3 = _mm256_setr_m128i(s[3], s[4]); \ |
199 | 170k | \ |
200 | 170k | ss[0] = _mm256_unpacklo_epi16(src_0, src_1); \ |
201 | 170k | ss[1] = _mm256_unpacklo_epi16(src_2, src_3); \ |
202 | 170k | \ |
203 | 1.03M | for (i = 0; i < h; i += 2) { \ |
204 | 860k | const int16_t *data = &im_block[i * 4]; \ |
205 | 860k | s[5] = _mm_loadl_epi64((__m128i *)(data + 5 * 4)); \ |
206 | 860k | const __m256i src_4 = _mm256_setr_m128i(s[4], s[5]); \ |
207 | 860k | s[4] = _mm_loadl_epi64((__m128i *)(data + 6 * 4)); \ |
208 | 860k | const __m256i src_5 = _mm256_setr_m128i(s[5], s[4]); \ |
209 | 860k | ss[2] = _mm256_unpacklo_epi16(src_4, src_5); \ |
210 | 860k | \ |
211 | 860k | const __m256i res = convolve_6tap(ss, coeffs_v); \ |
212 | 860k | \ |
213 | 860k | sr_2d_ver_round_and_store_w4(w, res, dst_ptr, dst_stride, round_const_v); \ |
214 | 860k | dst_ptr += 2 * dst_stride; \ |
215 | 860k | \ |
216 | 860k | ss[0] = ss[1]; \ |
217 | 860k | ss[1] = ss[2]; \ |
218 | 860k | } |
219 | | |
220 | | #define CONVOLVE_SR_VER_FILTER_8TAP_W4 \ |
221 | 10.2k | __m128i s[8]; \ |
222 | 10.2k | __m256i ss[4]; \ |
223 | 10.2k | s[0] = _mm_loadl_epi64((__m128i *)(im_block + 0 * 4)); \ |
224 | 10.2k | s[1] = _mm_loadl_epi64((__m128i *)(im_block + 1 * 4)); \ |
225 | 10.2k | s[2] = _mm_loadl_epi64((__m128i *)(im_block + 2 * 4)); \ |
226 | 10.2k | s[3] = _mm_loadl_epi64((__m128i *)(im_block + 3 * 4)); \ |
227 | 10.2k | s[4] = _mm_loadl_epi64((__m128i *)(im_block + 4 * 4)); \ |
228 | 10.2k | s[5] = _mm_loadl_epi64((__m128i *)(im_block + 5 * 4)); \ |
229 | 10.2k | s[6] = _mm_loadl_epi64((__m128i *)(im_block + 6 * 4)); \ |
230 | 10.2k | \ |
231 | 10.2k | const __m256i src_0 = _mm256_setr_m128i(s[0], s[1]); \ |
232 | 10.2k | const __m256i src_1 = _mm256_setr_m128i(s[1], s[2]); \ |
233 | 10.2k | const __m256i src_2 = _mm256_setr_m128i(s[2], s[3]); \ |
234 | 10.2k | const __m256i src_3 = _mm256_setr_m128i(s[3], s[4]); \ |
235 | 10.2k | const __m256i src_4 = _mm256_setr_m128i(s[4], s[5]); \ |
236 | 10.2k | const __m256i src_5 = _mm256_setr_m128i(s[5], s[6]); \ |
237 | 10.2k | \ |
238 | 10.2k | ss[0] = _mm256_unpacklo_epi16(src_0, src_1); \ |
239 | 10.2k | ss[1] = _mm256_unpacklo_epi16(src_2, src_3); \ |
240 | 10.2k | ss[2] = _mm256_unpacklo_epi16(src_4, src_5); \ |
241 | 10.2k | \ |
242 | 62.7k | for (i = 0; i < h; i += 2) { \ |
243 | 52.5k | const int16_t *data = &im_block[i * 4]; \ |
244 | 52.5k | s[7] = _mm_loadl_epi64((__m128i *)(data + 7 * 4)); \ |
245 | 52.5k | const __m256i src_6 = _mm256_setr_m128i(s[6], s[7]); \ |
246 | 52.5k | s[6] = _mm_loadl_epi64((__m128i *)(data + 8 * 4)); \ |
247 | 52.5k | const __m256i src_7 = _mm256_setr_m128i(s[7], s[6]); \ |
248 | 52.5k | ss[3] = _mm256_unpacklo_epi16(src_6, src_7); \ |
249 | 52.5k | \ |
250 | 52.5k | const __m256i res = convolve(ss, coeffs_v); \ |
251 | 52.5k | \ |
252 | 52.5k | sr_2d_ver_round_and_store_w4(w, res, dst_ptr, dst_stride, round_const_v); \ |
253 | 52.5k | dst_ptr += 2 * dst_stride; \ |
254 | 52.5k | \ |
255 | 52.5k | ss[0] = ss[1]; \ |
256 | 52.5k | ss[1] = ss[2]; \ |
257 | 52.5k | ss[2] = ss[3]; \ |
258 | 52.5k | } |
259 | | |
260 | | #define CONVOLVE_SR_HORIZONTAL_FILTER(CONVOLVE_LOWBD) \ |
261 | 13.8M | for (i = 0; i < (im_h - 2); i += 2) { \ |
262 | 12.6M | __m256i data = _mm256_castsi128_si256( \ |
263 | 12.6M | _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \ |
264 | 12.6M | data = _mm256_inserti128_si256( \ |
265 | 12.6M | data, \ |
266 | 12.6M | _mm_loadu_si128( \ |
267 | 12.6M | (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]), \ |
268 | 12.6M | 1); \ |
269 | 12.6M | __m256i res = CONVOLVE_LOWBD(data, coeffs_h, filt); \ |
270 | 12.6M | res = _mm256_srai_epi16(_mm256_add_epi16(res, round_const_h), 2); \ |
271 | 12.6M | _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \ |
272 | 12.6M | } \ |
273 | 1.24M | __m256i data_1 = _mm256_castsi128_si256( \ |
274 | 1.24M | _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \ |
275 | 1.24M | __m256i res = CONVOLVE_LOWBD(data_1, coeffs_h, filt); \ |
276 | 1.24M | res = _mm256_srai_epi16(_mm256_add_epi16(res, round_const_h), 2); \ |
277 | 1.24M | _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); |
278 | | |
279 | | #define CONVOLVE_SR_HORIZONTAL_FILTER_2TAP \ |
280 | 39.8k | CONVOLVE_SR_HORIZONTAL_FILTER(convolve_lowbd_x_2tap) |
281 | | |
282 | | #define CONVOLVE_SR_HORIZONTAL_FILTER_4TAP \ |
283 | 73.0k | CONVOLVE_SR_HORIZONTAL_FILTER(convolve_lowbd_x_4tap) |
284 | | |
285 | | #define CONVOLVE_SR_HORIZONTAL_FILTER_6TAP \ |
286 | 976k | CONVOLVE_SR_HORIZONTAL_FILTER(convolve_lowbd_x_6tap) |
287 | | |
288 | | #define CONVOLVE_SR_HORIZONTAL_FILTER_8TAP \ |
289 | 154k | CONVOLVE_SR_HORIZONTAL_FILTER(convolve_lowbd_x) |
290 | | |
291 | | static inline void sr_2d_ver_round_and_store(__m256i res_a, __m256i res_b, |
292 | | uint8_t *dst, int dst_stride, |
293 | 10.4M | __m256i round_const_v) { |
294 | 10.4M | const __m256i res_a_round = |
295 | 10.4M | _mm256_srai_epi32(_mm256_add_epi32(res_a, round_const_v), 11); |
296 | 10.4M | const __m256i res_b_round = |
297 | 10.4M | _mm256_srai_epi32(_mm256_add_epi32(res_b, round_const_v), 11); |
298 | 10.4M | const __m256i r16 = _mm256_packs_epi32(res_a_round, res_b_round); |
299 | 10.4M | const __m256i r8 = _mm256_packus_epi16(r16, r16); |
300 | | |
301 | 10.4M | _mm_storel_epi64((__m128i *)dst, _mm256_castsi256_si128(r8)); |
302 | 10.4M | _mm_storel_epi64((__m128i *)(dst + dst_stride), |
303 | 10.4M | _mm256_extracti128_si256(r8, 1)); |
304 | 10.4M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:sr_2d_ver_round_and_store Unexecuted instantiation: highbd_convolve_avx2.c:sr_2d_ver_round_and_store convolve_2d_avx2.c:sr_2d_ver_round_and_store Line | Count | Source | 293 | 10.4M | __m256i round_const_v) { | 294 | 10.4M | const __m256i res_a_round = | 295 | 10.4M | _mm256_srai_epi32(_mm256_add_epi32(res_a, round_const_v), 11); | 296 | 10.4M | const __m256i res_b_round = | 297 | 10.4M | _mm256_srai_epi32(_mm256_add_epi32(res_b, round_const_v), 11); | 298 | 10.4M | const __m256i r16 = _mm256_packs_epi32(res_a_round, res_b_round); | 299 | 10.4M | const __m256i r8 = _mm256_packus_epi16(r16, r16); | 300 | | | 301 | 10.4M | _mm_storel_epi64((__m128i *)dst, _mm256_castsi256_si128(r8)); | 302 | 10.4M | _mm_storel_epi64((__m128i *)(dst + dst_stride), | 303 | | _mm256_extracti128_si256(r8, 1)); | 304 | 10.4M | } |
Unexecuted instantiation: convolve_avx2.c:sr_2d_ver_round_and_store Unexecuted instantiation: jnt_convolve_avx2.c:sr_2d_ver_round_and_store Unexecuted instantiation: wiener_convolve_avx2.c:sr_2d_ver_round_and_store Unexecuted instantiation: highbd_convolve_2d_avx2.c:sr_2d_ver_round_and_store Unexecuted instantiation: highbd_jnt_convolve_avx2.c:sr_2d_ver_round_and_store |
305 | | |
306 | | #define CONVOLVE_SR_VERTICAL_FILTER_2TAP \ |
307 | 528k | for (i = 0; i < h; i += 2) { \ |
308 | 488k | __m256i s[2]; \ |
309 | 488k | const int16_t *data = &im_block[i * im_stride]; \ |
310 | 488k | const __m256i s1 = _mm256_loadu_si256((__m256i *)(data + 0 * im_stride)); \ |
311 | 488k | const __m256i s2 = _mm256_loadu_si256((__m256i *)(data + 1 * im_stride)); \ |
312 | 488k | s[0] = _mm256_unpacklo_epi16(s1, s2); \ |
313 | 488k | s[1] = _mm256_unpackhi_epi16(s1, s2); \ |
314 | 488k | \ |
315 | 488k | __m256i res_a = _mm256_madd_epi16(s[0], coeffs_v[0]); \ |
316 | 488k | __m256i res_b = _mm256_madd_epi16(s[1], coeffs_v[0]); \ |
317 | 488k | \ |
318 | 488k | sr_2d_ver_round_and_store(res_a, res_b, dst_ptr, dst_stride, \ |
319 | 488k | round_const_v); \ |
320 | 488k | dst_ptr += 2 * dst_stride; \ |
321 | 488k | } |
322 | | |
323 | | #define CONVOLVE_SR_VERTICAL_FILTER_4TAP \ |
324 | 436k | __m256i s[6]; \ |
325 | 436k | __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ |
326 | 436k | __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ |
327 | 436k | \ |
328 | 436k | s[0] = _mm256_unpacklo_epi16(src_0, src_1); \ |
329 | 436k | s[2] = _mm256_unpackhi_epi16(src_0, src_1); \ |
330 | 436k | \ |
331 | 1.76M | for (i = 0; i < h; i += 2) { \ |
332 | 1.32M | const int16_t *data = &im_block[i * im_stride]; \ |
333 | 1.32M | const __m256i s4 = _mm256_loadu_si256((__m256i *)(data + 2 * im_stride)); \ |
334 | 1.32M | const __m256i s5 = _mm256_loadu_si256((__m256i *)(data + 3 * im_stride)); \ |
335 | 1.32M | s[1] = _mm256_unpacklo_epi16(s4, s5); \ |
336 | 1.32M | s[3] = _mm256_unpackhi_epi16(s4, s5); \ |
337 | 1.32M | \ |
338 | 1.32M | __m256i res_a = convolve_4tap(s, coeffs_v); \ |
339 | 1.32M | __m256i res_b = convolve_4tap(s + 2, coeffs_v); \ |
340 | 1.32M | \ |
341 | 1.32M | sr_2d_ver_round_and_store(res_a, res_b, dst_ptr, dst_stride, \ |
342 | 1.32M | round_const_v); \ |
343 | 1.32M | dst_ptr += 2 * dst_stride; \ |
344 | 1.32M | \ |
345 | 1.32M | s[0] = s[1]; \ |
346 | 1.32M | s[2] = s[3]; \ |
347 | 1.32M | } |
348 | | |
349 | | #define CONVOLVE_SR_VERTICAL_FILTER_6TAP \ |
350 | 631k | __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ |
351 | 631k | __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ |
352 | 631k | __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \ |
353 | 631k | __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \ |
354 | 631k | \ |
355 | 631k | __m256i s[8]; \ |
356 | 631k | s[0] = _mm256_unpacklo_epi16(src_0, src_1); \ |
357 | 631k | s[1] = _mm256_unpacklo_epi16(src_2, src_3); \ |
358 | 631k | \ |
359 | 631k | s[3] = _mm256_unpackhi_epi16(src_0, src_1); \ |
360 | 631k | s[4] = _mm256_unpackhi_epi16(src_2, src_3); \ |
361 | 631k | \ |
362 | 7.62M | for (i = 0; i < h; i += 2) { \ |
363 | 6.99M | const int16_t *data = &im_block[i * im_stride]; \ |
364 | 6.99M | \ |
365 | 6.99M | const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 4 * im_stride)); \ |
366 | 6.99M | const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 5 * im_stride)); \ |
367 | 6.99M | \ |
368 | 6.99M | s[2] = _mm256_unpacklo_epi16(s6, s7); \ |
369 | 6.99M | s[5] = _mm256_unpackhi_epi16(s6, s7); \ |
370 | 6.99M | \ |
371 | 6.99M | __m256i res_a = convolve_6tap(s, coeffs_v); \ |
372 | 6.99M | __m256i res_b = convolve_6tap(s + 3, coeffs_v); \ |
373 | 6.99M | \ |
374 | 6.99M | sr_2d_ver_round_and_store(res_a, res_b, dst_ptr, dst_stride, \ |
375 | 6.99M | round_const_v); \ |
376 | 6.99M | dst_ptr += 2 * dst_stride; \ |
377 | 6.99M | \ |
378 | 6.99M | s[0] = s[1]; \ |
379 | 6.99M | s[1] = s[2]; \ |
380 | 6.99M | \ |
381 | 6.99M | s[3] = s[4]; \ |
382 | 6.99M | s[4] = s[5]; \ |
383 | 6.99M | } |
384 | | |
385 | | #define CONVOLVE_SR_VERTICAL_FILTER_8TAP \ |
386 | 136k | __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ |
387 | 136k | __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ |
388 | 136k | __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \ |
389 | 136k | __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \ |
390 | 136k | __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); \ |
391 | 136k | __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); \ |
392 | 136k | \ |
393 | 136k | __m256i s[8]; \ |
394 | 136k | s[0] = _mm256_unpacklo_epi16(src_0, src_1); \ |
395 | 136k | s[1] = _mm256_unpacklo_epi16(src_2, src_3); \ |
396 | 136k | s[2] = _mm256_unpacklo_epi16(src_4, src_5); \ |
397 | 136k | \ |
398 | 136k | s[4] = _mm256_unpackhi_epi16(src_0, src_1); \ |
399 | 136k | s[5] = _mm256_unpackhi_epi16(src_2, src_3); \ |
400 | 136k | s[6] = _mm256_unpackhi_epi16(src_4, src_5); \ |
401 | 136k | \ |
402 | 1.80M | for (i = 0; i < h; i += 2) { \ |
403 | 1.66M | const int16_t *data = &im_block[i * im_stride]; \ |
404 | 1.66M | \ |
405 | 1.66M | const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); \ |
406 | 1.66M | const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); \ |
407 | 1.66M | \ |
408 | 1.66M | s[3] = _mm256_unpacklo_epi16(s6, s7); \ |
409 | 1.66M | s[7] = _mm256_unpackhi_epi16(s6, s7); \ |
410 | 1.66M | \ |
411 | 1.66M | __m256i res_a = convolve(s, coeffs_v); \ |
412 | 1.66M | __m256i res_b = convolve(s + 4, coeffs_v); \ |
413 | 1.66M | \ |
414 | 1.66M | sr_2d_ver_round_and_store(res_a, res_b, dst_ptr, dst_stride, \ |
415 | 1.66M | round_const_v); \ |
416 | 1.66M | dst_ptr += 2 * dst_stride; \ |
417 | 1.66M | \ |
418 | 1.66M | s[0] = s[1]; \ |
419 | 1.66M | s[1] = s[2]; \ |
420 | 1.66M | s[2] = s[3]; \ |
421 | 1.66M | \ |
422 | 1.66M | s[4] = s[5]; \ |
423 | 1.66M | s[5] = s[6]; \ |
424 | 1.66M | s[6] = s[7]; \ |
425 | 1.66M | } |
426 | | |
427 | | #define CONVOLVE_SR_HORIZONTAL_FILTER_12TAP \ |
428 | 0 | const __m256i v_zero = _mm256_setzero_si256(); \ |
429 | 0 | __m256i s[12]; \ |
430 | 0 | if (w <= 4) { \ |
431 | 0 | for (i = 0; i < im_h; i += 2) { \ |
432 | 0 | const __m256i data = _mm256_permute2x128_si256( \ |
433 | 0 | _mm256_castsi128_si256( \ |
434 | 0 | _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))), \ |
435 | 0 | _mm256_castsi128_si256(_mm_loadu_si128( \ |
436 | 0 | (__m128i *)(&src_ptr[i * src_stride + src_stride + j]))), \ |
437 | 0 | 0x20); \ |
438 | 0 | const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero); \ |
439 | 0 | const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero); \ |
440 | 0 | const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo); \ |
441 | 0 | const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo); \ |
442 | 0 | \ |
443 | 0 | const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi); \ |
444 | 0 | const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi); \ |
445 | 0 | \ |
446 | 0 | s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2); \ |
447 | 0 | s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10); \ |
448 | 0 | s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2); \ |
449 | 0 | s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10); \ |
450 | 0 | s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2); \ |
451 | 0 | s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10); \ |
452 | 0 | \ |
453 | 0 | const __m256i res_lo = convolve_12taps(s, coeffs_h); \ |
454 | 0 | \ |
455 | 0 | __m256i res_32b_lo = _mm256_sra_epi32( \ |
456 | 0 | _mm256_add_epi32(res_lo, round_const_h12), round_shift_h12); \ |
457 | 0 | __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); \ |
458 | 0 | const __m128i res_0 = _mm256_extracti128_si256(res_16b_lo, 0); \ |
459 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_16b_lo, 1); \ |
460 | 0 | if (w > 2) { \ |
461 | 0 | _mm_storel_epi64((__m128i *)&im_block[i * im_stride], res_0); \ |
462 | 0 | _mm_storel_epi64((__m128i *)&im_block[i * im_stride + im_stride], \ |
463 | 0 | res_1); \ |
464 | 0 | } else { \ |
465 | 0 | uint32_t horiz_2; \ |
466 | 0 | horiz_2 = (uint32_t)_mm_cvtsi128_si32(res_0); \ |
467 | 0 | im_block[i * im_stride] = (uint16_t)horiz_2; \ |
468 | 0 | im_block[i * im_stride + 1] = (uint16_t)(horiz_2 >> 16); \ |
469 | 0 | horiz_2 = (uint32_t)_mm_cvtsi128_si32(res_1); \ |
470 | 0 | im_block[i * im_stride + im_stride] = (uint16_t)horiz_2; \ |
471 | 0 | im_block[i * im_stride + im_stride + 1] = (uint16_t)(horiz_2 >> 16); \ |
472 | 0 | } \ |
473 | 0 | } \ |
474 | 0 | } else { \ |
475 | 0 | for (i = 0; i < im_h; i++) { \ |
476 | 0 | const __m256i data = _mm256_permute2x128_si256( \ |
477 | 0 | _mm256_castsi128_si256( \ |
478 | 0 | _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))), \ |
479 | 0 | _mm256_castsi128_si256( \ |
480 | 0 | _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j + 4]))), \ |
481 | 0 | 0x20); \ |
482 | 0 | const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero); \ |
483 | 0 | const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero); \ |
484 | 0 | \ |
485 | 0 | const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo); \ |
486 | 0 | const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo); \ |
487 | 0 | \ |
488 | 0 | const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi); \ |
489 | 0 | const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi); \ |
490 | 0 | \ |
491 | 0 | s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2); \ |
492 | 0 | s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10); \ |
493 | 0 | s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2); \ |
494 | 0 | s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10); \ |
495 | 0 | s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2); \ |
496 | 0 | s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10); \ |
497 | 0 | \ |
498 | 0 | const __m256i res_lo = convolve_12taps(s, coeffs_h); \ |
499 | 0 | \ |
500 | 0 | __m256i res_32b_lo = _mm256_sra_epi32( \ |
501 | 0 | _mm256_add_epi32(res_lo, round_const_h12), round_shift_h12); \ |
502 | 0 | \ |
503 | 0 | __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); \ |
504 | 0 | _mm_store_si128((__m128i *)&im_block[i * im_stride], \ |
505 | 0 | _mm256_extracti128_si256( \ |
506 | 0 | _mm256_permute4x64_epi64(res_16b_lo, 0x88), 0)); \ |
507 | 0 | } \ |
508 | 0 | } |
509 | | |
510 | | #define CONVOLVE_SR_VERTICAL_FILTER_12TAP \ |
511 | 0 | __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ |
512 | 0 | __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ |
513 | 0 | __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \ |
514 | 0 | __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \ |
515 | 0 | __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); \ |
516 | 0 | __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); \ |
517 | 0 | __m256i src_6 = _mm256_loadu_si256((__m256i *)(im_block + 6 * im_stride)); \ |
518 | 0 | __m256i src_7 = _mm256_loadu_si256((__m256i *)(im_block + 7 * im_stride)); \ |
519 | 0 | __m256i src_8 = _mm256_loadu_si256((__m256i *)(im_block + 8 * im_stride)); \ |
520 | 0 | __m256i src_9 = _mm256_loadu_si256((__m256i *)(im_block + 9 * im_stride)); \ |
521 | 0 | \ |
522 | 0 | s[0] = _mm256_unpacklo_epi16(src_0, src_1); \ |
523 | 0 | s[1] = _mm256_unpacklo_epi16(src_2, src_3); \ |
524 | 0 | s[2] = _mm256_unpacklo_epi16(src_4, src_5); \ |
525 | 0 | s[3] = _mm256_unpacklo_epi16(src_6, src_7); \ |
526 | 0 | s[4] = _mm256_unpacklo_epi16(src_8, src_9); \ |
527 | 0 | \ |
528 | 0 | s[6] = _mm256_unpackhi_epi16(src_0, src_1); \ |
529 | 0 | s[7] = _mm256_unpackhi_epi16(src_2, src_3); \ |
530 | 0 | s[8] = _mm256_unpackhi_epi16(src_4, src_5); \ |
531 | 0 | s[9] = _mm256_unpackhi_epi16(src_6, src_7); \ |
532 | 0 | s[10] = _mm256_unpackhi_epi16(src_8, src_9); \ |
533 | 0 | \ |
534 | 0 | for (i = 0; i < h; i += 2) { \ |
535 | 0 | const int16_t *data = &im_block[i * im_stride]; \ |
536 | 0 | \ |
537 | 0 | const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 10 * im_stride)); \ |
538 | 0 | const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 11 * im_stride)); \ |
539 | 0 | \ |
540 | 0 | s[5] = _mm256_unpacklo_epi16(s6, s7); \ |
541 | 0 | s[11] = _mm256_unpackhi_epi16(s6, s7); \ |
542 | 0 | \ |
543 | 0 | __m256i res_a = convolve_12taps(s, coeffs_v); \ |
544 | 0 | __m256i res_b = convolve_12taps(s + 6, coeffs_v); \ |
545 | 0 | \ |
546 | 0 | res_a = \ |
547 | 0 | _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v); \ |
548 | 0 | res_b = \ |
549 | 0 | _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v); \ |
550 | 0 | \ |
551 | 0 | const __m256i res_a_round = _mm256_sra_epi32( \ |
552 | 0 | _mm256_add_epi32(res_a, round_const_v), round_shift_v); \ |
553 | 0 | const __m256i res_b_round = _mm256_sra_epi32( \ |
554 | 0 | _mm256_add_epi32(res_b, round_const_v), round_shift_v); \ |
555 | 0 | \ |
556 | 0 | const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); \ |
557 | 0 | const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); \ |
558 | 0 | \ |
559 | 0 | const __m128i res_0 = _mm256_castsi256_si128(res_8b); \ |
560 | 0 | const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); \ |
561 | 0 | \ |
562 | 0 | __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; \ |
563 | 0 | __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; \ |
564 | 0 | if (w - j > 4) { \ |
565 | 0 | _mm_storel_epi64(p_0, res_0); \ |
566 | 0 | _mm_storel_epi64(p_1, res_1); \ |
567 | 0 | } else if (w == 4) { \ |
568 | 0 | xx_storel_32(p_0, res_0); \ |
569 | 0 | xx_storel_32(p_1, res_1); \ |
570 | 0 | } else { \ |
571 | 0 | *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); \ |
572 | 0 | *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); \ |
573 | 0 | } \ |
574 | 0 | \ |
575 | 0 | s[0] = s[1]; \ |
576 | 0 | s[1] = s[2]; \ |
577 | 0 | s[2] = s[3]; \ |
578 | 0 | s[3] = s[4]; \ |
579 | 0 | s[4] = s[5]; \ |
580 | 0 | \ |
581 | 0 | s[6] = s[7]; \ |
582 | 0 | s[7] = s[8]; \ |
583 | 0 | s[8] = s[9]; \ |
584 | 0 | s[9] = s[10]; \ |
585 | 0 | s[10] = s[11]; \ |
586 | 0 | } |
587 | | |
588 | | #define DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP \ |
589 | 207k | do { \ |
590 | 3.11M | for (i = 0; i < im_h; i += 2) { \ |
591 | 2.91M | __m256i data = \ |
592 | 2.91M | _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h)); \ |
593 | 2.91M | if (i + 1 < im_h) \ |
594 | 2.91M | data = _mm256_inserti128_si256( \ |
595 | 2.91M | data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1); \ |
596 | 2.91M | src_h += (src_stride << 1); \ |
597 | 2.91M | __m256i res = convolve_lowbd_x(data, coeffs_x, filt); \ |
598 | 2.91M | \ |
599 | 2.91M | res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), \ |
600 | 2.91M | round_shift_h); \ |
601 | 2.91M | \ |
602 | 2.91M | _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \ |
603 | 2.91M | } \ |
604 | 207k | } while (0) |
605 | | |
606 | | #define DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP \ |
607 | 252k | do { \ |
608 | 252k | __m256i s[8]; \ |
609 | 252k | __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ |
610 | 252k | __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ |
611 | 252k | __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \ |
612 | 252k | __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \ |
613 | 252k | __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); \ |
614 | 252k | __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); \ |
615 | 252k | \ |
616 | 252k | s[0] = _mm256_unpacklo_epi16(s0, s1); \ |
617 | 252k | s[1] = _mm256_unpacklo_epi16(s2, s3); \ |
618 | 252k | s[2] = _mm256_unpacklo_epi16(s4, s5); \ |
619 | 252k | \ |
620 | 252k | s[4] = _mm256_unpackhi_epi16(s0, s1); \ |
621 | 252k | s[5] = _mm256_unpackhi_epi16(s2, s3); \ |
622 | 252k | s[6] = _mm256_unpackhi_epi16(s4, s5); \ |
623 | 252k | \ |
624 | 2.63M | for (i = 0; i < h; i += 2) { \ |
625 | 2.37M | const int16_t *data = &im_block[i * im_stride]; \ |
626 | 2.37M | \ |
627 | 2.37M | const __m256i s6 = \ |
628 | 2.37M | _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); \ |
629 | 2.37M | const __m256i s7 = \ |
630 | 2.37M | _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); \ |
631 | 2.37M | \ |
632 | 2.37M | s[3] = _mm256_unpacklo_epi16(s6, s7); \ |
633 | 2.37M | s[7] = _mm256_unpackhi_epi16(s6, s7); \ |
634 | 2.37M | \ |
635 | 2.37M | const __m256i res_a = convolve(s, coeffs_y); \ |
636 | 2.37M | const __m256i res_a_round = _mm256_sra_epi32( \ |
637 | 2.37M | _mm256_add_epi32(res_a, round_const_v), round_shift_v); \ |
638 | 2.37M | \ |
639 | 2.37M | if (w - j > 4) { \ |
640 | 2.22M | const __m256i res_b = convolve(s + 4, coeffs_y); \ |
641 | 2.22M | const __m256i res_b_round = _mm256_sra_epi32( \ |
642 | 2.22M | _mm256_add_epi32(res_b, round_const_v), round_shift_v); \ |
643 | 2.22M | const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round); \ |
644 | 2.22M | const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); \ |
645 | 2.22M | \ |
646 | 2.22M | if (do_average) { \ |
647 | 954k | const __m256i data_ref_0 = \ |
648 | 954k | load_line2_avx2(&dst[i * dst_stride + j], \ |
649 | 954k | &dst[i * dst_stride + j + dst_stride]); \ |
650 | 954k | const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, \ |
651 | 954k | &wt, use_dist_wtd_comp_avg); \ |
652 | 954k | \ |
653 | 954k | const __m256i round_result = convolve_rounding( \ |
654 | 954k | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); \ |
655 | 954k | \ |
656 | 954k | const __m256i res_8 = \ |
657 | 954k | _mm256_packus_epi16(round_result, round_result); \ |
658 | 954k | const __m128i res_0 = _mm256_castsi256_si128(res_8); \ |
659 | 954k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); \ |
660 | 954k | \ |
661 | 954k | _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); \ |
662 | 954k | _mm_storel_epi64( \ |
663 | 954k | (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); \ |
664 | 1.27M | } else { \ |
665 | 1.27M | const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); \ |
666 | 1.27M | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); \ |
667 | 1.27M | \ |
668 | 1.27M | const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); \ |
669 | 1.27M | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), \ |
670 | 1.27M | res_1); \ |
671 | 1.27M | } \ |
672 | 2.22M | } else { \ |
673 | 150k | const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round); \ |
674 | 150k | const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); \ |
675 | 150k | \ |
676 | 150k | if (do_average) { \ |
677 | 66.8k | const __m256i data_ref_0 = \ |
678 | 66.8k | load_line2_avx2(&dst[i * dst_stride + j], \ |
679 | 66.8k | &dst[i * dst_stride + j + dst_stride]); \ |
680 | 66.8k | \ |
681 | 66.8k | const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, \ |
682 | 66.8k | &wt, use_dist_wtd_comp_avg); \ |
683 | 66.8k | \ |
684 | 66.8k | const __m256i round_result = convolve_rounding( \ |
685 | 66.8k | &comp_avg_res, &offset_const, &rounding_const, rounding_shift); \ |
686 | 66.8k | \ |
687 | 66.8k | const __m256i res_8 = \ |
688 | 66.8k | _mm256_packus_epi16(round_result, round_result); \ |
689 | 66.8k | const __m128i res_0 = _mm256_castsi256_si128(res_8); \ |
690 | 66.8k | const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); \ |
691 | 66.8k | \ |
692 | 66.8k | *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); \ |
693 | 66.8k | *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = \ |
694 | 66.8k | _mm_cvtsi128_si32(res_1); \ |
695 | 66.8k | \ |
696 | 83.3k | } else { \ |
697 | 83.3k | const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); \ |
698 | 83.3k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); \ |
699 | 83.3k | \ |
700 | 83.3k | const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); \ |
701 | 83.3k | _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), \ |
702 | 83.3k | res_1); \ |
703 | 83.3k | } \ |
704 | 150k | } \ |
705 | 2.37M | \ |
706 | 2.37M | s[0] = s[1]; \ |
707 | 2.37M | s[1] = s[2]; \ |
708 | 2.37M | s[2] = s[3]; \ |
709 | 2.37M | \ |
710 | 2.37M | s[4] = s[5]; \ |
711 | 2.37M | s[5] = s[6]; \ |
712 | 2.37M | s[6] = s[7]; \ |
713 | 2.37M | } \ |
714 | 252k | } while (0) |
715 | | |
716 | | static inline void prepare_coeffs_2t_ssse3( |
717 | | const InterpFilterParams *const filter_params, const int32_t subpel_q4, |
718 | 37.7k | __m128i *const coeffs /* [4] */) { |
719 | 37.7k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( |
720 | 37.7k | filter_params, subpel_q4 & SUBPEL_MASK); |
721 | 37.7k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); |
722 | | |
723 | | // right shift all filter co-efficients by 1 to reduce the bits required. |
724 | | // This extra right shift will be taken care of at the end while rounding |
725 | | // the result. |
726 | | // Since all filter co-efficients are even, this change will not affect the |
727 | | // end result |
728 | 37.7k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), |
729 | 37.7k | _mm_set1_epi16((short)0xffff))); |
730 | | |
731 | 37.7k | const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1); |
732 | | |
733 | | // coeffs 3 4 3 4 3 4 3 4 |
734 | 37.7k | coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u)); |
735 | 37.7k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_2t_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_2t_ssse3 convolve_2d_avx2.c:prepare_coeffs_2t_ssse3 Line | Count | Source | 718 | 22.8k | __m128i *const coeffs /* [4] */) { | 719 | 22.8k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 720 | 22.8k | filter_params, subpel_q4 & SUBPEL_MASK); | 721 | 22.8k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 722 | | | 723 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 724 | | // This extra right shift will be taken care of at the end while rounding | 725 | | // the result. | 726 | | // Since all filter co-efficients are even, this change will not affect the | 727 | | // end result | 728 | 22.8k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 729 | 22.8k | _mm_set1_epi16((short)0xffff))); | 730 | | | 731 | 22.8k | const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1); | 732 | | | 733 | | // coeffs 3 4 3 4 3 4 3 4 | 734 | 22.8k | coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u)); | 735 | 22.8k | } |
convolve_avx2.c:prepare_coeffs_2t_ssse3 Line | Count | Source | 718 | 14.8k | __m128i *const coeffs /* [4] */) { | 719 | 14.8k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 720 | 14.8k | filter_params, subpel_q4 & SUBPEL_MASK); | 721 | 14.8k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 722 | | | 723 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 724 | | // This extra right shift will be taken care of at the end while rounding | 725 | | // the result. | 726 | | // Since all filter co-efficients are even, this change will not affect the | 727 | | // end result | 728 | 14.8k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 729 | 14.8k | _mm_set1_epi16((short)0xffff))); | 730 | | | 731 | 14.8k | const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1); | 732 | | | 733 | | // coeffs 3 4 3 4 3 4 3 4 | 734 | 14.8k | coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u)); | 735 | 14.8k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_2t_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_2t_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_2t_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_2t_ssse3 |
736 | | |
737 | | static inline void prepare_coeffs_4t_ssse3( |
738 | | const InterpFilterParams *const filter_params, const int32_t subpel_q4, |
739 | 828k | __m128i *const coeffs /* [4] */) { |
740 | 828k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( |
741 | 828k | filter_params, subpel_q4 & SUBPEL_MASK); |
742 | 828k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); |
743 | | |
744 | | // right shift all filter co-efficients by 1 to reduce the bits required. |
745 | | // This extra right shift will be taken care of at the end while rounding |
746 | | // the result. |
747 | | // Since all filter co-efficients are even, this change will not affect the |
748 | | // end result |
749 | 828k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), |
750 | 828k | _mm_set1_epi16((short)0xffff))); |
751 | | |
752 | 828k | const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1); |
753 | | |
754 | | // coeffs 2 3 2 3 2 3 2 3 |
755 | 828k | coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u)); |
756 | | // coeffs 4 5 4 5 4 5 4 5 |
757 | 828k | coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u)); |
758 | 828k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_4t_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_4t_ssse3 convolve_2d_avx2.c:prepare_coeffs_4t_ssse3 Line | Count | Source | 739 | 528k | __m128i *const coeffs /* [4] */) { | 740 | 528k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 741 | 528k | filter_params, subpel_q4 & SUBPEL_MASK); | 742 | 528k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 743 | | | 744 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 745 | | // This extra right shift will be taken care of at the end while rounding | 746 | | // the result. | 747 | | // Since all filter co-efficients are even, this change will not affect the | 748 | | // end result | 749 | 528k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 750 | 528k | _mm_set1_epi16((short)0xffff))); | 751 | | | 752 | 528k | const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1); | 753 | | | 754 | | // coeffs 2 3 2 3 2 3 2 3 | 755 | 528k | coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u)); | 756 | | // coeffs 4 5 4 5 4 5 4 5 | 757 | 528k | coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u)); | 758 | 528k | } |
convolve_avx2.c:prepare_coeffs_4t_ssse3 Line | Count | Source | 739 | 300k | __m128i *const coeffs /* [4] */) { | 740 | 300k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 741 | 300k | filter_params, subpel_q4 & SUBPEL_MASK); | 742 | 300k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 743 | | | 744 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 745 | | // This extra right shift will be taken care of at the end while rounding | 746 | | // the result. | 747 | | // Since all filter co-efficients are even, this change will not affect the | 748 | | // end result | 749 | 300k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 750 | 300k | _mm_set1_epi16((short)0xffff))); | 751 | | | 752 | 300k | const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1); | 753 | | | 754 | | // coeffs 2 3 2 3 2 3 2 3 | 755 | 300k | coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u)); | 756 | | // coeffs 4 5 4 5 4 5 4 5 | 757 | 300k | coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u)); | 758 | 300k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_4t_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_4t_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_4t_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_4t_ssse3 |
759 | | |
760 | | static inline void prepare_coeffs_6t_ssse3( |
761 | | const InterpFilterParams *const filter_params, const int32_t subpel_q4, |
762 | 61.9k | __m128i *const coeffs /* [4] */) { |
763 | 61.9k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( |
764 | 61.9k | filter_params, subpel_q4 & SUBPEL_MASK); |
765 | 61.9k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); |
766 | | |
767 | | // right shift all filter co-efficients by 1 to reduce the bits required. |
768 | | // This extra right shift will be taken care of at the end while rounding |
769 | | // the result. |
770 | | // Since all filter co-efficients are even, this change will not affect the |
771 | | // end result |
772 | 61.9k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), |
773 | 61.9k | _mm_set1_epi16((short)0xffff))); |
774 | | |
775 | 61.9k | const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1); |
776 | | |
777 | | // coeffs 2 3 2 3 2 3 2 3 |
778 | 61.9k | coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0402u)); |
779 | | // coeffs 4 5 4 5 4 5 4 5 |
780 | 61.9k | coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u)); |
781 | | // coeffs 5 6 5 6 5 6 5 6 |
782 | 61.9k | coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0c0au)); |
783 | 61.9k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_6t_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_6t_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:prepare_coeffs_6t_ssse3 convolve_avx2.c:prepare_coeffs_6t_ssse3 Line | Count | Source | 762 | 61.9k | __m128i *const coeffs /* [4] */) { | 763 | 61.9k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 764 | 61.9k | filter_params, subpel_q4 & SUBPEL_MASK); | 765 | 61.9k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 766 | | | 767 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 768 | | // This extra right shift will be taken care of at the end while rounding | 769 | | // the result. | 770 | | // Since all filter co-efficients are even, this change will not affect the | 771 | | // end result | 772 | 61.9k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 773 | 61.9k | _mm_set1_epi16((short)0xffff))); | 774 | | | 775 | 61.9k | const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1); | 776 | | | 777 | | // coeffs 2 3 2 3 2 3 2 3 | 778 | 61.9k | coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0402u)); | 779 | | // coeffs 4 5 4 5 4 5 4 5 | 780 | 61.9k | coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u)); | 781 | | // coeffs 5 6 5 6 5 6 5 6 | 782 | 61.9k | coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0c0au)); | 783 | 61.9k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_6t_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_6t_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_6t_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_6t_ssse3 |
784 | | |
785 | | static inline void prepare_coeffs_ssse3( |
786 | | const InterpFilterParams *const filter_params, const int32_t subpel_q4, |
787 | 5.08k | __m128i *const coeffs /* [4] */) { |
788 | 5.08k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( |
789 | 5.08k | filter_params, subpel_q4 & SUBPEL_MASK); |
790 | 5.08k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); |
791 | | |
792 | | // right shift all filter co-efficients by 1 to reduce the bits required. |
793 | | // This extra right shift will be taken care of at the end while rounding |
794 | | // the result. |
795 | | // Since all filter co-efficients are even, this change will not affect the |
796 | | // end result |
797 | 5.08k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), |
798 | 5.08k | _mm_set1_epi16((short)0xffff))); |
799 | | |
800 | 5.08k | const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1); |
801 | | |
802 | | // coeffs 0 1 0 1 0 1 0 1 |
803 | 5.08k | coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u)); |
804 | | // coeffs 2 3 2 3 2 3 2 3 |
805 | 5.08k | coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u)); |
806 | | // coeffs 4 5 4 5 4 5 4 5 |
807 | 5.08k | coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u)); |
808 | | // coeffs 6 7 6 7 6 7 6 7 |
809 | 5.08k | coeffs[3] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0e0cu)); |
810 | 5.08k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:prepare_coeffs_ssse3 convolve_avx2.c:prepare_coeffs_ssse3 Line | Count | Source | 787 | 5.08k | __m128i *const coeffs /* [4] */) { | 788 | 5.08k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 789 | 5.08k | filter_params, subpel_q4 & SUBPEL_MASK); | 790 | 5.08k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 791 | | | 792 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 793 | | // This extra right shift will be taken care of at the end while rounding | 794 | | // the result. | 795 | | // Since all filter co-efficients are even, this change will not affect the | 796 | | // end result | 797 | 5.08k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 798 | 5.08k | _mm_set1_epi16((short)0xffff))); | 799 | | | 800 | 5.08k | const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1); | 801 | | | 802 | | // coeffs 0 1 0 1 0 1 0 1 | 803 | 5.08k | coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u)); | 804 | | // coeffs 2 3 2 3 2 3 2 3 | 805 | 5.08k | coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u)); | 806 | | // coeffs 4 5 4 5 4 5 4 5 | 807 | 5.08k | coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u)); | 808 | | // coeffs 6 7 6 7 6 7 6 7 | 809 | 5.08k | coeffs[3] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0e0cu)); | 810 | 5.08k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_ssse3 |
811 | | |
812 | | static inline void prepare_coeffs_2t_lowbd( |
813 | | const InterpFilterParams *const filter_params, const int subpel_q4, |
814 | 30.4k | __m256i *const coeffs /* [4] */) { |
815 | 30.4k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( |
816 | 30.4k | filter_params, subpel_q4 & SUBPEL_MASK); |
817 | 30.4k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); |
818 | 30.4k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); |
819 | | |
820 | | // right shift all filter co-efficients by 1 to reduce the bits required. |
821 | | // This extra right shift will be taken care of at the end while rounding |
822 | | // the result. |
823 | | // Since all filter co-efficients are even, this change will not affect the |
824 | | // end result |
825 | 30.4k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), |
826 | 30.4k | _mm_set1_epi16((int16_t)0xffff))); |
827 | | |
828 | 30.4k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); |
829 | | |
830 | | // coeffs 3 4 3 4 3 4 3 4 |
831 | 30.4k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u)); |
832 | 30.4k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_2t_lowbd Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_2t_lowbd convolve_2d_avx2.c:prepare_coeffs_2t_lowbd Line | Count | Source | 814 | 20.4k | __m256i *const coeffs /* [4] */) { | 815 | 20.4k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 816 | 20.4k | filter_params, subpel_q4 & SUBPEL_MASK); | 817 | 20.4k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 818 | 20.4k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); | 819 | | | 820 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 821 | | // This extra right shift will be taken care of at the end while rounding | 822 | | // the result. | 823 | | // Since all filter co-efficients are even, this change will not affect the | 824 | | // end result | 825 | 20.4k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 826 | 20.4k | _mm_set1_epi16((int16_t)0xffff))); | 827 | | | 828 | 20.4k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); | 829 | | | 830 | | // coeffs 3 4 3 4 3 4 3 4 | 831 | 20.4k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u)); | 832 | 20.4k | } |
convolve_avx2.c:prepare_coeffs_2t_lowbd Line | Count | Source | 814 | 9.94k | __m256i *const coeffs /* [4] */) { | 815 | 9.94k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 816 | 9.94k | filter_params, subpel_q4 & SUBPEL_MASK); | 817 | 9.94k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 818 | 9.94k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); | 819 | | | 820 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 821 | | // This extra right shift will be taken care of at the end while rounding | 822 | | // the result. | 823 | | // Since all filter co-efficients are even, this change will not affect the | 824 | | // end result | 825 | 9.94k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 826 | 9.94k | _mm_set1_epi16((int16_t)0xffff))); | 827 | | | 828 | 9.94k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); | 829 | | | 830 | | // coeffs 3 4 3 4 3 4 3 4 | 831 | 9.94k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u)); | 832 | 9.94k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_2t_lowbd Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_2t_lowbd Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_2t_lowbd Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_2t_lowbd |
833 | | |
834 | | static inline void prepare_coeffs_4t_lowbd( |
835 | | const InterpFilterParams *const filter_params, const int subpel_q4, |
836 | 163k | __m256i *const coeffs /* [4] */) { |
837 | 163k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( |
838 | 163k | filter_params, subpel_q4 & SUBPEL_MASK); |
839 | 163k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); |
840 | 163k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); |
841 | | |
842 | | // right shift all filter co-efficients by 1 to reduce the bits required. |
843 | | // This extra right shift will be taken care of at the end while rounding |
844 | | // the result. |
845 | | // Since all filter co-efficients are even, this change will not affect the |
846 | | // end result |
847 | 163k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), |
848 | 163k | _mm_set1_epi16((short)0xffff))); |
849 | | |
850 | 163k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); |
851 | | |
852 | | // coeffs 2 3 2 3 2 3 2 3 |
853 | 163k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u)); |
854 | | // coeffs 4 5 4 5 4 5 4 5 |
855 | 163k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u)); |
856 | 163k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_4t_lowbd Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_4t_lowbd convolve_2d_avx2.c:prepare_coeffs_4t_lowbd Line | Count | Source | 836 | 39.7k | __m256i *const coeffs /* [4] */) { | 837 | 39.7k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 838 | 39.7k | filter_params, subpel_q4 & SUBPEL_MASK); | 839 | 39.7k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 840 | 39.7k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); | 841 | | | 842 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 843 | | // This extra right shift will be taken care of at the end while rounding | 844 | | // the result. | 845 | | // Since all filter co-efficients are even, this change will not affect the | 846 | | // end result | 847 | 39.7k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 848 | 39.7k | _mm_set1_epi16((short)0xffff))); | 849 | | | 850 | 39.7k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); | 851 | | | 852 | | // coeffs 2 3 2 3 2 3 2 3 | 853 | 39.7k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u)); | 854 | | // coeffs 4 5 4 5 4 5 4 5 | 855 | 39.7k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u)); | 856 | 39.7k | } |
convolve_avx2.c:prepare_coeffs_4t_lowbd Line | Count | Source | 836 | 123k | __m256i *const coeffs /* [4] */) { | 837 | 123k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 838 | 123k | filter_params, subpel_q4 & SUBPEL_MASK); | 839 | 123k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 840 | 123k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); | 841 | | | 842 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 843 | | // This extra right shift will be taken care of at the end while rounding | 844 | | // the result. | 845 | | // Since all filter co-efficients are even, this change will not affect the | 846 | | // end result | 847 | 123k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 848 | 123k | _mm_set1_epi16((short)0xffff))); | 849 | | | 850 | 123k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); | 851 | | | 852 | | // coeffs 2 3 2 3 2 3 2 3 | 853 | 123k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u)); | 854 | | // coeffs 4 5 4 5 4 5 4 5 | 855 | 123k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u)); | 856 | 123k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_4t_lowbd Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_4t_lowbd Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_4t_lowbd Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_4t_lowbd |
857 | | |
858 | | static inline void prepare_coeffs_6t_lowbd( |
859 | | const InterpFilterParams *const filter_params, const int subpel_q4, |
860 | 978k | __m256i *const coeffs /* [4] */) { |
861 | 978k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( |
862 | 978k | filter_params, subpel_q4 & SUBPEL_MASK); |
863 | 978k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); |
864 | 978k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); |
865 | | |
866 | | // right shift all filter co-efficients by 1 to reduce the bits required. |
867 | | // This extra right shift will be taken care of at the end while rounding |
868 | | // the result. |
869 | | // Since all filter co-efficients are even, this change will not affect the |
870 | | // end result |
871 | 978k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), |
872 | 978k | _mm_set1_epi16((int16_t)0xffff))); |
873 | | |
874 | 978k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); |
875 | | |
876 | | // coeffs 1 2 1 2 1 2 1 2 |
877 | 978k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0402u)); |
878 | | // coeffs 3 4 3 4 3 4 3 4 |
879 | 978k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u)); |
880 | | // coeffs 5 6 5 6 5 6 5 6 |
881 | 978k | coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0c0au)); |
882 | 978k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_6t_lowbd Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_6t_lowbd convolve_2d_avx2.c:prepare_coeffs_6t_lowbd Line | Count | Source | 860 | 595k | __m256i *const coeffs /* [4] */) { | 861 | 595k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 862 | 595k | filter_params, subpel_q4 & SUBPEL_MASK); | 863 | 595k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 864 | 595k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); | 865 | | | 866 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 867 | | // This extra right shift will be taken care of at the end while rounding | 868 | | // the result. | 869 | | // Since all filter co-efficients are even, this change will not affect the | 870 | | // end result | 871 | 595k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 872 | 595k | _mm_set1_epi16((int16_t)0xffff))); | 873 | | | 874 | 594k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); | 875 | | | 876 | | // coeffs 1 2 1 2 1 2 1 2 | 877 | 594k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0402u)); | 878 | | // coeffs 3 4 3 4 3 4 3 4 | 879 | 594k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u)); | 880 | | // coeffs 5 6 5 6 5 6 5 6 | 881 | 594k | coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0c0au)); | 882 | 594k | } |
convolve_avx2.c:prepare_coeffs_6t_lowbd Line | Count | Source | 860 | 383k | __m256i *const coeffs /* [4] */) { | 861 | 383k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 862 | 383k | filter_params, subpel_q4 & SUBPEL_MASK); | 863 | 383k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 864 | 383k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); | 865 | | | 866 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 867 | | // This extra right shift will be taken care of at the end while rounding | 868 | | // the result. | 869 | | // Since all filter co-efficients are even, this change will not affect the | 870 | | // end result | 871 | 383k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 872 | 383k | _mm_set1_epi16((int16_t)0xffff))); | 873 | | | 874 | 383k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); | 875 | | | 876 | | // coeffs 1 2 1 2 1 2 1 2 | 877 | 383k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0402u)); | 878 | | // coeffs 3 4 3 4 3 4 3 4 | 879 | 383k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u)); | 880 | | // coeffs 5 6 5 6 5 6 5 6 | 881 | 383k | coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0c0au)); | 882 | 383k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_6t_lowbd Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_6t_lowbd Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_6t_lowbd Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_6t_lowbd |
883 | | |
884 | | static inline void prepare_coeffs_lowbd( |
885 | | const InterpFilterParams *const filter_params, const int subpel_q4, |
886 | 427k | __m256i *const coeffs /* [4] */) { |
887 | 427k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( |
888 | 427k | filter_params, subpel_q4 & SUBPEL_MASK); |
889 | 427k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); |
890 | 427k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); |
891 | | |
892 | | // right shift all filter co-efficients by 1 to reduce the bits required. |
893 | | // This extra right shift will be taken care of at the end while rounding |
894 | | // the result. |
895 | | // Since all filter co-efficients are even, this change will not affect the |
896 | | // end result |
897 | 427k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), |
898 | 427k | _mm_set1_epi16((short)0xffff))); |
899 | | |
900 | 427k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); |
901 | | |
902 | | // coeffs 0 1 0 1 0 1 0 1 |
903 | 427k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u)); |
904 | | // coeffs 2 3 2 3 2 3 2 3 |
905 | 427k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u)); |
906 | | // coeffs 4 5 4 5 4 5 4 5 |
907 | 427k | coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u)); |
908 | | // coeffs 6 7 6 7 6 7 6 7 |
909 | 427k | coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu)); |
910 | 427k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_lowbd Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_lowbd convolve_2d_avx2.c:prepare_coeffs_lowbd Line | Count | Source | 886 | 49.5k | __m256i *const coeffs /* [4] */) { | 887 | 49.5k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 888 | 49.5k | filter_params, subpel_q4 & SUBPEL_MASK); | 889 | 49.5k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 890 | 49.5k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); | 891 | | | 892 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 893 | | // This extra right shift will be taken care of at the end while rounding | 894 | | // the result. | 895 | | // Since all filter co-efficients are even, this change will not affect the | 896 | | // end result | 897 | 49.5k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 898 | 49.5k | _mm_set1_epi16((short)0xffff))); | 899 | | | 900 | 49.5k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); | 901 | | | 902 | | // coeffs 0 1 0 1 0 1 0 1 | 903 | 49.5k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u)); | 904 | | // coeffs 2 3 2 3 2 3 2 3 | 905 | 49.5k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u)); | 906 | | // coeffs 4 5 4 5 4 5 4 5 | 907 | 49.5k | coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u)); | 908 | | // coeffs 6 7 6 7 6 7 6 7 | 909 | 49.5k | coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu)); | 910 | 49.5k | } |
convolve_avx2.c:prepare_coeffs_lowbd Line | Count | Source | 886 | 35.4k | __m256i *const coeffs /* [4] */) { | 887 | 35.4k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 888 | 35.4k | filter_params, subpel_q4 & SUBPEL_MASK); | 889 | 35.4k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 890 | 35.4k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); | 891 | | | 892 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 893 | | // This extra right shift will be taken care of at the end while rounding | 894 | | // the result. | 895 | | // Since all filter co-efficients are even, this change will not affect the | 896 | | // end result | 897 | 35.4k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 898 | 35.4k | _mm_set1_epi16((short)0xffff))); | 899 | | | 900 | 35.4k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); | 901 | | | 902 | | // coeffs 0 1 0 1 0 1 0 1 | 903 | 35.4k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u)); | 904 | | // coeffs 2 3 2 3 2 3 2 3 | 905 | 35.4k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u)); | 906 | | // coeffs 4 5 4 5 4 5 4 5 | 907 | 35.4k | coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u)); | 908 | | // coeffs 6 7 6 7 6 7 6 7 | 909 | 35.4k | coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu)); | 910 | 35.4k | } |
jnt_convolve_avx2.c:prepare_coeffs_lowbd Line | Count | Source | 886 | 342k | __m256i *const coeffs /* [4] */) { | 887 | 342k | const int16_t *const filter = av1_get_interp_filter_subpel_kernel( | 888 | 342k | filter_params, subpel_q4 & SUBPEL_MASK); | 889 | 342k | const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); | 890 | 342k | const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); | 891 | | | 892 | | // right shift all filter co-efficients by 1 to reduce the bits required. | 893 | | // This extra right shift will be taken care of at the end while rounding | 894 | | // the result. | 895 | | // Since all filter co-efficients are even, this change will not affect the | 896 | | // end result | 897 | 342k | assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), | 898 | 342k | _mm_set1_epi16((short)0xffff))); | 899 | | | 900 | 342k | const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); | 901 | | | 902 | | // coeffs 0 1 0 1 0 1 0 1 | 903 | 342k | coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u)); | 904 | | // coeffs 2 3 2 3 2 3 2 3 | 905 | 342k | coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u)); | 906 | | // coeffs 4 5 4 5 4 5 4 5 | 907 | 342k | coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u)); | 908 | | // coeffs 6 7 6 7 6 7 6 7 | 909 | 342k | coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu)); | 910 | 342k | } |
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_lowbd Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_lowbd Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_lowbd |
911 | | |
912 | | static inline void prepare_coeffs_2t( |
913 | | const InterpFilterParams *const filter_params, const int subpel_q4, |
914 | 43.3k | __m256i *const coeffs /* [4] */) { |
915 | 43.3k | const int16_t *filter = av1_get_interp_filter_subpel_kernel( |
916 | 43.3k | filter_params, subpel_q4 & SUBPEL_MASK); |
917 | | |
918 | 43.3k | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)(filter + 1)); |
919 | 43.3k | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); |
920 | | |
921 | | // coeffs 3 4 3 4 3 4 3 4 |
922 | 43.3k | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55); |
923 | 43.3k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_2t Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_2t convolve_2d_avx2.c:prepare_coeffs_2t Line | Count | Source | 914 | 43.3k | __m256i *const coeffs /* [4] */) { | 915 | 43.3k | const int16_t *filter = av1_get_interp_filter_subpel_kernel( | 916 | 43.3k | filter_params, subpel_q4 & SUBPEL_MASK); | 917 | | | 918 | 43.3k | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)(filter + 1)); | 919 | 43.3k | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); | 920 | | | 921 | | // coeffs 3 4 3 4 3 4 3 4 | 922 | | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55); | 923 | 43.3k | } |
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_2t Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_2t Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_2t Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_2t Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_2t |
924 | | |
925 | | static inline void prepare_coeffs_4t( |
926 | | const InterpFilterParams *const filter_params, const int subpel_q4, |
927 | 670k | __m256i *const coeffs /* [4] */) { |
928 | 670k | const int16_t *filter = av1_get_interp_filter_subpel_kernel( |
929 | 670k | filter_params, subpel_q4 & SUBPEL_MASK); |
930 | | |
931 | 670k | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); |
932 | 670k | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); |
933 | | // coeffs 2 3 2 3 2 3 2 3 |
934 | 670k | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55); |
935 | | // coeffs 4 5 4 5 4 5 4 5 |
936 | 670k | coeffs[1] = _mm256_shuffle_epi32(coeff, 0xaa); |
937 | 670k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_4t Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_4t convolve_2d_avx2.c:prepare_coeffs_4t Line | Count | Source | 927 | 670k | __m256i *const coeffs /* [4] */) { | 928 | 670k | const int16_t *filter = av1_get_interp_filter_subpel_kernel( | 929 | 670k | filter_params, subpel_q4 & SUBPEL_MASK); | 930 | | | 931 | 670k | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); | 932 | 670k | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); | 933 | | // coeffs 2 3 2 3 2 3 2 3 | 934 | 670k | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55); | 935 | | // coeffs 4 5 4 5 4 5 4 5 | 936 | | coeffs[1] = _mm256_shuffle_epi32(coeff, 0xaa); | 937 | 670k | } |
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_4t Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_4t Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_4t Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_4t Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_4t |
938 | | |
939 | | static inline void prepare_coeffs_6t( |
940 | | const InterpFilterParams *const filter_params, const int subpel_q4, |
941 | 495k | __m256i *const coeffs /* [4] */) { |
942 | 495k | const int16_t *filter = av1_get_interp_filter_subpel_kernel( |
943 | 495k | filter_params, subpel_q4 & SUBPEL_MASK); |
944 | | |
945 | 495k | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)(filter + 1)); |
946 | 495k | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); |
947 | | |
948 | | // coeffs 1 2 1 2 1 2 1 2 |
949 | 495k | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); |
950 | | // coeffs 3 4 3 4 3 4 3 4 |
951 | 495k | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); |
952 | | // coeffs 5 6 5 6 5 6 5 6 |
953 | 495k | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); |
954 | 495k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_6t Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_6t convolve_2d_avx2.c:prepare_coeffs_6t Line | Count | Source | 941 | 495k | __m256i *const coeffs /* [4] */) { | 942 | 495k | const int16_t *filter = av1_get_interp_filter_subpel_kernel( | 943 | 495k | filter_params, subpel_q4 & SUBPEL_MASK); | 944 | | | 945 | 495k | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)(filter + 1)); | 946 | 495k | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); | 947 | | | 948 | | // coeffs 1 2 1 2 1 2 1 2 | 949 | 495k | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); | 950 | | // coeffs 3 4 3 4 3 4 3 4 | 951 | 495k | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); | 952 | | // coeffs 5 6 5 6 5 6 5 6 | 953 | | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); | 954 | 495k | } |
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_6t Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_6t Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_6t Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_6t Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_6t |
955 | | |
956 | | static inline void prepare_coeffs(const InterpFilterParams *const filter_params, |
957 | | const int subpel_q4, |
958 | 7.67M | __m256i *const coeffs /* [4] */) { |
959 | 7.67M | const int16_t *filter = av1_get_interp_filter_subpel_kernel( |
960 | 7.67M | filter_params, subpel_q4 & SUBPEL_MASK); |
961 | | |
962 | 7.67M | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); |
963 | 7.67M | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); |
964 | | |
965 | | // coeffs 0 1 0 1 0 1 0 1 |
966 | 7.67M | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); |
967 | | // coeffs 2 3 2 3 2 3 2 3 |
968 | 7.67M | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); |
969 | | // coeffs 4 5 4 5 4 5 4 5 |
970 | 7.67M | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); |
971 | | // coeffs 6 7 6 7 6 7 6 7 |
972 | 7.67M | coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); |
973 | 7.67M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs highbd_convolve_avx2.c:prepare_coeffs Line | Count | Source | 958 | 1.59M | __m256i *const coeffs /* [4] */) { | 959 | 1.59M | const int16_t *filter = av1_get_interp_filter_subpel_kernel( | 960 | 1.59M | filter_params, subpel_q4 & SUBPEL_MASK); | 961 | | | 962 | 1.59M | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); | 963 | 1.59M | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); | 964 | | | 965 | | // coeffs 0 1 0 1 0 1 0 1 | 966 | 1.59M | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); | 967 | | // coeffs 2 3 2 3 2 3 2 3 | 968 | 1.59M | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); | 969 | | // coeffs 4 5 4 5 4 5 4 5 | 970 | 1.59M | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); | 971 | | // coeffs 6 7 6 7 6 7 6 7 | 972 | | coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); | 973 | 1.59M | } |
convolve_2d_avx2.c:prepare_coeffs Line | Count | Source | 958 | 46.4k | __m256i *const coeffs /* [4] */) { | 959 | 46.4k | const int16_t *filter = av1_get_interp_filter_subpel_kernel( | 960 | 46.4k | filter_params, subpel_q4 & SUBPEL_MASK); | 961 | | | 962 | 46.4k | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); | 963 | 46.4k | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); | 964 | | | 965 | | // coeffs 0 1 0 1 0 1 0 1 | 966 | 46.4k | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); | 967 | | // coeffs 2 3 2 3 2 3 2 3 | 968 | 46.4k | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); | 969 | | // coeffs 4 5 4 5 4 5 4 5 | 970 | 46.4k | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); | 971 | | // coeffs 6 7 6 7 6 7 6 7 | 972 | | coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); | 973 | 46.4k | } |
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs jnt_convolve_avx2.c:prepare_coeffs Line | Count | Source | 958 | 173k | __m256i *const coeffs /* [4] */) { | 959 | 173k | const int16_t *filter = av1_get_interp_filter_subpel_kernel( | 960 | 173k | filter_params, subpel_q4 & SUBPEL_MASK); | 961 | | | 962 | 173k | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); | 963 | 173k | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); | 964 | | | 965 | | // coeffs 0 1 0 1 0 1 0 1 | 966 | 173k | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); | 967 | | // coeffs 2 3 2 3 2 3 2 3 | 968 | 173k | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); | 969 | | // coeffs 4 5 4 5 4 5 4 5 | 970 | 173k | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); | 971 | | // coeffs 6 7 6 7 6 7 6 7 | 972 | | coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); | 973 | 173k | } |
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs highbd_convolve_2d_avx2.c:prepare_coeffs Line | Count | Source | 958 | 5.16M | __m256i *const coeffs /* [4] */) { | 959 | 5.16M | const int16_t *filter = av1_get_interp_filter_subpel_kernel( | 960 | 5.16M | filter_params, subpel_q4 & SUBPEL_MASK); | 961 | | | 962 | 5.16M | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); | 963 | 5.16M | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); | 964 | | | 965 | | // coeffs 0 1 0 1 0 1 0 1 | 966 | 5.16M | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); | 967 | | // coeffs 2 3 2 3 2 3 2 3 | 968 | 5.16M | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); | 969 | | // coeffs 4 5 4 5 4 5 4 5 | 970 | 5.16M | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); | 971 | | // coeffs 6 7 6 7 6 7 6 7 | 972 | | coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); | 973 | 5.16M | } |
highbd_jnt_convolve_avx2.c:prepare_coeffs Line | Count | Source | 958 | 693k | __m256i *const coeffs /* [4] */) { | 959 | 693k | const int16_t *filter = av1_get_interp_filter_subpel_kernel( | 960 | 693k | filter_params, subpel_q4 & SUBPEL_MASK); | 961 | | | 962 | 693k | const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); | 963 | 693k | const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); | 964 | | | 965 | | // coeffs 0 1 0 1 0 1 0 1 | 966 | 693k | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); | 967 | | // coeffs 2 3 2 3 2 3 2 3 | 968 | 693k | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); | 969 | | // coeffs 4 5 4 5 4 5 4 5 | 970 | 693k | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); | 971 | | // coeffs 6 7 6 7 6 7 6 7 | 972 | | coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); | 973 | 693k | } |
|
974 | | |
975 | | static inline void prepare_coeffs_12taps( |
976 | | const InterpFilterParams *const filter_params, const int subpel_q4, |
977 | 0 | __m256i *const coeffs /* [4] */) { |
978 | 0 | const int16_t *filter = av1_get_interp_filter_subpel_kernel( |
979 | 0 | filter_params, subpel_q4 & SUBPEL_MASK); |
980 | |
|
981 | 0 | __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); |
982 | 0 | __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); |
983 | | |
984 | | // coeffs 0 1 0 1 0 1 0 1 |
985 | 0 | coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); |
986 | | // coeffs 2 3 2 3 2 3 2 3 |
987 | 0 | coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); |
988 | | // coeffs 4 5 4 5 4 5 4 5 |
989 | 0 | coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); |
990 | | // coeffs 6 7 6 7 6 7 6 7 |
991 | 0 | coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); |
992 | | // coeffs 8 9 10 11 0 0 0 0 |
993 | 0 | coeff_8 = _mm_loadl_epi64((__m128i *)(filter + 8)); |
994 | 0 | coeff = _mm256_broadcastq_epi64(coeff_8); |
995 | 0 | coeffs[4] = _mm256_shuffle_epi32(coeff, 0x00); // coeffs 8 9 8 9 8 9 8 9 |
996 | 0 | coeffs[5] = _mm256_shuffle_epi32(coeff, 0x55); // coeffs 10 11 10 11.. 10 11 |
997 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_12taps Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_12taps Unexecuted instantiation: convolve_2d_avx2.c:prepare_coeffs_12taps Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_12taps Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_12taps Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_12taps Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_12taps Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_12taps |
998 | | |
999 | | static inline __m128i convolve_lowbd_4tap_ssse3(const __m128i ss[2], |
1000 | 3.63M | const __m128i coeffs[2]) { |
1001 | 3.63M | const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]); |
1002 | 3.63M | const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]); |
1003 | | |
1004 | 3.63M | return _mm_add_epi16(res_01, res_23); |
1005 | 3.63M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_4tap_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_4tap_ssse3 convolve_2d_avx2.c:convolve_lowbd_4tap_ssse3 Line | Count | Source | 1000 | 2.81M | const __m128i coeffs[2]) { | 1001 | 2.81M | const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]); | 1002 | 2.81M | const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]); | 1003 | | | 1004 | 2.81M | return _mm_add_epi16(res_01, res_23); | 1005 | 2.81M | } |
convolve_avx2.c:convolve_lowbd_4tap_ssse3 Line | Count | Source | 1000 | 814k | const __m128i coeffs[2]) { | 1001 | 814k | const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]); | 1002 | 814k | const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]); | 1003 | | | 1004 | 814k | return _mm_add_epi16(res_01, res_23); | 1005 | 814k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_4tap_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_4tap_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_4tap_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_4tap_ssse3 |
1006 | | |
1007 | | static inline __m128i convolve_lowbd_6tap_ssse3(const __m128i ss[3], |
1008 | 320k | const __m128i coeffs[3]) { |
1009 | 320k | const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]); |
1010 | 320k | const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]); |
1011 | 320k | const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]); |
1012 | | |
1013 | 320k | const __m128i res = _mm_add_epi16(_mm_add_epi16(res_01, res_45), res_23); |
1014 | | |
1015 | 320k | return res; |
1016 | 320k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_6tap_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_6tap_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_lowbd_6tap_ssse3 convolve_avx2.c:convolve_lowbd_6tap_ssse3 Line | Count | Source | 1008 | 320k | const __m128i coeffs[3]) { | 1009 | 320k | const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]); | 1010 | 320k | const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]); | 1011 | 320k | const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]); | 1012 | | | 1013 | 320k | const __m128i res = _mm_add_epi16(_mm_add_epi16(res_01, res_45), res_23); | 1014 | | | 1015 | 320k | return res; | 1016 | 320k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_6tap_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_6tap_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_6tap_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_6tap_ssse3 |
1017 | | |
1018 | | static inline __m128i convolve_lowbd_ssse3(const __m128i ss[4], |
1019 | 26.0k | const __m128i coeffs[4]) { |
1020 | 26.0k | const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]); |
1021 | 26.0k | const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]); |
1022 | 26.0k | const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]); |
1023 | 26.0k | const __m128i res_67 = _mm_maddubs_epi16(ss[3], coeffs[3]); |
1024 | | |
1025 | 26.0k | const __m128i res = _mm_add_epi16(_mm_add_epi16(res_01, res_45), |
1026 | 26.0k | _mm_add_epi16(res_23, res_67)); |
1027 | | |
1028 | 26.0k | return res; |
1029 | 26.0k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_lowbd_ssse3 convolve_avx2.c:convolve_lowbd_ssse3 Line | Count | Source | 1019 | 26.0k | const __m128i coeffs[4]) { | 1020 | 26.0k | const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]); | 1021 | 26.0k | const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]); | 1022 | 26.0k | const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]); | 1023 | 26.0k | const __m128i res_67 = _mm_maddubs_epi16(ss[3], coeffs[3]); | 1024 | | | 1025 | 26.0k | const __m128i res = _mm_add_epi16(_mm_add_epi16(res_01, res_45), | 1026 | 26.0k | _mm_add_epi16(res_23, res_67)); | 1027 | | | 1028 | 26.0k | return res; | 1029 | 26.0k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_ssse3 |
1030 | | |
1031 | | static inline __m256i convolve_lowbd(const __m256i *const s, |
1032 | 19.0M | const __m256i *const coeffs) { |
1033 | 19.0M | const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); |
1034 | 19.0M | const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); |
1035 | 19.0M | const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); |
1036 | 19.0M | const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]); |
1037 | | |
1038 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
1039 | 19.0M | const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), |
1040 | 19.0M | _mm256_add_epi16(res_23, res_67)); |
1041 | | |
1042 | 19.0M | return res; |
1043 | 19.0M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd convolve_2d_avx2.c:convolve_lowbd Line | Count | Source | 1032 | 2.28M | const __m256i *const coeffs) { | 1033 | 2.28M | const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 1034 | 2.28M | const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 1035 | 2.28M | const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); | 1036 | 2.28M | const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]); | 1037 | | | 1038 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 1039 | 2.28M | const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), | 1040 | 2.28M | _mm256_add_epi16(res_23, res_67)); | 1041 | | | 1042 | 2.28M | return res; | 1043 | 2.28M | } |
convolve_avx2.c:convolve_lowbd Line | Count | Source | 1032 | 511k | const __m256i *const coeffs) { | 1033 | 511k | const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 1034 | 511k | const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 1035 | 511k | const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); | 1036 | 511k | const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]); | 1037 | | | 1038 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 1039 | 511k | const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), | 1040 | 511k | _mm256_add_epi16(res_23, res_67)); | 1041 | | | 1042 | 511k | return res; | 1043 | 511k | } |
jnt_convolve_avx2.c:convolve_lowbd Line | Count | Source | 1032 | 5.43M | const __m256i *const coeffs) { | 1033 | 5.43M | const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 1034 | 5.43M | const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 1035 | 5.43M | const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); | 1036 | 5.43M | const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]); | 1037 | | | 1038 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 1039 | 5.43M | const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), | 1040 | 5.43M | _mm256_add_epi16(res_23, res_67)); | 1041 | | | 1042 | 5.43M | return res; | 1043 | 5.43M | } |
wiener_convolve_avx2.c:convolve_lowbd Line | Count | Source | 1032 | 10.7M | const __m256i *const coeffs) { | 1033 | 10.7M | const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 1034 | 10.7M | const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 1035 | 10.7M | const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); | 1036 | 10.7M | const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]); | 1037 | | | 1038 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 1039 | 10.7M | const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), | 1040 | 10.7M | _mm256_add_epi16(res_23, res_67)); | 1041 | | | 1042 | 10.7M | return res; | 1043 | 10.7M | } |
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd |
1044 | | |
1045 | | static inline __m256i convolve_lowbd_6tap(const __m256i *const s, |
1046 | 16.1M | const __m256i *const coeffs) { |
1047 | 16.1M | const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); |
1048 | 16.1M | const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); |
1049 | 16.1M | const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); |
1050 | | |
1051 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
1052 | 16.1M | const __m256i res = |
1053 | 16.1M | _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), res_23); |
1054 | | |
1055 | 16.1M | return res; |
1056 | 16.1M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_6tap Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_6tap convolve_2d_avx2.c:convolve_lowbd_6tap Line | Count | Source | 1046 | 9.99M | const __m256i *const coeffs) { | 1047 | 9.99M | const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 1048 | 9.99M | const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 1049 | 9.99M | const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); | 1050 | | | 1051 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 1052 | 9.99M | const __m256i res = | 1053 | 9.99M | _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), res_23); | 1054 | | | 1055 | 9.99M | return res; | 1056 | 9.99M | } |
convolve_avx2.c:convolve_lowbd_6tap Line | Count | Source | 1046 | 6.15M | const __m256i *const coeffs) { | 1047 | 6.15M | const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 1048 | 6.15M | const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 1049 | 6.15M | const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); | 1050 | | | 1051 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 1052 | 6.15M | const __m256i res = | 1053 | 6.15M | _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), res_23); | 1054 | | | 1055 | 6.15M | return res; | 1056 | 6.15M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_6tap Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_6tap Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_6tap Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_6tap |
1057 | | |
1058 | | static inline __m256i convolve_lowbd_4tap(const __m256i *const s, |
1059 | 4.41M | const __m256i *const coeffs) { |
1060 | 4.41M | const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]); |
1061 | 4.41M | const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]); |
1062 | | |
1063 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
1064 | 4.41M | const __m256i res = _mm256_add_epi16(res_45, res_23); |
1065 | | |
1066 | 4.41M | return res; |
1067 | 4.41M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_4tap Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_4tap convolve_2d_avx2.c:convolve_lowbd_4tap Line | Count | Source | 1059 | 1.04M | const __m256i *const coeffs) { | 1060 | 1.04M | const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 1061 | 1.04M | const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 1062 | | | 1063 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 1064 | 1.04M | const __m256i res = _mm256_add_epi16(res_45, res_23); | 1065 | | | 1066 | 1.04M | return res; | 1067 | 1.04M | } |
convolve_avx2.c:convolve_lowbd_4tap Line | Count | Source | 1059 | 1.50M | const __m256i *const coeffs) { | 1060 | 1.50M | const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 1061 | 1.50M | const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 1062 | | | 1063 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 1064 | 1.50M | const __m256i res = _mm256_add_epi16(res_45, res_23); | 1065 | | | 1066 | 1.50M | return res; | 1067 | 1.50M | } |
jnt_convolve_avx2.c:convolve_lowbd_4tap Line | Count | Source | 1059 | 1.86M | const __m256i *const coeffs) { | 1060 | 1.86M | const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]); | 1061 | 1.86M | const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]); | 1062 | | | 1063 | | // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | 1064 | 1.86M | const __m256i res = _mm256_add_epi16(res_45, res_23); | 1065 | | | 1066 | 1.86M | return res; | 1067 | 1.86M | } |
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_4tap Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_4tap Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_4tap |
1068 | | |
1069 | | static inline __m256i convolve_6tap(const __m256i *const s, |
1070 | 14.8M | const __m256i *const coeffs) { |
1071 | 14.8M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); |
1072 | 14.8M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); |
1073 | 14.8M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); |
1074 | | |
1075 | 14.8M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), res_2); |
1076 | | |
1077 | 14.8M | return res; |
1078 | 14.8M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_6tap Unexecuted instantiation: highbd_convolve_avx2.c:convolve_6tap convolve_2d_avx2.c:convolve_6tap Line | Count | Source | 1070 | 14.8M | const __m256i *const coeffs) { | 1071 | 14.8M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); | 1072 | 14.8M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); | 1073 | 14.8M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); | 1074 | | | 1075 | 14.8M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), res_2); | 1076 | | | 1077 | 14.8M | return res; | 1078 | 14.8M | } |
Unexecuted instantiation: convolve_avx2.c:convolve_6tap Unexecuted instantiation: jnt_convolve_avx2.c:convolve_6tap Unexecuted instantiation: wiener_convolve_avx2.c:convolve_6tap Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_6tap Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_6tap |
1079 | | |
1080 | | static inline __m256i convolve_12taps(const __m256i *const s, |
1081 | 0 | const __m256i *const coeffs) { |
1082 | 0 | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); |
1083 | 0 | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); |
1084 | 0 | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); |
1085 | 0 | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); |
1086 | 0 | const __m256i res_4 = _mm256_madd_epi16(s[4], coeffs[4]); |
1087 | 0 | const __m256i res_5 = _mm256_madd_epi16(s[5], coeffs[5]); |
1088 | |
|
1089 | 0 | const __m256i res1 = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), |
1090 | 0 | _mm256_add_epi32(res_2, res_3)); |
1091 | 0 | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_4, res_5), res1); |
1092 | |
|
1093 | 0 | return res; |
1094 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_12taps Unexecuted instantiation: highbd_convolve_avx2.c:convolve_12taps Unexecuted instantiation: convolve_2d_avx2.c:convolve_12taps Unexecuted instantiation: convolve_avx2.c:convolve_12taps Unexecuted instantiation: jnt_convolve_avx2.c:convolve_12taps Unexecuted instantiation: wiener_convolve_avx2.c:convolve_12taps Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_12taps Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_12taps |
1095 | | |
1096 | | static inline __m256i convolve(const __m256i *const s, |
1097 | 207M | const __m256i *const coeffs) { |
1098 | 207M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); |
1099 | 207M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); |
1100 | 207M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); |
1101 | 207M | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); |
1102 | | |
1103 | 207M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), |
1104 | 207M | _mm256_add_epi32(res_2, res_3)); |
1105 | | |
1106 | 207M | return res; |
1107 | 207M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve highbd_convolve_avx2.c:convolve Line | Count | Source | 1097 | 30.1M | const __m256i *const coeffs) { | 1098 | 30.1M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); | 1099 | 30.1M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); | 1100 | 30.1M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); | 1101 | 30.1M | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); | 1102 | | | 1103 | 30.1M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), | 1104 | 30.1M | _mm256_add_epi32(res_2, res_3)); | 1105 | | | 1106 | 30.1M | return res; | 1107 | 30.1M | } |
convolve_2d_avx2.c:convolve Line | Count | Source | 1097 | 3.38M | const __m256i *const coeffs) { | 1098 | 3.38M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); | 1099 | 3.38M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); | 1100 | 3.38M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); | 1101 | 3.38M | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); | 1102 | | | 1103 | 3.38M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), | 1104 | 3.38M | _mm256_add_epi32(res_2, res_3)); | 1105 | | | 1106 | 3.38M | return res; | 1107 | 3.38M | } |
Unexecuted instantiation: convolve_avx2.c:convolve jnt_convolve_avx2.c:convolve Line | Count | Source | 1097 | 4.59M | const __m256i *const coeffs) { | 1098 | 4.59M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); | 1099 | 4.59M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); | 1100 | 4.59M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); | 1101 | 4.59M | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); | 1102 | | | 1103 | 4.59M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), | 1104 | 4.59M | _mm256_add_epi32(res_2, res_3)); | 1105 | | | 1106 | 4.59M | return res; | 1107 | 4.59M | } |
wiener_convolve_avx2.c:convolve Line | Count | Source | 1097 | 19.2M | const __m256i *const coeffs) { | 1098 | 19.2M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); | 1099 | 19.2M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); | 1100 | 19.2M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); | 1101 | 19.2M | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); | 1102 | | | 1103 | 19.2M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), | 1104 | 19.2M | _mm256_add_epi32(res_2, res_3)); | 1105 | | | 1106 | 19.2M | return res; | 1107 | 19.2M | } |
highbd_convolve_2d_avx2.c:convolve Line | Count | Source | 1097 | 111M | const __m256i *const coeffs) { | 1098 | 111M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); | 1099 | 111M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); | 1100 | 111M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); | 1101 | 111M | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); | 1102 | | | 1103 | 111M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), | 1104 | 111M | _mm256_add_epi32(res_2, res_3)); | 1105 | | | 1106 | 111M | return res; | 1107 | 111M | } |
highbd_jnt_convolve_avx2.c:convolve Line | Count | Source | 1097 | 39.1M | const __m256i *const coeffs) { | 1098 | 39.1M | const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); | 1099 | 39.1M | const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); | 1100 | 39.1M | const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); | 1101 | 39.1M | const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); | 1102 | | | 1103 | 39.1M | const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), | 1104 | 39.1M | _mm256_add_epi32(res_2, res_3)); | 1105 | | | 1106 | 39.1M | return res; | 1107 | 39.1M | } |
|
1108 | | |
1109 | | static inline __m256i convolve_4tap(const __m256i *const s, |
1110 | 3.55M | const __m256i *const coeffs) { |
1111 | 3.55M | const __m256i res_1 = _mm256_madd_epi16(s[0], coeffs[0]); |
1112 | 3.55M | const __m256i res_2 = _mm256_madd_epi16(s[1], coeffs[1]); |
1113 | | |
1114 | 3.55M | const __m256i res = _mm256_add_epi32(res_1, res_2); |
1115 | 3.55M | return res; |
1116 | 3.55M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_4tap Unexecuted instantiation: highbd_convolve_avx2.c:convolve_4tap convolve_2d_avx2.c:convolve_4tap Line | Count | Source | 1110 | 3.31M | const __m256i *const coeffs) { | 1111 | 3.31M | const __m256i res_1 = _mm256_madd_epi16(s[0], coeffs[0]); | 1112 | 3.31M | const __m256i res_2 = _mm256_madd_epi16(s[1], coeffs[1]); | 1113 | | | 1114 | 3.31M | const __m256i res = _mm256_add_epi32(res_1, res_2); | 1115 | 3.31M | return res; | 1116 | 3.31M | } |
Unexecuted instantiation: convolve_avx2.c:convolve_4tap jnt_convolve_avx2.c:convolve_4tap Line | Count | Source | 1110 | 244k | const __m256i *const coeffs) { | 1111 | 244k | const __m256i res_1 = _mm256_madd_epi16(s[0], coeffs[0]); | 1112 | 244k | const __m256i res_2 = _mm256_madd_epi16(s[1], coeffs[1]); | 1113 | | | 1114 | 244k | const __m256i res = _mm256_add_epi32(res_1, res_2); | 1115 | 244k | return res; | 1116 | 244k | } |
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_4tap Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_4tap Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_4tap |
1117 | | |
1118 | | static inline __m128i convolve_lowbd_x_2tap_ssse3(const __m128i data, |
1119 | | const __m128i *const coeffs, |
1120 | 87.1k | const __m128i *const filt) { |
1121 | 87.1k | __m128i s; |
1122 | 87.1k | s = _mm_shuffle_epi8(data, filt[0]); |
1123 | | |
1124 | 87.1k | return _mm_maddubs_epi16(s, coeffs[0]); |
1125 | 87.1k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x_2tap_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x_2tap_ssse3 convolve_2d_avx2.c:convolve_lowbd_x_2tap_ssse3 Line | Count | Source | 1120 | 87.1k | const __m128i *const filt) { | 1121 | 87.1k | __m128i s; | 1122 | 87.1k | s = _mm_shuffle_epi8(data, filt[0]); | 1123 | | | 1124 | 87.1k | return _mm_maddubs_epi16(s, coeffs[0]); | 1125 | 87.1k | } |
Unexecuted instantiation: convolve_avx2.c:convolve_lowbd_x_2tap_ssse3 Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_x_2tap_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_x_2tap_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x_2tap_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x_2tap_ssse3 |
1126 | | |
1127 | | static inline __m128i convolve_lowbd_x_4tap_ssse3(const __m128i data, |
1128 | | const __m128i *const coeffs, |
1129 | 2.81M | const __m128i *const filt) { |
1130 | 2.81M | __m128i s[2]; |
1131 | | |
1132 | 2.81M | s[0] = _mm_shuffle_epi8(data, filt[0]); |
1133 | 2.81M | s[1] = _mm_shuffle_epi8(data, filt[1]); |
1134 | | |
1135 | 2.81M | return convolve_lowbd_4tap_ssse3(s, coeffs); |
1136 | 2.81M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x_4tap_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x_4tap_ssse3 convolve_2d_avx2.c:convolve_lowbd_x_4tap_ssse3 Line | Count | Source | 1129 | 2.81M | const __m128i *const filt) { | 1130 | 2.81M | __m128i s[2]; | 1131 | | | 1132 | 2.81M | s[0] = _mm_shuffle_epi8(data, filt[0]); | 1133 | 2.81M | s[1] = _mm_shuffle_epi8(data, filt[1]); | 1134 | | | 1135 | 2.81M | return convolve_lowbd_4tap_ssse3(s, coeffs); | 1136 | 2.81M | } |
Unexecuted instantiation: convolve_avx2.c:convolve_lowbd_x_4tap_ssse3 Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_x_4tap_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_x_4tap_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x_4tap_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x_4tap_ssse3 |
1137 | | |
1138 | | static inline __m256i convolve_lowbd_x(const __m256i data, |
1139 | | const __m256i *const coeffs, |
1140 | 18.0M | const __m256i *const filt) { |
1141 | 18.0M | __m256i s[4]; |
1142 | | |
1143 | 18.0M | s[0] = _mm256_shuffle_epi8(data, filt[0]); |
1144 | 18.0M | s[1] = _mm256_shuffle_epi8(data, filt[1]); |
1145 | 18.0M | s[2] = _mm256_shuffle_epi8(data, filt[2]); |
1146 | 18.0M | s[3] = _mm256_shuffle_epi8(data, filt[3]); |
1147 | | |
1148 | 18.0M | return convolve_lowbd(s, coeffs); |
1149 | 18.0M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x convolve_2d_avx2.c:convolve_lowbd_x Line | Count | Source | 1140 | 2.28M | const __m256i *const filt) { | 1141 | 2.28M | __m256i s[4]; | 1142 | | | 1143 | 2.28M | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 1144 | 2.28M | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 1145 | 2.28M | s[2] = _mm256_shuffle_epi8(data, filt[2]); | 1146 | 2.28M | s[3] = _mm256_shuffle_epi8(data, filt[3]); | 1147 | | | 1148 | 2.28M | return convolve_lowbd(s, coeffs); | 1149 | 2.28M | } |
convolve_avx2.c:convolve_lowbd_x Line | Count | Source | 1140 | 331k | const __m256i *const filt) { | 1141 | 331k | __m256i s[4]; | 1142 | | | 1143 | 331k | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 1144 | 331k | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 1145 | 331k | s[2] = _mm256_shuffle_epi8(data, filt[2]); | 1146 | 331k | s[3] = _mm256_shuffle_epi8(data, filt[3]); | 1147 | | | 1148 | 331k | return convolve_lowbd(s, coeffs); | 1149 | 331k | } |
jnt_convolve_avx2.c:convolve_lowbd_x Line | Count | Source | 1140 | 4.63M | const __m256i *const filt) { | 1141 | 4.63M | __m256i s[4]; | 1142 | | | 1143 | 4.63M | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 1144 | 4.63M | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 1145 | 4.63M | s[2] = _mm256_shuffle_epi8(data, filt[2]); | 1146 | 4.63M | s[3] = _mm256_shuffle_epi8(data, filt[3]); | 1147 | | | 1148 | 4.63M | return convolve_lowbd(s, coeffs); | 1149 | 4.63M | } |
wiener_convolve_avx2.c:convolve_lowbd_x Line | Count | Source | 1140 | 10.7M | const __m256i *const filt) { | 1141 | 10.7M | __m256i s[4]; | 1142 | | | 1143 | 10.7M | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 1144 | 10.7M | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 1145 | 10.7M | s[2] = _mm256_shuffle_epi8(data, filt[2]); | 1146 | 10.7M | s[3] = _mm256_shuffle_epi8(data, filt[3]); | 1147 | | | 1148 | 10.7M | return convolve_lowbd(s, coeffs); | 1149 | 10.7M | } |
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x |
1150 | | |
1151 | | static inline __m256i convolve_lowbd_x_6tap(const __m256i data, |
1152 | | const __m256i *const coeffs, |
1153 | 13.2M | const __m256i *const filt) { |
1154 | 13.2M | __m256i s[4]; |
1155 | | |
1156 | 13.2M | s[0] = _mm256_shuffle_epi8(data, filt[0]); |
1157 | 13.2M | s[1] = _mm256_shuffle_epi8(data, filt[1]); |
1158 | 13.2M | s[2] = _mm256_shuffle_epi8(data, filt[2]); |
1159 | | |
1160 | 13.2M | return convolve_lowbd_6tap(s, coeffs); |
1161 | 13.2M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x_6tap Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x_6tap convolve_2d_avx2.c:convolve_lowbd_x_6tap Line | Count | Source | 1153 | 9.99M | const __m256i *const filt) { | 1154 | 9.99M | __m256i s[4]; | 1155 | | | 1156 | 9.99M | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 1157 | 9.99M | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 1158 | 9.99M | s[2] = _mm256_shuffle_epi8(data, filt[2]); | 1159 | | | 1160 | 9.99M | return convolve_lowbd_6tap(s, coeffs); | 1161 | 9.99M | } |
convolve_avx2.c:convolve_lowbd_x_6tap Line | Count | Source | 1153 | 3.29M | const __m256i *const filt) { | 1154 | 3.29M | __m256i s[4]; | 1155 | | | 1156 | 3.29M | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 1157 | 3.29M | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 1158 | 3.29M | s[2] = _mm256_shuffle_epi8(data, filt[2]); | 1159 | | | 1160 | 3.29M | return convolve_lowbd_6tap(s, coeffs); | 1161 | 3.29M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_x_6tap Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_x_6tap Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x_6tap Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x_6tap |
1162 | | |
1163 | | static inline __m256i convolve_lowbd_x_4tap(const __m256i data, |
1164 | | const __m256i *const coeffs, |
1165 | 3.21M | const __m256i *const filt) { |
1166 | 3.21M | __m256i s[2]; |
1167 | | |
1168 | 3.21M | s[0] = _mm256_shuffle_epi8(data, filt[0]); |
1169 | 3.21M | s[1] = _mm256_shuffle_epi8(data, filt[1]); |
1170 | | |
1171 | 3.21M | return convolve_lowbd_4tap(s, coeffs); |
1172 | 3.21M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x_4tap Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x_4tap convolve_2d_avx2.c:convolve_lowbd_x_4tap Line | Count | Source | 1165 | 1.04M | const __m256i *const filt) { | 1166 | 1.04M | __m256i s[2]; | 1167 | | | 1168 | 1.04M | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 1169 | 1.04M | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 1170 | | | 1171 | 1.04M | return convolve_lowbd_4tap(s, coeffs); | 1172 | 1.04M | } |
convolve_avx2.c:convolve_lowbd_x_4tap Line | Count | Source | 1165 | 589k | const __m256i *const filt) { | 1166 | 589k | __m256i s[2]; | 1167 | | | 1168 | 589k | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 1169 | 589k | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 1170 | | | 1171 | 589k | return convolve_lowbd_4tap(s, coeffs); | 1172 | 589k | } |
jnt_convolve_avx2.c:convolve_lowbd_x_4tap Line | Count | Source | 1165 | 1.58M | const __m256i *const filt) { | 1166 | 1.58M | __m256i s[2]; | 1167 | | | 1168 | 1.58M | s[0] = _mm256_shuffle_epi8(data, filt[0]); | 1169 | 1.58M | s[1] = _mm256_shuffle_epi8(data, filt[1]); | 1170 | | | 1171 | 1.58M | return convolve_lowbd_4tap(s, coeffs); | 1172 | 1.58M | } |
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_x_4tap Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x_4tap Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x_4tap |
1173 | | |
1174 | | static inline __m256i convolve_lowbd_x_2tap(const __m256i data, |
1175 | | const __m256i *const coeffs, |
1176 | 528k | const __m256i *const filt) { |
1177 | 528k | __m256i s; |
1178 | 528k | s = _mm256_shuffle_epi8(data, filt[0]); |
1179 | | |
1180 | 528k | return _mm256_maddubs_epi16(s, coeffs[0]); |
1181 | 528k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x_2tap Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x_2tap convolve_2d_avx2.c:convolve_lowbd_x_2tap Line | Count | Source | 1176 | 528k | const __m256i *const filt) { | 1177 | 528k | __m256i s; | 1178 | 528k | s = _mm256_shuffle_epi8(data, filt[0]); | 1179 | | | 1180 | 528k | return _mm256_maddubs_epi16(s, coeffs[0]); | 1181 | 528k | } |
Unexecuted instantiation: convolve_avx2.c:convolve_lowbd_x_2tap Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_x_2tap Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_x_2tap Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x_2tap Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x_2tap |
1182 | | |
1183 | | static inline void add_store_aligned_256(CONV_BUF_TYPE *const dst, |
1184 | | const __m256i *const res, |
1185 | 0 | const int do_average) { |
1186 | 0 | __m256i d; |
1187 | 0 | if (do_average) { |
1188 | 0 | d = _mm256_load_si256((__m256i *)dst); |
1189 | 0 | d = _mm256_add_epi32(d, *res); |
1190 | 0 | d = _mm256_srai_epi32(d, 1); |
1191 | 0 | } else { |
1192 | 0 | d = *res; |
1193 | 0 | } |
1194 | 0 | _mm256_store_si256((__m256i *)dst, d); |
1195 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:add_store_aligned_256 Unexecuted instantiation: highbd_convolve_avx2.c:add_store_aligned_256 Unexecuted instantiation: convolve_2d_avx2.c:add_store_aligned_256 Unexecuted instantiation: convolve_avx2.c:add_store_aligned_256 Unexecuted instantiation: jnt_convolve_avx2.c:add_store_aligned_256 Unexecuted instantiation: wiener_convolve_avx2.c:add_store_aligned_256 Unexecuted instantiation: highbd_convolve_2d_avx2.c:add_store_aligned_256 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:add_store_aligned_256 |
1196 | | |
1197 | | static inline __m256i comp_avg(const __m256i *const data_ref_0, |
1198 | | const __m256i *const res_unsigned, |
1199 | | const __m256i *const wt, |
1200 | 124M | const int use_dist_wtd_comp_avg) { |
1201 | 124M | __m256i res; |
1202 | 124M | if (use_dist_wtd_comp_avg) { |
1203 | 2.06M | const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned); |
1204 | 2.06M | const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned); |
1205 | | |
1206 | 2.06M | const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt); |
1207 | 2.06M | const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt); |
1208 | | |
1209 | 2.06M | const __m256i res_lo = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); |
1210 | 2.06M | const __m256i res_hi = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); |
1211 | | |
1212 | 2.06M | res = _mm256_packs_epi32(res_lo, res_hi); |
1213 | 122M | } else { |
1214 | 122M | const __m256i wt_res = _mm256_add_epi16(*data_ref_0, *res_unsigned); |
1215 | 122M | res = _mm256_srai_epi16(wt_res, 1); |
1216 | 122M | } |
1217 | 124M | return res; |
1218 | 124M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:comp_avg Unexecuted instantiation: highbd_convolve_avx2.c:comp_avg Unexecuted instantiation: convolve_2d_avx2.c:comp_avg Unexecuted instantiation: convolve_avx2.c:comp_avg jnt_convolve_avx2.c:comp_avg Line | Count | Source | 1200 | 124M | const int use_dist_wtd_comp_avg) { | 1201 | 124M | __m256i res; | 1202 | 124M | if (use_dist_wtd_comp_avg) { | 1203 | 2.06M | const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned); | 1204 | 2.06M | const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned); | 1205 | | | 1206 | 2.06M | const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt); | 1207 | 2.06M | const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt); | 1208 | | | 1209 | 2.06M | const __m256i res_lo = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); | 1210 | 2.06M | const __m256i res_hi = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); | 1211 | | | 1212 | 2.06M | res = _mm256_packs_epi32(res_lo, res_hi); | 1213 | 122M | } else { | 1214 | 122M | const __m256i wt_res = _mm256_add_epi16(*data_ref_0, *res_unsigned); | 1215 | 122M | res = _mm256_srai_epi16(wt_res, 1); | 1216 | 122M | } | 1217 | 124M | return res; | 1218 | 124M | } |
Unexecuted instantiation: wiener_convolve_avx2.c:comp_avg Unexecuted instantiation: highbd_convolve_2d_avx2.c:comp_avg Unexecuted instantiation: highbd_jnt_convolve_avx2.c:comp_avg |
1219 | | |
1220 | | static inline __m256i convolve_rounding(const __m256i *const res_unsigned, |
1221 | | const __m256i *const offset_const, |
1222 | | const __m256i *const round_const, |
1223 | 124M | const int round_shift) { |
1224 | 124M | const __m256i res_signed = _mm256_sub_epi16(*res_unsigned, *offset_const); |
1225 | 124M | const __m256i res_round = _mm256_srai_epi16( |
1226 | 124M | _mm256_add_epi16(res_signed, *round_const), round_shift); |
1227 | 124M | return res_round; |
1228 | 124M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_rounding Unexecuted instantiation: highbd_convolve_avx2.c:convolve_rounding Unexecuted instantiation: convolve_2d_avx2.c:convolve_rounding Unexecuted instantiation: convolve_avx2.c:convolve_rounding jnt_convolve_avx2.c:convolve_rounding Line | Count | Source | 1223 | 124M | const int round_shift) { | 1224 | 124M | const __m256i res_signed = _mm256_sub_epi16(*res_unsigned, *offset_const); | 1225 | 124M | const __m256i res_round = _mm256_srai_epi16( | 1226 | 124M | _mm256_add_epi16(res_signed, *round_const), round_shift); | 1227 | 124M | return res_round; | 1228 | 124M | } |
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_rounding Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_rounding Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_rounding |
1229 | | |
1230 | | static inline __m256i highbd_comp_avg(const __m256i *const data_ref_0, |
1231 | | const __m256i *const res_unsigned, |
1232 | | const __m256i *const wt0, |
1233 | | const __m256i *const wt1, |
1234 | 12.5M | const int use_dist_wtd_comp_avg) { |
1235 | 12.5M | __m256i res; |
1236 | 12.5M | if (use_dist_wtd_comp_avg) { |
1237 | 1.29M | const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0); |
1238 | 1.29M | const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1); |
1239 | 1.29M | const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res); |
1240 | 1.29M | res = _mm256_srai_epi32(wt_res, DIST_PRECISION_BITS); |
1241 | 11.2M | } else { |
1242 | 11.2M | const __m256i wt_res = _mm256_add_epi32(*data_ref_0, *res_unsigned); |
1243 | 11.2M | res = _mm256_srai_epi32(wt_res, 1); |
1244 | 11.2M | } |
1245 | 12.5M | return res; |
1246 | 12.5M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:highbd_comp_avg Unexecuted instantiation: highbd_convolve_avx2.c:highbd_comp_avg Unexecuted instantiation: convolve_2d_avx2.c:highbd_comp_avg Unexecuted instantiation: convolve_avx2.c:highbd_comp_avg Unexecuted instantiation: jnt_convolve_avx2.c:highbd_comp_avg Unexecuted instantiation: wiener_convolve_avx2.c:highbd_comp_avg Unexecuted instantiation: highbd_convolve_2d_avx2.c:highbd_comp_avg highbd_jnt_convolve_avx2.c:highbd_comp_avg Line | Count | Source | 1234 | 12.5M | const int use_dist_wtd_comp_avg) { | 1235 | 12.5M | __m256i res; | 1236 | 12.5M | if (use_dist_wtd_comp_avg) { | 1237 | 1.29M | const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0); | 1238 | 1.29M | const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1); | 1239 | 1.29M | const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res); | 1240 | 1.29M | res = _mm256_srai_epi32(wt_res, DIST_PRECISION_BITS); | 1241 | 11.2M | } else { | 1242 | 11.2M | const __m256i wt_res = _mm256_add_epi32(*data_ref_0, *res_unsigned); | 1243 | 11.2M | res = _mm256_srai_epi32(wt_res, 1); | 1244 | 11.2M | } | 1245 | 12.5M | return res; | 1246 | 12.5M | } |
|
1247 | | |
1248 | | static inline __m256i highbd_convolve_rounding( |
1249 | | const __m256i *const res_unsigned, const __m256i *const offset_const, |
1250 | 12.5M | const __m256i *const round_const, const int round_shift) { |
1251 | 12.5M | const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const); |
1252 | 12.5M | const __m256i res_round = _mm256_srai_epi32( |
1253 | 12.5M | _mm256_add_epi32(res_signed, *round_const), round_shift); |
1254 | | |
1255 | 12.5M | return res_round; |
1256 | 12.5M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:highbd_convolve_rounding Unexecuted instantiation: highbd_convolve_avx2.c:highbd_convolve_rounding Unexecuted instantiation: convolve_2d_avx2.c:highbd_convolve_rounding Unexecuted instantiation: convolve_avx2.c:highbd_convolve_rounding Unexecuted instantiation: jnt_convolve_avx2.c:highbd_convolve_rounding Unexecuted instantiation: wiener_convolve_avx2.c:highbd_convolve_rounding Unexecuted instantiation: highbd_convolve_2d_avx2.c:highbd_convolve_rounding highbd_jnt_convolve_avx2.c:highbd_convolve_rounding Line | Count | Source | 1250 | 12.5M | const __m256i *const round_const, const int round_shift) { | 1251 | 12.5M | const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const); | 1252 | 12.5M | const __m256i res_round = _mm256_srai_epi32( | 1253 | 12.5M | _mm256_add_epi32(res_signed, *round_const), round_shift); | 1254 | | | 1255 | 12.5M | return res_round; | 1256 | 12.5M | } |
|
1257 | | |
1258 | 4.44M | static inline __m256i round_sr_x_avx2(const __m256i data) { |
1259 | | // we can perform the below steps: |
1260 | | // data = (data + 2) >> 2 |
1261 | | // data = (data + 8) >> 4, |
1262 | | // in the below form as well |
1263 | | // data = (data + 0x22) >> 6 |
1264 | 4.44M | const __m256i value = _mm256_set1_epi16(34); |
1265 | 4.44M | const __m256i reg = _mm256_add_epi16(data, value); |
1266 | 4.44M | return _mm256_srai_epi16(reg, 6); |
1267 | 4.44M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_sr_x_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:round_sr_x_avx2 Unexecuted instantiation: convolve_2d_avx2.c:round_sr_x_avx2 convolve_avx2.c:round_sr_x_avx2 Line | Count | Source | 1258 | 4.44M | static inline __m256i round_sr_x_avx2(const __m256i data) { | 1259 | | // we can perform the below steps: | 1260 | | // data = (data + 2) >> 2 | 1261 | | // data = (data + 8) >> 4, | 1262 | | // in the below form as well | 1263 | | // data = (data + 0x22) >> 6 | 1264 | 4.44M | const __m256i value = _mm256_set1_epi16(34); | 1265 | 4.44M | const __m256i reg = _mm256_add_epi16(data, value); | 1266 | 4.44M | return _mm256_srai_epi16(reg, 6); | 1267 | 4.44M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:round_sr_x_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:round_sr_x_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_sr_x_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_sr_x_avx2 |
1268 | | |
1269 | | static inline __m128i convolve_x_4tap_4x2_ssse3(const uint8_t *const src, |
1270 | | const ptrdiff_t src_stride, |
1271 | 507k | __m128i *const coeffs) { |
1272 | 507k | __m128i data[2]; |
1273 | 507k | const __m128i f_l0 = _mm_load_si128((__m128i const *)filt1_global_sse2); |
1274 | 507k | const __m128i f_l1 = _mm_load_si128((__m128i const *)filt2_global_sse2); |
1275 | 507k | const __m128i src_1 = |
1276 | 507k | load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * src_stride)); |
1277 | | |
1278 | 507k | data[0] = _mm_shuffle_epi8(src_1, f_l0); |
1279 | 507k | data[1] = _mm_shuffle_epi8(src_1, f_l1); |
1280 | 507k | return convolve_lowbd_4tap_ssse3(data, coeffs); |
1281 | 507k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_4tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_4tap_4x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_4tap_4x2_ssse3 convolve_avx2.c:convolve_x_4tap_4x2_ssse3 Line | Count | Source | 1271 | 507k | __m128i *const coeffs) { | 1272 | 507k | __m128i data[2]; | 1273 | 507k | const __m128i f_l0 = _mm_load_si128((__m128i const *)filt1_global_sse2); | 1274 | 507k | const __m128i f_l1 = _mm_load_si128((__m128i const *)filt2_global_sse2); | 1275 | 507k | const __m128i src_1 = | 1276 | 507k | load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * src_stride)); | 1277 | | | 1278 | 507k | data[0] = _mm_shuffle_epi8(src_1, f_l0); | 1279 | 507k | data[1] = _mm_shuffle_epi8(src_1, f_l1); | 1280 | 507k | return convolve_lowbd_4tap_ssse3(data, coeffs); | 1281 | 507k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_4tap_4x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_4tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_4tap_4x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_4tap_4x2_ssse3 |
1282 | | |
1283 | 630k | static inline __m128i round_sr_x_ssse3(const __m128i data) { |
1284 | 630k | const __m128i val = _mm_set1_epi16(34); |
1285 | 630k | const __m128i reg = _mm_add_epi16(data, val); |
1286 | 630k | return _mm_srai_epi16(reg, 6); |
1287 | 630k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_sr_x_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:round_sr_x_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:round_sr_x_ssse3 convolve_avx2.c:round_sr_x_ssse3 Line | Count | Source | 1283 | 630k | static inline __m128i round_sr_x_ssse3(const __m128i data) { | 1284 | 630k | const __m128i val = _mm_set1_epi16(34); | 1285 | 630k | const __m128i reg = _mm_add_epi16(data, val); | 1286 | 630k | return _mm_srai_epi16(reg, 6); | 1287 | 630k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:round_sr_x_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:round_sr_x_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_sr_x_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_sr_x_ssse3 |
1288 | | |
1289 | | static inline void store_8bit_4x2_sse2(const __m128i reg, uint8_t *const dst, |
1290 | 1.01M | const ptrdiff_t dst_stride) { |
1291 | 1.01M | xx_storel_32(dst, reg); |
1292 | 1.01M | *(uint32_t *)(dst + dst_stride) = |
1293 | 1.01M | ((uint32_t)_mm_extract_epi16(reg, 3) << 16) | _mm_extract_epi16(reg, 2); |
1294 | 1.01M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:store_8bit_4x2_sse2 Unexecuted instantiation: highbd_convolve_avx2.c:store_8bit_4x2_sse2 Unexecuted instantiation: convolve_2d_avx2.c:store_8bit_4x2_sse2 convolve_avx2.c:store_8bit_4x2_sse2 Line | Count | Source | 1290 | 1.01M | const ptrdiff_t dst_stride) { | 1291 | 1.01M | xx_storel_32(dst, reg); | 1292 | 1.01M | *(uint32_t *)(dst + dst_stride) = | 1293 | 1.01M | ((uint32_t)_mm_extract_epi16(reg, 3) << 16) | _mm_extract_epi16(reg, 2); | 1294 | 1.01M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:store_8bit_4x2_sse2 Unexecuted instantiation: wiener_convolve_avx2.c:store_8bit_4x2_sse2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:store_8bit_4x2_sse2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:store_8bit_4x2_sse2 |
1295 | | |
1296 | | static inline void pack_store_u8_4x2_sse2(const __m128i reg, uint8_t *const dst, |
1297 | 1.01M | const ptrdiff_t dst_stride) { |
1298 | 1.01M | const __m128i reg_pack = _mm_packus_epi16(reg, reg); |
1299 | 1.01M | store_8bit_4x2_sse2(reg_pack, dst, dst_stride); |
1300 | 1.01M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:pack_store_u8_4x2_sse2 Unexecuted instantiation: highbd_convolve_avx2.c:pack_store_u8_4x2_sse2 Unexecuted instantiation: convolve_2d_avx2.c:pack_store_u8_4x2_sse2 convolve_avx2.c:pack_store_u8_4x2_sse2 Line | Count | Source | 1297 | 1.01M | const ptrdiff_t dst_stride) { | 1298 | 1.01M | const __m128i reg_pack = _mm_packus_epi16(reg, reg); | 1299 | 1.01M | store_8bit_4x2_sse2(reg_pack, dst, dst_stride); | 1300 | 1.01M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:pack_store_u8_4x2_sse2 Unexecuted instantiation: wiener_convolve_avx2.c:pack_store_u8_4x2_sse2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:pack_store_u8_4x2_sse2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:pack_store_u8_4x2_sse2 |
1301 | | |
1302 | | static inline __m128i convolve_x_4tap_2x2_ssse3(const uint8_t *const src, |
1303 | | const ptrdiff_t src_stride, |
1304 | 88.0k | __m128i *const coeffs) { |
1305 | 88.0k | __m128i data[2]; |
1306 | 88.0k | const __m128i f_0 = _mm_load_si128((__m128i const *)filt3_global_sse2); |
1307 | 88.0k | const __m128i f_1 = _mm_load_si128((__m128i const *)filt4_global_sse2); |
1308 | 88.0k | const __m128i reg = |
1309 | 88.0k | load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * src_stride)); |
1310 | | |
1311 | 88.0k | data[0] = _mm_shuffle_epi8(reg, f_0); |
1312 | 88.0k | data[1] = _mm_shuffle_epi8(reg, f_1); |
1313 | 88.0k | return convolve_lowbd_4tap_ssse3(data, coeffs); |
1314 | 88.0k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_4tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_4tap_2x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_4tap_2x2_ssse3 convolve_avx2.c:convolve_x_4tap_2x2_ssse3 Line | Count | Source | 1304 | 88.0k | __m128i *const coeffs) { | 1305 | 88.0k | __m128i data[2]; | 1306 | 88.0k | const __m128i f_0 = _mm_load_si128((__m128i const *)filt3_global_sse2); | 1307 | 88.0k | const __m128i f_1 = _mm_load_si128((__m128i const *)filt4_global_sse2); | 1308 | 88.0k | const __m128i reg = | 1309 | 88.0k | load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * src_stride)); | 1310 | | | 1311 | 88.0k | data[0] = _mm_shuffle_epi8(reg, f_0); | 1312 | 88.0k | data[1] = _mm_shuffle_epi8(reg, f_1); | 1313 | 88.0k | return convolve_lowbd_4tap_ssse3(data, coeffs); | 1314 | 88.0k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_4tap_2x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_4tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_4tap_2x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_4tap_2x2_ssse3 |
1315 | | |
1316 | | static inline void pack_store_u8_2x2_sse2(const __m128i reg, uint8_t *const dst, |
1317 | 178k | const ptrdiff_t dst_stride) { |
1318 | 178k | const __m128i data = _mm_packus_epi16(reg, reg); |
1319 | 178k | *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(data); |
1320 | 178k | *(int16_t *)(dst + dst_stride) = (int16_t)_mm_extract_epi16(data, 1); |
1321 | 178k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:pack_store_u8_2x2_sse2 Unexecuted instantiation: highbd_convolve_avx2.c:pack_store_u8_2x2_sse2 Unexecuted instantiation: convolve_2d_avx2.c:pack_store_u8_2x2_sse2 convolve_avx2.c:pack_store_u8_2x2_sse2 Line | Count | Source | 1317 | 178k | const ptrdiff_t dst_stride) { | 1318 | 178k | const __m128i data = _mm_packus_epi16(reg, reg); | 1319 | 178k | *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(data); | 1320 | | *(int16_t *)(dst + dst_stride) = (int16_t)_mm_extract_epi16(data, 1); | 1321 | 178k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:pack_store_u8_2x2_sse2 Unexecuted instantiation: wiener_convolve_avx2.c:pack_store_u8_2x2_sse2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:pack_store_u8_2x2_sse2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:pack_store_u8_2x2_sse2 |
1322 | | |
1323 | | static inline __m128i convolve_x_2tap_ssse3(const __m128i *data, |
1324 | 35.1k | const __m128i *coeff) { |
1325 | 35.1k | return _mm_maddubs_epi16(data[0], coeff[0]); |
1326 | 35.1k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_ssse3 convolve_avx2.c:convolve_x_2tap_ssse3 Line | Count | Source | 1324 | 35.1k | const __m128i *coeff) { | 1325 | 35.1k | return _mm_maddubs_epi16(data[0], coeff[0]); | 1326 | 35.1k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_ssse3 |
1327 | | |
1328 | | static inline __m128i load8_x_4x2_sse4(const void *const src, |
1329 | 11.5k | const ptrdiff_t offset) { |
1330 | 11.5k | const __m128i s = _mm_cvtsi32_si128(loadu_int32(src)); |
1331 | 11.5k | return _mm_insert_epi32(s, loadu_int32((uint8_t *)src + offset), 1); |
1332 | 11.5k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load8_x_4x2_sse4 Unexecuted instantiation: highbd_convolve_avx2.c:load8_x_4x2_sse4 Unexecuted instantiation: convolve_2d_avx2.c:load8_x_4x2_sse4 convolve_avx2.c:load8_x_4x2_sse4 Line | Count | Source | 1329 | 11.5k | const ptrdiff_t offset) { | 1330 | 11.5k | const __m128i s = _mm_cvtsi32_si128(loadu_int32(src)); | 1331 | | return _mm_insert_epi32(s, loadu_int32((uint8_t *)src + offset), 1); | 1332 | 11.5k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load8_x_4x2_sse4 Unexecuted instantiation: wiener_convolve_avx2.c:load8_x_4x2_sse4 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load8_x_4x2_sse4 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load8_x_4x2_sse4 |
1333 | | |
1334 | | static inline __m128i load_x_u8_4x2_sse4(const uint8_t *const src, |
1335 | 11.5k | const ptrdiff_t stride) { |
1336 | 11.5k | return load8_x_4x2_sse4(src, sizeof(*src) * stride); |
1337 | 11.5k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_x_u8_4x2_sse4 Unexecuted instantiation: highbd_convolve_avx2.c:load_x_u8_4x2_sse4 Unexecuted instantiation: convolve_2d_avx2.c:load_x_u8_4x2_sse4 convolve_avx2.c:load_x_u8_4x2_sse4 Line | Count | Source | 1335 | 11.5k | const ptrdiff_t stride) { | 1336 | 11.5k | return load8_x_4x2_sse4(src, sizeof(*src) * stride); | 1337 | 11.5k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load_x_u8_4x2_sse4 Unexecuted instantiation: wiener_convolve_avx2.c:load_x_u8_4x2_sse4 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_x_u8_4x2_sse4 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_x_u8_4x2_sse4 |
1338 | | |
1339 | | static inline __m128i convolve_x_2tap_2x2_ssse3(const uint8_t *const src, |
1340 | | const ptrdiff_t stride, |
1341 | 2.69k | const __m128i *coeffs) { |
1342 | 2.69k | const __m128i flt = _mm_load_si128((__m128i const *)filt5_global_sse2); |
1343 | 2.69k | const __m128i reg = load_x_u8_4x2_sse4(src, stride); |
1344 | 2.69k | const __m128i data = _mm_shuffle_epi8(reg, flt); |
1345 | 2.69k | return convolve_x_2tap_ssse3(&data, coeffs); |
1346 | 2.69k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_2x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_2x2_ssse3 convolve_avx2.c:convolve_x_2tap_2x2_ssse3 Line | Count | Source | 1341 | 2.69k | const __m128i *coeffs) { | 1342 | 2.69k | const __m128i flt = _mm_load_si128((__m128i const *)filt5_global_sse2); | 1343 | 2.69k | const __m128i reg = load_x_u8_4x2_sse4(src, stride); | 1344 | 2.69k | const __m128i data = _mm_shuffle_epi8(reg, flt); | 1345 | 2.69k | return convolve_x_2tap_ssse3(&data, coeffs); | 1346 | 2.69k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_2x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_2x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_2x2_ssse3 |
1347 | | |
1348 | | static inline __m128i convolve_x_2tap_4x2_ssse3(const uint8_t *const src, |
1349 | | const ptrdiff_t stride, |
1350 | 10.7k | const __m128i *coeffs) { |
1351 | 10.7k | const __m128i flt = _mm_load_si128((__m128i const *)filt1_global_sse2); |
1352 | 10.7k | const __m128i data = |
1353 | 10.7k | load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * stride)); |
1354 | 10.7k | const __m128i res = _mm_shuffle_epi8(data, flt); |
1355 | 10.7k | return convolve_x_2tap_ssse3(&res, coeffs); |
1356 | 10.7k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_4x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_4x2_ssse3 convolve_avx2.c:convolve_x_2tap_4x2_ssse3 Line | Count | Source | 1350 | 10.7k | const __m128i *coeffs) { | 1351 | 10.7k | const __m128i flt = _mm_load_si128((__m128i const *)filt1_global_sse2); | 1352 | 10.7k | const __m128i data = | 1353 | 10.7k | load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * stride)); | 1354 | 10.7k | const __m128i res = _mm_shuffle_epi8(data, flt); | 1355 | 10.7k | return convolve_x_2tap_ssse3(&res, coeffs); | 1356 | 10.7k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_4x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_4x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_4x2_ssse3 |
1357 | | |
1358 | | static inline void convolve_x_2tap_8x2_ssse3(const uint8_t *const src, |
1359 | | const ptrdiff_t stride, |
1360 | | const __m128i *coeffs, |
1361 | 10.8k | __m128i *data) { |
1362 | 10.8k | __m128i res[2]; |
1363 | 10.8k | const __m128i reg_00 = _mm_loadu_si128((__m128i *)src); |
1364 | 10.8k | const __m128i reg_10 = _mm_loadu_si128((__m128i *)(src + stride)); |
1365 | 10.8k | const __m128i reg_01 = _mm_srli_si128(reg_00, 1); |
1366 | 10.8k | const __m128i reg_11 = _mm_srli_si128(reg_10, 1); |
1367 | 10.8k | res[0] = _mm_unpacklo_epi8(reg_00, reg_01); |
1368 | 10.8k | res[1] = _mm_unpacklo_epi8(reg_10, reg_11); |
1369 | | |
1370 | 10.8k | data[0] = convolve_x_2tap_ssse3(&res[0], coeffs); |
1371 | 10.8k | data[1] = convolve_x_2tap_ssse3(&res[1], coeffs); |
1372 | 10.8k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_8x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_8x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_8x2_ssse3 convolve_avx2.c:convolve_x_2tap_8x2_ssse3 Line | Count | Source | 1361 | 10.8k | __m128i *data) { | 1362 | 10.8k | __m128i res[2]; | 1363 | 10.8k | const __m128i reg_00 = _mm_loadu_si128((__m128i *)src); | 1364 | 10.8k | const __m128i reg_10 = _mm_loadu_si128((__m128i *)(src + stride)); | 1365 | 10.8k | const __m128i reg_01 = _mm_srli_si128(reg_00, 1); | 1366 | 10.8k | const __m128i reg_11 = _mm_srli_si128(reg_10, 1); | 1367 | 10.8k | res[0] = _mm_unpacklo_epi8(reg_00, reg_01); | 1368 | 10.8k | res[1] = _mm_unpacklo_epi8(reg_10, reg_11); | 1369 | | | 1370 | 10.8k | data[0] = convolve_x_2tap_ssse3(&res[0], coeffs); | 1371 | 10.8k | data[1] = convolve_x_2tap_ssse3(&res[1], coeffs); | 1372 | 10.8k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_8x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_8x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_8x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_8x2_ssse3 |
1373 | | |
1374 | | static inline __m256i loadu_x_8bit_16x2_avx2(const void *const src, |
1375 | 896k | const ptrdiff_t offset) { |
1376 | 896k | const __m128i reg0 = _mm_loadu_si128((__m128i *)src); |
1377 | 896k | const __m128i reg1 = _mm_loadu_si128((__m128i *)((uint8_t *)src + offset)); |
1378 | 896k | return _mm256_setr_m128i(reg0, reg1); |
1379 | 896k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:loadu_x_8bit_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:loadu_x_8bit_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:loadu_x_8bit_16x2_avx2 convolve_avx2.c:loadu_x_8bit_16x2_avx2 Line | Count | Source | 1375 | 896k | const ptrdiff_t offset) { | 1376 | 896k | const __m128i reg0 = _mm_loadu_si128((__m128i *)src); | 1377 | 896k | const __m128i reg1 = _mm_loadu_si128((__m128i *)((uint8_t *)src + offset)); | 1378 | 896k | return _mm256_setr_m128i(reg0, reg1); | 1379 | 896k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:loadu_x_8bit_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:loadu_x_8bit_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:loadu_x_8bit_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:loadu_x_8bit_16x2_avx2 |
1380 | | |
1381 | | static inline __m256i convolve_x_2tap_avx2(const __m256i *data, |
1382 | 228k | const __m256i *coeffs) { |
1383 | 228k | return _mm256_maddubs_epi16(data[0], coeffs[0]); |
1384 | 228k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_avx2 convolve_avx2.c:convolve_x_2tap_avx2 Line | Count | Source | 1382 | 228k | const __m256i *coeffs) { | 1383 | 228k | return _mm256_maddubs_epi16(data[0], coeffs[0]); | 1384 | 228k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_avx2 |
1385 | | |
1386 | | static inline void convolve_x_2tap_16x2_avx2(const uint8_t *const src, |
1387 | | const ptrdiff_t stride, |
1388 | | const __m256i *coeffs, |
1389 | 9.00k | __m256i *data) { |
1390 | 9.00k | const __m256i reg0 = loadu_x_8bit_16x2_avx2(src, stride); |
1391 | 9.00k | const __m256i reg1 = loadu_x_8bit_16x2_avx2(src + 1, stride); |
1392 | 9.00k | const __m256i res0 = _mm256_unpacklo_epi8(reg0, reg1); |
1393 | 9.00k | const __m256i res1 = _mm256_unpackhi_epi8(reg0, reg1); |
1394 | 9.00k | data[0] = convolve_x_2tap_avx2(&res0, coeffs); |
1395 | 9.00k | data[1] = convolve_x_2tap_avx2(&res1, coeffs); |
1396 | 9.00k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_16x2_avx2 convolve_avx2.c:convolve_x_2tap_16x2_avx2 Line | Count | Source | 1389 | 9.00k | __m256i *data) { | 1390 | 9.00k | const __m256i reg0 = loadu_x_8bit_16x2_avx2(src, stride); | 1391 | 9.00k | const __m256i reg1 = loadu_x_8bit_16x2_avx2(src + 1, stride); | 1392 | 9.00k | const __m256i res0 = _mm256_unpacklo_epi8(reg0, reg1); | 1393 | 9.00k | const __m256i res1 = _mm256_unpackhi_epi8(reg0, reg1); | 1394 | 9.00k | data[0] = convolve_x_2tap_avx2(&res0, coeffs); | 1395 | 9.00k | data[1] = convolve_x_2tap_avx2(&res1, coeffs); | 1396 | 9.00k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_16x2_avx2 |
1397 | | |
1398 | | static inline void store_u8_16x2_avx2(const __m256i src, uint8_t *const dst, |
1399 | 1.87M | const ptrdiff_t stride) { |
1400 | 1.87M | const __m128i reg0 = _mm256_castsi256_si128(src); |
1401 | 1.87M | const __m128i reg1 = _mm256_extracti128_si256(src, 1); |
1402 | 1.87M | _mm_storeu_si128((__m128i *)dst, reg0); |
1403 | 1.87M | _mm_storeu_si128((__m128i *)((uint8_t *)dst + stride), reg1); |
1404 | 1.87M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:store_u8_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:store_u8_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:store_u8_16x2_avx2 convolve_avx2.c:store_u8_16x2_avx2 Line | Count | Source | 1399 | 1.87M | const ptrdiff_t stride) { | 1400 | 1.87M | const __m128i reg0 = _mm256_castsi256_si128(src); | 1401 | | const __m128i reg1 = _mm256_extracti128_si256(src, 1); | 1402 | 1.87M | _mm_storeu_si128((__m128i *)dst, reg0); | 1403 | 1.87M | _mm_storeu_si128((__m128i *)((uint8_t *)dst + stride), reg1); | 1404 | 1.87M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:store_u8_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:store_u8_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:store_u8_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:store_u8_16x2_avx2 |
1405 | | |
1406 | | static inline void store_u8_8x2_avx2(const __m256i src, uint8_t *const dst, |
1407 | 576k | const ptrdiff_t stride) { |
1408 | 576k | const __m128i reg0 = _mm256_castsi256_si128(src); |
1409 | 576k | const __m128i reg1 = _mm256_extracti128_si256(src, 1); |
1410 | 576k | _mm_storel_epi64((__m128i *)dst, reg0); |
1411 | 576k | _mm_storel_epi64((__m128i *)(dst + stride), reg1); |
1412 | 576k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:store_u8_8x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:store_u8_8x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:store_u8_8x2_avx2 convolve_avx2.c:store_u8_8x2_avx2 Line | Count | Source | 1407 | 576k | const ptrdiff_t stride) { | 1408 | 576k | const __m128i reg0 = _mm256_castsi256_si128(src); | 1409 | | const __m128i reg1 = _mm256_extracti128_si256(src, 1); | 1410 | 576k | _mm_storel_epi64((__m128i *)dst, reg0); | 1411 | 576k | _mm_storel_epi64((__m128i *)(dst + stride), reg1); | 1412 | 576k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:store_u8_8x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:store_u8_8x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:store_u8_8x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:store_u8_8x2_avx2 |
1413 | | |
1414 | | static inline void pack_store_16x2_avx2(const __m256i data0, |
1415 | | const __m256i data1, uint8_t *const dst, |
1416 | 1.87M | const ptrdiff_t stride) { |
1417 | 1.87M | const __m256i res = _mm256_packus_epi16(data0, data1); |
1418 | 1.87M | store_u8_16x2_avx2(res, dst, stride); |
1419 | 1.87M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:pack_store_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:pack_store_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:pack_store_16x2_avx2 convolve_avx2.c:pack_store_16x2_avx2 Line | Count | Source | 1416 | 1.87M | const ptrdiff_t stride) { | 1417 | 1.87M | const __m256i res = _mm256_packus_epi16(data0, data1); | 1418 | 1.87M | store_u8_16x2_avx2(res, dst, stride); | 1419 | 1.87M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:pack_store_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:pack_store_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:pack_store_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:pack_store_16x2_avx2 |
1420 | | |
1421 | | static inline void pack_store_8x2_avx2(const __m256i data, uint8_t *const dst, |
1422 | 576k | const ptrdiff_t stride) { |
1423 | 576k | const __m256i res = _mm256_packus_epi16(data, data); |
1424 | 576k | store_u8_8x2_avx2(res, dst, stride); |
1425 | 576k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:pack_store_8x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:pack_store_8x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:pack_store_8x2_avx2 convolve_avx2.c:pack_store_8x2_avx2 Line | Count | Source | 1422 | 576k | const ptrdiff_t stride) { | 1423 | 576k | const __m256i res = _mm256_packus_epi16(data, data); | 1424 | 576k | store_u8_8x2_avx2(res, dst, stride); | 1425 | 576k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:pack_store_8x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:pack_store_8x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:pack_store_8x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:pack_store_8x2_avx2 |
1426 | | |
1427 | | static inline void round_pack_store_16x2_avx2(const __m256i *data, |
1428 | | uint8_t *const dst, |
1429 | 448k | const ptrdiff_t dst_stride) { |
1430 | 448k | __m256i reg[2]; |
1431 | | |
1432 | 448k | reg[0] = round_sr_x_avx2(data[0]); |
1433 | 448k | reg[1] = round_sr_x_avx2(data[1]); |
1434 | 448k | pack_store_16x2_avx2(reg[0], reg[1], dst, dst_stride); |
1435 | 448k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_pack_store_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:round_pack_store_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:round_pack_store_16x2_avx2 convolve_avx2.c:round_pack_store_16x2_avx2 Line | Count | Source | 1429 | 448k | const ptrdiff_t dst_stride) { | 1430 | 448k | __m256i reg[2]; | 1431 | | | 1432 | 448k | reg[0] = round_sr_x_avx2(data[0]); | 1433 | 448k | reg[1] = round_sr_x_avx2(data[1]); | 1434 | 448k | pack_store_16x2_avx2(reg[0], reg[1], dst, dst_stride); | 1435 | 448k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:round_pack_store_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:round_pack_store_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_pack_store_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_pack_store_16x2_avx2 |
1436 | | |
1437 | | static inline void convolve_x_2tap_32_avx2(const uint8_t *const src, |
1438 | | const __m256i *coeffs, |
1439 | 105k | __m256i *data) { |
1440 | 105k | const __m256i res0 = _mm256_loadu_si256((__m256i *)src); |
1441 | 105k | const __m256i res1 = _mm256_loadu_si256((__m256i *)(src + 1)); |
1442 | 105k | const __m256i reg0 = _mm256_unpacklo_epi8(res0, res1); |
1443 | 105k | const __m256i reg1 = _mm256_unpackhi_epi8(res0, res1); |
1444 | | |
1445 | 105k | data[0] = convolve_x_2tap_avx2(®0, coeffs); |
1446 | 105k | data[1] = convolve_x_2tap_avx2(®1, coeffs); |
1447 | 105k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_32_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_32_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_32_avx2 convolve_avx2.c:convolve_x_2tap_32_avx2 Line | Count | Source | 1439 | 105k | __m256i *data) { | 1440 | 105k | const __m256i res0 = _mm256_loadu_si256((__m256i *)src); | 1441 | 105k | const __m256i res1 = _mm256_loadu_si256((__m256i *)(src + 1)); | 1442 | 105k | const __m256i reg0 = _mm256_unpacklo_epi8(res0, res1); | 1443 | 105k | const __m256i reg1 = _mm256_unpackhi_epi8(res0, res1); | 1444 | | | 1445 | 105k | data[0] = convolve_x_2tap_avx2(®0, coeffs); | 1446 | 105k | data[1] = convolve_x_2tap_avx2(®1, coeffs); | 1447 | 105k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_32_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_32_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_32_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_32_avx2 |
1448 | | |
1449 | | static inline void pack_store_32_avx2(const __m256i data0, const __m256i data1, |
1450 | 1.56M | uint8_t *const dst) { |
1451 | 1.56M | const __m256i reg = _mm256_packus_epi16(data0, data1); |
1452 | 1.56M | _mm256_storeu_si256((__m256i *)dst, reg); |
1453 | 1.56M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:pack_store_32_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:pack_store_32_avx2 Unexecuted instantiation: convolve_2d_avx2.c:pack_store_32_avx2 convolve_avx2.c:pack_store_32_avx2 Line | Count | Source | 1450 | 1.56M | uint8_t *const dst) { | 1451 | 1.56M | const __m256i reg = _mm256_packus_epi16(data0, data1); | 1452 | 1.56M | _mm256_storeu_si256((__m256i *)dst, reg); | 1453 | 1.56M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:pack_store_32_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:pack_store_32_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:pack_store_32_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:pack_store_32_avx2 |
1454 | | |
1455 | | static inline void round_pack_store_32_avx2(const __m256i *data, |
1456 | 1.20M | uint8_t *const dst) { |
1457 | 1.20M | __m256i reg[2]; |
1458 | | |
1459 | 1.20M | reg[0] = round_sr_x_avx2(data[0]); |
1460 | 1.20M | reg[1] = round_sr_x_avx2(data[1]); |
1461 | 1.20M | pack_store_32_avx2(reg[0], reg[1], dst); |
1462 | 1.20M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_pack_store_32_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:round_pack_store_32_avx2 Unexecuted instantiation: convolve_2d_avx2.c:round_pack_store_32_avx2 convolve_avx2.c:round_pack_store_32_avx2 Line | Count | Source | 1456 | 1.20M | uint8_t *const dst) { | 1457 | 1.20M | __m256i reg[2]; | 1458 | | | 1459 | 1.20M | reg[0] = round_sr_x_avx2(data[0]); | 1460 | 1.20M | reg[1] = round_sr_x_avx2(data[1]); | 1461 | 1.20M | pack_store_32_avx2(reg[0], reg[1], dst); | 1462 | 1.20M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:round_pack_store_32_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:round_pack_store_32_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_pack_store_32_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_pack_store_32_avx2 |
1463 | | |
1464 | | static inline void convolve_round_2tap_32_avx2(const uint8_t *const src, |
1465 | | const __m256i *coeffs, |
1466 | 105k | uint8_t *const dst) { |
1467 | 105k | __m256i data[2]; |
1468 | | |
1469 | 105k | convolve_x_2tap_32_avx2(src, coeffs, data); |
1470 | 105k | round_pack_store_32_avx2(data, dst); |
1471 | 105k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_round_2tap_32_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_round_2tap_32_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_round_2tap_32_avx2 convolve_avx2.c:convolve_round_2tap_32_avx2 Line | Count | Source | 1466 | 105k | uint8_t *const dst) { | 1467 | 105k | __m256i data[2]; | 1468 | | | 1469 | 105k | convolve_x_2tap_32_avx2(src, coeffs, data); | 1470 | 105k | round_pack_store_32_avx2(data, dst); | 1471 | 105k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_round_2tap_32_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_round_2tap_32_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_round_2tap_32_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_round_2tap_32_avx2 |
1472 | | |
1473 | | static inline void load_avg_store_2tap_32_avx2(const uint8_t *const src, |
1474 | 116k | uint8_t *const dst) { |
1475 | 116k | const __m256i res0 = _mm256_loadu_si256((__m256i *)src); |
1476 | 116k | const __m256i res1 = _mm256_loadu_si256((__m256i *)(src + 1)); |
1477 | 116k | const __m256i data = _mm256_avg_epu8(res0, res1); |
1478 | 116k | _mm256_storeu_si256((__m256i *)dst, data); |
1479 | 116k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_avg_store_2tap_32_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:load_avg_store_2tap_32_avx2 Unexecuted instantiation: convolve_2d_avx2.c:load_avg_store_2tap_32_avx2 convolve_avx2.c:load_avg_store_2tap_32_avx2 Line | Count | Source | 1474 | 116k | uint8_t *const dst) { | 1475 | 116k | const __m256i res0 = _mm256_loadu_si256((__m256i *)src); | 1476 | 116k | const __m256i res1 = _mm256_loadu_si256((__m256i *)(src + 1)); | 1477 | 116k | const __m256i data = _mm256_avg_epu8(res0, res1); | 1478 | 116k | _mm256_storeu_si256((__m256i *)dst, data); | 1479 | 116k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load_avg_store_2tap_32_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:load_avg_store_2tap_32_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_avg_store_2tap_32_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_avg_store_2tap_32_avx2 |
1480 | | |
1481 | | static inline __m256i load_convolve_8tap_8x2_avx2(const uint8_t *const src, |
1482 | | const ptrdiff_t stride, |
1483 | | const __m256i *coeffs, |
1484 | 46.4k | const __m256i *flt) { |
1485 | 46.4k | const __m256i res = loadu_x_8bit_16x2_avx2(src, stride); |
1486 | 46.4k | return convolve_lowbd_x(res, coeffs, flt); |
1487 | 46.4k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_8tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_8tap_8x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_8tap_8x2_avx2 convolve_avx2.c:load_convolve_8tap_8x2_avx2 Line | Count | Source | 1484 | 46.4k | const __m256i *flt) { | 1485 | 46.4k | const __m256i res = loadu_x_8bit_16x2_avx2(src, stride); | 1486 | 46.4k | return convolve_lowbd_x(res, coeffs, flt); | 1487 | 46.4k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_8tap_8x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_8tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_8tap_8x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_8tap_8x2_avx2 |
1488 | | |
1489 | | static inline void load_convolve_8tap_16x2_avx2(const uint8_t *const src, |
1490 | | const int32_t src_stride, |
1491 | | const __m256i *coeffs, |
1492 | | const __m256i *flt, |
1493 | 23.2k | __m256i *reg) { |
1494 | 23.2k | reg[0] = load_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, flt); |
1495 | 23.2k | reg[1] = load_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, flt); |
1496 | 23.2k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_8tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_8tap_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_8tap_16x2_avx2 convolve_avx2.c:load_convolve_8tap_16x2_avx2 Line | Count | Source | 1493 | 23.2k | __m256i *reg) { | 1494 | 23.2k | reg[0] = load_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, flt); | 1495 | 23.2k | reg[1] = load_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, flt); | 1496 | 23.2k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_8tap_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_8tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_8tap_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_8tap_16x2_avx2 |
1497 | | |
1498 | | static inline void load_convolve_8tap_32_avx2(const uint8_t *const src, |
1499 | | const __m256i *coeffs, |
1500 | | const __m256i *filt, |
1501 | 128k | __m256i *data) { |
1502 | 128k | const __m256i reg_0 = _mm256_loadu_si256((__m256i *)src); |
1503 | 128k | const __m256i reg_8 = _mm256_loadu_si256((__m256i *)(src + 8)); |
1504 | | |
1505 | 128k | data[0] = convolve_lowbd_x(reg_0, coeffs, filt); |
1506 | 128k | data[1] = convolve_lowbd_x(reg_8, coeffs, filt); |
1507 | 128k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_8tap_32_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_8tap_32_avx2 Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_8tap_32_avx2 convolve_avx2.c:load_convolve_8tap_32_avx2 Line | Count | Source | 1501 | 128k | __m256i *data) { | 1502 | 128k | const __m256i reg_0 = _mm256_loadu_si256((__m256i *)src); | 1503 | 128k | const __m256i reg_8 = _mm256_loadu_si256((__m256i *)(src + 8)); | 1504 | | | 1505 | 128k | data[0] = convolve_lowbd_x(reg_0, coeffs, filt); | 1506 | 128k | data[1] = convolve_lowbd_x(reg_8, coeffs, filt); | 1507 | 128k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_8tap_32_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_8tap_32_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_8tap_32_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_8tap_32_avx2 |
1508 | | |
1509 | | static inline void load_convolve_round_8tap_32_avx2(const uint8_t *const src, |
1510 | | const __m256i *coeffs, |
1511 | | const __m256i *filt, |
1512 | 128k | uint8_t *const dst) { |
1513 | 128k | __m256i data[2]; |
1514 | | |
1515 | 128k | load_convolve_8tap_32_avx2(src, coeffs, filt, data); |
1516 | 128k | round_pack_store_32_avx2(data, dst); |
1517 | 128k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_round_8tap_32_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_round_8tap_32_avx2 Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_round_8tap_32_avx2 convolve_avx2.c:load_convolve_round_8tap_32_avx2 Line | Count | Source | 1512 | 128k | uint8_t *const dst) { | 1513 | 128k | __m256i data[2]; | 1514 | | | 1515 | 128k | load_convolve_8tap_32_avx2(src, coeffs, filt, data); | 1516 | 128k | round_pack_store_32_avx2(data, dst); | 1517 | 128k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_round_8tap_32_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_round_8tap_32_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_round_8tap_32_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_round_8tap_32_avx2 |
1518 | | |
1519 | | static inline void load_convolve_6tap_32_avx2(const uint8_t *const src, |
1520 | | const __m256i *coeffs, |
1521 | | const __m256i *filt, |
1522 | 968k | __m256i *data) { |
1523 | 968k | const __m256i reg0 = _mm256_loadu_si256((__m256i *)src); |
1524 | 968k | const __m256i reg1 = _mm256_loadu_si256((__m256i *)(src + 8)); |
1525 | | |
1526 | 968k | data[0] = convolve_lowbd_x_6tap(reg0, coeffs, filt); |
1527 | 968k | data[1] = convolve_lowbd_x_6tap(reg1, coeffs, filt); |
1528 | 968k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_6tap_32_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_6tap_32_avx2 Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_6tap_32_avx2 convolve_avx2.c:load_convolve_6tap_32_avx2 Line | Count | Source | 1522 | 968k | __m256i *data) { | 1523 | 968k | const __m256i reg0 = _mm256_loadu_si256((__m256i *)src); | 1524 | 968k | const __m256i reg1 = _mm256_loadu_si256((__m256i *)(src + 8)); | 1525 | | | 1526 | 968k | data[0] = convolve_lowbd_x_6tap(reg0, coeffs, filt); | 1527 | 968k | data[1] = convolve_lowbd_x_6tap(reg1, coeffs, filt); | 1528 | 968k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_6tap_32_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_6tap_32_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_6tap_32_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_6tap_32_avx2 |
1529 | | |
1530 | | static inline void convolve_sr_store_6tap_32_avx2(const uint8_t *const src, |
1531 | | const __m256i *coeffs, |
1532 | | const __m256i *filt, |
1533 | 968k | uint8_t *const dst) { |
1534 | 968k | __m256i data[2]; |
1535 | | |
1536 | 968k | load_convolve_6tap_32_avx2(src, coeffs, filt, data); |
1537 | 968k | round_pack_store_32_avx2(data, dst); |
1538 | 968k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_sr_store_6tap_32_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_sr_store_6tap_32_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_sr_store_6tap_32_avx2 convolve_avx2.c:convolve_sr_store_6tap_32_avx2 Line | Count | Source | 1533 | 968k | uint8_t *const dst) { | 1534 | 968k | __m256i data[2]; | 1535 | | | 1536 | 968k | load_convolve_6tap_32_avx2(src, coeffs, filt, data); | 1537 | 968k | round_pack_store_32_avx2(data, dst); | 1538 | 968k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_sr_store_6tap_32_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_sr_store_6tap_32_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_sr_store_6tap_32_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_sr_store_6tap_32_avx2 |
1539 | | |
1540 | | static inline __m256i load_convolve_6tap_8x2_avx2(const uint8_t *const src, |
1541 | | const ptrdiff_t stride, |
1542 | | const __m256i *coeffs, |
1543 | 831k | const __m256i *filt) { |
1544 | 831k | const __m256i data = loadu_x_8bit_16x2_avx2(src, stride); |
1545 | 831k | return convolve_lowbd_x_6tap(data, coeffs, filt); |
1546 | 831k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_6tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_6tap_8x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_6tap_8x2_avx2 convolve_avx2.c:load_convolve_6tap_8x2_avx2 Line | Count | Source | 1543 | 831k | const __m256i *filt) { | 1544 | 831k | const __m256i data = loadu_x_8bit_16x2_avx2(src, stride); | 1545 | 831k | return convolve_lowbd_x_6tap(data, coeffs, filt); | 1546 | 831k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_6tap_8x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_6tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_6tap_8x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_6tap_8x2_avx2 |
1547 | | |
1548 | | static inline void load_convolve_6tap_16x2_avx2(const uint8_t *const src, |
1549 | | const int32_t src_stride, |
1550 | | const __m256i *coeffs, |
1551 | | const __m256i *filt, |
1552 | 415k | __m256i *data) { |
1553 | 415k | data[0] = load_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt); |
1554 | 415k | data[1] = load_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt); |
1555 | 415k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_6tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_6tap_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_6tap_16x2_avx2 convolve_avx2.c:load_convolve_6tap_16x2_avx2 Line | Count | Source | 1552 | 415k | __m256i *data) { | 1553 | 415k | data[0] = load_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt); | 1554 | 415k | data[1] = load_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt); | 1555 | 415k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_6tap_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_6tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_6tap_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_6tap_16x2_avx2 |
1556 | | |
1557 | 584k | static inline __m128i round_sr_y_ssse3(const __m128i data) { |
1558 | 584k | const __m128i value = _mm_set1_epi16(32); |
1559 | 584k | const __m128i reg = _mm_add_epi16(data, value); |
1560 | 584k | return _mm_srai_epi16(reg, FILTER_BITS - 1); |
1561 | 584k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_sr_y_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:round_sr_y_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:round_sr_y_ssse3 convolve_avx2.c:round_sr_y_ssse3 Line | Count | Source | 1557 | 584k | static inline __m128i round_sr_y_ssse3(const __m128i data) { | 1558 | 584k | const __m128i value = _mm_set1_epi16(32); | 1559 | 584k | const __m128i reg = _mm_add_epi16(data, value); | 1560 | 584k | return _mm_srai_epi16(reg, FILTER_BITS - 1); | 1561 | 584k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:round_sr_y_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:round_sr_y_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_sr_y_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_sr_y_ssse3 |
1562 | | |
1563 | 4.15M | static inline __m256i round_sr_y_avx2(const __m256i data) { |
1564 | 4.15M | const __m256i value = _mm256_set1_epi16(32); |
1565 | 4.15M | const __m256i reg = _mm256_add_epi16(data, value); |
1566 | 4.15M | return _mm256_srai_epi16(reg, FILTER_BITS - 1); |
1567 | 4.15M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_sr_y_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:round_sr_y_avx2 Unexecuted instantiation: convolve_2d_avx2.c:round_sr_y_avx2 convolve_avx2.c:round_sr_y_avx2 Line | Count | Source | 1563 | 4.15M | static inline __m256i round_sr_y_avx2(const __m256i data) { | 1564 | 4.15M | const __m256i value = _mm256_set1_epi16(32); | 1565 | 4.15M | const __m256i reg = _mm256_add_epi16(data, value); | 1566 | 4.15M | return _mm256_srai_epi16(reg, FILTER_BITS - 1); | 1567 | 4.15M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:round_sr_y_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:round_sr_y_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_sr_y_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_sr_y_avx2 |
1568 | | |
1569 | | static inline void round_pack_store_y_8x2_avx2(const __m256i res, |
1570 | | uint8_t *const dst, |
1571 | 576k | const ptrdiff_t dst_stride) { |
1572 | 576k | __m256i r; |
1573 | | |
1574 | 576k | r = round_sr_y_avx2(res); |
1575 | 576k | pack_store_8x2_avx2(r, dst, dst_stride); |
1576 | 576k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_pack_store_y_8x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:round_pack_store_y_8x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:round_pack_store_y_8x2_avx2 convolve_avx2.c:round_pack_store_y_8x2_avx2 Line | Count | Source | 1571 | 576k | const ptrdiff_t dst_stride) { | 1572 | 576k | __m256i r; | 1573 | | | 1574 | 576k | r = round_sr_y_avx2(res); | 1575 | 576k | pack_store_8x2_avx2(r, dst, dst_stride); | 1576 | 576k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:round_pack_store_y_8x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:round_pack_store_y_8x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_pack_store_y_8x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_pack_store_y_8x2_avx2 |
1577 | | |
1578 | | static inline void round_pack_store_y_16x2_avx2(const __m256i res[2], |
1579 | | uint8_t *const dst, |
1580 | 1.42M | const ptrdiff_t dst_stride) { |
1581 | 1.42M | __m256i r[2]; |
1582 | | |
1583 | 1.42M | r[0] = round_sr_y_avx2(res[0]); |
1584 | 1.42M | r[1] = round_sr_y_avx2(res[1]); |
1585 | 1.42M | pack_store_16x2_avx2(r[0], r[1], dst, dst_stride); |
1586 | 1.42M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_pack_store_y_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:round_pack_store_y_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:round_pack_store_y_16x2_avx2 convolve_avx2.c:round_pack_store_y_16x2_avx2 Line | Count | Source | 1580 | 1.42M | const ptrdiff_t dst_stride) { | 1581 | 1.42M | __m256i r[2]; | 1582 | | | 1583 | 1.42M | r[0] = round_sr_y_avx2(res[0]); | 1584 | 1.42M | r[1] = round_sr_y_avx2(res[1]); | 1585 | 1.42M | pack_store_16x2_avx2(r[0], r[1], dst, dst_stride); | 1586 | 1.42M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:round_pack_store_y_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:round_pack_store_y_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_pack_store_y_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_pack_store_y_16x2_avx2 |
1587 | | |
1588 | | static inline void round_pack_store_y_32_avx2(const __m256i res[2], |
1589 | 362k | uint8_t *const dst) { |
1590 | 362k | __m256i r[2]; |
1591 | | |
1592 | 362k | r[0] = round_sr_y_avx2(res[0]); |
1593 | 362k | r[1] = round_sr_y_avx2(res[1]); |
1594 | 362k | pack_store_32_avx2(r[0], r[1], dst); |
1595 | 362k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_pack_store_y_32_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:round_pack_store_y_32_avx2 Unexecuted instantiation: convolve_2d_avx2.c:round_pack_store_y_32_avx2 convolve_avx2.c:round_pack_store_y_32_avx2 Line | Count | Source | 1589 | 362k | uint8_t *const dst) { | 1590 | 362k | __m256i r[2]; | 1591 | | | 1592 | 362k | r[0] = round_sr_y_avx2(res[0]); | 1593 | 362k | r[1] = round_sr_y_avx2(res[1]); | 1594 | 362k | pack_store_32_avx2(r[0], r[1], dst); | 1595 | 362k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:round_pack_store_y_32_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:round_pack_store_y_32_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_pack_store_y_32_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_pack_store_y_32_avx2 |
1596 | | |
1597 | | static inline void round_pack_store_y_32x2_avx2(const __m256i res[4], |
1598 | | uint8_t *const dst, |
1599 | 181k | const ptrdiff_t dst_stride) { |
1600 | 181k | round_pack_store_y_32_avx2(res, dst); |
1601 | 181k | round_pack_store_y_32_avx2(res + 2, dst + dst_stride); |
1602 | 181k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_pack_store_y_32x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:round_pack_store_y_32x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:round_pack_store_y_32x2_avx2 convolve_avx2.c:round_pack_store_y_32x2_avx2 Line | Count | Source | 1599 | 181k | const ptrdiff_t dst_stride) { | 1600 | 181k | round_pack_store_y_32_avx2(res, dst); | 1601 | 181k | round_pack_store_y_32_avx2(res + 2, dst + dst_stride); | 1602 | 181k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:round_pack_store_y_32x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:round_pack_store_y_32x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_pack_store_y_32x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_pack_store_y_32x2_avx2 |
1603 | | |
1604 | | static inline void convolve_y_2tap_2x2_ssse3(const uint8_t *const data, |
1605 | | const ptrdiff_t stride, |
1606 | | const __m128i *coeffs, |
1607 | 3.20k | __m128i d[2], __m128i *res) { |
1608 | 3.20k | d[1] = _mm_cvtsi32_si128(loadu_int16(data + 1 * stride)); |
1609 | 3.20k | const __m128i src_01a = _mm_unpacklo_epi16(d[0], d[1]); |
1610 | 3.20k | d[0] = _mm_cvtsi32_si128(loadu_int16(data + 2 * stride)); |
1611 | 3.20k | const __m128i src_12a = _mm_unpacklo_epi16(d[1], d[0]); |
1612 | | |
1613 | 3.20k | const __m128i s = _mm_unpacklo_epi8(src_01a, src_12a); |
1614 | | |
1615 | 3.20k | *res = _mm_maddubs_epi16(s, coeffs[0]); |
1616 | 3.20k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_2tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_2tap_2x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_2tap_2x2_ssse3 convolve_avx2.c:convolve_y_2tap_2x2_ssse3 Line | Count | Source | 1607 | 3.20k | __m128i d[2], __m128i *res) { | 1608 | 3.20k | d[1] = _mm_cvtsi32_si128(loadu_int16(data + 1 * stride)); | 1609 | 3.20k | const __m128i src_01a = _mm_unpacklo_epi16(d[0], d[1]); | 1610 | 3.20k | d[0] = _mm_cvtsi32_si128(loadu_int16(data + 2 * stride)); | 1611 | 3.20k | const __m128i src_12a = _mm_unpacklo_epi16(d[1], d[0]); | 1612 | | | 1613 | 3.20k | const __m128i s = _mm_unpacklo_epi8(src_01a, src_12a); | 1614 | | | 1615 | 3.20k | *res = _mm_maddubs_epi16(s, coeffs[0]); | 1616 | 3.20k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_2tap_2x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_2tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_2tap_2x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_2tap_2x2_ssse3 |
1617 | | |
1618 | | static inline void convolve_y_4tap_2x2_ssse3(const uint8_t *const data, |
1619 | | const ptrdiff_t stride, |
1620 | | const __m128i coeffs[2], |
1621 | | __m128i d[4], __m128i s[2], |
1622 | 33.6k | __m128i *res) { |
1623 | 33.6k | d[3] = _mm_cvtsi32_si128(loadu_int16(data + 3 * stride)); |
1624 | 33.6k | const __m128i src_23a = _mm_unpacklo_epi16(d[2], d[3]); |
1625 | 33.6k | d[2] = _mm_cvtsi32_si128(loadu_int16(data + 4 * stride)); |
1626 | 33.6k | const __m128i src_34a = _mm_unpacklo_epi16(d[3], d[2]); |
1627 | | |
1628 | 33.6k | s[1] = _mm_unpacklo_epi8(src_23a, src_34a); |
1629 | | |
1630 | 33.6k | *res = convolve_lowbd_4tap_ssse3(s, coeffs); |
1631 | 33.6k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_4tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_4tap_2x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_4tap_2x2_ssse3 convolve_avx2.c:convolve_y_4tap_2x2_ssse3 Line | Count | Source | 1622 | 33.6k | __m128i *res) { | 1623 | 33.6k | d[3] = _mm_cvtsi32_si128(loadu_int16(data + 3 * stride)); | 1624 | 33.6k | const __m128i src_23a = _mm_unpacklo_epi16(d[2], d[3]); | 1625 | 33.6k | d[2] = _mm_cvtsi32_si128(loadu_int16(data + 4 * stride)); | 1626 | 33.6k | const __m128i src_34a = _mm_unpacklo_epi16(d[3], d[2]); | 1627 | | | 1628 | 33.6k | s[1] = _mm_unpacklo_epi8(src_23a, src_34a); | 1629 | | | 1630 | 33.6k | *res = convolve_lowbd_4tap_ssse3(s, coeffs); | 1631 | 33.6k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_4tap_2x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_4tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_4tap_2x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_4tap_2x2_ssse3 |
1632 | | |
1633 | | static inline void convolve_y_6tap_2x2_ssse3(const uint8_t *const data, |
1634 | | const ptrdiff_t stride, |
1635 | | const __m128i coeffs[3], |
1636 | | __m128i d[6], __m128i s[3], |
1637 | 46.5k | __m128i *res) { |
1638 | 46.5k | d[5] = _mm_cvtsi32_si128(loadu_int16(data + 5 * stride)); |
1639 | 46.5k | const __m128i src_45a = _mm_unpacklo_epi16(d[4], d[5]); |
1640 | 46.5k | d[4] = _mm_cvtsi32_si128(loadu_int16(data + 6 * stride)); |
1641 | 46.5k | const __m128i src_56a = _mm_unpacklo_epi16(d[5], d[4]); |
1642 | | |
1643 | 46.5k | s[2] = _mm_unpacklo_epi8(src_45a, src_56a); |
1644 | | |
1645 | 46.5k | *res = convolve_lowbd_6tap_ssse3(s, coeffs); |
1646 | 46.5k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_6tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_6tap_2x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_6tap_2x2_ssse3 convolve_avx2.c:convolve_y_6tap_2x2_ssse3 Line | Count | Source | 1637 | 46.5k | __m128i *res) { | 1638 | 46.5k | d[5] = _mm_cvtsi32_si128(loadu_int16(data + 5 * stride)); | 1639 | 46.5k | const __m128i src_45a = _mm_unpacklo_epi16(d[4], d[5]); | 1640 | 46.5k | d[4] = _mm_cvtsi32_si128(loadu_int16(data + 6 * stride)); | 1641 | 46.5k | const __m128i src_56a = _mm_unpacklo_epi16(d[5], d[4]); | 1642 | | | 1643 | 46.5k | s[2] = _mm_unpacklo_epi8(src_45a, src_56a); | 1644 | | | 1645 | 46.5k | *res = convolve_lowbd_6tap_ssse3(s, coeffs); | 1646 | 46.5k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_6tap_2x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_6tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_6tap_2x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_6tap_2x2_ssse3 |
1647 | | |
1648 | | static inline void convolve_y_8tap_2x2_ssse3(const uint8_t *const data, |
1649 | | const ptrdiff_t stride, |
1650 | | const __m128i coeffs[4], |
1651 | | __m128i d[8], __m128i s[4], |
1652 | 4.46k | __m128i *res) { |
1653 | 4.46k | d[7] = _mm_cvtsi32_si128(loadu_int16(data + 7 * stride)); |
1654 | 4.46k | const __m128i src_67a = _mm_unpacklo_epi16(d[6], d[7]); |
1655 | 4.46k | d[6] = _mm_cvtsi32_si128(loadu_int16(data + 8 * stride)); |
1656 | 4.46k | const __m128i src_78a = _mm_unpacklo_epi16(d[7], d[6]); |
1657 | | |
1658 | 4.46k | s[3] = _mm_unpacklo_epi8(src_67a, src_78a); |
1659 | | |
1660 | 4.46k | *res = convolve_lowbd_ssse3(s, coeffs); |
1661 | 4.46k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_8tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_8tap_2x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_8tap_2x2_ssse3 convolve_avx2.c:convolve_y_8tap_2x2_ssse3 Line | Count | Source | 1652 | 4.46k | __m128i *res) { | 1653 | 4.46k | d[7] = _mm_cvtsi32_si128(loadu_int16(data + 7 * stride)); | 1654 | 4.46k | const __m128i src_67a = _mm_unpacklo_epi16(d[6], d[7]); | 1655 | 4.46k | d[6] = _mm_cvtsi32_si128(loadu_int16(data + 8 * stride)); | 1656 | 4.46k | const __m128i src_78a = _mm_unpacklo_epi16(d[7], d[6]); | 1657 | | | 1658 | 4.46k | s[3] = _mm_unpacklo_epi8(src_67a, src_78a); | 1659 | | | 1660 | 4.46k | *res = convolve_lowbd_ssse3(s, coeffs); | 1661 | 4.46k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_8tap_2x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_8tap_2x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_8tap_2x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_8tap_2x2_ssse3 |
1662 | | |
1663 | | static inline void convolve_y_2tap_4x2_ssse3(const uint8_t *const data, |
1664 | | const ptrdiff_t stride, |
1665 | | const __m128i *coeffs, |
1666 | 14.8k | __m128i d[2], __m128i *res) { |
1667 | 14.8k | d[1] = _mm_cvtsi32_si128(loadu_int32(data + 1 * stride)); |
1668 | 14.8k | const __m128i src_01a = _mm_unpacklo_epi32(d[0], d[1]); |
1669 | 14.8k | d[0] = _mm_cvtsi32_si128(loadu_int32(data + 2 * stride)); |
1670 | 14.8k | const __m128i src_12a = _mm_unpacklo_epi32(d[1], d[0]); |
1671 | | |
1672 | 14.8k | const __m128i s = _mm_unpacklo_epi8(src_01a, src_12a); |
1673 | | |
1674 | 14.8k | *res = _mm_maddubs_epi16(s, coeffs[0]); |
1675 | 14.8k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_2tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_2tap_4x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_2tap_4x2_ssse3 convolve_avx2.c:convolve_y_2tap_4x2_ssse3 Line | Count | Source | 1666 | 14.8k | __m128i d[2], __m128i *res) { | 1667 | 14.8k | d[1] = _mm_cvtsi32_si128(loadu_int32(data + 1 * stride)); | 1668 | 14.8k | const __m128i src_01a = _mm_unpacklo_epi32(d[0], d[1]); | 1669 | 14.8k | d[0] = _mm_cvtsi32_si128(loadu_int32(data + 2 * stride)); | 1670 | 14.8k | const __m128i src_12a = _mm_unpacklo_epi32(d[1], d[0]); | 1671 | | | 1672 | 14.8k | const __m128i s = _mm_unpacklo_epi8(src_01a, src_12a); | 1673 | | | 1674 | 14.8k | *res = _mm_maddubs_epi16(s, coeffs[0]); | 1675 | 14.8k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_2tap_4x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_2tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_2tap_4x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_2tap_4x2_ssse3 |
1676 | | |
1677 | | static inline void convolve_y_4tap_4x2_ssse3(const uint8_t *const data, |
1678 | | const ptrdiff_t stride, |
1679 | | const __m128i coeffs[2], |
1680 | | __m128i d[4], __m128i s[2], |
1681 | 185k | __m128i *res) { |
1682 | 185k | d[3] = _mm_cvtsi32_si128(loadu_int32(data + 3 * stride)); |
1683 | 185k | const __m128i src_23a = _mm_unpacklo_epi32(d[2], d[3]); |
1684 | 185k | d[2] = _mm_cvtsi32_si128(loadu_int32(data + 4 * stride)); |
1685 | 185k | const __m128i src_34a = _mm_unpacklo_epi32(d[3], d[2]); |
1686 | | |
1687 | 185k | s[1] = _mm_unpacklo_epi8(src_23a, src_34a); |
1688 | | |
1689 | 185k | *res = convolve_lowbd_4tap_ssse3(s, coeffs); |
1690 | 185k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_4tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_4tap_4x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_4tap_4x2_ssse3 convolve_avx2.c:convolve_y_4tap_4x2_ssse3 Line | Count | Source | 1681 | 185k | __m128i *res) { | 1682 | 185k | d[3] = _mm_cvtsi32_si128(loadu_int32(data + 3 * stride)); | 1683 | 185k | const __m128i src_23a = _mm_unpacklo_epi32(d[2], d[3]); | 1684 | 185k | d[2] = _mm_cvtsi32_si128(loadu_int32(data + 4 * stride)); | 1685 | 185k | const __m128i src_34a = _mm_unpacklo_epi32(d[3], d[2]); | 1686 | | | 1687 | 185k | s[1] = _mm_unpacklo_epi8(src_23a, src_34a); | 1688 | | | 1689 | 185k | *res = convolve_lowbd_4tap_ssse3(s, coeffs); | 1690 | 185k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_4tap_4x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_4tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_4tap_4x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_4tap_4x2_ssse3 |
1691 | | |
1692 | | static inline void convolve_y_6tap_4x2_ssse3(const uint8_t *const data, |
1693 | | const ptrdiff_t stride, |
1694 | | const __m128i coeffs[3], |
1695 | | __m128i d[6], __m128i s[3], |
1696 | 274k | __m128i *res) { |
1697 | 274k | d[5] = _mm_cvtsi32_si128(loadu_int32(data + 5 * stride)); |
1698 | 274k | const __m128i src_45a = _mm_unpacklo_epi32(d[4], d[5]); |
1699 | 274k | d[4] = _mm_cvtsi32_si128(loadu_int32(data + 6 * stride)); |
1700 | 274k | const __m128i src_56a = _mm_unpacklo_epi32(d[5], d[4]); |
1701 | | |
1702 | 274k | s[2] = _mm_unpacklo_epi8(src_45a, src_56a); |
1703 | | |
1704 | 274k | *res = convolve_lowbd_6tap_ssse3(s, coeffs); |
1705 | 274k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_6tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_6tap_4x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_6tap_4x2_ssse3 convolve_avx2.c:convolve_y_6tap_4x2_ssse3 Line | Count | Source | 1696 | 274k | __m128i *res) { | 1697 | 274k | d[5] = _mm_cvtsi32_si128(loadu_int32(data + 5 * stride)); | 1698 | 274k | const __m128i src_45a = _mm_unpacklo_epi32(d[4], d[5]); | 1699 | 274k | d[4] = _mm_cvtsi32_si128(loadu_int32(data + 6 * stride)); | 1700 | 274k | const __m128i src_56a = _mm_unpacklo_epi32(d[5], d[4]); | 1701 | | | 1702 | 274k | s[2] = _mm_unpacklo_epi8(src_45a, src_56a); | 1703 | | | 1704 | 274k | *res = convolve_lowbd_6tap_ssse3(s, coeffs); | 1705 | 274k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_6tap_4x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_6tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_6tap_4x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_6tap_4x2_ssse3 |
1706 | | |
1707 | | static inline void convolve_y_8tap_4x2_ssse3(const uint8_t *const data, |
1708 | | const ptrdiff_t stride, |
1709 | | const __m128i coeffs[4], |
1710 | | __m128i d[8], __m128i s[4], |
1711 | 21.6k | __m128i *res) { |
1712 | 21.6k | d[7] = _mm_cvtsi32_si128(loadu_int32(data + 7 * stride)); |
1713 | 21.6k | const __m128i src_67a = _mm_unpacklo_epi32(d[6], d[7]); |
1714 | 21.6k | d[6] = _mm_cvtsi32_si128(loadu_int32(data + 8 * stride)); |
1715 | 21.6k | const __m128i src_78a = _mm_unpacklo_epi32(d[7], d[6]); |
1716 | | |
1717 | 21.6k | s[3] = _mm_unpacklo_epi8(src_67a, src_78a); |
1718 | | |
1719 | 21.6k | res[0] = convolve_lowbd_ssse3(s, coeffs); |
1720 | 21.6k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_8tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_8tap_4x2_ssse3 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_8tap_4x2_ssse3 convolve_avx2.c:convolve_y_8tap_4x2_ssse3 Line | Count | Source | 1711 | 21.6k | __m128i *res) { | 1712 | 21.6k | d[7] = _mm_cvtsi32_si128(loadu_int32(data + 7 * stride)); | 1713 | 21.6k | const __m128i src_67a = _mm_unpacklo_epi32(d[6], d[7]); | 1714 | 21.6k | d[6] = _mm_cvtsi32_si128(loadu_int32(data + 8 * stride)); | 1715 | 21.6k | const __m128i src_78a = _mm_unpacklo_epi32(d[7], d[6]); | 1716 | | | 1717 | 21.6k | s[3] = _mm_unpacklo_epi8(src_67a, src_78a); | 1718 | | | 1719 | 21.6k | res[0] = convolve_lowbd_ssse3(s, coeffs); | 1720 | 21.6k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_8tap_4x2_ssse3 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_8tap_4x2_ssse3 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_8tap_4x2_ssse3 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_8tap_4x2_ssse3 |
1721 | | |
1722 | | static inline void convolve_y_2tap_8x2_avx2(const uint8_t *const data, |
1723 | | const ptrdiff_t stride, |
1724 | | const __m256i *coeffs, __m128i d[2], |
1725 | 12.5k | __m256i *res) { |
1726 | 12.5k | d[1] = _mm_loadu_si128((__m128i *)(data + 1 * stride)); |
1727 | 12.5k | const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]); |
1728 | 12.5k | d[0] = _mm_loadu_si128((__m128i *)(data + 2 * stride)); |
1729 | 12.5k | const __m256i src_12a = _mm256_setr_m128i(d[1], d[0]); |
1730 | | |
1731 | 12.5k | const __m256i s = _mm256_unpacklo_epi8(src_01a, src_12a); |
1732 | | |
1733 | 12.5k | *res = _mm256_maddubs_epi16(s, coeffs[0]); |
1734 | 12.5k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_2tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_2tap_8x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_2tap_8x2_avx2 convolve_avx2.c:convolve_y_2tap_8x2_avx2 Line | Count | Source | 1725 | 12.5k | __m256i *res) { | 1726 | 12.5k | d[1] = _mm_loadu_si128((__m128i *)(data + 1 * stride)); | 1727 | 12.5k | const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]); | 1728 | 12.5k | d[0] = _mm_loadu_si128((__m128i *)(data + 2 * stride)); | 1729 | 12.5k | const __m256i src_12a = _mm256_setr_m128i(d[1], d[0]); | 1730 | | | 1731 | 12.5k | const __m256i s = _mm256_unpacklo_epi8(src_01a, src_12a); | 1732 | | | 1733 | 12.5k | *res = _mm256_maddubs_epi16(s, coeffs[0]); | 1734 | 12.5k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_2tap_8x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_2tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_2tap_8x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_2tap_8x2_avx2 |
1735 | | |
1736 | | static inline void convolve_y_4tap_8x2_avx2(const uint8_t *const data, |
1737 | | const ptrdiff_t stride, |
1738 | | const __m256i coeffs[2], |
1739 | | __m128i d[4], __m256i s[2], |
1740 | 160k | __m256i *res) { |
1741 | 160k | d[3] = _mm_loadu_si128((__m128i *)(data + 3 * stride)); |
1742 | 160k | const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]); |
1743 | 160k | d[2] = _mm_loadu_si128((__m128i *)(data + 4 * stride)); |
1744 | 160k | const __m256i src_34a = _mm256_setr_m128i(d[3], d[2]); |
1745 | | |
1746 | 160k | s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); |
1747 | | |
1748 | 160k | *res = convolve_lowbd_4tap(s, coeffs); |
1749 | 160k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_4tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_4tap_8x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_4tap_8x2_avx2 convolve_avx2.c:convolve_y_4tap_8x2_avx2 Line | Count | Source | 1740 | 160k | __m256i *res) { | 1741 | 160k | d[3] = _mm_loadu_si128((__m128i *)(data + 3 * stride)); | 1742 | 160k | const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]); | 1743 | 160k | d[2] = _mm_loadu_si128((__m128i *)(data + 4 * stride)); | 1744 | 160k | const __m256i src_34a = _mm256_setr_m128i(d[3], d[2]); | 1745 | | | 1746 | 160k | s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); | 1747 | | | 1748 | 160k | *res = convolve_lowbd_4tap(s, coeffs); | 1749 | 160k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_4tap_8x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_4tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_4tap_8x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_4tap_8x2_avx2 |
1750 | | |
1751 | | static inline void convolve_y_6tap_8x2_avx2(const uint8_t *const data, |
1752 | | const ptrdiff_t stride, |
1753 | | const __m256i coeffs[3], |
1754 | | __m128i d[6], __m256i s[3], |
1755 | 382k | __m256i *res) { |
1756 | 382k | d[5] = _mm_loadu_si128((__m128i *)(data + 5 * stride)); |
1757 | 382k | const __m256i src_45a = _mm256_setr_m128i(d[4], d[5]); |
1758 | 382k | d[4] = _mm_loadu_si128((__m128i *)(data + 6 * stride)); |
1759 | 382k | const __m256i src_56a = _mm256_setr_m128i(d[5], d[4]); |
1760 | | |
1761 | 382k | s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); |
1762 | | |
1763 | 382k | *res = convolve_lowbd_6tap(s, coeffs); |
1764 | 382k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_6tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_6tap_8x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_6tap_8x2_avx2 convolve_avx2.c:convolve_y_6tap_8x2_avx2 Line | Count | Source | 1755 | 382k | __m256i *res) { | 1756 | 382k | d[5] = _mm_loadu_si128((__m128i *)(data + 5 * stride)); | 1757 | 382k | const __m256i src_45a = _mm256_setr_m128i(d[4], d[5]); | 1758 | 382k | d[4] = _mm_loadu_si128((__m128i *)(data + 6 * stride)); | 1759 | 382k | const __m256i src_56a = _mm256_setr_m128i(d[5], d[4]); | 1760 | | | 1761 | 382k | s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); | 1762 | | | 1763 | 382k | *res = convolve_lowbd_6tap(s, coeffs); | 1764 | 382k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_6tap_8x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_6tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_6tap_8x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_6tap_8x2_avx2 |
1765 | | |
1766 | | static inline void convolve_y_8tap_8x2_avx2(const uint8_t *const data, |
1767 | | const ptrdiff_t stride, |
1768 | | const __m256i coeffs[4], |
1769 | | __m128i d[8], __m256i s[4], |
1770 | 20.9k | __m256i *res) { |
1771 | 20.9k | d[7] = _mm_loadu_si128((__m128i *)(data + 7 * stride)); |
1772 | 20.9k | const __m256i src_67a = _mm256_setr_m128i(d[6], d[7]); |
1773 | 20.9k | d[6] = _mm_loadu_si128((__m128i *)(data + 8 * stride)); |
1774 | 20.9k | const __m256i src_78a = _mm256_setr_m128i(d[7], d[6]); |
1775 | | |
1776 | 20.9k | s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); |
1777 | | |
1778 | 20.9k | *res = convolve_lowbd(s, coeffs); |
1779 | 20.9k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_8tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_8tap_8x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_8tap_8x2_avx2 convolve_avx2.c:convolve_y_8tap_8x2_avx2 Line | Count | Source | 1770 | 20.9k | __m256i *res) { | 1771 | 20.9k | d[7] = _mm_loadu_si128((__m128i *)(data + 7 * stride)); | 1772 | 20.9k | const __m256i src_67a = _mm256_setr_m128i(d[6], d[7]); | 1773 | 20.9k | d[6] = _mm_loadu_si128((__m128i *)(data + 8 * stride)); | 1774 | 20.9k | const __m256i src_78a = _mm256_setr_m128i(d[7], d[6]); | 1775 | | | 1776 | 20.9k | s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); | 1777 | | | 1778 | 20.9k | *res = convolve_lowbd(s, coeffs); | 1779 | 20.9k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_8tap_8x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_8tap_8x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_8tap_8x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_8tap_8x2_avx2 |
1780 | | |
1781 | | static inline void convolve_y_2tap_16x2_avx2(const uint8_t *const data, |
1782 | | const ptrdiff_t stride, |
1783 | | const __m256i *coeffs, |
1784 | 12.8k | __m128i d[2], __m256i res[2]) { |
1785 | 12.8k | d[1] = _mm_loadu_si128((__m128i *)(data + 1 * stride)); |
1786 | 12.8k | const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]); |
1787 | 12.8k | d[0] = _mm_loadu_si128((__m128i *)(data + 2 * stride)); |
1788 | 12.8k | const __m256i src_12a = _mm256_setr_m128i(d[1], d[0]); |
1789 | | |
1790 | 12.8k | const __m256i s0 = _mm256_unpacklo_epi8(src_01a, src_12a); |
1791 | 12.8k | const __m256i s1 = _mm256_unpackhi_epi8(src_01a, src_12a); |
1792 | | |
1793 | 12.8k | res[0] = _mm256_maddubs_epi16(s0, coeffs[0]); |
1794 | 12.8k | res[1] = _mm256_maddubs_epi16(s1, coeffs[0]); |
1795 | 12.8k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_2tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_2tap_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_2tap_16x2_avx2 convolve_avx2.c:convolve_y_2tap_16x2_avx2 Line | Count | Source | 1784 | 12.8k | __m128i d[2], __m256i res[2]) { | 1785 | 12.8k | d[1] = _mm_loadu_si128((__m128i *)(data + 1 * stride)); | 1786 | 12.8k | const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]); | 1787 | 12.8k | d[0] = _mm_loadu_si128((__m128i *)(data + 2 * stride)); | 1788 | 12.8k | const __m256i src_12a = _mm256_setr_m128i(d[1], d[0]); | 1789 | | | 1790 | 12.8k | const __m256i s0 = _mm256_unpacklo_epi8(src_01a, src_12a); | 1791 | 12.8k | const __m256i s1 = _mm256_unpackhi_epi8(src_01a, src_12a); | 1792 | | | 1793 | 12.8k | res[0] = _mm256_maddubs_epi16(s0, coeffs[0]); | 1794 | 12.8k | res[1] = _mm256_maddubs_epi16(s1, coeffs[0]); | 1795 | 12.8k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_2tap_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_2tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_2tap_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_2tap_16x2_avx2 |
1796 | | |
1797 | | static inline void convolve_y_4tap_16x2_avx2(const uint8_t *const data, |
1798 | | const ptrdiff_t stride, |
1799 | | const __m256i coeffs[2], |
1800 | | __m128i d[4], __m256i s[4], |
1801 | 92.8k | __m256i res[2]) { |
1802 | 92.8k | d[3] = _mm_loadu_si128((__m128i *)(data + 3 * stride)); |
1803 | 92.8k | const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]); |
1804 | 92.8k | d[2] = _mm_loadu_si128((__m128i *)(data + 4 * stride)); |
1805 | 92.8k | const __m256i src_34a = _mm256_setr_m128i(d[3], d[2]); |
1806 | | |
1807 | 92.8k | s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); |
1808 | 92.8k | s[3] = _mm256_unpackhi_epi8(src_23a, src_34a); |
1809 | | |
1810 | 92.8k | res[0] = convolve_lowbd_4tap(s, coeffs); |
1811 | 92.8k | res[1] = convolve_lowbd_4tap(s + 2, coeffs); |
1812 | 92.8k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_4tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_4tap_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_4tap_16x2_avx2 convolve_avx2.c:convolve_y_4tap_16x2_avx2 Line | Count | Source | 1801 | 92.8k | __m256i res[2]) { | 1802 | 92.8k | d[3] = _mm_loadu_si128((__m128i *)(data + 3 * stride)); | 1803 | 92.8k | const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]); | 1804 | 92.8k | d[2] = _mm_loadu_si128((__m128i *)(data + 4 * stride)); | 1805 | 92.8k | const __m256i src_34a = _mm256_setr_m128i(d[3], d[2]); | 1806 | | | 1807 | 92.8k | s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); | 1808 | 92.8k | s[3] = _mm256_unpackhi_epi8(src_23a, src_34a); | 1809 | | | 1810 | 92.8k | res[0] = convolve_lowbd_4tap(s, coeffs); | 1811 | 92.8k | res[1] = convolve_lowbd_4tap(s + 2, coeffs); | 1812 | 92.8k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_4tap_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_4tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_4tap_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_4tap_16x2_avx2 |
1813 | | |
1814 | | static inline void convolve_y_6tap_16x2_avx2(const uint8_t *const data, |
1815 | | const ptrdiff_t stride, |
1816 | | const __m256i coeffs[3], |
1817 | | __m128i d[6], __m256i s[6], |
1818 | 1.24M | __m256i res[2]) { |
1819 | 1.24M | d[5] = _mm_loadu_si128((__m128i *)(data + 5 * stride)); |
1820 | 1.24M | const __m256i src_45a = _mm256_setr_m128i(d[4], d[5]); |
1821 | 1.24M | d[4] = _mm_loadu_si128((__m128i *)(data + 6 * stride)); |
1822 | 1.24M | const __m256i src_56a = _mm256_setr_m128i(d[5], d[4]); |
1823 | | |
1824 | 1.24M | s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); |
1825 | 1.24M | s[5] = _mm256_unpackhi_epi8(src_45a, src_56a); |
1826 | | |
1827 | 1.24M | res[0] = convolve_lowbd_6tap(s, coeffs); |
1828 | 1.24M | res[1] = convolve_lowbd_6tap(s + 3, coeffs); |
1829 | 1.24M | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_6tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_6tap_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_6tap_16x2_avx2 convolve_avx2.c:convolve_y_6tap_16x2_avx2 Line | Count | Source | 1818 | 1.24M | __m256i res[2]) { | 1819 | 1.24M | d[5] = _mm_loadu_si128((__m128i *)(data + 5 * stride)); | 1820 | 1.24M | const __m256i src_45a = _mm256_setr_m128i(d[4], d[5]); | 1821 | 1.24M | d[4] = _mm_loadu_si128((__m128i *)(data + 6 * stride)); | 1822 | 1.24M | const __m256i src_56a = _mm256_setr_m128i(d[5], d[4]); | 1823 | | | 1824 | 1.24M | s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); | 1825 | 1.24M | s[5] = _mm256_unpackhi_epi8(src_45a, src_56a); | 1826 | | | 1827 | 1.24M | res[0] = convolve_lowbd_6tap(s, coeffs); | 1828 | 1.24M | res[1] = convolve_lowbd_6tap(s + 3, coeffs); | 1829 | 1.24M | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_6tap_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_6tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_6tap_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_6tap_16x2_avx2 |
1830 | | |
1831 | | static inline void convolve_y_8tap_16x2_avx2(const uint8_t *const data, |
1832 | | const ptrdiff_t stride, |
1833 | | const __m256i coeffs[4], |
1834 | | __m128i d[8], __m256i s[8], |
1835 | 79.4k | __m256i res[2]) { |
1836 | 79.4k | d[7] = _mm_loadu_si128((__m128i *)(data + 7 * stride)); |
1837 | 79.4k | const __m256i src_67a = _mm256_setr_m128i(d[6], d[7]); |
1838 | 79.4k | d[6] = _mm_loadu_si128((__m128i *)(data + 8 * stride)); |
1839 | 79.4k | const __m256i src_78a = _mm256_setr_m128i(d[7], d[6]); |
1840 | | |
1841 | 79.4k | s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); |
1842 | 79.4k | s[7] = _mm256_unpackhi_epi8(src_67a, src_78a); |
1843 | | |
1844 | 79.4k | res[0] = convolve_lowbd(s, coeffs); |
1845 | 79.4k | res[1] = convolve_lowbd(s + 4, coeffs); |
1846 | 79.4k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_8tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_8tap_16x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_8tap_16x2_avx2 convolve_avx2.c:convolve_y_8tap_16x2_avx2 Line | Count | Source | 1835 | 79.4k | __m256i res[2]) { | 1836 | 79.4k | d[7] = _mm_loadu_si128((__m128i *)(data + 7 * stride)); | 1837 | 79.4k | const __m256i src_67a = _mm256_setr_m128i(d[6], d[7]); | 1838 | 79.4k | d[6] = _mm_loadu_si128((__m128i *)(data + 8 * stride)); | 1839 | 79.4k | const __m256i src_78a = _mm256_setr_m128i(d[7], d[6]); | 1840 | | | 1841 | 79.4k | s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); | 1842 | 79.4k | s[7] = _mm256_unpackhi_epi8(src_67a, src_78a); | 1843 | | | 1844 | 79.4k | res[0] = convolve_lowbd(s, coeffs); | 1845 | 79.4k | res[1] = convolve_lowbd(s + 4, coeffs); | 1846 | 79.4k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_8tap_16x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_8tap_16x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_8tap_16x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_8tap_16x2_avx2 |
1847 | | |
1848 | | static inline void convolve_y_2tap_32x2_avx2(const uint8_t *const data, |
1849 | | const ptrdiff_t stride, |
1850 | | const __m256i *coeffs, |
1851 | 39.6k | __m256i d[2], __m256i res[4]) { |
1852 | 39.6k | d[1] = _mm256_loadu_si256((__m256i *)(data + 1 * stride)); |
1853 | 39.6k | const __m256i s00 = _mm256_unpacklo_epi8(d[0], d[1]); |
1854 | 39.6k | const __m256i s01 = _mm256_unpackhi_epi8(d[0], d[1]); |
1855 | 39.6k | d[0] = _mm256_loadu_si256((__m256i *)(data + 2 * stride)); |
1856 | 39.6k | const __m256i s10 = _mm256_unpacklo_epi8(d[1], d[0]); |
1857 | 39.6k | const __m256i s11 = _mm256_unpackhi_epi8(d[1], d[0]); |
1858 | | |
1859 | 39.6k | res[0] = _mm256_maddubs_epi16(s00, coeffs[0]); |
1860 | 39.6k | res[1] = _mm256_maddubs_epi16(s01, coeffs[0]); |
1861 | 39.6k | res[2] = _mm256_maddubs_epi16(s10, coeffs[0]); |
1862 | 39.6k | res[3] = _mm256_maddubs_epi16(s11, coeffs[0]); |
1863 | 39.6k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_2tap_32x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_2tap_32x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_2tap_32x2_avx2 convolve_avx2.c:convolve_y_2tap_32x2_avx2 Line | Count | Source | 1851 | 39.6k | __m256i d[2], __m256i res[4]) { | 1852 | 39.6k | d[1] = _mm256_loadu_si256((__m256i *)(data + 1 * stride)); | 1853 | 39.6k | const __m256i s00 = _mm256_unpacklo_epi8(d[0], d[1]); | 1854 | 39.6k | const __m256i s01 = _mm256_unpackhi_epi8(d[0], d[1]); | 1855 | 39.6k | d[0] = _mm256_loadu_si256((__m256i *)(data + 2 * stride)); | 1856 | 39.6k | const __m256i s10 = _mm256_unpacklo_epi8(d[1], d[0]); | 1857 | 39.6k | const __m256i s11 = _mm256_unpackhi_epi8(d[1], d[0]); | 1858 | | | 1859 | 39.6k | res[0] = _mm256_maddubs_epi16(s00, coeffs[0]); | 1860 | 39.6k | res[1] = _mm256_maddubs_epi16(s01, coeffs[0]); | 1861 | 39.6k | res[2] = _mm256_maddubs_epi16(s10, coeffs[0]); | 1862 | 39.6k | res[3] = _mm256_maddubs_epi16(s11, coeffs[0]); | 1863 | 39.6k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_2tap_32x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_2tap_32x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_2tap_32x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_2tap_32x2_avx2 |
1864 | | |
1865 | | static inline void convolve_y_4tap_32x2_avx2(const uint8_t *const data, |
1866 | | const ptrdiff_t stride, |
1867 | | const __m256i coeffs[2], |
1868 | | __m256i d[4], __m256i s1[4], |
1869 | 141k | __m256i s2[4], __m256i res[4]) { |
1870 | 141k | d[3] = _mm256_loadu_si256((__m256i *)(data + 3 * stride)); |
1871 | 141k | s1[1] = _mm256_unpacklo_epi8(d[2], d[3]); |
1872 | 141k | s1[3] = _mm256_unpackhi_epi8(d[2], d[3]); |
1873 | 141k | d[2] = _mm256_loadu_si256((__m256i *)(data + 4 * stride)); |
1874 | 141k | s2[1] = _mm256_unpacklo_epi8(d[3], d[2]); |
1875 | 141k | s2[3] = _mm256_unpackhi_epi8(d[3], d[2]); |
1876 | | |
1877 | 141k | res[0] = convolve_lowbd_4tap(s1, coeffs); |
1878 | 141k | res[1] = convolve_lowbd_4tap(s1 + 2, coeffs); |
1879 | 141k | res[2] = convolve_lowbd_4tap(s2, coeffs); |
1880 | 141k | res[3] = convolve_lowbd_4tap(s2 + 2, coeffs); |
1881 | 141k | } Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_4tap_32x2_avx2 Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_4tap_32x2_avx2 Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_4tap_32x2_avx2 convolve_avx2.c:convolve_y_4tap_32x2_avx2 Line | Count | Source | 1869 | 141k | __m256i s2[4], __m256i res[4]) { | 1870 | 141k | d[3] = _mm256_loadu_si256((__m256i *)(data + 3 * stride)); | 1871 | 141k | s1[1] = _mm256_unpacklo_epi8(d[2], d[3]); | 1872 | 141k | s1[3] = _mm256_unpackhi_epi8(d[2], d[3]); | 1873 | 141k | d[2] = _mm256_loadu_si256((__m256i *)(data + 4 * stride)); | 1874 | 141k | s2[1] = _mm256_unpacklo_epi8(d[3], d[2]); | 1875 | 141k | s2[3] = _mm256_unpackhi_epi8(d[3], d[2]); | 1876 | | | 1877 | 141k | res[0] = convolve_lowbd_4tap(s1, coeffs); | 1878 | 141k | res[1] = convolve_lowbd_4tap(s1 + 2, coeffs); | 1879 | 141k | res[2] = convolve_lowbd_4tap(s2, coeffs); | 1880 | 141k | res[3] = convolve_lowbd_4tap(s2 + 2, coeffs); | 1881 | 141k | } |
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_4tap_32x2_avx2 Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_4tap_32x2_avx2 Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_4tap_32x2_avx2 Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_4tap_32x2_avx2 |
1882 | | #endif // AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ |