Coverage Report

Created: 2026-06-30 06:53

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/aom/aom_dsp/x86/convolve_avx2.h
Line
Count
Source
1
/*
2
 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#ifndef AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_
13
#define AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_
14
15
#include <immintrin.h>
16
17
#include "aom_ports/mem.h"
18
19
#include "aom_dsp/x86/mem_sse2.h"
20
#include "aom_dsp/x86/synonyms.h"
21
22
#include "av1/common/convolve.h"
23
#include "av1/common/filter.h"
24
25
820k
#define SECOND_32_BLK (32)
26
731k
#define THIRD_32_BLK (32 << 1)
27
365k
#define FOURTH_32_BLK (SECOND_32_BLK + THIRD_32_BLK)
28
29
// filters for 16
30
DECLARE_ALIGNED(32, static const uint8_t, filt_global_avx2[]) = {
31
  0,  1,  1,  2,  2, 3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  0,  1,  1,
32
  2,  2,  3,  3,  4, 4,  5,  5,  6,  6,  7,  7,  8,  2,  3,  3,  4,  4,  5,
33
  5,  6,  6,  7,  7, 8,  8,  9,  9,  10, 2,  3,  3,  4,  4,  5,  5,  6,  6,
34
  7,  7,  8,  8,  9, 9,  10, 4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10,
35
  10, 11, 11, 12, 4, 5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10, 10, 11, 11,
36
  12, 6,  7,  7,  8, 8,  9,  9,  10, 10, 11, 11, 12, 12, 13, 13, 14, 6,  7,
37
  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14
38
};
39
40
DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[]) = {
41
  0, 1, 2, 3,  1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3,  1, 2,
42
  3, 4, 2, 3,  4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7,  8, 9,
43
  7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
44
};
45
46
DECLARE_ALIGNED(32, static const uint8_t, filt4_d4_global_avx2[]) = {
47
  2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
48
  2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
49
};
50
51
DECLARE_ALIGNED(32, static const uint8_t, filt_center_global_avx2[32]) = {
52
  3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255,
53
  3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255
54
};
55
56
DECLARE_ALIGNED(32, static const uint8_t,
57
                filt1_global_sse2[16]) = { 0, 1, 1, 2,  2,  3,  3,  4,
58
                                           8, 9, 9, 10, 10, 11, 11, 12 };
59
60
DECLARE_ALIGNED(32, static const uint8_t,
61
                filt2_global_sse2[16]) = { 2,  3,  3,  4,  4,  5,  5,  6,
62
                                           10, 11, 11, 12, 12, 13, 13, 14 };
63
64
DECLARE_ALIGNED(32, static const uint8_t,
65
                filt3_global_sse2[16]) = { 0, 1, 1, 2, 8, 9, 9, 10,
66
                                           0, 0, 0, 0, 0, 0, 0, 0 };
67
68
DECLARE_ALIGNED(32, static const uint8_t,
69
                filt4_global_sse2[16]) = { 2, 3, 3, 4, 10, 11, 11, 12,
70
                                           0, 0, 0, 0, 0,  0,  0,  0 };
71
72
DECLARE_ALIGNED(32, static const uint8_t,
73
                filt5_global_sse2[16]) = { 0, 1, 1, 2, 4, 5, 5, 6,
74
                                           0, 0, 0, 0, 0, 0, 0, 0 };
75
76
DECLARE_ALIGNED(32, static const uint8_t,
77
                filt1_global_avx2[32]) = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,
78
                                           6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3,
79
                                           3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
80
81
DECLARE_ALIGNED(32, static const uint8_t,
82
                filt2_global_avx2[32]) = { 2, 3, 3, 4, 4,  5, 5, 6, 6, 7, 7,
83
                                           8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5,
84
                                           5, 6, 6, 7, 7,  8, 8, 9, 9, 10 };
85
86
DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
87
  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
88
  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
89
};
90
91
DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
92
  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
93
  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
94
};
95
96
#define CONVOLVE_SR_HOR_FILTER_W4(CONVOLVE_LOWBD)                            \
97
3.07M
  for (i = 0; i < (im_h - 2); i += 2) {                                      \
98
2.48M
    __m128i data =                                                           \
99
2.48M
        load_8bit_8x2_to_1_reg_sse2(&src_ptr[(i * src_stride)], src_stride); \
100
2.48M
    __m128i res = CONVOLVE_LOWBD(data, coeffs_h, filt);                      \
101
2.48M
    res = _mm_srai_epi16(_mm_add_epi16(res, round_const_h), 2);              \
102
2.48M
    _mm_store_si128((__m128i *)&im_block[i * 4], res);                       \
103
2.48M
  }                                                                          \
104
584k
  __m128i data_1 = _mm_loadl_epi64((__m128i *)&src_ptr[(i * src_stride)]);   \
105
584k
  __m128i res = CONVOLVE_LOWBD(data_1, coeffs_h, filt);                      \
106
584k
  res = _mm_srai_epi16(_mm_add_epi16(res, round_const_h), 2);                \
107
584k
  _mm_storel_epi64((__m128i *)&im_block[i * 4], res);
108
109
#define CONVOLVE_SR_HOR_FILTER_2TAP_W4 \
110
15.6k
  CONVOLVE_SR_HOR_FILTER_W4(convolve_lowbd_x_2tap_ssse3)
111
112
#define CONVOLVE_SR_HOR_FILTER_4TAP_W4 \
113
568k
  CONVOLVE_SR_HOR_FILTER_W4(convolve_lowbd_x_4tap_ssse3)
114
115
static inline void sr_2d_ver_round_and_store_w4(int w, __m256i res,
116
                                                uint8_t *dst, int dst_stride,
117
1.71M
                                                __m256i round_const_v) {
118
1.71M
  const __m256i res_round =
119
1.71M
      _mm256_srai_epi32(_mm256_add_epi32(res, round_const_v), 11);
120
121
1.71M
  const __m256i res_16bit = _mm256_packs_epi32(res_round, res_round);
122
1.71M
  const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);
123
124
1.71M
  const __m128i r0 = _mm256_castsi256_si128(res_8b);
125
1.71M
  const __m128i r1 = _mm256_extracti128_si256(res_8b, 1);
126
127
1.71M
  __m128i *const p0 = (__m128i *)dst;
128
1.71M
  __m128i *const p1 = (__m128i *)(dst + dst_stride);
129
130
1.71M
  if (w == 4) {
131
1.42M
    xx_storel_32(p0, r0);
132
1.42M
    xx_storel_32(p1, r1);
133
1.42M
  } else {
134
294k
    assert(w == 2);
135
294k
    *(uint16_t *)p0 = (uint16_t)_mm_cvtsi128_si32(r0);
136
294k
    *(uint16_t *)p1 = (uint16_t)_mm_cvtsi128_si32(r1);
137
294k
  }
138
1.71M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:sr_2d_ver_round_and_store_w4
Unexecuted instantiation: highbd_convolve_avx2.c:sr_2d_ver_round_and_store_w4
convolve_2d_avx2.c:sr_2d_ver_round_and_store_w4
Line
Count
Source
117
1.71M
                                                __m256i round_const_v) {
118
1.71M
  const __m256i res_round =
119
1.71M
      _mm256_srai_epi32(_mm256_add_epi32(res, round_const_v), 11);
120
121
1.71M
  const __m256i res_16bit = _mm256_packs_epi32(res_round, res_round);
122
1.71M
  const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);
123
124
1.71M
  const __m128i r0 = _mm256_castsi256_si128(res_8b);
125
1.71M
  const __m128i r1 = _mm256_extracti128_si256(res_8b, 1);
126
127
1.71M
  __m128i *const p0 = (__m128i *)dst;
128
1.71M
  __m128i *const p1 = (__m128i *)(dst + dst_stride);
129
130
1.71M
  if (w == 4) {
131
1.42M
    xx_storel_32(p0, r0);
132
1.42M
    xx_storel_32(p1, r1);
133
1.42M
  } else {
134
294k
    assert(w == 2);
135
294k
    *(uint16_t *)p0 = (uint16_t)_mm_cvtsi128_si32(r0);
136
294k
    *(uint16_t *)p1 = (uint16_t)_mm_cvtsi128_si32(r1);
137
294k
  }
138
1.71M
}
Unexecuted instantiation: convolve_avx2.c:sr_2d_ver_round_and_store_w4
Unexecuted instantiation: jnt_convolve_avx2.c:sr_2d_ver_round_and_store_w4
Unexecuted instantiation: wiener_convolve_avx2.c:sr_2d_ver_round_and_store_w4
Unexecuted instantiation: highbd_convolve_2d_avx2.c:sr_2d_ver_round_and_store_w4
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:sr_2d_ver_round_and_store_w4
139
140
#define CONVOLVE_SR_VER_FILTER_2TAP_W4                                        \
141
15.6k
  __m128i s[2];                                                               \
142
15.6k
  s[0] = _mm_loadl_epi64((__m128i *)(im_block + 0 * 4));                      \
143
15.6k
                                                                              \
144
59.7k
  for (i = 0; i < h; i += 2) {                                                \
145
44.0k
    const int16_t *data = &im_block[i * 4];                                   \
146
44.0k
    s[1] = _mm_loadl_epi64((__m128i *)(data + 1 * 4));                        \
147
44.0k
    const __m256i src_0 = _mm256_setr_m128i(s[0], s[1]);                      \
148
44.0k
    s[0] = _mm_loadl_epi64((__m128i *)(data + 2 * 4));                        \
149
44.0k
    const __m256i src_1 = _mm256_setr_m128i(s[1], s[0]);                      \
150
44.0k
    const __m256i ss = _mm256_unpacklo_epi16(src_0, src_1);                   \
151
44.0k
                                                                              \
152
44.0k
    const __m256i res = _mm256_madd_epi16(ss, coeffs_v[0]);                   \
153
44.0k
                                                                              \
154
44.0k
    sr_2d_ver_round_and_store_w4(w, res, dst_ptr, dst_stride, round_const_v); \
155
44.0k
    dst_ptr += 2 * dst_stride;                                                \
156
44.0k
  }
157
158
#define CONVOLVE_SR_VER_FILTER_4TAP_W4                                        \
159
377k
  __m128i s[4];                                                               \
160
377k
  __m256i ss[2];                                                              \
161
377k
  s[0] = _mm_loadl_epi64((__m128i *)(im_block + 0 * 4));                      \
162
377k
  s[1] = _mm_loadl_epi64((__m128i *)(im_block + 1 * 4));                      \
163
377k
  s[2] = _mm_loadl_epi64((__m128i *)(im_block + 2 * 4));                      \
164
377k
                                                                              \
165
377k
  const __m256i src_0 = _mm256_setr_m128i(s[0], s[1]);                        \
166
377k
  const __m256i src_1 = _mm256_setr_m128i(s[1], s[2]);                        \
167
377k
                                                                              \
168
377k
  ss[0] = _mm256_unpacklo_epi16(src_0, src_1);                                \
169
377k
                                                                              \
170
1.08M
  for (i = 0; i < h; i += 2) {                                                \
171
708k
    const int16_t *data = &im_block[i * 4];                                   \
172
708k
    s[3] = _mm_loadl_epi64((__m128i *)(data + 3 * 4));                        \
173
708k
    const __m256i src_2 = _mm256_setr_m128i(s[2], s[3]);                      \
174
708k
    s[2] = _mm_loadl_epi64((__m128i *)(data + 4 * 4));                        \
175
708k
    const __m256i src_3 = _mm256_setr_m128i(s[3], s[2]);                      \
176
708k
    ss[1] = _mm256_unpacklo_epi16(src_2, src_3);                              \
177
708k
                                                                              \
178
708k
    const __m256i res = convolve_4tap(ss, coeffs_v);                          \
179
708k
                                                                              \
180
708k
    sr_2d_ver_round_and_store_w4(w, res, dst_ptr, dst_stride, round_const_v); \
181
708k
    dst_ptr += 2 * dst_stride;                                                \
182
708k
                                                                              \
183
708k
    ss[0] = ss[1];                                                            \
184
708k
  }
185
186
#define CONVOLVE_SR_VER_FILTER_6TAP_W4                                        \
187
181k
  __m128i s[6];                                                               \
188
181k
  __m256i ss[3];                                                              \
189
181k
  s[0] = _mm_loadl_epi64((__m128i *)(im_block + 0 * 4));                      \
190
181k
  s[1] = _mm_loadl_epi64((__m128i *)(im_block + 1 * 4));                      \
191
181k
  s[2] = _mm_loadl_epi64((__m128i *)(im_block + 2 * 4));                      \
192
181k
  s[3] = _mm_loadl_epi64((__m128i *)(im_block + 3 * 4));                      \
193
181k
  s[4] = _mm_loadl_epi64((__m128i *)(im_block + 4 * 4));                      \
194
181k
                                                                              \
195
181k
  const __m256i src_0 = _mm256_setr_m128i(s[0], s[1]);                        \
196
181k
  const __m256i src_1 = _mm256_setr_m128i(s[1], s[2]);                        \
197
181k
  const __m256i src_2 = _mm256_setr_m128i(s[2], s[3]);                        \
198
181k
  const __m256i src_3 = _mm256_setr_m128i(s[3], s[4]);                        \
199
181k
                                                                              \
200
181k
  ss[0] = _mm256_unpacklo_epi16(src_0, src_1);                                \
201
181k
  ss[1] = _mm256_unpacklo_epi16(src_2, src_3);                                \
202
181k
                                                                              \
203
1.09M
  for (i = 0; i < h; i += 2) {                                                \
204
914k
    const int16_t *data = &im_block[i * 4];                                   \
205
914k
    s[5] = _mm_loadl_epi64((__m128i *)(data + 5 * 4));                        \
206
914k
    const __m256i src_4 = _mm256_setr_m128i(s[4], s[5]);                      \
207
914k
    s[4] = _mm_loadl_epi64((__m128i *)(data + 6 * 4));                        \
208
914k
    const __m256i src_5 = _mm256_setr_m128i(s[5], s[4]);                      \
209
914k
    ss[2] = _mm256_unpacklo_epi16(src_4, src_5);                              \
210
914k
                                                                              \
211
914k
    const __m256i res = convolve_6tap(ss, coeffs_v);                          \
212
914k
                                                                              \
213
914k
    sr_2d_ver_round_and_store_w4(w, res, dst_ptr, dst_stride, round_const_v); \
214
914k
    dst_ptr += 2 * dst_stride;                                                \
215
914k
                                                                              \
216
914k
    ss[0] = ss[1];                                                            \
217
914k
    ss[1] = ss[2];                                                            \
218
914k
  }
219
220
#define CONVOLVE_SR_VER_FILTER_8TAP_W4                                        \
221
9.94k
  __m128i s[8];                                                               \
222
9.94k
  __m256i ss[4];                                                              \
223
9.94k
  s[0] = _mm_loadl_epi64((__m128i *)(im_block + 0 * 4));                      \
224
9.94k
  s[1] = _mm_loadl_epi64((__m128i *)(im_block + 1 * 4));                      \
225
9.94k
  s[2] = _mm_loadl_epi64((__m128i *)(im_block + 2 * 4));                      \
226
9.94k
  s[3] = _mm_loadl_epi64((__m128i *)(im_block + 3 * 4));                      \
227
9.94k
  s[4] = _mm_loadl_epi64((__m128i *)(im_block + 4 * 4));                      \
228
9.94k
  s[5] = _mm_loadl_epi64((__m128i *)(im_block + 5 * 4));                      \
229
9.94k
  s[6] = _mm_loadl_epi64((__m128i *)(im_block + 6 * 4));                      \
230
9.94k
                                                                              \
231
9.94k
  const __m256i src_0 = _mm256_setr_m128i(s[0], s[1]);                        \
232
9.94k
  const __m256i src_1 = _mm256_setr_m128i(s[1], s[2]);                        \
233
9.94k
  const __m256i src_2 = _mm256_setr_m128i(s[2], s[3]);                        \
234
9.94k
  const __m256i src_3 = _mm256_setr_m128i(s[3], s[4]);                        \
235
9.94k
  const __m256i src_4 = _mm256_setr_m128i(s[4], s[5]);                        \
236
9.94k
  const __m256i src_5 = _mm256_setr_m128i(s[5], s[6]);                        \
237
9.94k
                                                                              \
238
9.94k
  ss[0] = _mm256_unpacklo_epi16(src_0, src_1);                                \
239
9.94k
  ss[1] = _mm256_unpacklo_epi16(src_2, src_3);                                \
240
9.94k
  ss[2] = _mm256_unpacklo_epi16(src_4, src_5);                                \
241
9.94k
                                                                              \
242
60.5k
  for (i = 0; i < h; i += 2) {                                                \
243
50.6k
    const int16_t *data = &im_block[i * 4];                                   \
244
50.6k
    s[7] = _mm_loadl_epi64((__m128i *)(data + 7 * 4));                        \
245
50.6k
    const __m256i src_6 = _mm256_setr_m128i(s[6], s[7]);                      \
246
50.6k
    s[6] = _mm_loadl_epi64((__m128i *)(data + 8 * 4));                        \
247
50.6k
    const __m256i src_7 = _mm256_setr_m128i(s[7], s[6]);                      \
248
50.6k
    ss[3] = _mm256_unpacklo_epi16(src_6, src_7);                              \
249
50.6k
                                                                              \
250
50.6k
    const __m256i res = convolve(ss, coeffs_v);                               \
251
50.6k
                                                                              \
252
50.6k
    sr_2d_ver_round_and_store_w4(w, res, dst_ptr, dst_stride, round_const_v); \
253
50.6k
    dst_ptr += 2 * dst_stride;                                                \
254
50.6k
                                                                              \
255
50.6k
    ss[0] = ss[1];                                                            \
256
50.6k
    ss[1] = ss[2];                                                            \
257
50.6k
    ss[2] = ss[3];                                                            \
258
50.6k
  }
259
260
#define CONVOLVE_SR_HORIZONTAL_FILTER(CONVOLVE_LOWBD)                 \
261
  for (i = 0; i < (im_h - 2); i += 2) {                               \
262
    __m256i data = _mm256_castsi128_si256(                            \
263
        _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));  \
264
    data = _mm256_inserti128_si256(                                   \
265
        data,                                                         \
266
        _mm_loadu_si128(                                              \
267
            (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),  \
268
        1);                                                           \
269
    __m256i res = CONVOLVE_LOWBD(data, coeffs_h, filt);               \
270
    res = _mm256_srai_epi16(_mm256_add_epi16(res, round_const_h), 2); \
271
    _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);     \
272
  }                                                                   \
273
  __m256i data_1 = _mm256_castsi128_si256(                            \
274
      _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));    \
275
  __m256i res = CONVOLVE_LOWBD(data_1, coeffs_h, filt);               \
276
  res = _mm256_srai_epi16(_mm256_add_epi16(res, round_const_h), 2);   \
277
  _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
278
279
#define CONVOLVE_SR_HORIZONTAL_FILTER_2TAP \
280
  CONVOLVE_SR_HORIZONTAL_FILTER(convolve_lowbd_x_2tap)
281
282
#define CONVOLVE_SR_HORIZONTAL_FILTER_4TAP \
283
  CONVOLVE_SR_HORIZONTAL_FILTER(convolve_lowbd_x_4tap)
284
285
#define CONVOLVE_SR_HORIZONTAL_FILTER_6TAP \
286
  CONVOLVE_SR_HORIZONTAL_FILTER(convolve_lowbd_x_6tap)
287
288
#define CONVOLVE_SR_HORIZONTAL_FILTER_8TAP \
289
  CONVOLVE_SR_HORIZONTAL_FILTER(convolve_lowbd_x)
290
291
static inline void sr_2d_ver_round_and_store(__m256i res_a, __m256i res_b,
292
                                             uint8_t *dst, int dst_stride,
293
11.8M
                                             __m256i round_const_v) {
294
11.8M
  const __m256i res_a_round =
295
11.8M
      _mm256_srai_epi32(_mm256_add_epi32(res_a, round_const_v), 11);
296
11.8M
  const __m256i res_b_round =
297
11.8M
      _mm256_srai_epi32(_mm256_add_epi32(res_b, round_const_v), 11);
298
11.8M
  const __m256i r16 = _mm256_packs_epi32(res_a_round, res_b_round);
299
11.8M
  const __m256i r8 = _mm256_packus_epi16(r16, r16);
300
301
11.8M
  _mm_storel_epi64((__m128i *)dst, _mm256_castsi256_si128(r8));
302
11.8M
  _mm_storel_epi64((__m128i *)(dst + dst_stride),
303
11.8M
                   _mm256_extracti128_si256(r8, 1));
304
11.8M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:sr_2d_ver_round_and_store
Unexecuted instantiation: highbd_convolve_avx2.c:sr_2d_ver_round_and_store
convolve_2d_avx2.c:sr_2d_ver_round_and_store
Line
Count
Source
293
11.8M
                                             __m256i round_const_v) {
294
11.8M
  const __m256i res_a_round =
295
11.8M
      _mm256_srai_epi32(_mm256_add_epi32(res_a, round_const_v), 11);
296
11.8M
  const __m256i res_b_round =
297
11.8M
      _mm256_srai_epi32(_mm256_add_epi32(res_b, round_const_v), 11);
298
11.8M
  const __m256i r16 = _mm256_packs_epi32(res_a_round, res_b_round);
299
11.8M
  const __m256i r8 = _mm256_packus_epi16(r16, r16);
300
301
11.8M
  _mm_storel_epi64((__m128i *)dst, _mm256_castsi256_si128(r8));
302
11.8M
  _mm_storel_epi64((__m128i *)(dst + dst_stride),
303
                   _mm256_extracti128_si256(r8, 1));
304
11.8M
}
Unexecuted instantiation: convolve_avx2.c:sr_2d_ver_round_and_store
Unexecuted instantiation: jnt_convolve_avx2.c:sr_2d_ver_round_and_store
Unexecuted instantiation: wiener_convolve_avx2.c:sr_2d_ver_round_and_store
Unexecuted instantiation: highbd_convolve_2d_avx2.c:sr_2d_ver_round_and_store
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:sr_2d_ver_round_and_store
305
306
#define CONVOLVE_SR_VERTICAL_FILTER_2TAP                                      \
307
433k
  for (i = 0; i < h; i += 2) {                                                \
308
403k
    __m256i s[2];                                                             \
309
403k
    const int16_t *data = &im_block[i * im_stride];                           \
310
403k
    const __m256i s1 = _mm256_loadu_si256((__m256i *)(data + 0 * im_stride)); \
311
403k
    const __m256i s2 = _mm256_loadu_si256((__m256i *)(data + 1 * im_stride)); \
312
403k
    s[0] = _mm256_unpacklo_epi16(s1, s2);                                     \
313
403k
    s[1] = _mm256_unpackhi_epi16(s1, s2);                                     \
314
403k
                                                                              \
315
403k
    __m256i res_a = _mm256_madd_epi16(s[0], coeffs_v[0]);                     \
316
403k
    __m256i res_b = _mm256_madd_epi16(s[1], coeffs_v[0]);                     \
317
403k
                                                                              \
318
403k
    sr_2d_ver_round_and_store(res_a, res_b, dst_ptr, dst_stride,              \
319
403k
                              round_const_v);                                 \
320
403k
    dst_ptr += 2 * dst_stride;                                                \
321
403k
  }
322
323
#define CONVOLVE_SR_VERTICAL_FILTER_4TAP                                      \
324
546k
  __m256i s[6];                                                               \
325
546k
  __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));  \
326
546k
  __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));  \
327
546k
                                                                              \
328
546k
  s[0] = _mm256_unpacklo_epi16(src_0, src_1);                                 \
329
546k
  s[2] = _mm256_unpackhi_epi16(src_0, src_1);                                 \
330
546k
                                                                              \
331
2.09M
  for (i = 0; i < h; i += 2) {                                                \
332
1.54M
    const int16_t *data = &im_block[i * im_stride];                           \
333
1.54M
    const __m256i s4 = _mm256_loadu_si256((__m256i *)(data + 2 * im_stride)); \
334
1.54M
    const __m256i s5 = _mm256_loadu_si256((__m256i *)(data + 3 * im_stride)); \
335
1.54M
    s[1] = _mm256_unpacklo_epi16(s4, s5);                                     \
336
1.54M
    s[3] = _mm256_unpackhi_epi16(s4, s5);                                     \
337
1.54M
                                                                              \
338
1.54M
    __m256i res_a = convolve_4tap(s, coeffs_v);                               \
339
1.54M
    __m256i res_b = convolve_4tap(s + 2, coeffs_v);                           \
340
1.54M
                                                                              \
341
1.54M
    sr_2d_ver_round_and_store(res_a, res_b, dst_ptr, dst_stride,              \
342
1.54M
                              round_const_v);                                 \
343
1.54M
    dst_ptr += 2 * dst_stride;                                                \
344
1.54M
                                                                              \
345
1.54M
    s[0] = s[1];                                                              \
346
1.54M
    s[2] = s[3];                                                              \
347
1.54M
  }
348
349
#define CONVOLVE_SR_VERTICAL_FILTER_6TAP                                      \
350
766k
  __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));  \
351
766k
  __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));  \
352
766k
  __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));  \
353
766k
  __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));  \
354
766k
                                                                              \
355
766k
  __m256i s[8];                                                               \
356
766k
  s[0] = _mm256_unpacklo_epi16(src_0, src_1);                                 \
357
766k
  s[1] = _mm256_unpacklo_epi16(src_2, src_3);                                 \
358
766k
                                                                              \
359
766k
  s[3] = _mm256_unpackhi_epi16(src_0, src_1);                                 \
360
766k
  s[4] = _mm256_unpackhi_epi16(src_2, src_3);                                 \
361
766k
                                                                              \
362
9.24M
  for (i = 0; i < h; i += 2) {                                                \
363
8.48M
    const int16_t *data = &im_block[i * im_stride];                           \
364
8.48M
                                                                              \
365
8.48M
    const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 4 * im_stride)); \
366
8.48M
    const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 5 * im_stride)); \
367
8.48M
                                                                              \
368
8.48M
    s[2] = _mm256_unpacklo_epi16(s6, s7);                                     \
369
8.48M
    s[5] = _mm256_unpackhi_epi16(s6, s7);                                     \
370
8.48M
                                                                              \
371
8.48M
    __m256i res_a = convolve_6tap(s, coeffs_v);                               \
372
8.48M
    __m256i res_b = convolve_6tap(s + 3, coeffs_v);                           \
373
8.48M
                                                                              \
374
8.48M
    sr_2d_ver_round_and_store(res_a, res_b, dst_ptr, dst_stride,              \
375
8.48M
                              round_const_v);                                 \
376
8.48M
    dst_ptr += 2 * dst_stride;                                                \
377
8.48M
                                                                              \
378
8.48M
    s[0] = s[1];                                                              \
379
8.48M
    s[1] = s[2];                                                              \
380
8.48M
                                                                              \
381
8.48M
    s[3] = s[4];                                                              \
382
8.48M
    s[4] = s[5];                                                              \
383
8.48M
  }
384
385
#define CONVOLVE_SR_VERTICAL_FILTER_8TAP                                      \
386
115k
  __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));  \
387
115k
  __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));  \
388
115k
  __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));  \
389
115k
  __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));  \
390
115k
  __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));  \
391
115k
  __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));  \
392
115k
                                                                              \
393
115k
  __m256i s[8];                                                               \
394
115k
  s[0] = _mm256_unpacklo_epi16(src_0, src_1);                                 \
395
115k
  s[1] = _mm256_unpacklo_epi16(src_2, src_3);                                 \
396
115k
  s[2] = _mm256_unpacklo_epi16(src_4, src_5);                                 \
397
115k
                                                                              \
398
115k
  s[4] = _mm256_unpackhi_epi16(src_0, src_1);                                 \
399
115k
  s[5] = _mm256_unpackhi_epi16(src_2, src_3);                                 \
400
115k
  s[6] = _mm256_unpackhi_epi16(src_4, src_5);                                 \
401
115k
                                                                              \
402
1.53M
  for (i = 0; i < h; i += 2) {                                                \
403
1.41M
    const int16_t *data = &im_block[i * im_stride];                           \
404
1.41M
                                                                              \
405
1.41M
    const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); \
406
1.41M
    const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); \
407
1.41M
                                                                              \
408
1.41M
    s[3] = _mm256_unpacklo_epi16(s6, s7);                                     \
409
1.41M
    s[7] = _mm256_unpackhi_epi16(s6, s7);                                     \
410
1.41M
                                                                              \
411
1.41M
    __m256i res_a = convolve(s, coeffs_v);                                    \
412
1.41M
    __m256i res_b = convolve(s + 4, coeffs_v);                                \
413
1.41M
                                                                              \
414
1.41M
    sr_2d_ver_round_and_store(res_a, res_b, dst_ptr, dst_stride,              \
415
1.41M
                              round_const_v);                                 \
416
1.41M
    dst_ptr += 2 * dst_stride;                                                \
417
1.41M
                                                                              \
418
1.41M
    s[0] = s[1];                                                              \
419
1.41M
    s[1] = s[2];                                                              \
420
1.41M
    s[2] = s[3];                                                              \
421
1.41M
                                                                              \
422
1.41M
    s[4] = s[5];                                                              \
423
1.41M
    s[5] = s[6];                                                              \
424
1.41M
    s[6] = s[7];                                                              \
425
1.41M
  }
426
427
#define CONVOLVE_SR_VERTICAL_FILTER_12TAP                                      \
428
0
  __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));   \
429
0
  __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));   \
430
0
  __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));   \
431
0
  __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));   \
432
0
  __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));   \
433
0
  __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));   \
434
0
  __m256i src_6 = _mm256_loadu_si256((__m256i *)(im_block + 6 * im_stride));   \
435
0
  __m256i src_7 = _mm256_loadu_si256((__m256i *)(im_block + 7 * im_stride));   \
436
0
  __m256i src_8 = _mm256_loadu_si256((__m256i *)(im_block + 8 * im_stride));   \
437
0
  __m256i src_9 = _mm256_loadu_si256((__m256i *)(im_block + 9 * im_stride));   \
438
0
                                                                               \
439
0
  s[0] = _mm256_unpacklo_epi16(src_0, src_1);                                  \
440
0
  s[1] = _mm256_unpacklo_epi16(src_2, src_3);                                  \
441
0
  s[2] = _mm256_unpacklo_epi16(src_4, src_5);                                  \
442
0
  s[3] = _mm256_unpacklo_epi16(src_6, src_7);                                  \
443
0
  s[4] = _mm256_unpacklo_epi16(src_8, src_9);                                  \
444
0
                                                                               \
445
0
  s[6] = _mm256_unpackhi_epi16(src_0, src_1);                                  \
446
0
  s[7] = _mm256_unpackhi_epi16(src_2, src_3);                                  \
447
0
  s[8] = _mm256_unpackhi_epi16(src_4, src_5);                                  \
448
0
  s[9] = _mm256_unpackhi_epi16(src_6, src_7);                                  \
449
0
  s[10] = _mm256_unpackhi_epi16(src_8, src_9);                                 \
450
0
                                                                               \
451
0
  for (i = 0; i < h; i += 2) {                                                 \
452
0
    const int16_t *data = &im_block[i * im_stride];                            \
453
0
                                                                               \
454
0
    const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 10 * im_stride)); \
455
0
    const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 11 * im_stride)); \
456
0
                                                                               \
457
0
    s[5] = _mm256_unpacklo_epi16(s6, s7);                                      \
458
0
    s[11] = _mm256_unpackhi_epi16(s6, s7);                                     \
459
0
                                                                               \
460
0
    __m256i res_a = convolve_12taps(s, coeffs_v);                              \
461
0
    __m256i res_b = convolve_12taps(s + 6, coeffs_v);                          \
462
0
                                                                               \
463
0
    res_a =                                                                    \
464
0
        _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v);   \
465
0
    res_b =                                                                    \
466
0
        _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v);   \
467
0
                                                                               \
468
0
    const __m256i res_a_round = _mm256_sra_epi32(                              \
469
0
        _mm256_add_epi32(res_a, round_const_v), round_shift_v);                \
470
0
    const __m256i res_b_round = _mm256_sra_epi32(                              \
471
0
        _mm256_add_epi32(res_b, round_const_v), round_shift_v);                \
472
0
                                                                               \
473
0
    const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);    \
474
0
    const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);          \
475
0
                                                                               \
476
0
    const __m128i res_0 = _mm256_castsi256_si128(res_8b);                      \
477
0
    const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);                 \
478
0
                                                                               \
479
0
    __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];                  \
480
0
    __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];     \
481
0
    if (w - j > 4) {                                                           \
482
0
      _mm_storel_epi64(p_0, res_0);                                            \
483
0
      _mm_storel_epi64(p_1, res_1);                                            \
484
0
    } else if (w == 4) {                                                       \
485
0
      xx_storel_32(p_0, res_0);                                                \
486
0
      xx_storel_32(p_1, res_1);                                                \
487
0
    } else {                                                                   \
488
0
      *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);                   \
489
0
      *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);                   \
490
0
    }                                                                          \
491
0
                                                                               \
492
0
    s[0] = s[1];                                                               \
493
0
    s[1] = s[2];                                                               \
494
0
    s[2] = s[3];                                                               \
495
0
    s[3] = s[4];                                                               \
496
0
    s[4] = s[5];                                                               \
497
0
                                                                               \
498
0
    s[6] = s[7];                                                               \
499
0
    s[7] = s[8];                                                               \
500
0
    s[8] = s[9];                                                               \
501
0
    s[9] = s[10];                                                              \
502
0
    s[10] = s[11];                                                             \
503
0
  }
504
505
#define JNT_CONVOLVE_PROCESS_OUTPUT(res_unsigned, j_off)                       \
506
2.65M
  do {                                                                         \
507
2.65M
    if (do_average) {                                                          \
508
1.10M
      const __m256i data_ref_0 =                                               \
509
1.10M
          load_line2_avx2(&dst[i * dst_stride + (j_off)],                      \
510
1.10M
                          &dst[i * dst_stride + (j_off) + dst_stride]);        \
511
1.10M
      const __m256i comp_avg_res =                                             \
512
1.10M
          comp_avg(&data_ref_0, &(res_unsigned), &wt, use_dist_wtd_comp_avg);  \
513
1.10M
      const __m256i res_signed = _mm256_sub_epi16(comp_avg_res, offset_const); \
514
1.10M
      const __m256i round_result =                                             \
515
1.10M
          _mm256_srai_epi16(_mm256_add_epi16(res_signed, rounding_const), 4);  \
516
1.10M
      const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);   \
517
1.10M
      const __m128i res_0 = _mm256_castsi256_si128(res_8);                     \
518
1.10M
      const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);                \
519
1.10M
      if (w - (j_off) > 4) {                                                   \
520
1.06M
        _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + (j_off)]),        \
521
1.06M
                         res_0);                                               \
522
1.06M
        _mm_storel_epi64(                                                      \
523
1.06M
            (__m128i *)(&dst0[i * dst_stride0 + (j_off) + dst_stride0]),       \
524
1.06M
            res_1);                                                            \
525
1.06M
      } else {                                                                 \
526
34.9k
        *(int *)(&dst0[i * dst_stride0 + (j_off)]) = _mm_cvtsi128_si32(res_0); \
527
34.9k
        *(int *)(&dst0[i * dst_stride0 + (j_off) + dst_stride0]) =             \
528
34.9k
            _mm_cvtsi128_si32(res_1);                                          \
529
34.9k
      }                                                                        \
530
1.55M
    } else {                                                                   \
531
1.55M
      const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);              \
532
1.55M
      _mm_store_si128((__m128i *)(&dst[i * dst_stride + (j_off)]), res_0);     \
533
1.55M
      const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);         \
534
1.55M
      _mm_store_si128(                                                         \
535
1.55M
          (__m128i *)(&dst[i * dst_stride + (j_off) + dst_stride]), res_1);    \
536
1.55M
    }                                                                          \
537
2.65M
  } while (0)
538
539
#define JNT_CONVOLVE_HORIZONTAL_FILTER(src_h_start, convolve_fn, coeffs) \
540
353k
  do {                                                                   \
541
353k
    const uint8_t *src_h = (src_h_start);                                \
542
4.90M
    for (i = 0; i < im_h; i += 2) {                                      \
543
4.55M
      const __m256i data = load_line2_avx2(src_h, src_h + src_stride);   \
544
4.55M
      src_h += (src_stride << 1);                                        \
545
4.55M
      __m256i res = convolve_fn(data, coeffs, filt);                     \
546
4.55M
      res = _mm256_srai_epi16(_mm256_add_epi16(res, round_const_h), 2);  \
547
4.55M
      _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);      \
548
4.55M
    }                                                                    \
549
353k
  } while (0)
550
551
#define JNT_CONVOLVE_VERTICAL_FILTER_8TAP                                     \
552
238k
  do {                                                                        \
553
238k
    __m256i s[8];                                                             \
554
238k
    __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));   \
555
238k
    __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));   \
556
238k
    __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));   \
557
238k
    __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));   \
558
238k
    __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));   \
559
238k
    __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));   \
560
238k
                                                                              \
561
238k
    s[0] = _mm256_unpacklo_epi16(s0, s1);                                     \
562
238k
    s[1] = _mm256_unpacklo_epi16(s2, s3);                                     \
563
238k
    s[2] = _mm256_unpacklo_epi16(s4, s5);                                     \
564
238k
                                                                              \
565
238k
    s[4] = _mm256_unpackhi_epi16(s0, s1);                                     \
566
238k
    s[5] = _mm256_unpackhi_epi16(s2, s3);                                     \
567
238k
    s[6] = _mm256_unpackhi_epi16(s4, s5);                                     \
568
238k
                                                                              \
569
2.89M
    for (i = 0; i < h; i += 2) {                                              \
570
2.65M
      const int16_t *data = &im_block[i * im_stride];                         \
571
2.65M
                                                                              \
572
2.65M
      const __m256i s6 =                                                      \
573
2.65M
          _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));              \
574
2.65M
      const __m256i s7 =                                                      \
575
2.65M
          _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));              \
576
2.65M
                                                                              \
577
2.65M
      s[3] = _mm256_unpacklo_epi16(s6, s7);                                   \
578
2.65M
      s[7] = _mm256_unpackhi_epi16(s6, s7);                                   \
579
2.65M
                                                                              \
580
2.65M
      const __m256i res_a = convolve(s, coeffs_y);                            \
581
2.65M
      const __m256i res_a_round =                                             \
582
2.65M
          _mm256_srai_epi32(_mm256_add_epi32(res_a, round_const_v), 7);       \
583
2.65M
                                                                              \
584
2.65M
      if (w - j > 4) {                                                        \
585
2.57M
        const __m256i res_b = convolve(s + 4, coeffs_y);                      \
586
2.57M
        const __m256i res_b_round =                                           \
587
2.57M
            _mm256_srai_epi32(_mm256_add_epi32(res_b, round_const_v), 7);     \
588
2.57M
        const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round); \
589
2.57M
        const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); \
590
2.57M
        JNT_CONVOLVE_PROCESS_OUTPUT(res_unsigned, j);                         \
591
2.57M
      } else {                                                                \
592
78.4k
        const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round); \
593
78.4k
        const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); \
594
78.4k
        JNT_CONVOLVE_PROCESS_OUTPUT(res_unsigned, j);                         \
595
78.4k
      }                                                                       \
596
2.65M
                                                                              \
597
2.65M
      s[0] = s[1];                                                            \
598
2.65M
      s[1] = s[2];                                                            \
599
2.65M
      s[2] = s[3];                                                            \
600
2.65M
                                                                              \
601
2.65M
      s[4] = s[5];                                                            \
602
2.65M
      s[5] = s[6];                                                            \
603
2.65M
      s[6] = s[7];                                                            \
604
2.65M
    }                                                                         \
605
238k
  } while (0)
606
607
static inline void prepare_coeffs_2t_ssse3(
608
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
609
34.7k
    __m128i *const coeffs /* [4] */) {
610
34.7k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
611
34.7k
      filter_params, subpel_q4 & SUBPEL_MASK);
612
34.7k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
613
614
  // right shift all filter co-efficients by 1 to reduce the bits required.
615
  // This extra right shift will be taken care of at the end while rounding
616
  // the result.
617
  // Since all filter co-efficients are even, this change will not affect the
618
  // end result
619
34.7k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
620
34.7k
                            _mm_set1_epi16((short)0xffff)));
621
622
34.7k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
623
624
  // coeffs 3 4 3 4 3 4 3 4
625
34.7k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u));
626
34.7k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_2t_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_2t_ssse3
convolve_2d_avx2.c:prepare_coeffs_2t_ssse3
Line
Count
Source
609
15.6k
    __m128i *const coeffs /* [4] */) {
610
15.6k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
611
15.6k
      filter_params, subpel_q4 & SUBPEL_MASK);
612
15.6k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
613
614
  // right shift all filter co-efficients by 1 to reduce the bits required.
615
  // This extra right shift will be taken care of at the end while rounding
616
  // the result.
617
  // Since all filter co-efficients are even, this change will not affect the
618
  // end result
619
15.6k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
620
15.6k
                            _mm_set1_epi16((short)0xffff)));
621
622
15.6k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
623
624
  // coeffs 3 4 3 4 3 4 3 4
625
15.6k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u));
626
15.6k
}
convolve_avx2.c:prepare_coeffs_2t_ssse3
Line
Count
Source
609
19.0k
    __m128i *const coeffs /* [4] */) {
610
19.0k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
611
19.0k
      filter_params, subpel_q4 & SUBPEL_MASK);
612
19.0k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
613
614
  // right shift all filter co-efficients by 1 to reduce the bits required.
615
  // This extra right shift will be taken care of at the end while rounding
616
  // the result.
617
  // Since all filter co-efficients are even, this change will not affect the
618
  // end result
619
19.0k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
620
19.0k
                            _mm_set1_epi16((short)0xffff)));
621
622
19.0k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
623
624
  // coeffs 3 4 3 4 3 4 3 4
625
19.0k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u));
626
19.0k
}
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_2t_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_2t_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_2t_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_2t_ssse3
627
628
static inline void prepare_coeffs_4t_ssse3(
629
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
630
854k
    __m128i *const coeffs /* [4] */) {
631
854k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
632
854k
      filter_params, subpel_q4 & SUBPEL_MASK);
633
854k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
634
635
  // right shift all filter co-efficients by 1 to reduce the bits required.
636
  // This extra right shift will be taken care of at the end while rounding
637
  // the result.
638
  // Since all filter co-efficients are even, this change will not affect the
639
  // end result
640
854k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
641
854k
                            _mm_set1_epi16((short)0xffff)));
642
643
854k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
644
645
  // coeffs 2 3 2 3 2 3 2 3
646
854k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
647
  // coeffs 4 5 4 5 4 5 4 5
648
854k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
649
854k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_4t_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_4t_ssse3
convolve_2d_avx2.c:prepare_coeffs_4t_ssse3
Line
Count
Source
630
568k
    __m128i *const coeffs /* [4] */) {
631
568k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
632
568k
      filter_params, subpel_q4 & SUBPEL_MASK);
633
568k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
634
635
  // right shift all filter co-efficients by 1 to reduce the bits required.
636
  // This extra right shift will be taken care of at the end while rounding
637
  // the result.
638
  // Since all filter co-efficients are even, this change will not affect the
639
  // end result
640
568k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
641
568k
                            _mm_set1_epi16((short)0xffff)));
642
643
568k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
644
645
  // coeffs 2 3 2 3 2 3 2 3
646
568k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
647
  // coeffs 4 5 4 5 4 5 4 5
648
568k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
649
568k
}
convolve_avx2.c:prepare_coeffs_4t_ssse3
Line
Count
Source
630
285k
    __m128i *const coeffs /* [4] */) {
631
285k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
632
285k
      filter_params, subpel_q4 & SUBPEL_MASK);
633
285k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
634
635
  // right shift all filter co-efficients by 1 to reduce the bits required.
636
  // This extra right shift will be taken care of at the end while rounding
637
  // the result.
638
  // Since all filter co-efficients are even, this change will not affect the
639
  // end result
640
285k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
641
285k
                            _mm_set1_epi16((short)0xffff)));
642
643
285k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
644
645
  // coeffs 2 3 2 3 2 3 2 3
646
285k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
647
  // coeffs 4 5 4 5 4 5 4 5
648
285k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
649
285k
}
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_4t_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_4t_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_4t_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_4t_ssse3
650
651
static inline void prepare_coeffs_6t_ssse3(
652
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
653
67.2k
    __m128i *const coeffs /* [4] */) {
654
67.2k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
655
67.2k
      filter_params, subpel_q4 & SUBPEL_MASK);
656
67.2k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
657
658
  // right shift all filter co-efficients by 1 to reduce the bits required.
659
  // This extra right shift will be taken care of at the end while rounding
660
  // the result.
661
  // Since all filter co-efficients are even, this change will not affect the
662
  // end result
663
67.2k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
664
67.2k
                            _mm_set1_epi16((short)0xffff)));
665
666
67.2k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
667
668
  // coeffs 2 3 2 3 2 3 2 3
669
67.2k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0402u));
670
  // coeffs 4 5 4 5 4 5 4 5
671
67.2k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u));
672
  // coeffs 5 6 5 6 5 6 5 6
673
67.2k
  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0c0au));
674
67.2k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_6t_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_6t_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:prepare_coeffs_6t_ssse3
convolve_avx2.c:prepare_coeffs_6t_ssse3
Line
Count
Source
653
67.2k
    __m128i *const coeffs /* [4] */) {
654
67.2k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
655
67.2k
      filter_params, subpel_q4 & SUBPEL_MASK);
656
67.2k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
657
658
  // right shift all filter co-efficients by 1 to reduce the bits required.
659
  // This extra right shift will be taken care of at the end while rounding
660
  // the result.
661
  // Since all filter co-efficients are even, this change will not affect the
662
  // end result
663
67.2k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
664
67.2k
                            _mm_set1_epi16((short)0xffff)));
665
666
67.2k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
667
668
  // coeffs 2 3 2 3 2 3 2 3
669
67.2k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0402u));
670
  // coeffs 4 5 4 5 4 5 4 5
671
67.2k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u));
672
  // coeffs 5 6 5 6 5 6 5 6
673
67.2k
  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0c0au));
674
67.2k
}
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_6t_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_6t_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_6t_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_6t_ssse3
675
676
static inline void prepare_coeffs_ssse3(
677
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
678
5.89k
    __m128i *const coeffs /* [4] */) {
679
5.89k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
680
5.89k
      filter_params, subpel_q4 & SUBPEL_MASK);
681
5.89k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
682
683
  // right shift all filter co-efficients by 1 to reduce the bits required.
684
  // This extra right shift will be taken care of at the end while rounding
685
  // the result.
686
  // Since all filter co-efficients are even, this change will not affect the
687
  // end result
688
5.89k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
689
5.89k
                            _mm_set1_epi16((short)0xffff)));
690
691
5.89k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
692
693
  // coeffs 0 1 0 1 0 1 0 1
694
5.89k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
695
  // coeffs 2 3 2 3 2 3 2 3
696
5.89k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
697
  // coeffs 4 5 4 5 4 5 4 5
698
5.89k
  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
699
  // coeffs 6 7 6 7 6 7 6 7
700
5.89k
  coeffs[3] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0e0cu));
701
5.89k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:prepare_coeffs_ssse3
convolve_avx2.c:prepare_coeffs_ssse3
Line
Count
Source
678
5.89k
    __m128i *const coeffs /* [4] */) {
679
5.89k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
680
5.89k
      filter_params, subpel_q4 & SUBPEL_MASK);
681
5.89k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
682
683
  // right shift all filter co-efficients by 1 to reduce the bits required.
684
  // This extra right shift will be taken care of at the end while rounding
685
  // the result.
686
  // Since all filter co-efficients are even, this change will not affect the
687
  // end result
688
5.89k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
689
5.89k
                            _mm_set1_epi16((short)0xffff)));
690
691
5.89k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
692
693
  // coeffs 0 1 0 1 0 1 0 1
694
5.89k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
695
  // coeffs 2 3 2 3 2 3 2 3
696
5.89k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
697
  // coeffs 4 5 4 5 4 5 4 5
698
5.89k
  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
699
  // coeffs 6 7 6 7 6 7 6 7
700
5.89k
  coeffs[3] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0e0cu));
701
5.89k
}
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_ssse3
702
703
static inline void prepare_coeffs_2t_lowbd(
704
    const InterpFilterParams *const filter_params, const int subpel_q4,
705
25.1k
    __m256i *const coeffs /* [4] */) {
706
25.1k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
707
25.1k
      filter_params, subpel_q4 & SUBPEL_MASK);
708
25.1k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
709
25.1k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
710
711
  // right shift all filter co-efficients by 1 to reduce the bits required.
712
  // This extra right shift will be taken care of at the end while rounding
713
  // the result.
714
  // Since all filter co-efficients are even, this change will not affect the
715
  // end result
716
25.1k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
717
25.1k
                            _mm_set1_epi16((int16_t)0xffff)));
718
719
25.1k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
720
721
  // coeffs 3 4 3 4 3 4 3 4
722
25.1k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u));
723
25.1k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_2t_lowbd
Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_2t_lowbd
convolve_2d_avx2.c:prepare_coeffs_2t_lowbd
Line
Count
Source
705
14.7k
    __m256i *const coeffs /* [4] */) {
706
14.7k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
707
14.7k
      filter_params, subpel_q4 & SUBPEL_MASK);
708
14.7k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
709
14.7k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
710
711
  // right shift all filter co-efficients by 1 to reduce the bits required.
712
  // This extra right shift will be taken care of at the end while rounding
713
  // the result.
714
  // Since all filter co-efficients are even, this change will not affect the
715
  // end result
716
14.7k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
717
14.7k
                            _mm_set1_epi16((int16_t)0xffff)));
718
719
14.7k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
720
721
  // coeffs 3 4 3 4 3 4 3 4
722
14.7k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u));
723
14.7k
}
convolve_avx2.c:prepare_coeffs_2t_lowbd
Line
Count
Source
705
10.3k
    __m256i *const coeffs /* [4] */) {
706
10.3k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
707
10.3k
      filter_params, subpel_q4 & SUBPEL_MASK);
708
10.3k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
709
10.3k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
710
711
  // right shift all filter co-efficients by 1 to reduce the bits required.
712
  // This extra right shift will be taken care of at the end while rounding
713
  // the result.
714
  // Since all filter co-efficients are even, this change will not affect the
715
  // end result
716
10.3k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
717
10.3k
                            _mm_set1_epi16((int16_t)0xffff)));
718
719
10.3k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
720
721
  // coeffs 3 4 3 4 3 4 3 4
722
10.3k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u));
723
10.3k
}
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_2t_lowbd
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_2t_lowbd
Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_2t_lowbd
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_2t_lowbd
724
725
static inline void prepare_coeffs_4t_lowbd(
726
    const InterpFilterParams *const filter_params, const int subpel_q4,
727
186k
    __m256i *const coeffs /* [4] */) {
728
186k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
729
186k
      filter_params, subpel_q4 & SUBPEL_MASK);
730
186k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
731
186k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
732
733
  // right shift all filter co-efficients by 1 to reduce the bits required.
734
  // This extra right shift will be taken care of at the end while rounding
735
  // the result.
736
  // Since all filter co-efficients are even, this change will not affect the
737
  // end result
738
186k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
739
186k
                            _mm_set1_epi16((short)0xffff)));
740
741
186k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
742
743
  // coeffs 2 3 2 3 2 3 2 3
744
186k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
745
  // coeffs 4 5 4 5 4 5 4 5
746
186k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
747
186k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_4t_lowbd
Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_4t_lowbd
convolve_2d_avx2.c:prepare_coeffs_4t_lowbd
Line
Count
Source
727
38.0k
    __m256i *const coeffs /* [4] */) {
728
38.0k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
729
38.0k
      filter_params, subpel_q4 & SUBPEL_MASK);
730
38.0k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
731
38.0k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
732
733
  // right shift all filter co-efficients by 1 to reduce the bits required.
734
  // This extra right shift will be taken care of at the end while rounding
735
  // the result.
736
  // Since all filter co-efficients are even, this change will not affect the
737
  // end result
738
38.0k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
739
38.0k
                            _mm_set1_epi16((short)0xffff)));
740
741
38.0k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
742
743
  // coeffs 2 3 2 3 2 3 2 3
744
38.0k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
745
  // coeffs 4 5 4 5 4 5 4 5
746
38.0k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
747
38.0k
}
convolve_avx2.c:prepare_coeffs_4t_lowbd
Line
Count
Source
727
148k
    __m256i *const coeffs /* [4] */) {
728
148k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
729
148k
      filter_params, subpel_q4 & SUBPEL_MASK);
730
148k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
731
148k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
732
733
  // right shift all filter co-efficients by 1 to reduce the bits required.
734
  // This extra right shift will be taken care of at the end while rounding
735
  // the result.
736
  // Since all filter co-efficients are even, this change will not affect the
737
  // end result
738
148k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
739
148k
                            _mm_set1_epi16((short)0xffff)));
740
741
148k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
742
743
  // coeffs 2 3 2 3 2 3 2 3
744
148k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
745
  // coeffs 4 5 4 5 4 5 4 5
746
148k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
747
148k
}
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_4t_lowbd
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_4t_lowbd
Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_4t_lowbd
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_4t_lowbd
748
749
static inline void prepare_coeffs_6t_lowbd(
750
    const InterpFilterParams *const filter_params, const int subpel_q4,
751
1.14M
    __m256i *const coeffs /* [4] */) {
752
1.14M
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
753
1.14M
      filter_params, subpel_q4 & SUBPEL_MASK);
754
1.14M
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
755
1.14M
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
756
757
  // right shift all filter co-efficients by 1 to reduce the bits required.
758
  // This extra right shift will be taken care of at the end while rounding
759
  // the result.
760
  // Since all filter co-efficients are even, this change will not affect the
761
  // end result
762
1.14M
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
763
1.14M
                            _mm_set1_epi16((int16_t)0xffff)));
764
765
1.14M
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
766
767
  // coeffs 1 2 1 2 1 2 1 2
768
1.14M
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0402u));
769
  // coeffs 3 4 3 4 3 4 3 4
770
1.14M
  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u));
771
  // coeffs 5 6 5 6 5 6 5 6
772
1.14M
  coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0c0au));
773
1.14M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_6t_lowbd
Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_6t_lowbd
convolve_2d_avx2.c:prepare_coeffs_6t_lowbd
Line
Count
Source
751
730k
    __m256i *const coeffs /* [4] */) {
752
730k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
753
730k
      filter_params, subpel_q4 & SUBPEL_MASK);
754
730k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
755
730k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
756
757
  // right shift all filter co-efficients by 1 to reduce the bits required.
758
  // This extra right shift will be taken care of at the end while rounding
759
  // the result.
760
  // Since all filter co-efficients are even, this change will not affect the
761
  // end result
762
730k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
763
730k
                            _mm_set1_epi16((int16_t)0xffff)));
764
765
730k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
766
767
  // coeffs 1 2 1 2 1 2 1 2
768
730k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0402u));
769
  // coeffs 3 4 3 4 3 4 3 4
770
730k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u));
771
  // coeffs 5 6 5 6 5 6 5 6
772
730k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0c0au));
773
730k
}
convolve_avx2.c:prepare_coeffs_6t_lowbd
Line
Count
Source
751
410k
    __m256i *const coeffs /* [4] */) {
752
410k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
753
410k
      filter_params, subpel_q4 & SUBPEL_MASK);
754
410k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
755
410k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
756
757
  // right shift all filter co-efficients by 1 to reduce the bits required.
758
  // This extra right shift will be taken care of at the end while rounding
759
  // the result.
760
  // Since all filter co-efficients are even, this change will not affect the
761
  // end result
762
410k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
763
410k
                            _mm_set1_epi16((int16_t)0xffff)));
764
765
410k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
766
767
  // coeffs 1 2 1 2 1 2 1 2
768
410k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0402u));
769
  // coeffs 3 4 3 4 3 4 3 4
770
410k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u));
771
  // coeffs 5 6 5 6 5 6 5 6
772
410k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0c0au));
773
410k
}
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_6t_lowbd
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_6t_lowbd
Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_6t_lowbd
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_6t_lowbd
774
775
static inline void prepare_coeffs_lowbd(
776
    const InterpFilterParams *const filter_params, const int subpel_q4,
777
455k
    __m256i *const coeffs /* [4] */) {
778
455k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
779
455k
      filter_params, subpel_q4 & SUBPEL_MASK);
780
455k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
781
455k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
782
783
  // right shift all filter co-efficients by 1 to reduce the bits required.
784
  // This extra right shift will be taken care of at the end while rounding
785
  // the result.
786
  // Since all filter co-efficients are even, this change will not affect the
787
  // end result
788
455k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
789
455k
                            _mm_set1_epi16((short)0xffff)));
790
791
455k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
792
793
  // coeffs 0 1 0 1 0 1 0 1
794
455k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
795
  // coeffs 2 3 2 3 2 3 2 3
796
455k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
797
  // coeffs 4 5 4 5 4 5 4 5
798
455k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
799
  // coeffs 6 7 6 7 6 7 6 7
800
455k
  coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu));
801
455k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_lowbd
Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_lowbd
convolve_2d_avx2.c:prepare_coeffs_lowbd
Line
Count
Source
777
47.8k
    __m256i *const coeffs /* [4] */) {
778
47.8k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
779
47.8k
      filter_params, subpel_q4 & SUBPEL_MASK);
780
47.8k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
781
47.8k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
782
783
  // right shift all filter co-efficients by 1 to reduce the bits required.
784
  // This extra right shift will be taken care of at the end while rounding
785
  // the result.
786
  // Since all filter co-efficients are even, this change will not affect the
787
  // end result
788
47.8k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
789
47.8k
                            _mm_set1_epi16((short)0xffff)));
790
791
47.8k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
792
793
  // coeffs 0 1 0 1 0 1 0 1
794
47.8k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
795
  // coeffs 2 3 2 3 2 3 2 3
796
47.8k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
797
  // coeffs 4 5 4 5 4 5 4 5
798
47.8k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
799
  // coeffs 6 7 6 7 6 7 6 7
800
47.8k
  coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu));
801
47.8k
}
convolve_avx2.c:prepare_coeffs_lowbd
Line
Count
Source
777
33.3k
    __m256i *const coeffs /* [4] */) {
778
33.3k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
779
33.3k
      filter_params, subpel_q4 & SUBPEL_MASK);
780
33.3k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
781
33.3k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
782
783
  // right shift all filter co-efficients by 1 to reduce the bits required.
784
  // This extra right shift will be taken care of at the end while rounding
785
  // the result.
786
  // Since all filter co-efficients are even, this change will not affect the
787
  // end result
788
33.3k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
789
33.3k
                            _mm_set1_epi16((short)0xffff)));
790
791
33.3k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
792
793
  // coeffs 0 1 0 1 0 1 0 1
794
33.3k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
795
  // coeffs 2 3 2 3 2 3 2 3
796
33.3k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
797
  // coeffs 4 5 4 5 4 5 4 5
798
33.3k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
799
  // coeffs 6 7 6 7 6 7 6 7
800
33.3k
  coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu));
801
33.3k
}
jnt_convolve_avx2.c:prepare_coeffs_lowbd
Line
Count
Source
777
374k
    __m256i *const coeffs /* [4] */) {
778
374k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
779
374k
      filter_params, subpel_q4 & SUBPEL_MASK);
780
374k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
781
374k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
782
783
  // right shift all filter co-efficients by 1 to reduce the bits required.
784
  // This extra right shift will be taken care of at the end while rounding
785
  // the result.
786
  // Since all filter co-efficients are even, this change will not affect the
787
  // end result
788
374k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
789
374k
                            _mm_set1_epi16((short)0xffff)));
790
791
374k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
792
793
  // coeffs 0 1 0 1 0 1 0 1
794
374k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
795
  // coeffs 2 3 2 3 2 3 2 3
796
374k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
797
  // coeffs 4 5 4 5 4 5 4 5
798
374k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
799
  // coeffs 6 7 6 7 6 7 6 7
800
374k
  coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu));
801
374k
}
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_lowbd
Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_lowbd
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_lowbd
802
803
static inline void prepare_coeffs_2t(
804
    const InterpFilterParams *const filter_params, const int subpel_q4,
805
30.4k
    __m256i *const coeffs /* [4] */) {
806
30.4k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
807
30.4k
      filter_params, subpel_q4 & SUBPEL_MASK);
808
809
30.4k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)(filter + 1));
810
30.4k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
811
812
  // coeffs 3 4 3 4 3 4 3 4
813
30.4k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55);
814
30.4k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_2t
Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_2t
convolve_2d_avx2.c:prepare_coeffs_2t
Line
Count
Source
805
30.4k
    __m256i *const coeffs /* [4] */) {
806
30.4k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
807
30.4k
      filter_params, subpel_q4 & SUBPEL_MASK);
808
809
30.4k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)(filter + 1));
810
30.4k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
811
812
  // coeffs 3 4 3 4 3 4 3 4
813
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55);
814
30.4k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_2t
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_2t
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_2t
Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_2t
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_2t
815
816
static inline void prepare_coeffs_4t(
817
    const InterpFilterParams *const filter_params, const int subpel_q4,
818
779k
    __m256i *const coeffs /* [4] */) {
819
779k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
820
779k
      filter_params, subpel_q4 & SUBPEL_MASK);
821
822
779k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
823
779k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
824
  // coeffs 2 3 2 3 2 3 2 3
825
779k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55);
826
  // coeffs 4 5 4 5 4 5 4 5
827
779k
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0xaa);
828
779k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_4t
Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_4t
convolve_2d_avx2.c:prepare_coeffs_4t
Line
Count
Source
818
779k
    __m256i *const coeffs /* [4] */) {
819
779k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
820
779k
      filter_params, subpel_q4 & SUBPEL_MASK);
821
822
779k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
823
779k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
824
  // coeffs 2 3 2 3 2 3 2 3
825
779k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55);
826
  // coeffs 4 5 4 5 4 5 4 5
827
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0xaa);
828
779k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_4t
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_4t
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_4t
Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_4t
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_4t
829
830
static inline void prepare_coeffs_6t(
831
    const InterpFilterParams *const filter_params, const int subpel_q4,
832
563k
    __m256i *const coeffs /* [4] */) {
833
563k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
834
563k
      filter_params, subpel_q4 & SUBPEL_MASK);
835
836
563k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)(filter + 1));
837
563k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
838
839
  // coeffs 1 2 1 2 1 2 1 2
840
563k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
841
  // coeffs 3 4 3 4 3 4 3 4
842
563k
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
843
  // coeffs 5 6 5 6 5 6 5 6
844
563k
  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
845
563k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_6t
Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_6t
convolve_2d_avx2.c:prepare_coeffs_6t
Line
Count
Source
832
563k
    __m256i *const coeffs /* [4] */) {
833
563k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
834
563k
      filter_params, subpel_q4 & SUBPEL_MASK);
835
836
563k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)(filter + 1));
837
563k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
838
839
  // coeffs 1 2 1 2 1 2 1 2
840
563k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
841
  // coeffs 3 4 3 4 3 4 3 4
842
563k
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
843
  // coeffs 5 6 5 6 5 6 5 6
844
  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
845
563k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_6t
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_6t
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_6t
Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_6t
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_6t
846
847
static inline void prepare_coeffs(const InterpFilterParams *const filter_params,
848
                                  const int subpel_q4,
849
8.64M
                                  __m256i *const coeffs /* [4] */) {
850
8.64M
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
851
8.64M
      filter_params, subpel_q4 & SUBPEL_MASK);
852
853
8.64M
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
854
8.64M
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
855
856
  // coeffs 0 1 0 1 0 1 0 1
857
8.64M
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
858
  // coeffs 2 3 2 3 2 3 2 3
859
8.64M
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
860
  // coeffs 4 5 4 5 4 5 4 5
861
8.64M
  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
862
  // coeffs 6 7 6 7 6 7 6 7
863
8.64M
  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
864
8.64M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs
highbd_convolve_avx2.c:prepare_coeffs
Line
Count
Source
849
1.66M
                                  __m256i *const coeffs /* [4] */) {
850
1.66M
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
851
1.66M
      filter_params, subpel_q4 & SUBPEL_MASK);
852
853
1.66M
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
854
1.66M
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
855
856
  // coeffs 0 1 0 1 0 1 0 1
857
1.66M
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
858
  // coeffs 2 3 2 3 2 3 2 3
859
1.66M
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
860
  // coeffs 4 5 4 5 4 5 4 5
861
1.66M
  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
862
  // coeffs 6 7 6 7 6 7 6 7
863
  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
864
1.66M
}
convolve_2d_avx2.c:prepare_coeffs
Line
Count
Source
849
41.9k
                                  __m256i *const coeffs /* [4] */) {
850
41.9k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
851
41.9k
      filter_params, subpel_q4 & SUBPEL_MASK);
852
853
41.9k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
854
41.9k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
855
856
  // coeffs 0 1 0 1 0 1 0 1
857
41.9k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
858
  // coeffs 2 3 2 3 2 3 2 3
859
41.9k
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
860
  // coeffs 4 5 4 5 4 5 4 5
861
41.9k
  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
862
  // coeffs 6 7 6 7 6 7 6 7
863
  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
864
41.9k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs
jnt_convolve_avx2.c:prepare_coeffs
Line
Count
Source
849
201k
                                  __m256i *const coeffs /* [4] */) {
850
201k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
851
201k
      filter_params, subpel_q4 & SUBPEL_MASK);
852
853
201k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
854
201k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
855
856
  // coeffs 0 1 0 1 0 1 0 1
857
201k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
858
  // coeffs 2 3 2 3 2 3 2 3
859
201k
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
860
  // coeffs 4 5 4 5 4 5 4 5
861
201k
  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
862
  // coeffs 6 7 6 7 6 7 6 7
863
  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
864
201k
}
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs
highbd_convolve_2d_avx2.c:prepare_coeffs
Line
Count
Source
849
5.38M
                                  __m256i *const coeffs /* [4] */) {
850
5.38M
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
851
5.38M
      filter_params, subpel_q4 & SUBPEL_MASK);
852
853
5.38M
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
854
5.38M
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
855
856
  // coeffs 0 1 0 1 0 1 0 1
857
5.38M
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
858
  // coeffs 2 3 2 3 2 3 2 3
859
5.38M
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
860
  // coeffs 4 5 4 5 4 5 4 5
861
5.38M
  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
862
  // coeffs 6 7 6 7 6 7 6 7
863
  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
864
5.38M
}
highbd_jnt_convolve_avx2.c:prepare_coeffs
Line
Count
Source
849
1.35M
                                  __m256i *const coeffs /* [4] */) {
850
1.35M
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
851
1.35M
      filter_params, subpel_q4 & SUBPEL_MASK);
852
853
1.35M
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
854
1.35M
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
855
856
  // coeffs 0 1 0 1 0 1 0 1
857
1.35M
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
858
  // coeffs 2 3 2 3 2 3 2 3
859
1.35M
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
860
  // coeffs 4 5 4 5 4 5 4 5
861
1.35M
  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
862
  // coeffs 6 7 6 7 6 7 6 7
863
  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
864
1.35M
}
865
866
static inline void prepare_coeffs_12taps(
867
    const InterpFilterParams *const filter_params, const int subpel_q4,
868
0
    __m256i *const coeffs /* [4] */) {
869
0
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
870
0
      filter_params, subpel_q4 & SUBPEL_MASK);
871
872
0
  __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
873
0
  __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
874
875
  // coeffs 0 1 0 1 0 1 0 1
876
0
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
877
  // coeffs 2 3 2 3 2 3 2 3
878
0
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
879
  // coeffs 4 5 4 5 4 5 4 5
880
0
  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
881
  // coeffs 6 7 6 7 6 7 6 7
882
0
  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
883
  // coeffs 8 9 10 11 0 0 0 0
884
0
  coeff_8 = _mm_loadl_epi64((__m128i *)(filter + 8));
885
0
  coeff = _mm256_broadcastq_epi64(coeff_8);
886
0
  coeffs[4] = _mm256_shuffle_epi32(coeff, 0x00);  // coeffs 8 9 8 9 8 9 8 9
887
0
  coeffs[5] = _mm256_shuffle_epi32(coeff, 0x55);  // coeffs 10 11 10 11.. 10 11
888
0
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_12taps
Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_12taps
Unexecuted instantiation: convolve_2d_avx2.c:prepare_coeffs_12taps
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_12taps
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_12taps
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_12taps
Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_12taps
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_12taps
889
890
static inline __m128i convolve_lowbd_4tap_ssse3(const __m128i ss[2],
891
3.75M
                                                const __m128i coeffs[2]) {
892
3.75M
  const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
893
3.75M
  const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
894
895
3.75M
  return _mm_add_epi16(res_01, res_23);
896
3.75M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_4tap_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_4tap_ssse3
convolve_2d_avx2.c:convolve_lowbd_4tap_ssse3
Line
Count
Source
891
3.01M
                                                const __m128i coeffs[2]) {
892
3.01M
  const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
893
3.01M
  const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
894
895
3.01M
  return _mm_add_epi16(res_01, res_23);
896
3.01M
}
convolve_avx2.c:convolve_lowbd_4tap_ssse3
Line
Count
Source
891
740k
                                                const __m128i coeffs[2]) {
892
740k
  const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
893
740k
  const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
894
895
740k
  return _mm_add_epi16(res_01, res_23);
896
740k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_4tap_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_4tap_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_4tap_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_4tap_ssse3
897
898
static inline __m128i convolve_lowbd_6tap_ssse3(const __m128i ss[3],
899
348k
                                                const __m128i coeffs[3]) {
900
348k
  const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
901
348k
  const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
902
348k
  const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]);
903
904
348k
  const __m128i res = _mm_add_epi16(_mm_add_epi16(res_01, res_45), res_23);
905
906
348k
  return res;
907
348k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_6tap_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_6tap_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_lowbd_6tap_ssse3
convolve_avx2.c:convolve_lowbd_6tap_ssse3
Line
Count
Source
899
348k
                                                const __m128i coeffs[3]) {
900
348k
  const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
901
348k
  const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
902
348k
  const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]);
903
904
348k
  const __m128i res = _mm_add_epi16(_mm_add_epi16(res_01, res_45), res_23);
905
906
348k
  return res;
907
348k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_6tap_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_6tap_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_6tap_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_6tap_ssse3
908
909
static inline __m128i convolve_lowbd_ssse3(const __m128i ss[4],
910
30.5k
                                           const __m128i coeffs[4]) {
911
30.5k
  const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
912
30.5k
  const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
913
30.5k
  const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]);
914
30.5k
  const __m128i res_67 = _mm_maddubs_epi16(ss[3], coeffs[3]);
915
916
30.5k
  const __m128i res = _mm_add_epi16(_mm_add_epi16(res_01, res_45),
917
30.5k
                                    _mm_add_epi16(res_23, res_67));
918
919
30.5k
  return res;
920
30.5k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_lowbd_ssse3
convolve_avx2.c:convolve_lowbd_ssse3
Line
Count
Source
910
30.5k
                                           const __m128i coeffs[4]) {
911
30.5k
  const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
912
30.5k
  const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
913
30.5k
  const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]);
914
30.5k
  const __m128i res_67 = _mm_maddubs_epi16(ss[3], coeffs[3]);
915
916
30.5k
  const __m128i res = _mm_add_epi16(_mm_add_epi16(res_01, res_45),
917
30.5k
                                    _mm_add_epi16(res_23, res_67));
918
919
30.5k
  return res;
920
30.5k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_ssse3
921
922
static inline __m256i convolve_lowbd(const __m256i *const s,
923
21.3M
                                     const __m256i *const coeffs) {
924
21.3M
  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
925
21.3M
  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
926
21.3M
  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
927
21.3M
  const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]);
928
929
  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
930
21.3M
  const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45),
931
21.3M
                                       _mm256_add_epi16(res_23, res_67));
932
933
21.3M
  return res;
934
21.3M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd
convolve_2d_avx2.c:convolve_lowbd
Line
Count
Source
923
1.96M
                                     const __m256i *const coeffs) {
924
1.96M
  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
925
1.96M
  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
926
1.96M
  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
927
1.96M
  const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]);
928
929
  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
930
1.96M
  const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45),
931
1.96M
                                       _mm256_add_epi16(res_23, res_67));
932
933
1.96M
  return res;
934
1.96M
}
convolve_avx2.c:convolve_lowbd
Line
Count
Source
923
573k
                                     const __m256i *const coeffs) {
924
573k
  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
925
573k
  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
926
573k
  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
927
573k
  const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]);
928
929
  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
930
573k
  const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45),
931
573k
                                       _mm256_add_epi16(res_23, res_67));
932
933
573k
  return res;
934
573k
}
jnt_convolve_avx2.c:convolve_lowbd
Line
Count
Source
923
6.81M
                                     const __m256i *const coeffs) {
924
6.81M
  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
925
6.81M
  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
926
6.81M
  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
927
6.81M
  const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]);
928
929
  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
930
6.81M
  const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45),
931
6.81M
                                       _mm256_add_epi16(res_23, res_67));
932
933
6.81M
  return res;
934
6.81M
}
wiener_convolve_avx2.c:convolve_lowbd
Line
Count
Source
923
11.9M
                                     const __m256i *const coeffs) {
924
11.9M
  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
925
11.9M
  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
926
11.9M
  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
927
11.9M
  const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]);
928
929
  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
930
11.9M
  const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45),
931
11.9M
                                       _mm256_add_epi16(res_23, res_67));
932
933
11.9M
  return res;
934
11.9M
}
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd
935
936
static inline __m256i convolve_lowbd_6tap(const __m256i *const s,
937
20.2M
                                          const __m256i *const coeffs) {
938
20.2M
  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
939
20.2M
  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
940
20.2M
  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
941
942
  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
943
20.2M
  const __m256i res =
944
20.2M
      _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), res_23);
945
946
20.2M
  return res;
947
20.2M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_6tap
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_6tap
convolve_2d_avx2.c:convolve_lowbd_6tap
Line
Count
Source
937
12.2M
                                          const __m256i *const coeffs) {
938
12.2M
  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
939
12.2M
  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
940
12.2M
  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
941
942
  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
943
12.2M
  const __m256i res =
944
12.2M
      _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), res_23);
945
946
12.2M
  return res;
947
12.2M
}
convolve_avx2.c:convolve_lowbd_6tap
Line
Count
Source
937
8.02M
                                          const __m256i *const coeffs) {
938
8.02M
  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
939
8.02M
  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
940
8.02M
  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
941
942
  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
943
8.02M
  const __m256i res =
944
8.02M
      _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), res_23);
945
946
8.02M
  return res;
947
8.02M
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_6tap
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_6tap
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_6tap
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_6tap
948
949
static inline __m256i convolve_lowbd_4tap(const __m256i *const s,
950
4.65M
                                          const __m256i *const coeffs) {
951
4.65M
  const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]);
952
4.65M
  const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]);
953
954
  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
955
4.65M
  const __m256i res = _mm256_add_epi16(res_45, res_23);
956
957
4.65M
  return res;
958
4.65M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_4tap
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_4tap
convolve_2d_avx2.c:convolve_lowbd_4tap
Line
Count
Source
950
1.04M
                                          const __m256i *const coeffs) {
951
1.04M
  const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]);
952
1.04M
  const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]);
953
954
  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
955
1.04M
  const __m256i res = _mm256_add_epi16(res_45, res_23);
956
957
1.04M
  return res;
958
1.04M
}
convolve_avx2.c:convolve_lowbd_4tap
Line
Count
Source
950
1.56M
                                          const __m256i *const coeffs) {
951
1.56M
  const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]);
952
1.56M
  const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]);
953
954
  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
955
1.56M
  const __m256i res = _mm256_add_epi16(res_45, res_23);
956
957
1.56M
  return res;
958
1.56M
}
jnt_convolve_avx2.c:convolve_lowbd_4tap
Line
Count
Source
950
2.04M
                                          const __m256i *const coeffs) {
951
2.04M
  const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]);
952
2.04M
  const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]);
953
954
  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
955
2.04M
  const __m256i res = _mm256_add_epi16(res_45, res_23);
956
957
2.04M
  return res;
958
2.04M
}
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_4tap
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_4tap
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_4tap
959
960
static inline __m256i convolve_6tap(const __m256i *const s,
961
17.8M
                                    const __m256i *const coeffs) {
962
17.8M
  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
963
17.8M
  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
964
17.8M
  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
965
966
17.8M
  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), res_2);
967
968
17.8M
  return res;
969
17.8M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_6tap
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_6tap
convolve_2d_avx2.c:convolve_6tap
Line
Count
Source
961
17.8M
                                    const __m256i *const coeffs) {
962
17.8M
  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
963
17.8M
  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
964
17.8M
  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
965
966
17.8M
  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), res_2);
967
968
17.8M
  return res;
969
17.8M
}
Unexecuted instantiation: convolve_avx2.c:convolve_6tap
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_6tap
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_6tap
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_6tap
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_6tap
970
971
static inline __m256i convolve_12taps(const __m256i *const s,
972
0
                                      const __m256i *const coeffs) {
973
0
  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
974
0
  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
975
0
  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
976
0
  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
977
0
  const __m256i res_4 = _mm256_madd_epi16(s[4], coeffs[4]);
978
0
  const __m256i res_5 = _mm256_madd_epi16(s[5], coeffs[5]);
979
980
0
  const __m256i res1 = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
981
0
                                        _mm256_add_epi32(res_2, res_3));
982
0
  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_4, res_5), res1);
983
984
0
  return res;
985
0
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_12taps
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_12taps
Unexecuted instantiation: convolve_2d_avx2.c:convolve_12taps
Unexecuted instantiation: convolve_avx2.c:convolve_12taps
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_12taps
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_12taps
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_12taps
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_12taps
986
987
static inline __m256i convolve(const __m256i *const s,
988
250M
                               const __m256i *const coeffs) {
989
250M
  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
990
250M
  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
991
250M
  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
992
250M
  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
993
994
250M
  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
995
250M
                                       _mm256_add_epi32(res_2, res_3));
996
997
250M
  return res;
998
250M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve
highbd_convolve_avx2.c:convolve
Line
Count
Source
988
33.0M
                               const __m256i *const coeffs) {
989
33.0M
  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
990
33.0M
  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
991
33.0M
  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
992
33.0M
  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
993
994
33.0M
  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
995
33.0M
                                       _mm256_add_epi32(res_2, res_3));
996
997
33.0M
  return res;
998
33.0M
}
convolve_2d_avx2.c:convolve
Line
Count
Source
988
2.88M
                               const __m256i *const coeffs) {
989
2.88M
  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
990
2.88M
  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
991
2.88M
  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
992
2.88M
  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
993
994
2.88M
  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
995
2.88M
                                       _mm256_add_epi32(res_2, res_3));
996
997
2.88M
  return res;
998
2.88M
}
Unexecuted instantiation: convolve_avx2.c:convolve
jnt_convolve_avx2.c:convolve
Line
Count
Source
988
5.22M
                               const __m256i *const coeffs) {
989
5.22M
  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
990
5.22M
  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
991
5.22M
  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
992
5.22M
  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
993
994
5.22M
  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
995
5.22M
                                       _mm256_add_epi32(res_2, res_3));
996
997
5.22M
  return res;
998
5.22M
}
wiener_convolve_avx2.c:convolve
Line
Count
Source
988
22.3M
                               const __m256i *const coeffs) {
989
22.3M
  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
990
22.3M
  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
991
22.3M
  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
992
22.3M
  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
993
994
22.3M
  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
995
22.3M
                                       _mm256_add_epi32(res_2, res_3));
996
997
22.3M
  return res;
998
22.3M
}
highbd_convolve_2d_avx2.c:convolve
Line
Count
Source
988
120M
                               const __m256i *const coeffs) {
989
120M
  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
990
120M
  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
991
120M
  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
992
120M
  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
993
994
120M
  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
995
120M
                                       _mm256_add_epi32(res_2, res_3));
996
997
120M
  return res;
998
120M
}
highbd_jnt_convolve_avx2.c:convolve
Line
Count
Source
988
66.4M
                               const __m256i *const coeffs) {
989
66.4M
  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
990
66.4M
  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
991
66.4M
  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
992
66.4M
  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
993
994
66.4M
  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
995
66.4M
                                       _mm256_add_epi32(res_2, res_3));
996
997
66.4M
  return res;
998
66.4M
}
999
1000
static inline __m256i convolve_4tap(const __m256i *const s,
1001
3.80M
                                    const __m256i *const coeffs) {
1002
3.80M
  const __m256i res_1 = _mm256_madd_epi16(s[0], coeffs[0]);
1003
3.80M
  const __m256i res_2 = _mm256_madd_epi16(s[1], coeffs[1]);
1004
1005
3.80M
  const __m256i res = _mm256_add_epi32(res_1, res_2);
1006
3.80M
  return res;
1007
3.80M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_4tap
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_4tap
convolve_2d_avx2.c:convolve_4tap
Line
Count
Source
1001
3.80M
                                    const __m256i *const coeffs) {
1002
3.80M
  const __m256i res_1 = _mm256_madd_epi16(s[0], coeffs[0]);
1003
3.80M
  const __m256i res_2 = _mm256_madd_epi16(s[1], coeffs[1]);
1004
1005
3.80M
  const __m256i res = _mm256_add_epi32(res_1, res_2);
1006
3.80M
  return res;
1007
3.80M
}
Unexecuted instantiation: convolve_avx2.c:convolve_4tap
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_4tap
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_4tap
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_4tap
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_4tap
1008
1009
static inline __m128i convolve_lowbd_x_2tap_ssse3(const __m128i data,
1010
                                                  const __m128i *const coeffs,
1011
59.7k
                                                  const __m128i *const filt) {
1012
59.7k
  __m128i s;
1013
59.7k
  s = _mm_shuffle_epi8(data, filt[0]);
1014
1015
59.7k
  return _mm_maddubs_epi16(s, coeffs[0]);
1016
59.7k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x_2tap_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x_2tap_ssse3
convolve_2d_avx2.c:convolve_lowbd_x_2tap_ssse3
Line
Count
Source
1011
59.7k
                                                  const __m128i *const filt) {
1012
59.7k
  __m128i s;
1013
59.7k
  s = _mm_shuffle_epi8(data, filt[0]);
1014
1015
59.7k
  return _mm_maddubs_epi16(s, coeffs[0]);
1016
59.7k
}
Unexecuted instantiation: convolve_avx2.c:convolve_lowbd_x_2tap_ssse3
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_x_2tap_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_x_2tap_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x_2tap_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x_2tap_ssse3
1017
1018
static inline __m128i convolve_lowbd_x_4tap_ssse3(const __m128i data,
1019
                                                  const __m128i *const coeffs,
1020
3.01M
                                                  const __m128i *const filt) {
1021
3.01M
  __m128i s[2];
1022
1023
3.01M
  s[0] = _mm_shuffle_epi8(data, filt[0]);
1024
3.01M
  s[1] = _mm_shuffle_epi8(data, filt[1]);
1025
1026
3.01M
  return convolve_lowbd_4tap_ssse3(s, coeffs);
1027
3.01M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x_4tap_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x_4tap_ssse3
convolve_2d_avx2.c:convolve_lowbd_x_4tap_ssse3
Line
Count
Source
1020
3.01M
                                                  const __m128i *const filt) {
1021
3.01M
  __m128i s[2];
1022
1023
3.01M
  s[0] = _mm_shuffle_epi8(data, filt[0]);
1024
3.01M
  s[1] = _mm_shuffle_epi8(data, filt[1]);
1025
1026
3.01M
  return convolve_lowbd_4tap_ssse3(s, coeffs);
1027
3.01M
}
Unexecuted instantiation: convolve_avx2.c:convolve_lowbd_x_4tap_ssse3
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_x_4tap_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_x_4tap_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x_4tap_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x_4tap_ssse3
1028
1029
static inline __m256i convolve_lowbd_x(const __m256i data,
1030
                                       const __m256i *const coeffs,
1031
20.2M
                                       const __m256i *const filt) {
1032
20.2M
  __m256i s[4];
1033
1034
20.2M
  s[0] = _mm256_shuffle_epi8(data, filt[0]);
1035
20.2M
  s[1] = _mm256_shuffle_epi8(data, filt[1]);
1036
20.2M
  s[2] = _mm256_shuffle_epi8(data, filt[2]);
1037
20.2M
  s[3] = _mm256_shuffle_epi8(data, filt[3]);
1038
1039
20.2M
  return convolve_lowbd(s, coeffs);
1040
20.2M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x
convolve_2d_avx2.c:convolve_lowbd_x
Line
Count
Source
1031
1.96M
                                       const __m256i *const filt) {
1032
1.96M
  __m256i s[4];
1033
1034
1.96M
  s[0] = _mm256_shuffle_epi8(data, filt[0]);
1035
1.96M
  s[1] = _mm256_shuffle_epi8(data, filt[1]);
1036
1.96M
  s[2] = _mm256_shuffle_epi8(data, filt[2]);
1037
1.96M
  s[3] = _mm256_shuffle_epi8(data, filt[3]);
1038
1039
1.96M
  return convolve_lowbd(s, coeffs);
1040
1.96M
}
convolve_avx2.c:convolve_lowbd_x
Line
Count
Source
1031
390k
                                       const __m256i *const filt) {
1032
390k
  __m256i s[4];
1033
1034
390k
  s[0] = _mm256_shuffle_epi8(data, filt[0]);
1035
390k
  s[1] = _mm256_shuffle_epi8(data, filt[1]);
1036
390k
  s[2] = _mm256_shuffle_epi8(data, filt[2]);
1037
390k
  s[3] = _mm256_shuffle_epi8(data, filt[3]);
1038
1039
390k
  return convolve_lowbd(s, coeffs);
1040
390k
}
jnt_convolve_avx2.c:convolve_lowbd_x
Line
Count
Source
1031
5.89M
                                       const __m256i *const filt) {
1032
5.89M
  __m256i s[4];
1033
1034
5.89M
  s[0] = _mm256_shuffle_epi8(data, filt[0]);
1035
5.89M
  s[1] = _mm256_shuffle_epi8(data, filt[1]);
1036
5.89M
  s[2] = _mm256_shuffle_epi8(data, filt[2]);
1037
5.89M
  s[3] = _mm256_shuffle_epi8(data, filt[3]);
1038
1039
5.89M
  return convolve_lowbd(s, coeffs);
1040
5.89M
}
wiener_convolve_avx2.c:convolve_lowbd_x
Line
Count
Source
1031
11.9M
                                       const __m256i *const filt) {
1032
11.9M
  __m256i s[4];
1033
1034
11.9M
  s[0] = _mm256_shuffle_epi8(data, filt[0]);
1035
11.9M
  s[1] = _mm256_shuffle_epi8(data, filt[1]);
1036
11.9M
  s[2] = _mm256_shuffle_epi8(data, filt[2]);
1037
11.9M
  s[3] = _mm256_shuffle_epi8(data, filt[3]);
1038
1039
11.9M
  return convolve_lowbd(s, coeffs);
1040
11.9M
}
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x
1041
1042
static inline __m256i convolve_lowbd_x_6tap(const __m256i data,
1043
                                            const __m256i *const coeffs,
1044
16.5M
                                            const __m256i *const filt) {
1045
16.5M
  __m256i s[4];
1046
1047
16.5M
  s[0] = _mm256_shuffle_epi8(data, filt[0]);
1048
16.5M
  s[1] = _mm256_shuffle_epi8(data, filt[1]);
1049
16.5M
  s[2] = _mm256_shuffle_epi8(data, filt[2]);
1050
1051
16.5M
  return convolve_lowbd_6tap(s, coeffs);
1052
16.5M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x_6tap
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x_6tap
convolve_2d_avx2.c:convolve_lowbd_x_6tap
Line
Count
Source
1044
12.2M
                                            const __m256i *const filt) {
1045
12.2M
  __m256i s[4];
1046
1047
12.2M
  s[0] = _mm256_shuffle_epi8(data, filt[0]);
1048
12.2M
  s[1] = _mm256_shuffle_epi8(data, filt[1]);
1049
12.2M
  s[2] = _mm256_shuffle_epi8(data, filt[2]);
1050
1051
12.2M
  return convolve_lowbd_6tap(s, coeffs);
1052
12.2M
}
convolve_avx2.c:convolve_lowbd_x_6tap
Line
Count
Source
1044
4.26M
                                            const __m256i *const filt) {
1045
4.26M
  __m256i s[4];
1046
1047
4.26M
  s[0] = _mm256_shuffle_epi8(data, filt[0]);
1048
4.26M
  s[1] = _mm256_shuffle_epi8(data, filt[1]);
1049
4.26M
  s[2] = _mm256_shuffle_epi8(data, filt[2]);
1050
1051
4.26M
  return convolve_lowbd_6tap(s, coeffs);
1052
4.26M
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_x_6tap
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_x_6tap
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x_6tap
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x_6tap
1053
1054
static inline __m256i convolve_lowbd_x_4tap(const __m256i data,
1055
                                            const __m256i *const coeffs,
1056
3.40M
                                            const __m256i *const filt) {
1057
3.40M
  __m256i s[2];
1058
1059
3.40M
  s[0] = _mm256_shuffle_epi8(data, filt[0]);
1060
3.40M
  s[1] = _mm256_shuffle_epi8(data, filt[1]);
1061
1062
3.40M
  return convolve_lowbd_4tap(s, coeffs);
1063
3.40M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x_4tap
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x_4tap
convolve_2d_avx2.c:convolve_lowbd_x_4tap
Line
Count
Source
1056
1.04M
                                            const __m256i *const filt) {
1057
1.04M
  __m256i s[2];
1058
1059
1.04M
  s[0] = _mm256_shuffle_epi8(data, filt[0]);
1060
1.04M
  s[1] = _mm256_shuffle_epi8(data, filt[1]);
1061
1062
1.04M
  return convolve_lowbd_4tap(s, coeffs);
1063
1.04M
}
convolve_avx2.c:convolve_lowbd_x_4tap
Line
Count
Source
1056
627k
                                            const __m256i *const filt) {
1057
627k
  __m256i s[2];
1058
1059
627k
  s[0] = _mm256_shuffle_epi8(data, filt[0]);
1060
627k
  s[1] = _mm256_shuffle_epi8(data, filt[1]);
1061
1062
627k
  return convolve_lowbd_4tap(s, coeffs);
1063
627k
}
jnt_convolve_avx2.c:convolve_lowbd_x_4tap
Line
Count
Source
1056
1.73M
                                            const __m256i *const filt) {
1057
1.73M
  __m256i s[2];
1058
1059
1.73M
  s[0] = _mm256_shuffle_epi8(data, filt[0]);
1060
1.73M
  s[1] = _mm256_shuffle_epi8(data, filt[1]);
1061
1062
1.73M
  return convolve_lowbd_4tap(s, coeffs);
1063
1.73M
}
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_x_4tap
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x_4tap
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x_4tap
1064
1065
static inline __m256i convolve_lowbd_x_2tap(const __m256i data,
1066
                                            const __m256i *const coeffs,
1067
433k
                                            const __m256i *const filt) {
1068
433k
  __m256i s;
1069
433k
  s = _mm256_shuffle_epi8(data, filt[0]);
1070
1071
433k
  return _mm256_maddubs_epi16(s, coeffs[0]);
1072
433k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x_2tap
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x_2tap
convolve_2d_avx2.c:convolve_lowbd_x_2tap
Line
Count
Source
1067
433k
                                            const __m256i *const filt) {
1068
433k
  __m256i s;
1069
433k
  s = _mm256_shuffle_epi8(data, filt[0]);
1070
1071
433k
  return _mm256_maddubs_epi16(s, coeffs[0]);
1072
433k
}
Unexecuted instantiation: convolve_avx2.c:convolve_lowbd_x_2tap
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_x_2tap
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_x_2tap
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x_2tap
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x_2tap
1073
1074
static inline void add_store_aligned_256(CONV_BUF_TYPE *const dst,
1075
                                         const __m256i *const res,
1076
0
                                         const int do_average) {
1077
0
  __m256i d;
1078
0
  if (do_average) {
1079
0
    d = _mm256_load_si256((__m256i *)dst);
1080
0
    d = _mm256_add_epi32(d, *res);
1081
0
    d = _mm256_srai_epi32(d, 1);
1082
0
  } else {
1083
0
    d = *res;
1084
0
  }
1085
0
  _mm256_store_si256((__m256i *)dst, d);
1086
0
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:add_store_aligned_256
Unexecuted instantiation: highbd_convolve_avx2.c:add_store_aligned_256
Unexecuted instantiation: convolve_2d_avx2.c:add_store_aligned_256
Unexecuted instantiation: convolve_avx2.c:add_store_aligned_256
Unexecuted instantiation: jnt_convolve_avx2.c:add_store_aligned_256
Unexecuted instantiation: wiener_convolve_avx2.c:add_store_aligned_256
Unexecuted instantiation: highbd_convolve_2d_avx2.c:add_store_aligned_256
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:add_store_aligned_256
1087
1088
static inline __m256i comp_avg(const __m256i *const data_ref_0,
1089
                               const __m256i *const res_unsigned,
1090
                               const __m256i *const wt,
1091
218M
                               const int use_dist_wtd_comp_avg) {
1092
218M
  __m256i res;
1093
218M
  if (use_dist_wtd_comp_avg) {
1094
1.76M
    const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned);
1095
1.76M
    const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned);
1096
1097
1.76M
    const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt);
1098
1.76M
    const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt);
1099
1100
1.76M
    const __m256i res_lo = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
1101
1.76M
    const __m256i res_hi = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
1102
1103
1.76M
    res = _mm256_packs_epi32(res_lo, res_hi);
1104
216M
  } else {
1105
216M
    const __m256i wt_res = _mm256_add_epi16(*data_ref_0, *res_unsigned);
1106
216M
    res = _mm256_srai_epi16(wt_res, 1);
1107
216M
  }
1108
218M
  return res;
1109
218M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:comp_avg
Unexecuted instantiation: highbd_convolve_avx2.c:comp_avg
Unexecuted instantiation: convolve_2d_avx2.c:comp_avg
Unexecuted instantiation: convolve_avx2.c:comp_avg
jnt_convolve_avx2.c:comp_avg
Line
Count
Source
1091
218M
                               const int use_dist_wtd_comp_avg) {
1092
218M
  __m256i res;
1093
218M
  if (use_dist_wtd_comp_avg) {
1094
1.76M
    const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned);
1095
1.76M
    const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned);
1096
1097
1.76M
    const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt);
1098
1.76M
    const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt);
1099
1100
1.76M
    const __m256i res_lo = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
1101
1.76M
    const __m256i res_hi = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
1102
1103
1.76M
    res = _mm256_packs_epi32(res_lo, res_hi);
1104
216M
  } else {
1105
216M
    const __m256i wt_res = _mm256_add_epi16(*data_ref_0, *res_unsigned);
1106
216M
    res = _mm256_srai_epi16(wt_res, 1);
1107
216M
  }
1108
218M
  return res;
1109
218M
}
Unexecuted instantiation: wiener_convolve_avx2.c:comp_avg
Unexecuted instantiation: highbd_convolve_2d_avx2.c:comp_avg
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:comp_avg
1110
1111
static inline __m256i convolve_rounding(const __m256i *const res_unsigned,
1112
                                        const __m256i *const offset_const,
1113
                                        const __m256i *const round_const,
1114
217M
                                        const int round_shift) {
1115
217M
  const __m256i res_signed = _mm256_sub_epi16(*res_unsigned, *offset_const);
1116
217M
  const __m256i res_round = _mm256_srai_epi16(
1117
217M
      _mm256_add_epi16(res_signed, *round_const), round_shift);
1118
217M
  return res_round;
1119
217M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_rounding
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_rounding
Unexecuted instantiation: convolve_2d_avx2.c:convolve_rounding
Unexecuted instantiation: convolve_avx2.c:convolve_rounding
jnt_convolve_avx2.c:convolve_rounding
Line
Count
Source
1114
217M
                                        const int round_shift) {
1115
217M
  const __m256i res_signed = _mm256_sub_epi16(*res_unsigned, *offset_const);
1116
217M
  const __m256i res_round = _mm256_srai_epi16(
1117
217M
      _mm256_add_epi16(res_signed, *round_const), round_shift);
1118
217M
  return res_round;
1119
217M
}
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_rounding
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_rounding
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_rounding
1120
1121
static inline __m256i highbd_comp_avg(const __m256i *const data_ref_0,
1122
                                      const __m256i *const res_unsigned,
1123
                                      const __m256i *const wt0,
1124
                                      const __m256i *const wt1,
1125
19.4M
                                      const int use_dist_wtd_comp_avg) {
1126
19.4M
  __m256i res;
1127
19.4M
  if (use_dist_wtd_comp_avg) {
1128
2.41M
    const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0);
1129
2.41M
    const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1);
1130
2.41M
    const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res);
1131
2.41M
    res = _mm256_srai_epi32(wt_res, DIST_PRECISION_BITS);
1132
17.0M
  } else {
1133
17.0M
    const __m256i wt_res = _mm256_add_epi32(*data_ref_0, *res_unsigned);
1134
17.0M
    res = _mm256_srai_epi32(wt_res, 1);
1135
17.0M
  }
1136
19.4M
  return res;
1137
19.4M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:highbd_comp_avg
Unexecuted instantiation: highbd_convolve_avx2.c:highbd_comp_avg
Unexecuted instantiation: convolve_2d_avx2.c:highbd_comp_avg
Unexecuted instantiation: convolve_avx2.c:highbd_comp_avg
Unexecuted instantiation: jnt_convolve_avx2.c:highbd_comp_avg
Unexecuted instantiation: wiener_convolve_avx2.c:highbd_comp_avg
Unexecuted instantiation: highbd_convolve_2d_avx2.c:highbd_comp_avg
highbd_jnt_convolve_avx2.c:highbd_comp_avg
Line
Count
Source
1125
19.4M
                                      const int use_dist_wtd_comp_avg) {
1126
19.4M
  __m256i res;
1127
19.4M
  if (use_dist_wtd_comp_avg) {
1128
2.41M
    const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0);
1129
2.41M
    const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1);
1130
2.41M
    const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res);
1131
2.41M
    res = _mm256_srai_epi32(wt_res, DIST_PRECISION_BITS);
1132
17.0M
  } else {
1133
17.0M
    const __m256i wt_res = _mm256_add_epi32(*data_ref_0, *res_unsigned);
1134
17.0M
    res = _mm256_srai_epi32(wt_res, 1);
1135
17.0M
  }
1136
19.4M
  return res;
1137
19.4M
}
1138
1139
static inline __m256i highbd_convolve_rounding(
1140
    const __m256i *const res_unsigned, const __m256i *const offset_const,
1141
19.4M
    const __m256i *const round_const, const int round_shift) {
1142
19.4M
  const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const);
1143
19.4M
  const __m256i res_round = _mm256_srai_epi32(
1144
19.4M
      _mm256_add_epi32(res_signed, *round_const), round_shift);
1145
1146
19.4M
  return res_round;
1147
19.4M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:highbd_convolve_rounding
Unexecuted instantiation: highbd_convolve_avx2.c:highbd_convolve_rounding
Unexecuted instantiation: convolve_2d_avx2.c:highbd_convolve_rounding
Unexecuted instantiation: convolve_avx2.c:highbd_convolve_rounding
Unexecuted instantiation: jnt_convolve_avx2.c:highbd_convolve_rounding
Unexecuted instantiation: wiener_convolve_avx2.c:highbd_convolve_rounding
Unexecuted instantiation: highbd_convolve_2d_avx2.c:highbd_convolve_rounding
highbd_jnt_convolve_avx2.c:highbd_convolve_rounding
Line
Count
Source
1141
19.4M
    const __m256i *const round_const, const int round_shift) {
1142
19.4M
  const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const);
1143
19.4M
  const __m256i res_round = _mm256_srai_epi32(
1144
19.4M
      _mm256_add_epi32(res_signed, *round_const), round_shift);
1145
1146
19.4M
  return res_round;
1147
19.4M
}
1148
1149
5.55M
static inline __m256i round_sr_x_avx2(const __m256i data) {
1150
  // we can perform the below steps:
1151
  // data = (data + 2) >> 2
1152
  // data = (data + 8) >> 4,
1153
  // in the below form as well
1154
  // data = (data + 0x22) >> 6
1155
5.55M
  const __m256i value = _mm256_set1_epi16(34);
1156
5.55M
  const __m256i reg = _mm256_add_epi16(data, value);
1157
5.55M
  return _mm256_srai_epi16(reg, 6);
1158
5.55M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_sr_x_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:round_sr_x_avx2
Unexecuted instantiation: convolve_2d_avx2.c:round_sr_x_avx2
convolve_avx2.c:round_sr_x_avx2
Line
Count
Source
1149
5.55M
static inline __m256i round_sr_x_avx2(const __m256i data) {
1150
  // we can perform the below steps:
1151
  // data = (data + 2) >> 2
1152
  // data = (data + 8) >> 4,
1153
  // in the below form as well
1154
  // data = (data + 0x22) >> 6
1155
5.55M
  const __m256i value = _mm256_set1_epi16(34);
1156
5.55M
  const __m256i reg = _mm256_add_epi16(data, value);
1157
5.55M
  return _mm256_srai_epi16(reg, 6);
1158
5.55M
}
Unexecuted instantiation: jnt_convolve_avx2.c:round_sr_x_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:round_sr_x_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_sr_x_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_sr_x_avx2
1159
1160
static inline __m128i convolve_x_4tap_4x2_ssse3(const uint8_t *const src,
1161
                                                const ptrdiff_t src_stride,
1162
429k
                                                __m128i *const coeffs) {
1163
429k
  __m128i data[2];
1164
429k
  const __m128i f_l0 = _mm_load_si128((__m128i const *)filt1_global_sse2);
1165
429k
  const __m128i f_l1 = _mm_load_si128((__m128i const *)filt2_global_sse2);
1166
429k
  const __m128i src_1 =
1167
429k
      load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * src_stride));
1168
1169
429k
  data[0] = _mm_shuffle_epi8(src_1, f_l0);
1170
429k
  data[1] = _mm_shuffle_epi8(src_1, f_l1);
1171
429k
  return convolve_lowbd_4tap_ssse3(data, coeffs);
1172
429k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_4tap_4x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_4tap_4x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_4tap_4x2_ssse3
convolve_avx2.c:convolve_x_4tap_4x2_ssse3
Line
Count
Source
1162
429k
                                                __m128i *const coeffs) {
1163
429k
  __m128i data[2];
1164
429k
  const __m128i f_l0 = _mm_load_si128((__m128i const *)filt1_global_sse2);
1165
429k
  const __m128i f_l1 = _mm_load_si128((__m128i const *)filt2_global_sse2);
1166
429k
  const __m128i src_1 =
1167
429k
      load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * src_stride));
1168
1169
429k
  data[0] = _mm_shuffle_epi8(src_1, f_l0);
1170
429k
  data[1] = _mm_shuffle_epi8(src_1, f_l1);
1171
429k
  return convolve_lowbd_4tap_ssse3(data, coeffs);
1172
429k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_4tap_4x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_4tap_4x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_4tap_4x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_4tap_4x2_ssse3
1173
1174
560k
static inline __m128i round_sr_x_ssse3(const __m128i data) {
1175
560k
  const __m128i val = _mm_set1_epi16(34);
1176
560k
  const __m128i reg = _mm_add_epi16(data, val);
1177
560k
  return _mm_srai_epi16(reg, 6);
1178
560k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_sr_x_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:round_sr_x_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:round_sr_x_ssse3
convolve_avx2.c:round_sr_x_ssse3
Line
Count
Source
1174
560k
static inline __m128i round_sr_x_ssse3(const __m128i data) {
1175
560k
  const __m128i val = _mm_set1_epi16(34);
1176
560k
  const __m128i reg = _mm_add_epi16(data, val);
1177
560k
  return _mm_srai_epi16(reg, 6);
1178
560k
}
Unexecuted instantiation: jnt_convolve_avx2.c:round_sr_x_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:round_sr_x_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_sr_x_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_sr_x_ssse3
1179
1180
static inline void store_8bit_4x2_sse2(const __m128i reg, uint8_t *const dst,
1181
985k
                                       const ptrdiff_t dst_stride) {
1182
985k
  xx_storel_32(dst, reg);
1183
985k
  *(uint32_t *)(dst + dst_stride) =
1184
985k
      ((uint32_t)_mm_extract_epi16(reg, 3) << 16) | _mm_extract_epi16(reg, 2);
1185
985k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:store_8bit_4x2_sse2
Unexecuted instantiation: highbd_convolve_avx2.c:store_8bit_4x2_sse2
Unexecuted instantiation: convolve_2d_avx2.c:store_8bit_4x2_sse2
convolve_avx2.c:store_8bit_4x2_sse2
Line
Count
Source
1181
985k
                                       const ptrdiff_t dst_stride) {
1182
985k
  xx_storel_32(dst, reg);
1183
985k
  *(uint32_t *)(dst + dst_stride) =
1184
985k
      ((uint32_t)_mm_extract_epi16(reg, 3) << 16) | _mm_extract_epi16(reg, 2);
1185
985k
}
Unexecuted instantiation: jnt_convolve_avx2.c:store_8bit_4x2_sse2
Unexecuted instantiation: wiener_convolve_avx2.c:store_8bit_4x2_sse2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:store_8bit_4x2_sse2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:store_8bit_4x2_sse2
1186
1187
static inline void pack_store_u8_4x2_sse2(const __m128i reg, uint8_t *const dst,
1188
985k
                                          const ptrdiff_t dst_stride) {
1189
985k
  const __m128i reg_pack = _mm_packus_epi16(reg, reg);
1190
985k
  store_8bit_4x2_sse2(reg_pack, dst, dst_stride);
1191
985k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:pack_store_u8_4x2_sse2
Unexecuted instantiation: highbd_convolve_avx2.c:pack_store_u8_4x2_sse2
Unexecuted instantiation: convolve_2d_avx2.c:pack_store_u8_4x2_sse2
convolve_avx2.c:pack_store_u8_4x2_sse2
Line
Count
Source
1188
985k
                                          const ptrdiff_t dst_stride) {
1189
985k
  const __m128i reg_pack = _mm_packus_epi16(reg, reg);
1190
985k
  store_8bit_4x2_sse2(reg_pack, dst, dst_stride);
1191
985k
}
Unexecuted instantiation: jnt_convolve_avx2.c:pack_store_u8_4x2_sse2
Unexecuted instantiation: wiener_convolve_avx2.c:pack_store_u8_4x2_sse2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:pack_store_u8_4x2_sse2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:pack_store_u8_4x2_sse2
1192
1193
static inline __m128i convolve_x_4tap_2x2_ssse3(const uint8_t *const src,
1194
                                                const ptrdiff_t src_stride,
1195
74.3k
                                                __m128i *const coeffs) {
1196
74.3k
  __m128i data[2];
1197
74.3k
  const __m128i f_0 = _mm_load_si128((__m128i const *)filt3_global_sse2);
1198
74.3k
  const __m128i f_1 = _mm_load_si128((__m128i const *)filt4_global_sse2);
1199
74.3k
  const __m128i reg =
1200
74.3k
      load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * src_stride));
1201
1202
74.3k
  data[0] = _mm_shuffle_epi8(reg, f_0);
1203
74.3k
  data[1] = _mm_shuffle_epi8(reg, f_1);
1204
74.3k
  return convolve_lowbd_4tap_ssse3(data, coeffs);
1205
74.3k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_4tap_2x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_4tap_2x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_4tap_2x2_ssse3
convolve_avx2.c:convolve_x_4tap_2x2_ssse3
Line
Count
Source
1195
74.3k
                                                __m128i *const coeffs) {
1196
74.3k
  __m128i data[2];
1197
74.3k
  const __m128i f_0 = _mm_load_si128((__m128i const *)filt3_global_sse2);
1198
74.3k
  const __m128i f_1 = _mm_load_si128((__m128i const *)filt4_global_sse2);
1199
74.3k
  const __m128i reg =
1200
74.3k
      load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * src_stride));
1201
1202
74.3k
  data[0] = _mm_shuffle_epi8(reg, f_0);
1203
74.3k
  data[1] = _mm_shuffle_epi8(reg, f_1);
1204
74.3k
  return convolve_lowbd_4tap_ssse3(data, coeffs);
1205
74.3k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_4tap_2x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_4tap_2x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_4tap_2x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_4tap_2x2_ssse3
1206
1207
static inline void pack_store_u8_2x2_sse2(const __m128i reg, uint8_t *const dst,
1208
173k
                                          const ptrdiff_t dst_stride) {
1209
173k
  const __m128i data = _mm_packus_epi16(reg, reg);
1210
173k
  *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(data);
1211
173k
  *(int16_t *)(dst + dst_stride) = (int16_t)_mm_extract_epi16(data, 1);
1212
173k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:pack_store_u8_2x2_sse2
Unexecuted instantiation: highbd_convolve_avx2.c:pack_store_u8_2x2_sse2
Unexecuted instantiation: convolve_2d_avx2.c:pack_store_u8_2x2_sse2
convolve_avx2.c:pack_store_u8_2x2_sse2
Line
Count
Source
1208
173k
                                          const ptrdiff_t dst_stride) {
1209
173k
  const __m128i data = _mm_packus_epi16(reg, reg);
1210
173k
  *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(data);
1211
  *(int16_t *)(dst + dst_stride) = (int16_t)_mm_extract_epi16(data, 1);
1212
173k
}
Unexecuted instantiation: jnt_convolve_avx2.c:pack_store_u8_2x2_sse2
Unexecuted instantiation: wiener_convolve_avx2.c:pack_store_u8_2x2_sse2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:pack_store_u8_2x2_sse2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:pack_store_u8_2x2_sse2
1213
1214
static inline __m128i convolve_x_2tap_ssse3(const __m128i *data,
1215
56.5k
                                            const __m128i *coeff) {
1216
56.5k
  return _mm_maddubs_epi16(data[0], coeff[0]);
1217
56.5k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_ssse3
convolve_avx2.c:convolve_x_2tap_ssse3
Line
Count
Source
1215
56.5k
                                            const __m128i *coeff) {
1216
56.5k
  return _mm_maddubs_epi16(data[0], coeff[0]);
1217
56.5k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_ssse3
1218
1219
static inline __m128i load8_x_4x2_sse4(const void *const src,
1220
10.6k
                                       const ptrdiff_t offset) {
1221
10.6k
  const __m128i s = _mm_cvtsi32_si128(loadu_int32(src));
1222
10.6k
  return _mm_insert_epi32(s, loadu_int32((uint8_t *)src + offset), 1);
1223
10.6k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load8_x_4x2_sse4
Unexecuted instantiation: highbd_convolve_avx2.c:load8_x_4x2_sse4
Unexecuted instantiation: convolve_2d_avx2.c:load8_x_4x2_sse4
convolve_avx2.c:load8_x_4x2_sse4
Line
Count
Source
1220
10.6k
                                       const ptrdiff_t offset) {
1221
10.6k
  const __m128i s = _mm_cvtsi32_si128(loadu_int32(src));
1222
  return _mm_insert_epi32(s, loadu_int32((uint8_t *)src + offset), 1);
1223
10.6k
}
Unexecuted instantiation: jnt_convolve_avx2.c:load8_x_4x2_sse4
Unexecuted instantiation: wiener_convolve_avx2.c:load8_x_4x2_sse4
Unexecuted instantiation: highbd_convolve_2d_avx2.c:load8_x_4x2_sse4
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load8_x_4x2_sse4
1224
1225
static inline __m128i load_x_u8_4x2_sse4(const uint8_t *const src,
1226
10.6k
                                         const ptrdiff_t stride) {
1227
10.6k
  return load8_x_4x2_sse4(src, sizeof(*src) * stride);
1228
10.6k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_x_u8_4x2_sse4
Unexecuted instantiation: highbd_convolve_avx2.c:load_x_u8_4x2_sse4
Unexecuted instantiation: convolve_2d_avx2.c:load_x_u8_4x2_sse4
convolve_avx2.c:load_x_u8_4x2_sse4
Line
Count
Source
1226
10.6k
                                         const ptrdiff_t stride) {
1227
10.6k
  return load8_x_4x2_sse4(src, sizeof(*src) * stride);
1228
10.6k
}
Unexecuted instantiation: jnt_convolve_avx2.c:load_x_u8_4x2_sse4
Unexecuted instantiation: wiener_convolve_avx2.c:load_x_u8_4x2_sse4
Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_x_u8_4x2_sse4
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_x_u8_4x2_sse4
1229
1230
static inline __m128i convolve_x_2tap_2x2_ssse3(const uint8_t *const src,
1231
                                                const ptrdiff_t stride,
1232
4.21k
                                                const __m128i *coeffs) {
1233
4.21k
  const __m128i flt = _mm_load_si128((__m128i const *)filt5_global_sse2);
1234
4.21k
  const __m128i reg = load_x_u8_4x2_sse4(src, stride);
1235
4.21k
  const __m128i data = _mm_shuffle_epi8(reg, flt);
1236
4.21k
  return convolve_x_2tap_ssse3(&data, coeffs);
1237
4.21k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_2x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_2x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_2x2_ssse3
convolve_avx2.c:convolve_x_2tap_2x2_ssse3
Line
Count
Source
1232
4.21k
                                                const __m128i *coeffs) {
1233
4.21k
  const __m128i flt = _mm_load_si128((__m128i const *)filt5_global_sse2);
1234
4.21k
  const __m128i reg = load_x_u8_4x2_sse4(src, stride);
1235
4.21k
  const __m128i data = _mm_shuffle_epi8(reg, flt);
1236
4.21k
  return convolve_x_2tap_ssse3(&data, coeffs);
1237
4.21k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_2x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_2x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_2x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_2x2_ssse3
1238
1239
static inline __m128i convolve_x_2tap_4x2_ssse3(const uint8_t *const src,
1240
                                                const ptrdiff_t stride,
1241
17.7k
                                                const __m128i *coeffs) {
1242
17.7k
  const __m128i flt = _mm_load_si128((__m128i const *)filt1_global_sse2);
1243
17.7k
  const __m128i data =
1244
17.7k
      load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * stride));
1245
17.7k
  const __m128i res = _mm_shuffle_epi8(data, flt);
1246
17.7k
  return convolve_x_2tap_ssse3(&res, coeffs);
1247
17.7k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_4x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_4x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_4x2_ssse3
convolve_avx2.c:convolve_x_2tap_4x2_ssse3
Line
Count
Source
1241
17.7k
                                                const __m128i *coeffs) {
1242
17.7k
  const __m128i flt = _mm_load_si128((__m128i const *)filt1_global_sse2);
1243
17.7k
  const __m128i data =
1244
17.7k
      load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * stride));
1245
17.7k
  const __m128i res = _mm_shuffle_epi8(data, flt);
1246
17.7k
  return convolve_x_2tap_ssse3(&res, coeffs);
1247
17.7k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_4x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_4x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_4x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_4x2_ssse3
1248
1249
static inline void convolve_x_2tap_8x2_ssse3(const uint8_t *const src,
1250
                                             const ptrdiff_t stride,
1251
                                             const __m128i *coeffs,
1252
17.3k
                                             __m128i *data) {
1253
17.3k
  __m128i res[2];
1254
17.3k
  const __m128i reg_00 = _mm_loadu_si128((__m128i *)src);
1255
17.3k
  const __m128i reg_10 = _mm_loadu_si128((__m128i *)(src + stride));
1256
17.3k
  const __m128i reg_01 = _mm_srli_si128(reg_00, 1);
1257
17.3k
  const __m128i reg_11 = _mm_srli_si128(reg_10, 1);
1258
17.3k
  res[0] = _mm_unpacklo_epi8(reg_00, reg_01);
1259
17.3k
  res[1] = _mm_unpacklo_epi8(reg_10, reg_11);
1260
1261
17.3k
  data[0] = convolve_x_2tap_ssse3(&res[0], coeffs);
1262
17.3k
  data[1] = convolve_x_2tap_ssse3(&res[1], coeffs);
1263
17.3k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_8x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_8x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_8x2_ssse3
convolve_avx2.c:convolve_x_2tap_8x2_ssse3
Line
Count
Source
1252
17.3k
                                             __m128i *data) {
1253
17.3k
  __m128i res[2];
1254
17.3k
  const __m128i reg_00 = _mm_loadu_si128((__m128i *)src);
1255
17.3k
  const __m128i reg_10 = _mm_loadu_si128((__m128i *)(src + stride));
1256
17.3k
  const __m128i reg_01 = _mm_srli_si128(reg_00, 1);
1257
17.3k
  const __m128i reg_11 = _mm_srli_si128(reg_10, 1);
1258
17.3k
  res[0] = _mm_unpacklo_epi8(reg_00, reg_01);
1259
17.3k
  res[1] = _mm_unpacklo_epi8(reg_10, reg_11);
1260
1261
17.3k
  data[0] = convolve_x_2tap_ssse3(&res[0], coeffs);
1262
17.3k
  data[1] = convolve_x_2tap_ssse3(&res[1], coeffs);
1263
17.3k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_8x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_8x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_8x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_8x2_ssse3
1264
1265
static inline __m256i loadu_x_8bit_16x2_avx2(const void *const src,
1266
937k
                                             const ptrdiff_t offset) {
1267
937k
  const __m128i reg0 = _mm_loadu_si128((__m128i *)src);
1268
937k
  const __m128i reg1 = _mm_loadu_si128((__m128i *)((uint8_t *)src + offset));
1269
937k
  return _mm256_setr_m128i(reg0, reg1);
1270
937k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:loadu_x_8bit_16x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:loadu_x_8bit_16x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:loadu_x_8bit_16x2_avx2
convolve_avx2.c:loadu_x_8bit_16x2_avx2
Line
Count
Source
1266
937k
                                             const ptrdiff_t offset) {
1267
937k
  const __m128i reg0 = _mm_loadu_si128((__m128i *)src);
1268
937k
  const __m128i reg1 = _mm_loadu_si128((__m128i *)((uint8_t *)src + offset));
1269
937k
  return _mm256_setr_m128i(reg0, reg1);
1270
937k
}
Unexecuted instantiation: jnt_convolve_avx2.c:loadu_x_8bit_16x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:loadu_x_8bit_16x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:loadu_x_8bit_16x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:loadu_x_8bit_16x2_avx2
1271
1272
static inline __m256i convolve_x_2tap_avx2(const __m256i *data,
1273
272k
                                           const __m256i *coeffs) {
1274
272k
  return _mm256_maddubs_epi16(data[0], coeffs[0]);
1275
272k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_avx2
convolve_avx2.c:convolve_x_2tap_avx2
Line
Count
Source
1273
272k
                                           const __m256i *coeffs) {
1274
272k
  return _mm256_maddubs_epi16(data[0], coeffs[0]);
1275
272k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_avx2
1276
1277
static inline void convolve_x_2tap_16x2_avx2(const uint8_t *const src,
1278
                                             const ptrdiff_t stride,
1279
                                             const __m256i *coeffs,
1280
11.8k
                                             __m256i *data) {
1281
11.8k
  const __m256i reg0 = loadu_x_8bit_16x2_avx2(src, stride);
1282
11.8k
  const __m256i reg1 = loadu_x_8bit_16x2_avx2(src + 1, stride);
1283
11.8k
  const __m256i res0 = _mm256_unpacklo_epi8(reg0, reg1);
1284
11.8k
  const __m256i res1 = _mm256_unpackhi_epi8(reg0, reg1);
1285
11.8k
  data[0] = convolve_x_2tap_avx2(&res0, coeffs);
1286
11.8k
  data[1] = convolve_x_2tap_avx2(&res1, coeffs);
1287
11.8k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_16x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_16x2_avx2
convolve_avx2.c:convolve_x_2tap_16x2_avx2
Line
Count
Source
1280
11.8k
                                             __m256i *data) {
1281
11.8k
  const __m256i reg0 = loadu_x_8bit_16x2_avx2(src, stride);
1282
11.8k
  const __m256i reg1 = loadu_x_8bit_16x2_avx2(src + 1, stride);
1283
11.8k
  const __m256i res0 = _mm256_unpacklo_epi8(reg0, reg1);
1284
11.8k
  const __m256i res1 = _mm256_unpackhi_epi8(reg0, reg1);
1285
11.8k
  data[0] = convolve_x_2tap_avx2(&res0, coeffs);
1286
11.8k
  data[1] = convolve_x_2tap_avx2(&res1, coeffs);
1287
11.8k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_16x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_16x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_16x2_avx2
1288
1289
static inline void store_u8_16x2_avx2(const __m256i src, uint8_t *const dst,
1290
2.35M
                                      const ptrdiff_t stride) {
1291
2.35M
  const __m128i reg0 = _mm256_castsi256_si128(src);
1292
2.35M
  const __m128i reg1 = _mm256_extracti128_si256(src, 1);
1293
2.35M
  _mm_storeu_si128((__m128i *)dst, reg0);
1294
2.35M
  _mm_storeu_si128((__m128i *)((uint8_t *)dst + stride), reg1);
1295
2.35M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:store_u8_16x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:store_u8_16x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:store_u8_16x2_avx2
convolve_avx2.c:store_u8_16x2_avx2
Line
Count
Source
1290
2.35M
                                      const ptrdiff_t stride) {
1291
2.35M
  const __m128i reg0 = _mm256_castsi256_si128(src);
1292
  const __m128i reg1 = _mm256_extracti128_si256(src, 1);
1293
2.35M
  _mm_storeu_si128((__m128i *)dst, reg0);
1294
2.35M
  _mm_storeu_si128((__m128i *)((uint8_t *)dst + stride), reg1);
1295
2.35M
}
Unexecuted instantiation: jnt_convolve_avx2.c:store_u8_16x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:store_u8_16x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:store_u8_16x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:store_u8_16x2_avx2
1296
1297
static inline void store_u8_8x2_avx2(const __m256i src, uint8_t *const dst,
1298
625k
                                     const ptrdiff_t stride) {
1299
625k
  const __m128i reg0 = _mm256_castsi256_si128(src);
1300
625k
  const __m128i reg1 = _mm256_extracti128_si256(src, 1);
1301
625k
  _mm_storel_epi64((__m128i *)dst, reg0);
1302
625k
  _mm_storel_epi64((__m128i *)(dst + stride), reg1);
1303
625k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:store_u8_8x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:store_u8_8x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:store_u8_8x2_avx2
convolve_avx2.c:store_u8_8x2_avx2
Line
Count
Source
1298
625k
                                     const ptrdiff_t stride) {
1299
625k
  const __m128i reg0 = _mm256_castsi256_si128(src);
1300
  const __m128i reg1 = _mm256_extracti128_si256(src, 1);
1301
625k
  _mm_storel_epi64((__m128i *)dst, reg0);
1302
625k
  _mm_storel_epi64((__m128i *)(dst + stride), reg1);
1303
625k
}
Unexecuted instantiation: jnt_convolve_avx2.c:store_u8_8x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:store_u8_8x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:store_u8_8x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:store_u8_8x2_avx2
1304
1305
static inline void pack_store_16x2_avx2(const __m256i data0,
1306
                                        const __m256i data1, uint8_t *const dst,
1307
2.35M
                                        const ptrdiff_t stride) {
1308
2.35M
  const __m256i res = _mm256_packus_epi16(data0, data1);
1309
2.35M
  store_u8_16x2_avx2(res, dst, stride);
1310
2.35M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:pack_store_16x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:pack_store_16x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:pack_store_16x2_avx2
convolve_avx2.c:pack_store_16x2_avx2
Line
Count
Source
1307
2.35M
                                        const ptrdiff_t stride) {
1308
2.35M
  const __m256i res = _mm256_packus_epi16(data0, data1);
1309
2.35M
  store_u8_16x2_avx2(res, dst, stride);
1310
2.35M
}
Unexecuted instantiation: jnt_convolve_avx2.c:pack_store_16x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:pack_store_16x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:pack_store_16x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:pack_store_16x2_avx2
1311
1312
static inline void pack_store_8x2_avx2(const __m256i data, uint8_t *const dst,
1313
625k
                                       const ptrdiff_t stride) {
1314
625k
  const __m256i res = _mm256_packus_epi16(data, data);
1315
625k
  store_u8_8x2_avx2(res, dst, stride);
1316
625k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:pack_store_8x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:pack_store_8x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:pack_store_8x2_avx2
convolve_avx2.c:pack_store_8x2_avx2
Line
Count
Source
1313
625k
                                       const ptrdiff_t stride) {
1314
625k
  const __m256i res = _mm256_packus_epi16(data, data);
1315
625k
  store_u8_8x2_avx2(res, dst, stride);
1316
625k
}
Unexecuted instantiation: jnt_convolve_avx2.c:pack_store_8x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:pack_store_8x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:pack_store_8x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:pack_store_8x2_avx2
1317
1318
static inline void round_pack_store_16x2_avx2(const __m256i *data,
1319
                                              uint8_t *const dst,
1320
468k
                                              const ptrdiff_t dst_stride) {
1321
468k
  __m256i reg[2];
1322
1323
468k
  reg[0] = round_sr_x_avx2(data[0]);
1324
468k
  reg[1] = round_sr_x_avx2(data[1]);
1325
468k
  pack_store_16x2_avx2(reg[0], reg[1], dst, dst_stride);
1326
468k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_pack_store_16x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:round_pack_store_16x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:round_pack_store_16x2_avx2
convolve_avx2.c:round_pack_store_16x2_avx2
Line
Count
Source
1320
468k
                                              const ptrdiff_t dst_stride) {
1321
468k
  __m256i reg[2];
1322
1323
468k
  reg[0] = round_sr_x_avx2(data[0]);
1324
468k
  reg[1] = round_sr_x_avx2(data[1]);
1325
468k
  pack_store_16x2_avx2(reg[0], reg[1], dst, dst_stride);
1326
468k
}
Unexecuted instantiation: jnt_convolve_avx2.c:round_pack_store_16x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:round_pack_store_16x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_pack_store_16x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_pack_store_16x2_avx2
1327
1328
static inline void convolve_x_2tap_32_avx2(const uint8_t *const src,
1329
                                           const __m256i *coeffs,
1330
124k
                                           __m256i *data) {
1331
124k
  const __m256i res0 = _mm256_loadu_si256((__m256i *)src);
1332
124k
  const __m256i res1 = _mm256_loadu_si256((__m256i *)(src + 1));
1333
124k
  const __m256i reg0 = _mm256_unpacklo_epi8(res0, res1);
1334
124k
  const __m256i reg1 = _mm256_unpackhi_epi8(res0, res1);
1335
1336
124k
  data[0] = convolve_x_2tap_avx2(&reg0, coeffs);
1337
124k
  data[1] = convolve_x_2tap_avx2(&reg1, coeffs);
1338
124k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_32_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_32_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_32_avx2
convolve_avx2.c:convolve_x_2tap_32_avx2
Line
Count
Source
1330
124k
                                           __m256i *data) {
1331
124k
  const __m256i res0 = _mm256_loadu_si256((__m256i *)src);
1332
124k
  const __m256i res1 = _mm256_loadu_si256((__m256i *)(src + 1));
1333
124k
  const __m256i reg0 = _mm256_unpacklo_epi8(res0, res1);
1334
124k
  const __m256i reg1 = _mm256_unpackhi_epi8(res0, res1);
1335
1336
124k
  data[0] = convolve_x_2tap_avx2(&reg0, coeffs);
1337
124k
  data[1] = convolve_x_2tap_avx2(&reg1, coeffs);
1338
124k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_32_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_32_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_32_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_32_avx2
1339
1340
static inline void pack_store_32_avx2(const __m256i data0, const __m256i data1,
1341
2.09M
                                      uint8_t *const dst) {
1342
2.09M
  const __m256i reg = _mm256_packus_epi16(data0, data1);
1343
2.09M
  _mm256_storeu_si256((__m256i *)dst, reg);
1344
2.09M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:pack_store_32_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:pack_store_32_avx2
Unexecuted instantiation: convolve_2d_avx2.c:pack_store_32_avx2
convolve_avx2.c:pack_store_32_avx2
Line
Count
Source
1341
2.09M
                                      uint8_t *const dst) {
1342
2.09M
  const __m256i reg = _mm256_packus_epi16(data0, data1);
1343
2.09M
  _mm256_storeu_si256((__m256i *)dst, reg);
1344
2.09M
}
Unexecuted instantiation: jnt_convolve_avx2.c:pack_store_32_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:pack_store_32_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:pack_store_32_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:pack_store_32_avx2
1345
1346
static inline void round_pack_store_32_avx2(const __m256i *data,
1347
1.74M
                                            uint8_t *const dst) {
1348
1.74M
  __m256i reg[2];
1349
1350
1.74M
  reg[0] = round_sr_x_avx2(data[0]);
1351
1.74M
  reg[1] = round_sr_x_avx2(data[1]);
1352
1.74M
  pack_store_32_avx2(reg[0], reg[1], dst);
1353
1.74M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_pack_store_32_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:round_pack_store_32_avx2
Unexecuted instantiation: convolve_2d_avx2.c:round_pack_store_32_avx2
convolve_avx2.c:round_pack_store_32_avx2
Line
Count
Source
1347
1.74M
                                            uint8_t *const dst) {
1348
1.74M
  __m256i reg[2];
1349
1350
1.74M
  reg[0] = round_sr_x_avx2(data[0]);
1351
1.74M
  reg[1] = round_sr_x_avx2(data[1]);
1352
1.74M
  pack_store_32_avx2(reg[0], reg[1], dst);
1353
1.74M
}
Unexecuted instantiation: jnt_convolve_avx2.c:round_pack_store_32_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:round_pack_store_32_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_pack_store_32_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_pack_store_32_avx2
1354
1355
static inline void convolve_round_2tap_32_avx2(const uint8_t *const src,
1356
                                               const __m256i *coeffs,
1357
124k
                                               uint8_t *const dst) {
1358
124k
  __m256i data[2];
1359
1360
124k
  convolve_x_2tap_32_avx2(src, coeffs, data);
1361
124k
  round_pack_store_32_avx2(data, dst);
1362
124k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_round_2tap_32_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_round_2tap_32_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_round_2tap_32_avx2
convolve_avx2.c:convolve_round_2tap_32_avx2
Line
Count
Source
1357
124k
                                               uint8_t *const dst) {
1358
124k
  __m256i data[2];
1359
1360
124k
  convolve_x_2tap_32_avx2(src, coeffs, data);
1361
124k
  round_pack_store_32_avx2(data, dst);
1362
124k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_round_2tap_32_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_round_2tap_32_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_round_2tap_32_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_round_2tap_32_avx2
1363
1364
static inline void load_avg_store_2tap_32_avx2(const uint8_t *const src,
1365
116k
                                               uint8_t *const dst) {
1366
116k
  const __m256i res0 = _mm256_loadu_si256((__m256i *)src);
1367
116k
  const __m256i res1 = _mm256_loadu_si256((__m256i *)(src + 1));
1368
116k
  const __m256i data = _mm256_avg_epu8(res0, res1);
1369
116k
  _mm256_storeu_si256((__m256i *)dst, data);
1370
116k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_avg_store_2tap_32_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:load_avg_store_2tap_32_avx2
Unexecuted instantiation: convolve_2d_avx2.c:load_avg_store_2tap_32_avx2
convolve_avx2.c:load_avg_store_2tap_32_avx2
Line
Count
Source
1365
116k
                                               uint8_t *const dst) {
1366
116k
  const __m256i res0 = _mm256_loadu_si256((__m256i *)src);
1367
116k
  const __m256i res1 = _mm256_loadu_si256((__m256i *)(src + 1));
1368
116k
  const __m256i data = _mm256_avg_epu8(res0, res1);
1369
116k
  _mm256_storeu_si256((__m256i *)dst, data);
1370
116k
}
Unexecuted instantiation: jnt_convolve_avx2.c:load_avg_store_2tap_32_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:load_avg_store_2tap_32_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_avg_store_2tap_32_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_avg_store_2tap_32_avx2
1371
1372
static inline __m256i load_convolve_8tap_8x2_avx2(const uint8_t *const src,
1373
                                                  const ptrdiff_t stride,
1374
                                                  const __m256i *coeffs,
1375
47.0k
                                                  const __m256i *flt) {
1376
47.0k
  const __m256i res = loadu_x_8bit_16x2_avx2(src, stride);
1377
47.0k
  return convolve_lowbd_x(res, coeffs, flt);
1378
47.0k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_8tap_8x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_8tap_8x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_8tap_8x2_avx2
convolve_avx2.c:load_convolve_8tap_8x2_avx2
Line
Count
Source
1375
47.0k
                                                  const __m256i *flt) {
1376
47.0k
  const __m256i res = loadu_x_8bit_16x2_avx2(src, stride);
1377
47.0k
  return convolve_lowbd_x(res, coeffs, flt);
1378
47.0k
}
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_8tap_8x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_8tap_8x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_8tap_8x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_8tap_8x2_avx2
1379
1380
static inline void load_convolve_8tap_16x2_avx2(const uint8_t *const src,
1381
                                                const int32_t src_stride,
1382
                                                const __m256i *coeffs,
1383
                                                const __m256i *flt,
1384
23.5k
                                                __m256i *reg) {
1385
23.5k
  reg[0] = load_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, flt);
1386
23.5k
  reg[1] = load_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, flt);
1387
23.5k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_8tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_8tap_16x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_8tap_16x2_avx2
convolve_avx2.c:load_convolve_8tap_16x2_avx2
Line
Count
Source
1384
23.5k
                                                __m256i *reg) {
1385
23.5k
  reg[0] = load_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, flt);
1386
23.5k
  reg[1] = load_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, flt);
1387
23.5k
}
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_8tap_16x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_8tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_8tap_16x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_8tap_16x2_avx2
1388
1389
static inline void load_convolve_8tap_32_avx2(const uint8_t *const src,
1390
                                              const __m256i *coeffs,
1391
                                              const __m256i *filt,
1392
160k
                                              __m256i *data) {
1393
160k
  const __m256i reg_0 = _mm256_loadu_si256((__m256i *)src);
1394
160k
  const __m256i reg_8 = _mm256_loadu_si256((__m256i *)(src + 8));
1395
1396
160k
  data[0] = convolve_lowbd_x(reg_0, coeffs, filt);
1397
160k
  data[1] = convolve_lowbd_x(reg_8, coeffs, filt);
1398
160k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_8tap_32_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_8tap_32_avx2
Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_8tap_32_avx2
convolve_avx2.c:load_convolve_8tap_32_avx2
Line
Count
Source
1392
160k
                                              __m256i *data) {
1393
160k
  const __m256i reg_0 = _mm256_loadu_si256((__m256i *)src);
1394
160k
  const __m256i reg_8 = _mm256_loadu_si256((__m256i *)(src + 8));
1395
1396
160k
  data[0] = convolve_lowbd_x(reg_0, coeffs, filt);
1397
160k
  data[1] = convolve_lowbd_x(reg_8, coeffs, filt);
1398
160k
}
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_8tap_32_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_8tap_32_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_8tap_32_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_8tap_32_avx2
1399
1400
static inline void load_convolve_round_8tap_32_avx2(const uint8_t *const src,
1401
                                                    const __m256i *coeffs,
1402
                                                    const __m256i *filt,
1403
160k
                                                    uint8_t *const dst) {
1404
160k
  __m256i data[2];
1405
1406
160k
  load_convolve_8tap_32_avx2(src, coeffs, filt, data);
1407
160k
  round_pack_store_32_avx2(data, dst);
1408
160k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_round_8tap_32_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_round_8tap_32_avx2
Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_round_8tap_32_avx2
convolve_avx2.c:load_convolve_round_8tap_32_avx2
Line
Count
Source
1403
160k
                                                    uint8_t *const dst) {
1404
160k
  __m256i data[2];
1405
1406
160k
  load_convolve_8tap_32_avx2(src, coeffs, filt, data);
1407
160k
  round_pack_store_32_avx2(data, dst);
1408
160k
}
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_round_8tap_32_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_round_8tap_32_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_round_8tap_32_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_round_8tap_32_avx2
1409
1410
static inline void load_convolve_6tap_32_avx2(const uint8_t *const src,
1411
                                              const __m256i *coeffs,
1412
                                              const __m256i *filt,
1413
1.46M
                                              __m256i *data) {
1414
1.46M
  const __m256i reg0 = _mm256_loadu_si256((__m256i *)src);
1415
1.46M
  const __m256i reg1 = _mm256_loadu_si256((__m256i *)(src + 8));
1416
1417
1.46M
  data[0] = convolve_lowbd_x_6tap(reg0, coeffs, filt);
1418
1.46M
  data[1] = convolve_lowbd_x_6tap(reg1, coeffs, filt);
1419
1.46M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_6tap_32_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_6tap_32_avx2
Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_6tap_32_avx2
convolve_avx2.c:load_convolve_6tap_32_avx2
Line
Count
Source
1413
1.46M
                                              __m256i *data) {
1414
1.46M
  const __m256i reg0 = _mm256_loadu_si256((__m256i *)src);
1415
1.46M
  const __m256i reg1 = _mm256_loadu_si256((__m256i *)(src + 8));
1416
1417
1.46M
  data[0] = convolve_lowbd_x_6tap(reg0, coeffs, filt);
1418
1.46M
  data[1] = convolve_lowbd_x_6tap(reg1, coeffs, filt);
1419
1.46M
}
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_6tap_32_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_6tap_32_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_6tap_32_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_6tap_32_avx2
1420
1421
static inline void convolve_sr_store_6tap_32_avx2(const uint8_t *const src,
1422
                                                  const __m256i *coeffs,
1423
                                                  const __m256i *filt,
1424
1.46M
                                                  uint8_t *const dst) {
1425
1.46M
  __m256i data[2];
1426
1427
1.46M
  load_convolve_6tap_32_avx2(src, coeffs, filt, data);
1428
1.46M
  round_pack_store_32_avx2(data, dst);
1429
1.46M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_sr_store_6tap_32_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_sr_store_6tap_32_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_sr_store_6tap_32_avx2
convolve_avx2.c:convolve_sr_store_6tap_32_avx2
Line
Count
Source
1424
1.46M
                                                  uint8_t *const dst) {
1425
1.46M
  __m256i data[2];
1426
1427
1.46M
  load_convolve_6tap_32_avx2(src, coeffs, filt, data);
1428
1.46M
  round_pack_store_32_avx2(data, dst);
1429
1.46M
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_sr_store_6tap_32_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_sr_store_6tap_32_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_sr_store_6tap_32_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_sr_store_6tap_32_avx2
1430
1431
static inline __m256i load_convolve_6tap_8x2_avx2(const uint8_t *const src,
1432
                                                  const ptrdiff_t stride,
1433
                                                  const __m256i *coeffs,
1434
867k
                                                  const __m256i *filt) {
1435
867k
  const __m256i data = loadu_x_8bit_16x2_avx2(src, stride);
1436
867k
  return convolve_lowbd_x_6tap(data, coeffs, filt);
1437
867k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_6tap_8x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_6tap_8x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_6tap_8x2_avx2
convolve_avx2.c:load_convolve_6tap_8x2_avx2
Line
Count
Source
1434
867k
                                                  const __m256i *filt) {
1435
867k
  const __m256i data = loadu_x_8bit_16x2_avx2(src, stride);
1436
867k
  return convolve_lowbd_x_6tap(data, coeffs, filt);
1437
867k
}
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_6tap_8x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_6tap_8x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_6tap_8x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_6tap_8x2_avx2
1438
1439
static inline void load_convolve_6tap_16x2_avx2(const uint8_t *const src,
1440
                                                const int32_t src_stride,
1441
                                                const __m256i *coeffs,
1442
                                                const __m256i *filt,
1443
433k
                                                __m256i *data) {
1444
433k
  data[0] = load_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1445
433k
  data[1] = load_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1446
433k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_6tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_6tap_16x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_6tap_16x2_avx2
convolve_avx2.c:load_convolve_6tap_16x2_avx2
Line
Count
Source
1443
433k
                                                __m256i *data) {
1444
433k
  data[0] = load_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1445
433k
  data[1] = load_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1446
433k
}
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_6tap_16x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_6tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_6tap_16x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_6tap_16x2_avx2
1447
1448
632k
static inline __m128i round_sr_y_ssse3(const __m128i data) {
1449
632k
  const __m128i value = _mm_set1_epi16(32);
1450
632k
  const __m128i reg = _mm_add_epi16(data, value);
1451
632k
  return _mm_srai_epi16(reg, FILTER_BITS - 1);
1452
632k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_sr_y_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:round_sr_y_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:round_sr_y_ssse3
convolve_avx2.c:round_sr_y_ssse3
Line
Count
Source
1448
632k
static inline __m128i round_sr_y_ssse3(const __m128i data) {
1449
632k
  const __m128i value = _mm_set1_epi16(32);
1450
632k
  const __m128i reg = _mm_add_epi16(data, value);
1451
632k
  return _mm_srai_epi16(reg, FILTER_BITS - 1);
1452
632k
}
Unexecuted instantiation: jnt_convolve_avx2.c:round_sr_y_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:round_sr_y_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_sr_y_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_sr_y_ssse3
1453
1454
5.08M
static inline __m256i round_sr_y_avx2(const __m256i data) {
1455
5.08M
  const __m256i value = _mm256_set1_epi16(32);
1456
5.08M
  const __m256i reg = _mm256_add_epi16(data, value);
1457
5.08M
  return _mm256_srai_epi16(reg, FILTER_BITS - 1);
1458
5.08M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_sr_y_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:round_sr_y_avx2
Unexecuted instantiation: convolve_2d_avx2.c:round_sr_y_avx2
convolve_avx2.c:round_sr_y_avx2
Line
Count
Source
1454
5.08M
static inline __m256i round_sr_y_avx2(const __m256i data) {
1455
5.08M
  const __m256i value = _mm256_set1_epi16(32);
1456
5.08M
  const __m256i reg = _mm256_add_epi16(data, value);
1457
5.08M
  return _mm256_srai_epi16(reg, FILTER_BITS - 1);
1458
5.08M
}
Unexecuted instantiation: jnt_convolve_avx2.c:round_sr_y_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:round_sr_y_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_sr_y_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_sr_y_avx2
1459
1460
static inline void round_pack_store_y_8x2_avx2(const __m256i res,
1461
                                               uint8_t *const dst,
1462
625k
                                               const ptrdiff_t dst_stride) {
1463
625k
  __m256i r;
1464
1465
625k
  r = round_sr_y_avx2(res);
1466
625k
  pack_store_8x2_avx2(r, dst, dst_stride);
1467
625k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_pack_store_y_8x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:round_pack_store_y_8x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:round_pack_store_y_8x2_avx2
convolve_avx2.c:round_pack_store_y_8x2_avx2
Line
Count
Source
1462
625k
                                               const ptrdiff_t dst_stride) {
1463
625k
  __m256i r;
1464
1465
625k
  r = round_sr_y_avx2(res);
1466
625k
  pack_store_8x2_avx2(r, dst, dst_stride);
1467
625k
}
Unexecuted instantiation: jnt_convolve_avx2.c:round_pack_store_y_8x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:round_pack_store_y_8x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_pack_store_y_8x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_pack_store_y_8x2_avx2
1468
1469
static inline void round_pack_store_y_16x2_avx2(const __m256i res[2],
1470
                                                uint8_t *const dst,
1471
1.88M
                                                const ptrdiff_t dst_stride) {
1472
1.88M
  __m256i r[2];
1473
1474
1.88M
  r[0] = round_sr_y_avx2(res[0]);
1475
1.88M
  r[1] = round_sr_y_avx2(res[1]);
1476
1.88M
  pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
1477
1.88M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_pack_store_y_16x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:round_pack_store_y_16x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:round_pack_store_y_16x2_avx2
convolve_avx2.c:round_pack_store_y_16x2_avx2
Line
Count
Source
1471
1.88M
                                                const ptrdiff_t dst_stride) {
1472
1.88M
  __m256i r[2];
1473
1474
1.88M
  r[0] = round_sr_y_avx2(res[0]);
1475
1.88M
  r[1] = round_sr_y_avx2(res[1]);
1476
1.88M
  pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
1477
1.88M
}
Unexecuted instantiation: jnt_convolve_avx2.c:round_pack_store_y_16x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:round_pack_store_y_16x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_pack_store_y_16x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_pack_store_y_16x2_avx2
1478
1479
static inline void round_pack_store_y_32_avx2(const __m256i res[2],
1480
342k
                                              uint8_t *const dst) {
1481
342k
  __m256i r[2];
1482
1483
342k
  r[0] = round_sr_y_avx2(res[0]);
1484
342k
  r[1] = round_sr_y_avx2(res[1]);
1485
342k
  pack_store_32_avx2(r[0], r[1], dst);
1486
342k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_pack_store_y_32_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:round_pack_store_y_32_avx2
Unexecuted instantiation: convolve_2d_avx2.c:round_pack_store_y_32_avx2
convolve_avx2.c:round_pack_store_y_32_avx2
Line
Count
Source
1480
342k
                                              uint8_t *const dst) {
1481
342k
  __m256i r[2];
1482
1483
342k
  r[0] = round_sr_y_avx2(res[0]);
1484
342k
  r[1] = round_sr_y_avx2(res[1]);
1485
342k
  pack_store_32_avx2(r[0], r[1], dst);
1486
342k
}
Unexecuted instantiation: jnt_convolve_avx2.c:round_pack_store_y_32_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:round_pack_store_y_32_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_pack_store_y_32_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_pack_store_y_32_avx2
1487
1488
static inline void round_pack_store_y_32x2_avx2(const __m256i res[4],
1489
                                                uint8_t *const dst,
1490
171k
                                                const ptrdiff_t dst_stride) {
1491
171k
  round_pack_store_y_32_avx2(res, dst);
1492
171k
  round_pack_store_y_32_avx2(res + 2, dst + dst_stride);
1493
171k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_pack_store_y_32x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:round_pack_store_y_32x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:round_pack_store_y_32x2_avx2
convolve_avx2.c:round_pack_store_y_32x2_avx2
Line
Count
Source
1490
171k
                                                const ptrdiff_t dst_stride) {
1491
171k
  round_pack_store_y_32_avx2(res, dst);
1492
171k
  round_pack_store_y_32_avx2(res + 2, dst + dst_stride);
1493
171k
}
Unexecuted instantiation: jnt_convolve_avx2.c:round_pack_store_y_32x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:round_pack_store_y_32x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_pack_store_y_32x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_pack_store_y_32x2_avx2
1494
1495
static inline void convolve_y_2tap_2x2_ssse3(const uint8_t *const data,
1496
                                             const ptrdiff_t stride,
1497
                                             const __m128i *coeffs,
1498
3.21k
                                             __m128i d[2], __m128i *res) {
1499
3.21k
  d[1] = _mm_cvtsi32_si128(loadu_int16(data + 1 * stride));
1500
3.21k
  const __m128i src_01a = _mm_unpacklo_epi16(d[0], d[1]);
1501
3.21k
  d[0] = _mm_cvtsi32_si128(loadu_int16(data + 2 * stride));
1502
3.21k
  const __m128i src_12a = _mm_unpacklo_epi16(d[1], d[0]);
1503
1504
3.21k
  const __m128i s = _mm_unpacklo_epi8(src_01a, src_12a);
1505
1506
3.21k
  *res = _mm_maddubs_epi16(s, coeffs[0]);
1507
3.21k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_2tap_2x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_2tap_2x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_2tap_2x2_ssse3
convolve_avx2.c:convolve_y_2tap_2x2_ssse3
Line
Count
Source
1498
3.21k
                                             __m128i d[2], __m128i *res) {
1499
3.21k
  d[1] = _mm_cvtsi32_si128(loadu_int16(data + 1 * stride));
1500
3.21k
  const __m128i src_01a = _mm_unpacklo_epi16(d[0], d[1]);
1501
3.21k
  d[0] = _mm_cvtsi32_si128(loadu_int16(data + 2 * stride));
1502
3.21k
  const __m128i src_12a = _mm_unpacklo_epi16(d[1], d[0]);
1503
1504
3.21k
  const __m128i s = _mm_unpacklo_epi8(src_01a, src_12a);
1505
1506
3.21k
  *res = _mm_maddubs_epi16(s, coeffs[0]);
1507
3.21k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_2tap_2x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_2tap_2x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_2tap_2x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_2tap_2x2_ssse3
1508
1509
static inline void convolve_y_4tap_2x2_ssse3(const uint8_t *const data,
1510
                                             const ptrdiff_t stride,
1511
                                             const __m128i coeffs[2],
1512
                                             __m128i d[4], __m128i s[2],
1513
37.0k
                                             __m128i *res) {
1514
37.0k
  d[3] = _mm_cvtsi32_si128(loadu_int16(data + 3 * stride));
1515
37.0k
  const __m128i src_23a = _mm_unpacklo_epi16(d[2], d[3]);
1516
37.0k
  d[2] = _mm_cvtsi32_si128(loadu_int16(data + 4 * stride));
1517
37.0k
  const __m128i src_34a = _mm_unpacklo_epi16(d[3], d[2]);
1518
1519
37.0k
  s[1] = _mm_unpacklo_epi8(src_23a, src_34a);
1520
1521
37.0k
  *res = convolve_lowbd_4tap_ssse3(s, coeffs);
1522
37.0k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_4tap_2x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_4tap_2x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_4tap_2x2_ssse3
convolve_avx2.c:convolve_y_4tap_2x2_ssse3
Line
Count
Source
1513
37.0k
                                             __m128i *res) {
1514
37.0k
  d[3] = _mm_cvtsi32_si128(loadu_int16(data + 3 * stride));
1515
37.0k
  const __m128i src_23a = _mm_unpacklo_epi16(d[2], d[3]);
1516
37.0k
  d[2] = _mm_cvtsi32_si128(loadu_int16(data + 4 * stride));
1517
37.0k
  const __m128i src_34a = _mm_unpacklo_epi16(d[3], d[2]);
1518
1519
37.0k
  s[1] = _mm_unpacklo_epi8(src_23a, src_34a);
1520
1521
37.0k
  *res = convolve_lowbd_4tap_ssse3(s, coeffs);
1522
37.0k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_4tap_2x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_4tap_2x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_4tap_2x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_4tap_2x2_ssse3
1523
1524
static inline void convolve_y_6tap_2x2_ssse3(const uint8_t *const data,
1525
                                             const ptrdiff_t stride,
1526
                                             const __m128i coeffs[3],
1527
                                             __m128i d[6], __m128i s[3],
1528
49.3k
                                             __m128i *res) {
1529
49.3k
  d[5] = _mm_cvtsi32_si128(loadu_int16(data + 5 * stride));
1530
49.3k
  const __m128i src_45a = _mm_unpacklo_epi16(d[4], d[5]);
1531
49.3k
  d[4] = _mm_cvtsi32_si128(loadu_int16(data + 6 * stride));
1532
49.3k
  const __m128i src_56a = _mm_unpacklo_epi16(d[5], d[4]);
1533
1534
49.3k
  s[2] = _mm_unpacklo_epi8(src_45a, src_56a);
1535
1536
49.3k
  *res = convolve_lowbd_6tap_ssse3(s, coeffs);
1537
49.3k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_6tap_2x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_6tap_2x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_6tap_2x2_ssse3
convolve_avx2.c:convolve_y_6tap_2x2_ssse3
Line
Count
Source
1528
49.3k
                                             __m128i *res) {
1529
49.3k
  d[5] = _mm_cvtsi32_si128(loadu_int16(data + 5 * stride));
1530
49.3k
  const __m128i src_45a = _mm_unpacklo_epi16(d[4], d[5]);
1531
49.3k
  d[4] = _mm_cvtsi32_si128(loadu_int16(data + 6 * stride));
1532
49.3k
  const __m128i src_56a = _mm_unpacklo_epi16(d[5], d[4]);
1533
1534
49.3k
  s[2] = _mm_unpacklo_epi8(src_45a, src_56a);
1535
1536
49.3k
  *res = convolve_lowbd_6tap_ssse3(s, coeffs);
1537
49.3k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_6tap_2x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_6tap_2x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_6tap_2x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_6tap_2x2_ssse3
1538
1539
static inline void convolve_y_8tap_2x2_ssse3(const uint8_t *const data,
1540
                                             const ptrdiff_t stride,
1541
                                             const __m128i coeffs[4],
1542
                                             __m128i d[8], __m128i s[4],
1543
5.12k
                                             __m128i *res) {
1544
5.12k
  d[7] = _mm_cvtsi32_si128(loadu_int16(data + 7 * stride));
1545
5.12k
  const __m128i src_67a = _mm_unpacklo_epi16(d[6], d[7]);
1546
5.12k
  d[6] = _mm_cvtsi32_si128(loadu_int16(data + 8 * stride));
1547
5.12k
  const __m128i src_78a = _mm_unpacklo_epi16(d[7], d[6]);
1548
1549
5.12k
  s[3] = _mm_unpacklo_epi8(src_67a, src_78a);
1550
1551
5.12k
  *res = convolve_lowbd_ssse3(s, coeffs);
1552
5.12k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_8tap_2x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_8tap_2x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_8tap_2x2_ssse3
convolve_avx2.c:convolve_y_8tap_2x2_ssse3
Line
Count
Source
1543
5.12k
                                             __m128i *res) {
1544
5.12k
  d[7] = _mm_cvtsi32_si128(loadu_int16(data + 7 * stride));
1545
5.12k
  const __m128i src_67a = _mm_unpacklo_epi16(d[6], d[7]);
1546
5.12k
  d[6] = _mm_cvtsi32_si128(loadu_int16(data + 8 * stride));
1547
5.12k
  const __m128i src_78a = _mm_unpacklo_epi16(d[7], d[6]);
1548
1549
5.12k
  s[3] = _mm_unpacklo_epi8(src_67a, src_78a);
1550
1551
5.12k
  *res = convolve_lowbd_ssse3(s, coeffs);
1552
5.12k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_8tap_2x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_8tap_2x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_8tap_2x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_8tap_2x2_ssse3
1553
1554
static inline void convolve_y_2tap_4x2_ssse3(const uint8_t *const data,
1555
                                             const ptrdiff_t stride,
1556
                                             const __m128i *coeffs,
1557
13.8k
                                             __m128i d[2], __m128i *res) {
1558
13.8k
  d[1] = _mm_cvtsi32_si128(loadu_int32(data + 1 * stride));
1559
13.8k
  const __m128i src_01a = _mm_unpacklo_epi32(d[0], d[1]);
1560
13.8k
  d[0] = _mm_cvtsi32_si128(loadu_int32(data + 2 * stride));
1561
13.8k
  const __m128i src_12a = _mm_unpacklo_epi32(d[1], d[0]);
1562
1563
13.8k
  const __m128i s = _mm_unpacklo_epi8(src_01a, src_12a);
1564
1565
13.8k
  *res = _mm_maddubs_epi16(s, coeffs[0]);
1566
13.8k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_2tap_4x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_2tap_4x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_2tap_4x2_ssse3
convolve_avx2.c:convolve_y_2tap_4x2_ssse3
Line
Count
Source
1557
13.8k
                                             __m128i d[2], __m128i *res) {
1558
13.8k
  d[1] = _mm_cvtsi32_si128(loadu_int32(data + 1 * stride));
1559
13.8k
  const __m128i src_01a = _mm_unpacklo_epi32(d[0], d[1]);
1560
13.8k
  d[0] = _mm_cvtsi32_si128(loadu_int32(data + 2 * stride));
1561
13.8k
  const __m128i src_12a = _mm_unpacklo_epi32(d[1], d[0]);
1562
1563
13.8k
  const __m128i s = _mm_unpacklo_epi8(src_01a, src_12a);
1564
1565
13.8k
  *res = _mm_maddubs_epi16(s, coeffs[0]);
1566
13.8k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_2tap_4x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_2tap_4x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_2tap_4x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_2tap_4x2_ssse3
1567
1568
static inline void convolve_y_4tap_4x2_ssse3(const uint8_t *const data,
1569
                                             const ptrdiff_t stride,
1570
                                             const __m128i coeffs[2],
1571
                                             __m128i d[4], __m128i s[2],
1572
199k
                                             __m128i *res) {
1573
199k
  d[3] = _mm_cvtsi32_si128(loadu_int32(data + 3 * stride));
1574
199k
  const __m128i src_23a = _mm_unpacklo_epi32(d[2], d[3]);
1575
199k
  d[2] = _mm_cvtsi32_si128(loadu_int32(data + 4 * stride));
1576
199k
  const __m128i src_34a = _mm_unpacklo_epi32(d[3], d[2]);
1577
1578
199k
  s[1] = _mm_unpacklo_epi8(src_23a, src_34a);
1579
1580
199k
  *res = convolve_lowbd_4tap_ssse3(s, coeffs);
1581
199k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_4tap_4x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_4tap_4x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_4tap_4x2_ssse3
convolve_avx2.c:convolve_y_4tap_4x2_ssse3
Line
Count
Source
1572
199k
                                             __m128i *res) {
1573
199k
  d[3] = _mm_cvtsi32_si128(loadu_int32(data + 3 * stride));
1574
199k
  const __m128i src_23a = _mm_unpacklo_epi32(d[2], d[3]);
1575
199k
  d[2] = _mm_cvtsi32_si128(loadu_int32(data + 4 * stride));
1576
199k
  const __m128i src_34a = _mm_unpacklo_epi32(d[3], d[2]);
1577
1578
199k
  s[1] = _mm_unpacklo_epi8(src_23a, src_34a);
1579
1580
199k
  *res = convolve_lowbd_4tap_ssse3(s, coeffs);
1581
199k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_4tap_4x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_4tap_4x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_4tap_4x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_4tap_4x2_ssse3
1582
1583
static inline void convolve_y_6tap_4x2_ssse3(const uint8_t *const data,
1584
                                             const ptrdiff_t stride,
1585
                                             const __m128i coeffs[3],
1586
                                             __m128i d[6], __m128i s[3],
1587
299k
                                             __m128i *res) {
1588
299k
  d[5] = _mm_cvtsi32_si128(loadu_int32(data + 5 * stride));
1589
299k
  const __m128i src_45a = _mm_unpacklo_epi32(d[4], d[5]);
1590
299k
  d[4] = _mm_cvtsi32_si128(loadu_int32(data + 6 * stride));
1591
299k
  const __m128i src_56a = _mm_unpacklo_epi32(d[5], d[4]);
1592
1593
299k
  s[2] = _mm_unpacklo_epi8(src_45a, src_56a);
1594
1595
299k
  *res = convolve_lowbd_6tap_ssse3(s, coeffs);
1596
299k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_6tap_4x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_6tap_4x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_6tap_4x2_ssse3
convolve_avx2.c:convolve_y_6tap_4x2_ssse3
Line
Count
Source
1587
299k
                                             __m128i *res) {
1588
299k
  d[5] = _mm_cvtsi32_si128(loadu_int32(data + 5 * stride));
1589
299k
  const __m128i src_45a = _mm_unpacklo_epi32(d[4], d[5]);
1590
299k
  d[4] = _mm_cvtsi32_si128(loadu_int32(data + 6 * stride));
1591
299k
  const __m128i src_56a = _mm_unpacklo_epi32(d[5], d[4]);
1592
1593
299k
  s[2] = _mm_unpacklo_epi8(src_45a, src_56a);
1594
1595
299k
  *res = convolve_lowbd_6tap_ssse3(s, coeffs);
1596
299k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_6tap_4x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_6tap_4x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_6tap_4x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_6tap_4x2_ssse3
1597
1598
static inline void convolve_y_8tap_4x2_ssse3(const uint8_t *const data,
1599
                                             const ptrdiff_t stride,
1600
                                             const __m128i coeffs[4],
1601
                                             __m128i d[8], __m128i s[4],
1602
25.4k
                                             __m128i *res) {
1603
25.4k
  d[7] = _mm_cvtsi32_si128(loadu_int32(data + 7 * stride));
1604
25.4k
  const __m128i src_67a = _mm_unpacklo_epi32(d[6], d[7]);
1605
25.4k
  d[6] = _mm_cvtsi32_si128(loadu_int32(data + 8 * stride));
1606
25.4k
  const __m128i src_78a = _mm_unpacklo_epi32(d[7], d[6]);
1607
1608
25.4k
  s[3] = _mm_unpacklo_epi8(src_67a, src_78a);
1609
1610
25.4k
  res[0] = convolve_lowbd_ssse3(s, coeffs);
1611
25.4k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_8tap_4x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_8tap_4x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_8tap_4x2_ssse3
convolve_avx2.c:convolve_y_8tap_4x2_ssse3
Line
Count
Source
1602
25.4k
                                             __m128i *res) {
1603
25.4k
  d[7] = _mm_cvtsi32_si128(loadu_int32(data + 7 * stride));
1604
25.4k
  const __m128i src_67a = _mm_unpacklo_epi32(d[6], d[7]);
1605
25.4k
  d[6] = _mm_cvtsi32_si128(loadu_int32(data + 8 * stride));
1606
25.4k
  const __m128i src_78a = _mm_unpacklo_epi32(d[7], d[6]);
1607
1608
25.4k
  s[3] = _mm_unpacklo_epi8(src_67a, src_78a);
1609
1610
25.4k
  res[0] = convolve_lowbd_ssse3(s, coeffs);
1611
25.4k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_8tap_4x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_8tap_4x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_8tap_4x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_8tap_4x2_ssse3
1612
1613
static inline void convolve_y_2tap_8x2_avx2(const uint8_t *const data,
1614
                                            const ptrdiff_t stride,
1615
                                            const __m256i *coeffs, __m128i d[2],
1616
12.4k
                                            __m256i *res) {
1617
12.4k
  d[1] = _mm_loadu_si128((__m128i *)(data + 1 * stride));
1618
12.4k
  const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]);
1619
12.4k
  d[0] = _mm_loadu_si128((__m128i *)(data + 2 * stride));
1620
12.4k
  const __m256i src_12a = _mm256_setr_m128i(d[1], d[0]);
1621
1622
12.4k
  const __m256i s = _mm256_unpacklo_epi8(src_01a, src_12a);
1623
1624
12.4k
  *res = _mm256_maddubs_epi16(s, coeffs[0]);
1625
12.4k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_2tap_8x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_2tap_8x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_2tap_8x2_avx2
convolve_avx2.c:convolve_y_2tap_8x2_avx2
Line
Count
Source
1616
12.4k
                                            __m256i *res) {
1617
12.4k
  d[1] = _mm_loadu_si128((__m128i *)(data + 1 * stride));
1618
12.4k
  const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]);
1619
12.4k
  d[0] = _mm_loadu_si128((__m128i *)(data + 2 * stride));
1620
12.4k
  const __m256i src_12a = _mm256_setr_m128i(d[1], d[0]);
1621
1622
12.4k
  const __m256i s = _mm256_unpacklo_epi8(src_01a, src_12a);
1623
1624
12.4k
  *res = _mm256_maddubs_epi16(s, coeffs[0]);
1625
12.4k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_2tap_8x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_2tap_8x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_2tap_8x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_2tap_8x2_avx2
1626
1627
static inline void convolve_y_4tap_8x2_avx2(const uint8_t *const data,
1628
                                            const ptrdiff_t stride,
1629
                                            const __m256i coeffs[2],
1630
                                            __m128i d[4], __m256i s[2],
1631
180k
                                            __m256i *res) {
1632
180k
  d[3] = _mm_loadu_si128((__m128i *)(data + 3 * stride));
1633
180k
  const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]);
1634
180k
  d[2] = _mm_loadu_si128((__m128i *)(data + 4 * stride));
1635
180k
  const __m256i src_34a = _mm256_setr_m128i(d[3], d[2]);
1636
1637
180k
  s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
1638
1639
180k
  *res = convolve_lowbd_4tap(s, coeffs);
1640
180k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_4tap_8x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_4tap_8x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_4tap_8x2_avx2
convolve_avx2.c:convolve_y_4tap_8x2_avx2
Line
Count
Source
1631
180k
                                            __m256i *res) {
1632
180k
  d[3] = _mm_loadu_si128((__m128i *)(data + 3 * stride));
1633
180k
  const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]);
1634
180k
  d[2] = _mm_loadu_si128((__m128i *)(data + 4 * stride));
1635
180k
  const __m256i src_34a = _mm256_setr_m128i(d[3], d[2]);
1636
1637
180k
  s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
1638
1639
180k
  *res = convolve_lowbd_4tap(s, coeffs);
1640
180k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_4tap_8x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_4tap_8x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_4tap_8x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_4tap_8x2_avx2
1641
1642
static inline void convolve_y_6tap_8x2_avx2(const uint8_t *const data,
1643
                                            const ptrdiff_t stride,
1644
                                            const __m256i coeffs[3],
1645
                                            __m128i d[6], __m256i s[3],
1646
407k
                                            __m256i *res) {
1647
407k
  d[5] = _mm_loadu_si128((__m128i *)(data + 5 * stride));
1648
407k
  const __m256i src_45a = _mm256_setr_m128i(d[4], d[5]);
1649
407k
  d[4] = _mm_loadu_si128((__m128i *)(data + 6 * stride));
1650
407k
  const __m256i src_56a = _mm256_setr_m128i(d[5], d[4]);
1651
1652
407k
  s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
1653
1654
407k
  *res = convolve_lowbd_6tap(s, coeffs);
1655
407k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_6tap_8x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_6tap_8x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_6tap_8x2_avx2
convolve_avx2.c:convolve_y_6tap_8x2_avx2
Line
Count
Source
1646
407k
                                            __m256i *res) {
1647
407k
  d[5] = _mm_loadu_si128((__m128i *)(data + 5 * stride));
1648
407k
  const __m256i src_45a = _mm256_setr_m128i(d[4], d[5]);
1649
407k
  d[4] = _mm_loadu_si128((__m128i *)(data + 6 * stride));
1650
407k
  const __m256i src_56a = _mm256_setr_m128i(d[5], d[4]);
1651
1652
407k
  s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
1653
1654
407k
  *res = convolve_lowbd_6tap(s, coeffs);
1655
407k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_6tap_8x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_6tap_8x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_6tap_8x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_6tap_8x2_avx2
1656
1657
static inline void convolve_y_8tap_8x2_avx2(const uint8_t *const data,
1658
                                            const ptrdiff_t stride,
1659
                                            const __m256i coeffs[4],
1660
                                            __m128i d[8], __m256i s[4],
1661
24.1k
                                            __m256i *res) {
1662
24.1k
  d[7] = _mm_loadu_si128((__m128i *)(data + 7 * stride));
1663
24.1k
  const __m256i src_67a = _mm256_setr_m128i(d[6], d[7]);
1664
24.1k
  d[6] = _mm_loadu_si128((__m128i *)(data + 8 * stride));
1665
24.1k
  const __m256i src_78a = _mm256_setr_m128i(d[7], d[6]);
1666
1667
24.1k
  s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
1668
1669
24.1k
  *res = convolve_lowbd(s, coeffs);
1670
24.1k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_8tap_8x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_8tap_8x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_8tap_8x2_avx2
convolve_avx2.c:convolve_y_8tap_8x2_avx2
Line
Count
Source
1661
24.1k
                                            __m256i *res) {
1662
24.1k
  d[7] = _mm_loadu_si128((__m128i *)(data + 7 * stride));
1663
24.1k
  const __m256i src_67a = _mm256_setr_m128i(d[6], d[7]);
1664
24.1k
  d[6] = _mm_loadu_si128((__m128i *)(data + 8 * stride));
1665
24.1k
  const __m256i src_78a = _mm256_setr_m128i(d[7], d[6]);
1666
1667
24.1k
  s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
1668
1669
24.1k
  *res = convolve_lowbd(s, coeffs);
1670
24.1k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_8tap_8x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_8tap_8x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_8tap_8x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_8tap_8x2_avx2
1671
1672
static inline void convolve_y_2tap_16x2_avx2(const uint8_t *const data,
1673
                                             const ptrdiff_t stride,
1674
                                             const __m256i *coeffs,
1675
12.8k
                                             __m128i d[2], __m256i res[2]) {
1676
12.8k
  d[1] = _mm_loadu_si128((__m128i *)(data + 1 * stride));
1677
12.8k
  const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]);
1678
12.8k
  d[0] = _mm_loadu_si128((__m128i *)(data + 2 * stride));
1679
12.8k
  const __m256i src_12a = _mm256_setr_m128i(d[1], d[0]);
1680
1681
12.8k
  const __m256i s0 = _mm256_unpacklo_epi8(src_01a, src_12a);
1682
12.8k
  const __m256i s1 = _mm256_unpackhi_epi8(src_01a, src_12a);
1683
1684
12.8k
  res[0] = _mm256_maddubs_epi16(s0, coeffs[0]);
1685
12.8k
  res[1] = _mm256_maddubs_epi16(s1, coeffs[0]);
1686
12.8k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_2tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_2tap_16x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_2tap_16x2_avx2
convolve_avx2.c:convolve_y_2tap_16x2_avx2
Line
Count
Source
1675
12.8k
                                             __m128i d[2], __m256i res[2]) {
1676
12.8k
  d[1] = _mm_loadu_si128((__m128i *)(data + 1 * stride));
1677
12.8k
  const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]);
1678
12.8k
  d[0] = _mm_loadu_si128((__m128i *)(data + 2 * stride));
1679
12.8k
  const __m256i src_12a = _mm256_setr_m128i(d[1], d[0]);
1680
1681
12.8k
  const __m256i s0 = _mm256_unpacklo_epi8(src_01a, src_12a);
1682
12.8k
  const __m256i s1 = _mm256_unpackhi_epi8(src_01a, src_12a);
1683
1684
12.8k
  res[0] = _mm256_maddubs_epi16(s0, coeffs[0]);
1685
12.8k
  res[1] = _mm256_maddubs_epi16(s1, coeffs[0]);
1686
12.8k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_2tap_16x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_2tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_2tap_16x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_2tap_16x2_avx2
1687
1688
static inline void convolve_y_4tap_16x2_avx2(const uint8_t *const data,
1689
                                             const ptrdiff_t stride,
1690
                                             const __m256i coeffs[2],
1691
                                             __m128i d[4], __m256i s[4],
1692
111k
                                             __m256i res[2]) {
1693
111k
  d[3] = _mm_loadu_si128((__m128i *)(data + 3 * stride));
1694
111k
  const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]);
1695
111k
  d[2] = _mm_loadu_si128((__m128i *)(data + 4 * stride));
1696
111k
  const __m256i src_34a = _mm256_setr_m128i(d[3], d[2]);
1697
1698
111k
  s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
1699
111k
  s[3] = _mm256_unpackhi_epi8(src_23a, src_34a);
1700
1701
111k
  res[0] = convolve_lowbd_4tap(s, coeffs);
1702
111k
  res[1] = convolve_lowbd_4tap(s + 2, coeffs);
1703
111k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_4tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_4tap_16x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_4tap_16x2_avx2
convolve_avx2.c:convolve_y_4tap_16x2_avx2
Line
Count
Source
1692
111k
                                             __m256i res[2]) {
1693
111k
  d[3] = _mm_loadu_si128((__m128i *)(data + 3 * stride));
1694
111k
  const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]);
1695
111k
  d[2] = _mm_loadu_si128((__m128i *)(data + 4 * stride));
1696
111k
  const __m256i src_34a = _mm256_setr_m128i(d[3], d[2]);
1697
1698
111k
  s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
1699
111k
  s[3] = _mm256_unpackhi_epi8(src_23a, src_34a);
1700
1701
111k
  res[0] = convolve_lowbd_4tap(s, coeffs);
1702
111k
  res[1] = convolve_lowbd_4tap(s + 2, coeffs);
1703
111k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_4tap_16x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_4tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_4tap_16x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_4tap_16x2_avx2
1704
1705
static inline void convolve_y_6tap_16x2_avx2(const uint8_t *const data,
1706
                                             const ptrdiff_t stride,
1707
                                             const __m256i coeffs[3],
1708
                                             __m128i d[6], __m256i s[6],
1709
1.68M
                                             __m256i res[2]) {
1710
1.68M
  d[5] = _mm_loadu_si128((__m128i *)(data + 5 * stride));
1711
1.68M
  const __m256i src_45a = _mm256_setr_m128i(d[4], d[5]);
1712
1.68M
  d[4] = _mm_loadu_si128((__m128i *)(data + 6 * stride));
1713
1.68M
  const __m256i src_56a = _mm256_setr_m128i(d[5], d[4]);
1714
1715
1.68M
  s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
1716
1.68M
  s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
1717
1718
1.68M
  res[0] = convolve_lowbd_6tap(s, coeffs);
1719
1.68M
  res[1] = convolve_lowbd_6tap(s + 3, coeffs);
1720
1.68M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_6tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_6tap_16x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_6tap_16x2_avx2
convolve_avx2.c:convolve_y_6tap_16x2_avx2
Line
Count
Source
1709
1.68M
                                             __m256i res[2]) {
1710
1.68M
  d[5] = _mm_loadu_si128((__m128i *)(data + 5 * stride));
1711
1.68M
  const __m256i src_45a = _mm256_setr_m128i(d[4], d[5]);
1712
1.68M
  d[4] = _mm_loadu_si128((__m128i *)(data + 6 * stride));
1713
1.68M
  const __m256i src_56a = _mm256_setr_m128i(d[5], d[4]);
1714
1715
1.68M
  s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
1716
1.68M
  s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
1717
1718
1.68M
  res[0] = convolve_lowbd_6tap(s, coeffs);
1719
1.68M
  res[1] = convolve_lowbd_6tap(s + 3, coeffs);
1720
1.68M
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_6tap_16x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_6tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_6tap_16x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_6tap_16x2_avx2
1721
1722
static inline void convolve_y_8tap_16x2_avx2(const uint8_t *const data,
1723
                                             const ptrdiff_t stride,
1724
                                             const __m256i coeffs[4],
1725
                                             __m128i d[8], __m256i s[8],
1726
79.1k
                                             __m256i res[2]) {
1727
79.1k
  d[7] = _mm_loadu_si128((__m128i *)(data + 7 * stride));
1728
79.1k
  const __m256i src_67a = _mm256_setr_m128i(d[6], d[7]);
1729
79.1k
  d[6] = _mm_loadu_si128((__m128i *)(data + 8 * stride));
1730
79.1k
  const __m256i src_78a = _mm256_setr_m128i(d[7], d[6]);
1731
1732
79.1k
  s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
1733
79.1k
  s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
1734
1735
79.1k
  res[0] = convolve_lowbd(s, coeffs);
1736
79.1k
  res[1] = convolve_lowbd(s + 4, coeffs);
1737
79.1k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_8tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_8tap_16x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_8tap_16x2_avx2
convolve_avx2.c:convolve_y_8tap_16x2_avx2
Line
Count
Source
1726
79.1k
                                             __m256i res[2]) {
1727
79.1k
  d[7] = _mm_loadu_si128((__m128i *)(data + 7 * stride));
1728
79.1k
  const __m256i src_67a = _mm256_setr_m128i(d[6], d[7]);
1729
79.1k
  d[6] = _mm_loadu_si128((__m128i *)(data + 8 * stride));
1730
79.1k
  const __m256i src_78a = _mm256_setr_m128i(d[7], d[6]);
1731
1732
79.1k
  s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
1733
79.1k
  s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
1734
1735
79.1k
  res[0] = convolve_lowbd(s, coeffs);
1736
79.1k
  res[1] = convolve_lowbd(s + 4, coeffs);
1737
79.1k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_8tap_16x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_8tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_8tap_16x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_8tap_16x2_avx2
1738
1739
static inline void convolve_y_2tap_32x2_avx2(const uint8_t *const data,
1740
                                             const ptrdiff_t stride,
1741
                                             const __m256i *coeffs,
1742
38.7k
                                             __m256i d[2], __m256i res[4]) {
1743
38.7k
  d[1] = _mm256_loadu_si256((__m256i *)(data + 1 * stride));
1744
38.7k
  const __m256i s00 = _mm256_unpacklo_epi8(d[0], d[1]);
1745
38.7k
  const __m256i s01 = _mm256_unpackhi_epi8(d[0], d[1]);
1746
38.7k
  d[0] = _mm256_loadu_si256((__m256i *)(data + 2 * stride));
1747
38.7k
  const __m256i s10 = _mm256_unpacklo_epi8(d[1], d[0]);
1748
38.7k
  const __m256i s11 = _mm256_unpackhi_epi8(d[1], d[0]);
1749
1750
38.7k
  res[0] = _mm256_maddubs_epi16(s00, coeffs[0]);
1751
38.7k
  res[1] = _mm256_maddubs_epi16(s01, coeffs[0]);
1752
38.7k
  res[2] = _mm256_maddubs_epi16(s10, coeffs[0]);
1753
38.7k
  res[3] = _mm256_maddubs_epi16(s11, coeffs[0]);
1754
38.7k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_2tap_32x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_2tap_32x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_2tap_32x2_avx2
convolve_avx2.c:convolve_y_2tap_32x2_avx2
Line
Count
Source
1742
38.7k
                                             __m256i d[2], __m256i res[4]) {
1743
38.7k
  d[1] = _mm256_loadu_si256((__m256i *)(data + 1 * stride));
1744
38.7k
  const __m256i s00 = _mm256_unpacklo_epi8(d[0], d[1]);
1745
38.7k
  const __m256i s01 = _mm256_unpackhi_epi8(d[0], d[1]);
1746
38.7k
  d[0] = _mm256_loadu_si256((__m256i *)(data + 2 * stride));
1747
38.7k
  const __m256i s10 = _mm256_unpacklo_epi8(d[1], d[0]);
1748
38.7k
  const __m256i s11 = _mm256_unpackhi_epi8(d[1], d[0]);
1749
1750
38.7k
  res[0] = _mm256_maddubs_epi16(s00, coeffs[0]);
1751
38.7k
  res[1] = _mm256_maddubs_epi16(s01, coeffs[0]);
1752
38.7k
  res[2] = _mm256_maddubs_epi16(s10, coeffs[0]);
1753
38.7k
  res[3] = _mm256_maddubs_epi16(s11, coeffs[0]);
1754
38.7k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_2tap_32x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_2tap_32x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_2tap_32x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_2tap_32x2_avx2
1755
1756
static inline void convolve_y_4tap_32x2_avx2(const uint8_t *const data,
1757
                                             const ptrdiff_t stride,
1758
                                             const __m256i coeffs[2],
1759
                                             __m256i d[4], __m256i s1[4],
1760
132k
                                             __m256i s2[4], __m256i res[4]) {
1761
132k
  d[3] = _mm256_loadu_si256((__m256i *)(data + 3 * stride));
1762
132k
  s1[1] = _mm256_unpacklo_epi8(d[2], d[3]);
1763
132k
  s1[3] = _mm256_unpackhi_epi8(d[2], d[3]);
1764
132k
  d[2] = _mm256_loadu_si256((__m256i *)(data + 4 * stride));
1765
132k
  s2[1] = _mm256_unpacklo_epi8(d[3], d[2]);
1766
132k
  s2[3] = _mm256_unpackhi_epi8(d[3], d[2]);
1767
1768
132k
  res[0] = convolve_lowbd_4tap(s1, coeffs);
1769
132k
  res[1] = convolve_lowbd_4tap(s1 + 2, coeffs);
1770
132k
  res[2] = convolve_lowbd_4tap(s2, coeffs);
1771
132k
  res[3] = convolve_lowbd_4tap(s2 + 2, coeffs);
1772
132k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_4tap_32x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_4tap_32x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_4tap_32x2_avx2
convolve_avx2.c:convolve_y_4tap_32x2_avx2
Line
Count
Source
1760
132k
                                             __m256i s2[4], __m256i res[4]) {
1761
132k
  d[3] = _mm256_loadu_si256((__m256i *)(data + 3 * stride));
1762
132k
  s1[1] = _mm256_unpacklo_epi8(d[2], d[3]);
1763
132k
  s1[3] = _mm256_unpackhi_epi8(d[2], d[3]);
1764
132k
  d[2] = _mm256_loadu_si256((__m256i *)(data + 4 * stride));
1765
132k
  s2[1] = _mm256_unpacklo_epi8(d[3], d[2]);
1766
132k
  s2[3] = _mm256_unpackhi_epi8(d[3], d[2]);
1767
1768
132k
  res[0] = convolve_lowbd_4tap(s1, coeffs);
1769
132k
  res[1] = convolve_lowbd_4tap(s1 + 2, coeffs);
1770
132k
  res[2] = convolve_lowbd_4tap(s2, coeffs);
1771
132k
  res[3] = convolve_lowbd_4tap(s2 + 2, coeffs);
1772
132k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_4tap_32x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_4tap_32x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_4tap_32x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_4tap_32x2_avx2
1773
#endif  // AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_