Coverage Report

Created: 2026-05-16 06:27

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/aom/aom_dsp/x86/convolve_avx2.h
Line
Count
Source
1
/*
2
 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#ifndef AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_
13
#define AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_
14
15
#include <immintrin.h>
16
17
#include "aom_ports/mem.h"
18
19
#include "aom_dsp/x86/mem_sse2.h"
20
#include "aom_dsp/x86/synonyms.h"
21
22
#include "av1/common/convolve.h"
23
#include "av1/common/filter.h"
24
25
737k
#define SECOND_32_BLK (32)
26
660k
#define THIRD_32_BLK (32 << 1)
27
330k
#define FOURTH_32_BLK (SECOND_32_BLK + THIRD_32_BLK)
28
29
// filters for 16
30
DECLARE_ALIGNED(32, static const uint8_t, filt_global_avx2[]) = {
31
  0,  1,  1,  2,  2, 3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  0,  1,  1,
32
  2,  2,  3,  3,  4, 4,  5,  5,  6,  6,  7,  7,  8,  2,  3,  3,  4,  4,  5,
33
  5,  6,  6,  7,  7, 8,  8,  9,  9,  10, 2,  3,  3,  4,  4,  5,  5,  6,  6,
34
  7,  7,  8,  8,  9, 9,  10, 4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10,
35
  10, 11, 11, 12, 4, 5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10, 10, 11, 11,
36
  12, 6,  7,  7,  8, 8,  9,  9,  10, 10, 11, 11, 12, 12, 13, 13, 14, 6,  7,
37
  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14
38
};
39
40
DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[]) = {
41
  0, 1, 2, 3,  1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3,  1, 2,
42
  3, 4, 2, 3,  4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7,  8, 9,
43
  7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
44
};
45
46
DECLARE_ALIGNED(32, static const uint8_t, filt4_d4_global_avx2[]) = {
47
  2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
48
  2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
49
};
50
51
DECLARE_ALIGNED(32, static const uint8_t, filt_center_global_avx2[32]) = {
52
  3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255,
53
  3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255
54
};
55
56
DECLARE_ALIGNED(32, static const uint8_t,
57
                filt1_global_sse2[16]) = { 0, 1, 1, 2,  2,  3,  3,  4,
58
                                           8, 9, 9, 10, 10, 11, 11, 12 };
59
60
DECLARE_ALIGNED(32, static const uint8_t,
61
                filt2_global_sse2[16]) = { 2,  3,  3,  4,  4,  5,  5,  6,
62
                                           10, 11, 11, 12, 12, 13, 13, 14 };
63
64
DECLARE_ALIGNED(32, static const uint8_t,
65
                filt3_global_sse2[16]) = { 0, 1, 1, 2, 8, 9, 9, 10,
66
                                           0, 0, 0, 0, 0, 0, 0, 0 };
67
68
DECLARE_ALIGNED(32, static const uint8_t,
69
                filt4_global_sse2[16]) = { 2, 3, 3, 4, 10, 11, 11, 12,
70
                                           0, 0, 0, 0, 0,  0,  0,  0 };
71
72
DECLARE_ALIGNED(32, static const uint8_t,
73
                filt5_global_sse2[16]) = { 0, 1, 1, 2, 4, 5, 5, 6,
74
                                           0, 0, 0, 0, 0, 0, 0, 0 };
75
76
DECLARE_ALIGNED(32, static const uint8_t,
77
                filt1_global_avx2[32]) = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,
78
                                           6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3,
79
                                           3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
80
81
DECLARE_ALIGNED(32, static const uint8_t,
82
                filt2_global_avx2[32]) = { 2, 3, 3, 4, 4,  5, 5, 6, 6, 7, 7,
83
                                           8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5,
84
                                           5, 6, 6, 7, 7,  8, 8, 9, 9, 10 };
85
86
DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
87
  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
88
  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
89
};
90
91
DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
92
  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
93
  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
94
};
95
96
#define CONVOLVE_SR_HOR_FILTER_W4(CONVOLVE_LOWBD)                            \
97
2.89M
  for (i = 0; i < (im_h - 2); i += 2) {                                      \
98
2.34M
    __m128i data =                                                           \
99
2.34M
        load_8bit_8x2_to_1_reg_sse2(&src_ptr[(i * src_stride)], src_stride); \
100
2.34M
    __m128i res = CONVOLVE_LOWBD(data, coeffs_h, filt);                      \
101
2.34M
    res = _mm_srai_epi16(_mm_add_epi16(res, round_const_h), 2);              \
102
2.34M
    _mm_store_si128((__m128i *)&im_block[i * 4], res);                       \
103
2.34M
  }                                                                          \
104
546k
  __m128i data_1 = _mm_loadl_epi64((__m128i *)&src_ptr[(i * src_stride)]);   \
105
546k
  __m128i res = CONVOLVE_LOWBD(data_1, coeffs_h, filt);                      \
106
546k
  res = _mm_srai_epi16(_mm_add_epi16(res, round_const_h), 2);                \
107
546k
  _mm_storel_epi64((__m128i *)&im_block[i * 4], res);
108
109
#define CONVOLVE_SR_HOR_FILTER_2TAP_W4 \
110
19.1k
  CONVOLVE_SR_HOR_FILTER_W4(convolve_lowbd_x_2tap_ssse3)
111
112
#define CONVOLVE_SR_HOR_FILTER_4TAP_W4 \
113
527k
  CONVOLVE_SR_HOR_FILTER_W4(convolve_lowbd_x_4tap_ssse3)
114
115
static inline void sr_2d_ver_round_and_store_w4(int w, __m256i res,
116
                                                uint8_t *dst, int dst_stride,
117
1.62M
                                                __m256i round_const_v) {
118
1.62M
  const __m256i res_round =
119
1.62M
      _mm256_srai_epi32(_mm256_add_epi32(res, round_const_v), 11);
120
121
1.62M
  const __m256i res_16bit = _mm256_packs_epi32(res_round, res_round);
122
1.62M
  const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);
123
124
1.62M
  const __m128i r0 = _mm256_castsi256_si128(res_8b);
125
1.62M
  const __m128i r1 = _mm256_extracti128_si256(res_8b, 1);
126
127
1.62M
  __m128i *const p0 = (__m128i *)dst;
128
1.62M
  __m128i *const p1 = (__m128i *)(dst + dst_stride);
129
130
1.62M
  if (w == 4) {
131
1.35M
    xx_storel_32(p0, r0);
132
1.35M
    xx_storel_32(p1, r1);
133
1.35M
  } else {
134
274k
    assert(w == 2);
135
274k
    *(uint16_t *)p0 = (uint16_t)_mm_cvtsi128_si32(r0);
136
274k
    *(uint16_t *)p1 = (uint16_t)_mm_cvtsi128_si32(r1);
137
274k
  }
138
1.62M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:sr_2d_ver_round_and_store_w4
Unexecuted instantiation: highbd_convolve_avx2.c:sr_2d_ver_round_and_store_w4
convolve_2d_avx2.c:sr_2d_ver_round_and_store_w4
Line
Count
Source
117
1.62M
                                                __m256i round_const_v) {
118
1.62M
  const __m256i res_round =
119
1.62M
      _mm256_srai_epi32(_mm256_add_epi32(res, round_const_v), 11);
120
121
1.62M
  const __m256i res_16bit = _mm256_packs_epi32(res_round, res_round);
122
1.62M
  const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);
123
124
1.62M
  const __m128i r0 = _mm256_castsi256_si128(res_8b);
125
1.62M
  const __m128i r1 = _mm256_extracti128_si256(res_8b, 1);
126
127
1.62M
  __m128i *const p0 = (__m128i *)dst;
128
1.62M
  __m128i *const p1 = (__m128i *)(dst + dst_stride);
129
130
1.62M
  if (w == 4) {
131
1.35M
    xx_storel_32(p0, r0);
132
1.35M
    xx_storel_32(p1, r1);
133
1.35M
  } else {
134
274k
    assert(w == 2);
135
274k
    *(uint16_t *)p0 = (uint16_t)_mm_cvtsi128_si32(r0);
136
274k
    *(uint16_t *)p1 = (uint16_t)_mm_cvtsi128_si32(r1);
137
274k
  }
138
1.62M
}
Unexecuted instantiation: convolve_avx2.c:sr_2d_ver_round_and_store_w4
Unexecuted instantiation: jnt_convolve_avx2.c:sr_2d_ver_round_and_store_w4
Unexecuted instantiation: wiener_convolve_avx2.c:sr_2d_ver_round_and_store_w4
Unexecuted instantiation: highbd_convolve_2d_avx2.c:sr_2d_ver_round_and_store_w4
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:sr_2d_ver_round_and_store_w4
139
140
#define CONVOLVE_SR_VER_FILTER_2TAP_W4                                        \
141
19.1k
  __m128i s[2];                                                               \
142
19.1k
  s[0] = _mm_loadl_epi64((__m128i *)(im_block + 0 * 4));                      \
143
19.1k
                                                                              \
144
76.0k
  for (i = 0; i < h; i += 2) {                                                \
145
56.9k
    const int16_t *data = &im_block[i * 4];                                   \
146
56.9k
    s[1] = _mm_loadl_epi64((__m128i *)(data + 1 * 4));                        \
147
56.9k
    const __m256i src_0 = _mm256_setr_m128i(s[0], s[1]);                      \
148
56.9k
    s[0] = _mm_loadl_epi64((__m128i *)(data + 2 * 4));                        \
149
56.9k
    const __m256i src_1 = _mm256_setr_m128i(s[1], s[0]);                      \
150
56.9k
    const __m256i ss = _mm256_unpacklo_epi16(src_0, src_1);                   \
151
56.9k
                                                                              \
152
56.9k
    const __m256i res = _mm256_madd_epi16(ss, coeffs_v[0]);                   \
153
56.9k
                                                                              \
154
56.9k
    sr_2d_ver_round_and_store_w4(w, res, dst_ptr, dst_stride, round_const_v); \
155
56.9k
    dst_ptr += 2 * dst_stride;                                                \
156
56.9k
  }
157
158
#define CONVOLVE_SR_VER_FILTER_4TAP_W4                                        \
159
346k
  __m128i s[4];                                                               \
160
346k
  __m256i ss[2];                                                              \
161
346k
  s[0] = _mm_loadl_epi64((__m128i *)(im_block + 0 * 4));                      \
162
346k
  s[1] = _mm_loadl_epi64((__m128i *)(im_block + 1 * 4));                      \
163
346k
  s[2] = _mm_loadl_epi64((__m128i *)(im_block + 2 * 4));                      \
164
346k
                                                                              \
165
346k
  const __m256i src_0 = _mm256_setr_m128i(s[0], s[1]);                        \
166
346k
  const __m256i src_1 = _mm256_setr_m128i(s[1], s[2]);                        \
167
346k
                                                                              \
168
346k
  ss[0] = _mm256_unpacklo_epi16(src_0, src_1);                                \
169
346k
                                                                              \
170
1.00M
  for (i = 0; i < h; i += 2) {                                                \
171
658k
    const int16_t *data = &im_block[i * 4];                                   \
172
658k
    s[3] = _mm_loadl_epi64((__m128i *)(data + 3 * 4));                        \
173
658k
    const __m256i src_2 = _mm256_setr_m128i(s[2], s[3]);                      \
174
658k
    s[2] = _mm_loadl_epi64((__m128i *)(data + 4 * 4));                        \
175
658k
    const __m256i src_3 = _mm256_setr_m128i(s[3], s[2]);                      \
176
658k
    ss[1] = _mm256_unpacklo_epi16(src_2, src_3);                              \
177
658k
                                                                              \
178
658k
    const __m256i res = convolve_4tap(ss, coeffs_v);                          \
179
658k
                                                                              \
180
658k
    sr_2d_ver_round_and_store_w4(w, res, dst_ptr, dst_stride, round_const_v); \
181
658k
    dst_ptr += 2 * dst_stride;                                                \
182
658k
                                                                              \
183
658k
    ss[0] = ss[1];                                                            \
184
658k
  }
185
186
#define CONVOLVE_SR_VER_FILTER_6TAP_W4                                        \
187
168k
  __m128i s[6];                                                               \
188
168k
  __m256i ss[3];                                                              \
189
168k
  s[0] = _mm_loadl_epi64((__m128i *)(im_block + 0 * 4));                      \
190
168k
  s[1] = _mm_loadl_epi64((__m128i *)(im_block + 1 * 4));                      \
191
168k
  s[2] = _mm_loadl_epi64((__m128i *)(im_block + 2 * 4));                      \
192
168k
  s[3] = _mm_loadl_epi64((__m128i *)(im_block + 3 * 4));                      \
193
168k
  s[4] = _mm_loadl_epi64((__m128i *)(im_block + 4 * 4));                      \
194
168k
                                                                              \
195
168k
  const __m256i src_0 = _mm256_setr_m128i(s[0], s[1]);                        \
196
168k
  const __m256i src_1 = _mm256_setr_m128i(s[1], s[2]);                        \
197
168k
  const __m256i src_2 = _mm256_setr_m128i(s[2], s[3]);                        \
198
168k
  const __m256i src_3 = _mm256_setr_m128i(s[3], s[4]);                        \
199
168k
                                                                              \
200
168k
  ss[0] = _mm256_unpacklo_epi16(src_0, src_1);                                \
201
168k
  ss[1] = _mm256_unpacklo_epi16(src_2, src_3);                                \
202
168k
                                                                              \
203
1.02M
  for (i = 0; i < h; i += 2) {                                                \
204
855k
    const int16_t *data = &im_block[i * 4];                                   \
205
855k
    s[5] = _mm_loadl_epi64((__m128i *)(data + 5 * 4));                        \
206
855k
    const __m256i src_4 = _mm256_setr_m128i(s[4], s[5]);                      \
207
855k
    s[4] = _mm_loadl_epi64((__m128i *)(data + 6 * 4));                        \
208
855k
    const __m256i src_5 = _mm256_setr_m128i(s[5], s[4]);                      \
209
855k
    ss[2] = _mm256_unpacklo_epi16(src_4, src_5);                              \
210
855k
                                                                              \
211
855k
    const __m256i res = convolve_6tap(ss, coeffs_v);                          \
212
855k
                                                                              \
213
855k
    sr_2d_ver_round_and_store_w4(w, res, dst_ptr, dst_stride, round_const_v); \
214
855k
    dst_ptr += 2 * dst_stride;                                                \
215
855k
                                                                              \
216
855k
    ss[0] = ss[1];                                                            \
217
855k
    ss[1] = ss[2];                                                            \
218
855k
  }
219
220
#define CONVOLVE_SR_VER_FILTER_8TAP_W4                                        \
221
11.7k
  __m128i s[8];                                                               \
222
11.7k
  __m256i ss[4];                                                              \
223
11.7k
  s[0] = _mm_loadl_epi64((__m128i *)(im_block + 0 * 4));                      \
224
11.7k
  s[1] = _mm_loadl_epi64((__m128i *)(im_block + 1 * 4));                      \
225
11.7k
  s[2] = _mm_loadl_epi64((__m128i *)(im_block + 2 * 4));                      \
226
11.7k
  s[3] = _mm_loadl_epi64((__m128i *)(im_block + 3 * 4));                      \
227
11.7k
  s[4] = _mm_loadl_epi64((__m128i *)(im_block + 4 * 4));                      \
228
11.7k
  s[5] = _mm_loadl_epi64((__m128i *)(im_block + 5 * 4));                      \
229
11.7k
  s[6] = _mm_loadl_epi64((__m128i *)(im_block + 6 * 4));                      \
230
11.7k
                                                                              \
231
11.7k
  const __m256i src_0 = _mm256_setr_m128i(s[0], s[1]);                        \
232
11.7k
  const __m256i src_1 = _mm256_setr_m128i(s[1], s[2]);                        \
233
11.7k
  const __m256i src_2 = _mm256_setr_m128i(s[2], s[3]);                        \
234
11.7k
  const __m256i src_3 = _mm256_setr_m128i(s[3], s[4]);                        \
235
11.7k
  const __m256i src_4 = _mm256_setr_m128i(s[4], s[5]);                        \
236
11.7k
  const __m256i src_5 = _mm256_setr_m128i(s[5], s[6]);                        \
237
11.7k
                                                                              \
238
11.7k
  ss[0] = _mm256_unpacklo_epi16(src_0, src_1);                                \
239
11.7k
  ss[1] = _mm256_unpacklo_epi16(src_2, src_3);                                \
240
11.7k
  ss[2] = _mm256_unpacklo_epi16(src_4, src_5);                                \
241
11.7k
                                                                              \
242
70.8k
  for (i = 0; i < h; i += 2) {                                                \
243
59.1k
    const int16_t *data = &im_block[i * 4];                                   \
244
59.1k
    s[7] = _mm_loadl_epi64((__m128i *)(data + 7 * 4));                        \
245
59.1k
    const __m256i src_6 = _mm256_setr_m128i(s[6], s[7]);                      \
246
59.1k
    s[6] = _mm_loadl_epi64((__m128i *)(data + 8 * 4));                        \
247
59.1k
    const __m256i src_7 = _mm256_setr_m128i(s[7], s[6]);                      \
248
59.1k
    ss[3] = _mm256_unpacklo_epi16(src_6, src_7);                              \
249
59.1k
                                                                              \
250
59.1k
    const __m256i res = convolve(ss, coeffs_v);                               \
251
59.1k
                                                                              \
252
59.1k
    sr_2d_ver_round_and_store_w4(w, res, dst_ptr, dst_stride, round_const_v); \
253
59.1k
    dst_ptr += 2 * dst_stride;                                                \
254
59.1k
                                                                              \
255
59.1k
    ss[0] = ss[1];                                                            \
256
59.1k
    ss[1] = ss[2];                                                            \
257
59.1k
    ss[2] = ss[3];                                                            \
258
59.1k
  }
259
260
#define CONVOLVE_SR_HORIZONTAL_FILTER(CONVOLVE_LOWBD)                 \
261
  for (i = 0; i < (im_h - 2); i += 2) {                               \
262
    __m256i data = _mm256_castsi128_si256(                            \
263
        _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));  \
264
    data = _mm256_inserti128_si256(                                   \
265
        data,                                                         \
266
        _mm_loadu_si128(                                              \
267
            (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),  \
268
        1);                                                           \
269
    __m256i res = CONVOLVE_LOWBD(data, coeffs_h, filt);               \
270
    res = _mm256_srai_epi16(_mm256_add_epi16(res, round_const_h), 2); \
271
    _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);     \
272
  }                                                                   \
273
  __m256i data_1 = _mm256_castsi128_si256(                            \
274
      _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));    \
275
  __m256i res = CONVOLVE_LOWBD(data_1, coeffs_h, filt);               \
276
  res = _mm256_srai_epi16(_mm256_add_epi16(res, round_const_h), 2);   \
277
  _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
278
279
#define CONVOLVE_SR_HORIZONTAL_FILTER_2TAP \
280
  CONVOLVE_SR_HORIZONTAL_FILTER(convolve_lowbd_x_2tap)
281
282
#define CONVOLVE_SR_HORIZONTAL_FILTER_4TAP \
283
  CONVOLVE_SR_HORIZONTAL_FILTER(convolve_lowbd_x_4tap)
284
285
#define CONVOLVE_SR_HORIZONTAL_FILTER_6TAP \
286
  CONVOLVE_SR_HORIZONTAL_FILTER(convolve_lowbd_x_6tap)
287
288
#define CONVOLVE_SR_HORIZONTAL_FILTER_8TAP \
289
  CONVOLVE_SR_HORIZONTAL_FILTER(convolve_lowbd_x)
290
291
static inline void sr_2d_ver_round_and_store(__m256i res_a, __m256i res_b,
292
                                             uint8_t *dst, int dst_stride,
293
11.1M
                                             __m256i round_const_v) {
294
11.1M
  const __m256i res_a_round =
295
11.1M
      _mm256_srai_epi32(_mm256_add_epi32(res_a, round_const_v), 11);
296
11.1M
  const __m256i res_b_round =
297
11.1M
      _mm256_srai_epi32(_mm256_add_epi32(res_b, round_const_v), 11);
298
11.1M
  const __m256i r16 = _mm256_packs_epi32(res_a_round, res_b_round);
299
11.1M
  const __m256i r8 = _mm256_packus_epi16(r16, r16);
300
301
11.1M
  _mm_storel_epi64((__m128i *)dst, _mm256_castsi256_si128(r8));
302
11.1M
  _mm_storel_epi64((__m128i *)(dst + dst_stride),
303
11.1M
                   _mm256_extracti128_si256(r8, 1));
304
11.1M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:sr_2d_ver_round_and_store
Unexecuted instantiation: highbd_convolve_avx2.c:sr_2d_ver_round_and_store
convolve_2d_avx2.c:sr_2d_ver_round_and_store
Line
Count
Source
293
11.1M
                                             __m256i round_const_v) {
294
11.1M
  const __m256i res_a_round =
295
11.1M
      _mm256_srai_epi32(_mm256_add_epi32(res_a, round_const_v), 11);
296
11.1M
  const __m256i res_b_round =
297
11.1M
      _mm256_srai_epi32(_mm256_add_epi32(res_b, round_const_v), 11);
298
11.1M
  const __m256i r16 = _mm256_packs_epi32(res_a_round, res_b_round);
299
11.1M
  const __m256i r8 = _mm256_packus_epi16(r16, r16);
300
301
11.1M
  _mm_storel_epi64((__m128i *)dst, _mm256_castsi256_si128(r8));
302
11.1M
  _mm_storel_epi64((__m128i *)(dst + dst_stride),
303
                   _mm256_extracti128_si256(r8, 1));
304
11.1M
}
Unexecuted instantiation: convolve_avx2.c:sr_2d_ver_round_and_store
Unexecuted instantiation: jnt_convolve_avx2.c:sr_2d_ver_round_and_store
Unexecuted instantiation: wiener_convolve_avx2.c:sr_2d_ver_round_and_store
Unexecuted instantiation: highbd_convolve_2d_avx2.c:sr_2d_ver_round_and_store
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:sr_2d_ver_round_and_store
305
306
#define CONVOLVE_SR_VERTICAL_FILTER_2TAP                                      \
307
435k
  for (i = 0; i < h; i += 2) {                                                \
308
402k
    __m256i s[2];                                                             \
309
402k
    const int16_t *data = &im_block[i * im_stride];                           \
310
402k
    const __m256i s1 = _mm256_loadu_si256((__m256i *)(data + 0 * im_stride)); \
311
402k
    const __m256i s2 = _mm256_loadu_si256((__m256i *)(data + 1 * im_stride)); \
312
402k
    s[0] = _mm256_unpacklo_epi16(s1, s2);                                     \
313
402k
    s[1] = _mm256_unpackhi_epi16(s1, s2);                                     \
314
402k
                                                                              \
315
402k
    __m256i res_a = _mm256_madd_epi16(s[0], coeffs_v[0]);                     \
316
402k
    __m256i res_b = _mm256_madd_epi16(s[1], coeffs_v[0]);                     \
317
402k
                                                                              \
318
402k
    sr_2d_ver_round_and_store(res_a, res_b, dst_ptr, dst_stride,              \
319
402k
                              round_const_v);                                 \
320
402k
    dst_ptr += 2 * dst_stride;                                                \
321
402k
  }
322
323
#define CONVOLVE_SR_VERTICAL_FILTER_4TAP                                      \
324
469k
  __m256i s[6];                                                               \
325
469k
  __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));  \
326
469k
  __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));  \
327
469k
                                                                              \
328
469k
  s[0] = _mm256_unpacklo_epi16(src_0, src_1);                                 \
329
469k
  s[2] = _mm256_unpackhi_epi16(src_0, src_1);                                 \
330
469k
                                                                              \
331
1.81M
  for (i = 0; i < h; i += 2) {                                                \
332
1.34M
    const int16_t *data = &im_block[i * im_stride];                           \
333
1.34M
    const __m256i s4 = _mm256_loadu_si256((__m256i *)(data + 2 * im_stride)); \
334
1.34M
    const __m256i s5 = _mm256_loadu_si256((__m256i *)(data + 3 * im_stride)); \
335
1.34M
    s[1] = _mm256_unpacklo_epi16(s4, s5);                                     \
336
1.34M
    s[3] = _mm256_unpackhi_epi16(s4, s5);                                     \
337
1.34M
                                                                              \
338
1.34M
    __m256i res_a = convolve_4tap(s, coeffs_v);                               \
339
1.34M
    __m256i res_b = convolve_4tap(s + 2, coeffs_v);                           \
340
1.34M
                                                                              \
341
1.34M
    sr_2d_ver_round_and_store(res_a, res_b, dst_ptr, dst_stride,              \
342
1.34M
                              round_const_v);                                 \
343
1.34M
    dst_ptr += 2 * dst_stride;                                                \
344
1.34M
                                                                              \
345
1.34M
    s[0] = s[1];                                                              \
346
1.34M
    s[2] = s[3];                                                              \
347
1.34M
  }
348
349
#define CONVOLVE_SR_VERTICAL_FILTER_6TAP                                      \
350
679k
  __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));  \
351
679k
  __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));  \
352
679k
  __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));  \
353
679k
  __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));  \
354
679k
                                                                              \
355
679k
  __m256i s[8];                                                               \
356
679k
  s[0] = _mm256_unpacklo_epi16(src_0, src_1);                                 \
357
679k
  s[1] = _mm256_unpacklo_epi16(src_2, src_3);                                 \
358
679k
                                                                              \
359
679k
  s[3] = _mm256_unpackhi_epi16(src_0, src_1);                                 \
360
679k
  s[4] = _mm256_unpackhi_epi16(src_2, src_3);                                 \
361
679k
                                                                              \
362
8.33M
  for (i = 0; i < h; i += 2) {                                                \
363
7.65M
    const int16_t *data = &im_block[i * im_stride];                           \
364
7.65M
                                                                              \
365
7.65M
    const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 4 * im_stride)); \
366
7.65M
    const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 5 * im_stride)); \
367
7.65M
                                                                              \
368
7.65M
    s[2] = _mm256_unpacklo_epi16(s6, s7);                                     \
369
7.65M
    s[5] = _mm256_unpackhi_epi16(s6, s7);                                     \
370
7.65M
                                                                              \
371
7.65M
    __m256i res_a = convolve_6tap(s, coeffs_v);                               \
372
7.65M
    __m256i res_b = convolve_6tap(s + 3, coeffs_v);                           \
373
7.65M
                                                                              \
374
7.65M
    sr_2d_ver_round_and_store(res_a, res_b, dst_ptr, dst_stride,              \
375
7.65M
                              round_const_v);                                 \
376
7.65M
    dst_ptr += 2 * dst_stride;                                                \
377
7.65M
                                                                              \
378
7.65M
    s[0] = s[1];                                                              \
379
7.65M
    s[1] = s[2];                                                              \
380
7.65M
                                                                              \
381
7.65M
    s[3] = s[4];                                                              \
382
7.65M
    s[4] = s[5];                                                              \
383
7.65M
  }
384
385
#define CONVOLVE_SR_VERTICAL_FILTER_8TAP                                      \
386
146k
  __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));  \
387
146k
  __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));  \
388
146k
  __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));  \
389
146k
  __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));  \
390
146k
  __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));  \
391
146k
  __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));  \
392
146k
                                                                              \
393
146k
  __m256i s[8];                                                               \
394
146k
  s[0] = _mm256_unpacklo_epi16(src_0, src_1);                                 \
395
146k
  s[1] = _mm256_unpacklo_epi16(src_2, src_3);                                 \
396
146k
  s[2] = _mm256_unpacklo_epi16(src_4, src_5);                                 \
397
146k
                                                                              \
398
146k
  s[4] = _mm256_unpackhi_epi16(src_0, src_1);                                 \
399
146k
  s[5] = _mm256_unpackhi_epi16(src_2, src_3);                                 \
400
146k
  s[6] = _mm256_unpackhi_epi16(src_4, src_5);                                 \
401
146k
                                                                              \
402
1.92M
  for (i = 0; i < h; i += 2) {                                                \
403
1.77M
    const int16_t *data = &im_block[i * im_stride];                           \
404
1.77M
                                                                              \
405
1.77M
    const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); \
406
1.77M
    const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); \
407
1.77M
                                                                              \
408
1.77M
    s[3] = _mm256_unpacklo_epi16(s6, s7);                                     \
409
1.77M
    s[7] = _mm256_unpackhi_epi16(s6, s7);                                     \
410
1.77M
                                                                              \
411
1.77M
    __m256i res_a = convolve(s, coeffs_v);                                    \
412
1.77M
    __m256i res_b = convolve(s + 4, coeffs_v);                                \
413
1.77M
                                                                              \
414
1.77M
    sr_2d_ver_round_and_store(res_a, res_b, dst_ptr, dst_stride,              \
415
1.77M
                              round_const_v);                                 \
416
1.77M
    dst_ptr += 2 * dst_stride;                                                \
417
1.77M
                                                                              \
418
1.77M
    s[0] = s[1];                                                              \
419
1.77M
    s[1] = s[2];                                                              \
420
1.77M
    s[2] = s[3];                                                              \
421
1.77M
                                                                              \
422
1.77M
    s[4] = s[5];                                                              \
423
1.77M
    s[5] = s[6];                                                              \
424
1.77M
    s[6] = s[7];                                                              \
425
1.77M
  }
426
427
#define CONVOLVE_SR_VERTICAL_FILTER_12TAP                                      \
428
0
  __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));   \
429
0
  __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));   \
430
0
  __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));   \
431
0
  __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));   \
432
0
  __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));   \
433
0
  __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));   \
434
0
  __m256i src_6 = _mm256_loadu_si256((__m256i *)(im_block + 6 * im_stride));   \
435
0
  __m256i src_7 = _mm256_loadu_si256((__m256i *)(im_block + 7 * im_stride));   \
436
0
  __m256i src_8 = _mm256_loadu_si256((__m256i *)(im_block + 8 * im_stride));   \
437
0
  __m256i src_9 = _mm256_loadu_si256((__m256i *)(im_block + 9 * im_stride));   \
438
0
                                                                               \
439
0
  s[0] = _mm256_unpacklo_epi16(src_0, src_1);                                  \
440
0
  s[1] = _mm256_unpacklo_epi16(src_2, src_3);                                  \
441
0
  s[2] = _mm256_unpacklo_epi16(src_4, src_5);                                  \
442
0
  s[3] = _mm256_unpacklo_epi16(src_6, src_7);                                  \
443
0
  s[4] = _mm256_unpacklo_epi16(src_8, src_9);                                  \
444
0
                                                                               \
445
0
  s[6] = _mm256_unpackhi_epi16(src_0, src_1);                                  \
446
0
  s[7] = _mm256_unpackhi_epi16(src_2, src_3);                                  \
447
0
  s[8] = _mm256_unpackhi_epi16(src_4, src_5);                                  \
448
0
  s[9] = _mm256_unpackhi_epi16(src_6, src_7);                                  \
449
0
  s[10] = _mm256_unpackhi_epi16(src_8, src_9);                                 \
450
0
                                                                               \
451
0
  for (i = 0; i < h; i += 2) {                                                 \
452
0
    const int16_t *data = &im_block[i * im_stride];                            \
453
0
                                                                               \
454
0
    const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 10 * im_stride)); \
455
0
    const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 11 * im_stride)); \
456
0
                                                                               \
457
0
    s[5] = _mm256_unpacklo_epi16(s6, s7);                                      \
458
0
    s[11] = _mm256_unpackhi_epi16(s6, s7);                                     \
459
0
                                                                               \
460
0
    __m256i res_a = convolve_12taps(s, coeffs_v);                              \
461
0
    __m256i res_b = convolve_12taps(s + 6, coeffs_v);                          \
462
0
                                                                               \
463
0
    res_a =                                                                    \
464
0
        _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v);   \
465
0
    res_b =                                                                    \
466
0
        _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v);   \
467
0
                                                                               \
468
0
    const __m256i res_a_round = _mm256_sra_epi32(                              \
469
0
        _mm256_add_epi32(res_a, round_const_v), round_shift_v);                \
470
0
    const __m256i res_b_round = _mm256_sra_epi32(                              \
471
0
        _mm256_add_epi32(res_b, round_const_v), round_shift_v);                \
472
0
                                                                               \
473
0
    const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);    \
474
0
    const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);          \
475
0
                                                                               \
476
0
    const __m128i res_0 = _mm256_castsi256_si128(res_8b);                      \
477
0
    const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);                 \
478
0
                                                                               \
479
0
    __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];                  \
480
0
    __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];     \
481
0
    if (w - j > 4) {                                                           \
482
0
      _mm_storel_epi64(p_0, res_0);                                            \
483
0
      _mm_storel_epi64(p_1, res_1);                                            \
484
0
    } else if (w == 4) {                                                       \
485
0
      xx_storel_32(p_0, res_0);                                                \
486
0
      xx_storel_32(p_1, res_1);                                                \
487
0
    } else {                                                                   \
488
0
      *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);                   \
489
0
      *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);                   \
490
0
    }                                                                          \
491
0
                                                                               \
492
0
    s[0] = s[1];                                                               \
493
0
    s[1] = s[2];                                                               \
494
0
    s[2] = s[3];                                                               \
495
0
    s[3] = s[4];                                                               \
496
0
    s[4] = s[5];                                                               \
497
0
                                                                               \
498
0
    s[6] = s[7];                                                               \
499
0
    s[7] = s[8];                                                               \
500
0
    s[8] = s[9];                                                               \
501
0
    s[9] = s[10];                                                              \
502
0
    s[10] = s[11];                                                             \
503
0
  }
504
505
#define DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP                        \
506
198k
  do {                                                                  \
507
3.25M
    for (i = 0; i < im_h; i += 2) {                                     \
508
3.06M
      __m256i data =                                                    \
509
3.06M
          _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h));    \
510
3.06M
      if (i + 1 < im_h)                                                 \
511
3.06M
        data = _mm256_inserti128_si256(                                 \
512
3.06M
            data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1); \
513
3.06M
      src_h += (src_stride << 1);                                       \
514
3.06M
      __m256i res = convolve_lowbd_x(data, coeffs_x, filt);             \
515
3.06M
                                                                        \
516
3.06M
      res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),      \
517
3.06M
                             round_shift_h);                            \
518
3.06M
                                                                        \
519
3.06M
      _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);     \
520
3.06M
    }                                                                   \
521
198k
  } while (0)
522
523
#define DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP                                 \
524
250k
  do {                                                                         \
525
250k
    __m256i s[8];                                                              \
526
250k
    __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));    \
527
250k
    __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));    \
528
250k
    __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));    \
529
250k
    __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));    \
530
250k
    __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));    \
531
250k
    __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));    \
532
250k
                                                                               \
533
250k
    s[0] = _mm256_unpacklo_epi16(s0, s1);                                      \
534
250k
    s[1] = _mm256_unpacklo_epi16(s2, s3);                                      \
535
250k
    s[2] = _mm256_unpacklo_epi16(s4, s5);                                      \
536
250k
                                                                               \
537
250k
    s[4] = _mm256_unpackhi_epi16(s0, s1);                                      \
538
250k
    s[5] = _mm256_unpackhi_epi16(s2, s3);                                      \
539
250k
    s[6] = _mm256_unpackhi_epi16(s4, s5);                                      \
540
250k
                                                                               \
541
2.72M
    for (i = 0; i < h; i += 2) {                                               \
542
2.47M
      const int16_t *data = &im_block[i * im_stride];                          \
543
2.47M
                                                                               \
544
2.47M
      const __m256i s6 =                                                       \
545
2.47M
          _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));               \
546
2.47M
      const __m256i s7 =                                                       \
547
2.47M
          _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));               \
548
2.47M
                                                                               \
549
2.47M
      s[3] = _mm256_unpacklo_epi16(s6, s7);                                    \
550
2.47M
      s[7] = _mm256_unpackhi_epi16(s6, s7);                                    \
551
2.47M
                                                                               \
552
2.47M
      const __m256i res_a = convolve(s, coeffs_y);                             \
553
2.47M
      const __m256i res_a_round = _mm256_sra_epi32(                            \
554
2.47M
          _mm256_add_epi32(res_a, round_const_v), round_shift_v);              \
555
2.47M
                                                                               \
556
2.47M
      if (w - j > 4) {                                                         \
557
2.33M
        const __m256i res_b = convolve(s + 4, coeffs_y);                       \
558
2.33M
        const __m256i res_b_round = _mm256_sra_epi32(                          \
559
2.33M
            _mm256_add_epi32(res_b, round_const_v), round_shift_v);            \
560
2.33M
        const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round);  \
561
2.33M
        const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);  \
562
2.33M
                                                                               \
563
2.33M
        if (do_average) {                                                      \
564
960k
          const __m256i data_ref_0 =                                           \
565
960k
              load_line2_avx2(&dst[i * dst_stride + j],                        \
566
960k
                              &dst[i * dst_stride + j + dst_stride]);          \
567
960k
          const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,    \
568
960k
                                                &wt, use_dist_wtd_comp_avg);   \
569
960k
                                                                               \
570
960k
          const __m256i round_result = convolve_rounding(                      \
571
960k
              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);  \
572
960k
                                                                               \
573
960k
          const __m256i res_8 =                                                \
574
960k
              _mm256_packus_epi16(round_result, round_result);                 \
575
960k
          const __m128i res_0 = _mm256_castsi256_si128(res_8);                 \
576
960k
          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);            \
577
960k
                                                                               \
578
960k
          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);    \
579
960k
          _mm_storel_epi64(                                                    \
580
960k
              (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); \
581
1.37M
        } else {                                                               \
582
1.37M
          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);          \
583
1.37M
          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);       \
584
1.37M
                                                                               \
585
1.37M
          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);     \
586
1.37M
          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),  \
587
1.37M
                          res_1);                                              \
588
1.37M
        }                                                                      \
589
2.33M
      } else {                                                                 \
590
141k
        const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round);  \
591
141k
        const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);  \
592
141k
                                                                               \
593
141k
        if (do_average) {                                                      \
594
61.9k
          const __m256i data_ref_0 =                                           \
595
61.9k
              load_line2_avx2(&dst[i * dst_stride + j],                        \
596
61.9k
                              &dst[i * dst_stride + j + dst_stride]);          \
597
61.9k
                                                                               \
598
61.9k
          const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,    \
599
61.9k
                                                &wt, use_dist_wtd_comp_avg);   \
600
61.9k
                                                                               \
601
61.9k
          const __m256i round_result = convolve_rounding(                      \
602
61.9k
              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);  \
603
61.9k
                                                                               \
604
61.9k
          const __m256i res_8 =                                                \
605
61.9k
              _mm256_packus_epi16(round_result, round_result);                 \
606
61.9k
          const __m128i res_0 = _mm256_castsi256_si128(res_8);                 \
607
61.9k
          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);            \
608
61.9k
                                                                               \
609
61.9k
          *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);     \
610
61.9k
          *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) =                 \
611
61.9k
              _mm_cvtsi128_si32(res_1);                                        \
612
61.9k
                                                                               \
613
79.7k
        } else {                                                               \
614
79.7k
          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);          \
615
79.7k
          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);       \
616
79.7k
                                                                               \
617
79.7k
          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);     \
618
79.7k
          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),  \
619
79.7k
                          res_1);                                              \
620
79.7k
        }                                                                      \
621
141k
      }                                                                        \
622
2.47M
                                                                               \
623
2.47M
      s[0] = s[1];                                                             \
624
2.47M
      s[1] = s[2];                                                             \
625
2.47M
      s[2] = s[3];                                                             \
626
2.47M
                                                                               \
627
2.47M
      s[4] = s[5];                                                             \
628
2.47M
      s[5] = s[6];                                                             \
629
2.47M
      s[6] = s[7];                                                             \
630
2.47M
    }                                                                          \
631
250k
  } while (0)
632
633
static inline void prepare_coeffs_2t_ssse3(
634
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
635
37.0k
    __m128i *const coeffs /* [4] */) {
636
37.0k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
637
37.0k
      filter_params, subpel_q4 & SUBPEL_MASK);
638
37.0k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
639
640
  // right shift all filter co-efficients by 1 to reduce the bits required.
641
  // This extra right shift will be taken care of at the end while rounding
642
  // the result.
643
  // Since all filter co-efficients are even, this change will not affect the
644
  // end result
645
37.0k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
646
37.0k
                            _mm_set1_epi16((short)0xffff)));
647
648
37.0k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
649
650
  // coeffs 3 4 3 4 3 4 3 4
651
37.0k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u));
652
37.0k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_2t_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_2t_ssse3
convolve_2d_avx2.c:prepare_coeffs_2t_ssse3
Line
Count
Source
635
19.1k
    __m128i *const coeffs /* [4] */) {
636
19.1k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
637
19.1k
      filter_params, subpel_q4 & SUBPEL_MASK);
638
19.1k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
639
640
  // right shift all filter co-efficients by 1 to reduce the bits required.
641
  // This extra right shift will be taken care of at the end while rounding
642
  // the result.
643
  // Since all filter co-efficients are even, this change will not affect the
644
  // end result
645
19.1k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
646
19.1k
                            _mm_set1_epi16((short)0xffff)));
647
648
19.1k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
649
650
  // coeffs 3 4 3 4 3 4 3 4
651
19.1k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u));
652
19.1k
}
convolve_avx2.c:prepare_coeffs_2t_ssse3
Line
Count
Source
635
17.8k
    __m128i *const coeffs /* [4] */) {
636
17.8k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
637
17.8k
      filter_params, subpel_q4 & SUBPEL_MASK);
638
17.8k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
639
640
  // right shift all filter co-efficients by 1 to reduce the bits required.
641
  // This extra right shift will be taken care of at the end while rounding
642
  // the result.
643
  // Since all filter co-efficients are even, this change will not affect the
644
  // end result
645
17.8k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
646
17.8k
                            _mm_set1_epi16((short)0xffff)));
647
648
17.8k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
649
650
  // coeffs 3 4 3 4 3 4 3 4
651
17.8k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u));
652
17.8k
}
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_2t_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_2t_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_2t_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_2t_ssse3
653
654
static inline void prepare_coeffs_4t_ssse3(
655
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
656
787k
    __m128i *const coeffs /* [4] */) {
657
787k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
658
787k
      filter_params, subpel_q4 & SUBPEL_MASK);
659
787k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
660
661
  // right shift all filter co-efficients by 1 to reduce the bits required.
662
  // This extra right shift will be taken care of at the end while rounding
663
  // the result.
664
  // Since all filter co-efficients are even, this change will not affect the
665
  // end result
666
787k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
667
787k
                            _mm_set1_epi16((short)0xffff)));
668
669
787k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
670
671
  // coeffs 2 3 2 3 2 3 2 3
672
787k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
673
  // coeffs 4 5 4 5 4 5 4 5
674
787k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
675
787k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_4t_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_4t_ssse3
convolve_2d_avx2.c:prepare_coeffs_4t_ssse3
Line
Count
Source
656
527k
    __m128i *const coeffs /* [4] */) {
657
527k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
658
527k
      filter_params, subpel_q4 & SUBPEL_MASK);
659
527k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
660
661
  // right shift all filter co-efficients by 1 to reduce the bits required.
662
  // This extra right shift will be taken care of at the end while rounding
663
  // the result.
664
  // Since all filter co-efficients are even, this change will not affect the
665
  // end result
666
527k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
667
527k
                            _mm_set1_epi16((short)0xffff)));
668
669
527k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
670
671
  // coeffs 2 3 2 3 2 3 2 3
672
527k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
673
  // coeffs 4 5 4 5 4 5 4 5
674
527k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
675
527k
}
convolve_avx2.c:prepare_coeffs_4t_ssse3
Line
Count
Source
656
259k
    __m128i *const coeffs /* [4] */) {
657
259k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
658
259k
      filter_params, subpel_q4 & SUBPEL_MASK);
659
259k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
660
661
  // right shift all filter co-efficients by 1 to reduce the bits required.
662
  // This extra right shift will be taken care of at the end while rounding
663
  // the result.
664
  // Since all filter co-efficients are even, this change will not affect the
665
  // end result
666
259k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
667
259k
                            _mm_set1_epi16((short)0xffff)));
668
669
259k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
670
671
  // coeffs 2 3 2 3 2 3 2 3
672
259k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
673
  // coeffs 4 5 4 5 4 5 4 5
674
259k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
675
259k
}
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_4t_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_4t_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_4t_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_4t_ssse3
676
677
static inline void prepare_coeffs_6t_ssse3(
678
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
679
63.2k
    __m128i *const coeffs /* [4] */) {
680
63.2k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
681
63.2k
      filter_params, subpel_q4 & SUBPEL_MASK);
682
63.2k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
683
684
  // right shift all filter co-efficients by 1 to reduce the bits required.
685
  // This extra right shift will be taken care of at the end while rounding
686
  // the result.
687
  // Since all filter co-efficients are even, this change will not affect the
688
  // end result
689
63.2k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
690
63.2k
                            _mm_set1_epi16((short)0xffff)));
691
692
63.2k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
693
694
  // coeffs 2 3 2 3 2 3 2 3
695
63.2k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0402u));
696
  // coeffs 4 5 4 5 4 5 4 5
697
63.2k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u));
698
  // coeffs 5 6 5 6 5 6 5 6
699
63.2k
  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0c0au));
700
63.2k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_6t_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_6t_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:prepare_coeffs_6t_ssse3
convolve_avx2.c:prepare_coeffs_6t_ssse3
Line
Count
Source
679
63.2k
    __m128i *const coeffs /* [4] */) {
680
63.2k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
681
63.2k
      filter_params, subpel_q4 & SUBPEL_MASK);
682
63.2k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
683
684
  // right shift all filter co-efficients by 1 to reduce the bits required.
685
  // This extra right shift will be taken care of at the end while rounding
686
  // the result.
687
  // Since all filter co-efficients are even, this change will not affect the
688
  // end result
689
63.2k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
690
63.2k
                            _mm_set1_epi16((short)0xffff)));
691
692
63.2k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
693
694
  // coeffs 2 3 2 3 2 3 2 3
695
63.2k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0402u));
696
  // coeffs 4 5 4 5 4 5 4 5
697
63.2k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u));
698
  // coeffs 5 6 5 6 5 6 5 6
699
63.2k
  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0c0au));
700
63.2k
}
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_6t_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_6t_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_6t_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_6t_ssse3
701
702
static inline void prepare_coeffs_ssse3(
703
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
704
6.74k
    __m128i *const coeffs /* [4] */) {
705
6.74k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
706
6.74k
      filter_params, subpel_q4 & SUBPEL_MASK);
707
6.74k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
708
709
  // right shift all filter co-efficients by 1 to reduce the bits required.
710
  // This extra right shift will be taken care of at the end while rounding
711
  // the result.
712
  // Since all filter co-efficients are even, this change will not affect the
713
  // end result
714
6.74k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
715
6.74k
                            _mm_set1_epi16((short)0xffff)));
716
717
6.74k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
718
719
  // coeffs 0 1 0 1 0 1 0 1
720
6.74k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
721
  // coeffs 2 3 2 3 2 3 2 3
722
6.74k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
723
  // coeffs 4 5 4 5 4 5 4 5
724
6.74k
  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
725
  // coeffs 6 7 6 7 6 7 6 7
726
6.74k
  coeffs[3] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0e0cu));
727
6.74k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:prepare_coeffs_ssse3
convolve_avx2.c:prepare_coeffs_ssse3
Line
Count
Source
704
6.74k
    __m128i *const coeffs /* [4] */) {
705
6.74k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
706
6.74k
      filter_params, subpel_q4 & SUBPEL_MASK);
707
6.74k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
708
709
  // right shift all filter co-efficients by 1 to reduce the bits required.
710
  // This extra right shift will be taken care of at the end while rounding
711
  // the result.
712
  // Since all filter co-efficients are even, this change will not affect the
713
  // end result
714
6.74k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
715
6.74k
                            _mm_set1_epi16((short)0xffff)));
716
717
6.74k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
718
719
  // coeffs 0 1 0 1 0 1 0 1
720
6.74k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
721
  // coeffs 2 3 2 3 2 3 2 3
722
6.74k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
723
  // coeffs 4 5 4 5 4 5 4 5
724
6.74k
  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
725
  // coeffs 6 7 6 7 6 7 6 7
726
6.74k
  coeffs[3] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0e0cu));
727
6.74k
}
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_ssse3
728
729
static inline void prepare_coeffs_2t_lowbd(
730
    const InterpFilterParams *const filter_params, const int subpel_q4,
731
28.2k
    __m256i *const coeffs /* [4] */) {
732
28.2k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
733
28.2k
      filter_params, subpel_q4 & SUBPEL_MASK);
734
28.2k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
735
28.2k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
736
737
  // right shift all filter co-efficients by 1 to reduce the bits required.
738
  // This extra right shift will be taken care of at the end while rounding
739
  // the result.
740
  // Since all filter co-efficients are even, this change will not affect the
741
  // end result
742
28.2k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
743
28.2k
                            _mm_set1_epi16((int16_t)0xffff)));
744
745
28.2k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
746
747
  // coeffs 3 4 3 4 3 4 3 4
748
28.2k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u));
749
28.2k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_2t_lowbd
Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_2t_lowbd
convolve_2d_avx2.c:prepare_coeffs_2t_lowbd
Line
Count
Source
731
17.5k
    __m256i *const coeffs /* [4] */) {
732
17.5k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
733
17.5k
      filter_params, subpel_q4 & SUBPEL_MASK);
734
17.5k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
735
17.5k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
736
737
  // right shift all filter co-efficients by 1 to reduce the bits required.
738
  // This extra right shift will be taken care of at the end while rounding
739
  // the result.
740
  // Since all filter co-efficients are even, this change will not affect the
741
  // end result
742
17.5k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
743
17.5k
                            _mm_set1_epi16((int16_t)0xffff)));
744
745
17.5k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
746
747
  // coeffs 3 4 3 4 3 4 3 4
748
17.5k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u));
749
17.5k
}
convolve_avx2.c:prepare_coeffs_2t_lowbd
Line
Count
Source
731
10.6k
    __m256i *const coeffs /* [4] */) {
732
10.6k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
733
10.6k
      filter_params, subpel_q4 & SUBPEL_MASK);
734
10.6k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
735
10.6k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
736
737
  // right shift all filter co-efficients by 1 to reduce the bits required.
738
  // This extra right shift will be taken care of at the end while rounding
739
  // the result.
740
  // Since all filter co-efficients are even, this change will not affect the
741
  // end result
742
10.6k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
743
10.6k
                            _mm_set1_epi16((int16_t)0xffff)));
744
745
10.6k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
746
747
  // coeffs 3 4 3 4 3 4 3 4
748
10.6k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u));
749
10.6k
}
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_2t_lowbd
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_2t_lowbd
Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_2t_lowbd
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_2t_lowbd
750
751
static inline void prepare_coeffs_4t_lowbd(
752
    const InterpFilterParams *const filter_params, const int subpel_q4,
753
168k
    __m256i *const coeffs /* [4] */) {
754
168k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
755
168k
      filter_params, subpel_q4 & SUBPEL_MASK);
756
168k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
757
168k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
758
759
  // right shift all filter co-efficients by 1 to reduce the bits required.
760
  // This extra right shift will be taken care of at the end while rounding
761
  // the result.
762
  // Since all filter co-efficients are even, this change will not affect the
763
  // end result
764
168k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
765
168k
                            _mm_set1_epi16((short)0xffff)));
766
767
168k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
768
769
  // coeffs 2 3 2 3 2 3 2 3
770
168k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
771
  // coeffs 4 5 4 5 4 5 4 5
772
168k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
773
168k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_4t_lowbd
Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_4t_lowbd
convolve_2d_avx2.c:prepare_coeffs_4t_lowbd
Line
Count
Source
753
34.9k
    __m256i *const coeffs /* [4] */) {
754
34.9k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
755
34.9k
      filter_params, subpel_q4 & SUBPEL_MASK);
756
34.9k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
757
34.9k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
758
759
  // right shift all filter co-efficients by 1 to reduce the bits required.
760
  // This extra right shift will be taken care of at the end while rounding
761
  // the result.
762
  // Since all filter co-efficients are even, this change will not affect the
763
  // end result
764
34.9k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
765
34.9k
                            _mm_set1_epi16((short)0xffff)));
766
767
34.9k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
768
769
  // coeffs 2 3 2 3 2 3 2 3
770
34.9k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
771
  // coeffs 4 5 4 5 4 5 4 5
772
34.9k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
773
34.9k
}
convolve_avx2.c:prepare_coeffs_4t_lowbd
Line
Count
Source
753
133k
    __m256i *const coeffs /* [4] */) {
754
133k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
755
133k
      filter_params, subpel_q4 & SUBPEL_MASK);
756
133k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
757
133k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
758
759
  // right shift all filter co-efficients by 1 to reduce the bits required.
760
  // This extra right shift will be taken care of at the end while rounding
761
  // the result.
762
  // Since all filter co-efficients are even, this change will not affect the
763
  // end result
764
133k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
765
133k
                            _mm_set1_epi16((short)0xffff)));
766
767
133k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
768
769
  // coeffs 2 3 2 3 2 3 2 3
770
133k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
771
  // coeffs 4 5 4 5 4 5 4 5
772
133k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
773
133k
}
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_4t_lowbd
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_4t_lowbd
Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_4t_lowbd
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_4t_lowbd
774
775
static inline void prepare_coeffs_6t_lowbd(
776
    const InterpFilterParams *const filter_params, const int subpel_q4,
777
975k
    __m256i *const coeffs /* [4] */) {
778
975k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
779
975k
      filter_params, subpel_q4 & SUBPEL_MASK);
780
975k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
781
975k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
782
783
  // right shift all filter co-efficients by 1 to reduce the bits required.
784
  // This extra right shift will be taken care of at the end while rounding
785
  // the result.
786
  // Since all filter co-efficients are even, this change will not affect the
787
  // end result
788
975k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
789
975k
                            _mm_set1_epi16((int16_t)0xffff)));
790
791
975k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
792
793
  // coeffs 1 2 1 2 1 2 1 2
794
975k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0402u));
795
  // coeffs 3 4 3 4 3 4 3 4
796
975k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u));
797
  // coeffs 5 6 5 6 5 6 5 6
798
975k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0c0au));
799
975k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_6t_lowbd
Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_6t_lowbd
convolve_2d_avx2.c:prepare_coeffs_6t_lowbd
Line
Count
Source
777
632k
    __m256i *const coeffs /* [4] */) {
778
632k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
779
632k
      filter_params, subpel_q4 & SUBPEL_MASK);
780
632k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
781
632k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
782
783
  // right shift all filter co-efficients by 1 to reduce the bits required.
784
  // This extra right shift will be taken care of at the end while rounding
785
  // the result.
786
  // Since all filter co-efficients are even, this change will not affect the
787
  // end result
788
632k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
789
632k
                            _mm_set1_epi16((int16_t)0xffff)));
790
791
632k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
792
793
  // coeffs 1 2 1 2 1 2 1 2
794
632k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0402u));
795
  // coeffs 3 4 3 4 3 4 3 4
796
632k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u));
797
  // coeffs 5 6 5 6 5 6 5 6
798
632k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0c0au));
799
632k
}
convolve_avx2.c:prepare_coeffs_6t_lowbd
Line
Count
Source
777
343k
    __m256i *const coeffs /* [4] */) {
778
343k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
779
343k
      filter_params, subpel_q4 & SUBPEL_MASK);
780
343k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
781
343k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
782
783
  // right shift all filter co-efficients by 1 to reduce the bits required.
784
  // This extra right shift will be taken care of at the end while rounding
785
  // the result.
786
  // Since all filter co-efficients are even, this change will not affect the
787
  // end result
788
343k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
789
343k
                            _mm_set1_epi16((int16_t)0xffff)));
790
791
343k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
792
793
  // coeffs 1 2 1 2 1 2 1 2
794
343k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0402u));
795
  // coeffs 3 4 3 4 3 4 3 4
796
343k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u));
797
  // coeffs 5 6 5 6 5 6 5 6
798
343k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0c0au));
799
343k
}
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_6t_lowbd
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_6t_lowbd
Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_6t_lowbd
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_6t_lowbd
800
801
static inline void prepare_coeffs_lowbd(
802
    const InterpFilterParams *const filter_params, const int subpel_q4,
803
400k
    __m256i *const coeffs /* [4] */) {
804
400k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
805
400k
      filter_params, subpel_q4 & SUBPEL_MASK);
806
400k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
807
400k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
808
809
  // right shift all filter co-efficients by 1 to reduce the bits required.
810
  // This extra right shift will be taken care of at the end while rounding
811
  // the result.
812
  // Since all filter co-efficients are even, this change will not affect the
813
  // end result
814
400k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
815
400k
                            _mm_set1_epi16((short)0xffff)));
816
817
400k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
818
819
  // coeffs 0 1 0 1 0 1 0 1
820
400k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
821
  // coeffs 2 3 2 3 2 3 2 3
822
400k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
823
  // coeffs 4 5 4 5 4 5 4 5
824
400k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
825
  // coeffs 6 7 6 7 6 7 6 7
826
400k
  coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu));
827
400k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_lowbd
Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_lowbd
convolve_2d_avx2.c:prepare_coeffs_lowbd
Line
Count
Source
803
57.6k
    __m256i *const coeffs /* [4] */) {
804
57.6k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
805
57.6k
      filter_params, subpel_q4 & SUBPEL_MASK);
806
57.6k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
807
57.6k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
808
809
  // right shift all filter co-efficients by 1 to reduce the bits required.
810
  // This extra right shift will be taken care of at the end while rounding
811
  // the result.
812
  // Since all filter co-efficients are even, this change will not affect the
813
  // end result
814
57.6k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
815
57.6k
                            _mm_set1_epi16((short)0xffff)));
816
817
57.6k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
818
819
  // coeffs 0 1 0 1 0 1 0 1
820
57.6k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
821
  // coeffs 2 3 2 3 2 3 2 3
822
57.6k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
823
  // coeffs 4 5 4 5 4 5 4 5
824
57.6k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
825
  // coeffs 6 7 6 7 6 7 6 7
826
57.6k
  coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu));
827
57.6k
}
convolve_avx2.c:prepare_coeffs_lowbd
Line
Count
Source
803
36.1k
    __m256i *const coeffs /* [4] */) {
804
36.1k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
805
36.1k
      filter_params, subpel_q4 & SUBPEL_MASK);
806
36.1k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
807
36.1k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
808
809
  // right shift all filter co-efficients by 1 to reduce the bits required.
810
  // This extra right shift will be taken care of at the end while rounding
811
  // the result.
812
  // Since all filter co-efficients are even, this change will not affect the
813
  // end result
814
36.1k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
815
36.1k
                            _mm_set1_epi16((short)0xffff)));
816
817
36.1k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
818
819
  // coeffs 0 1 0 1 0 1 0 1
820
36.1k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
821
  // coeffs 2 3 2 3 2 3 2 3
822
36.1k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
823
  // coeffs 4 5 4 5 4 5 4 5
824
36.1k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
825
  // coeffs 6 7 6 7 6 7 6 7
826
36.1k
  coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu));
827
36.1k
}
jnt_convolve_avx2.c:prepare_coeffs_lowbd
Line
Count
Source
803
306k
    __m256i *const coeffs /* [4] */) {
804
306k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
805
306k
      filter_params, subpel_q4 & SUBPEL_MASK);
806
306k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
807
306k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
808
809
  // right shift all filter co-efficients by 1 to reduce the bits required.
810
  // This extra right shift will be taken care of at the end while rounding
811
  // the result.
812
  // Since all filter co-efficients are even, this change will not affect the
813
  // end result
814
306k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
815
306k
                            _mm_set1_epi16((short)0xffff)));
816
817
306k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
818
819
  // coeffs 0 1 0 1 0 1 0 1
820
306k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
821
  // coeffs 2 3 2 3 2 3 2 3
822
306k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
823
  // coeffs 4 5 4 5 4 5 4 5
824
306k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
825
  // coeffs 6 7 6 7 6 7 6 7
826
306k
  coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu));
827
306k
}
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_lowbd
Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_lowbd
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_lowbd
828
829
static inline void prepare_coeffs_2t(
830
    const InterpFilterParams *const filter_params, const int subpel_q4,
831
36.6k
    __m256i *const coeffs /* [4] */) {
832
36.6k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
833
36.6k
      filter_params, subpel_q4 & SUBPEL_MASK);
834
835
36.6k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)(filter + 1));
836
36.6k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
837
838
  // coeffs 3 4 3 4 3 4 3 4
839
36.6k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55);
840
36.6k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_2t
Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_2t
convolve_2d_avx2.c:prepare_coeffs_2t
Line
Count
Source
831
36.6k
    __m256i *const coeffs /* [4] */) {
832
36.6k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
833
36.6k
      filter_params, subpel_q4 & SUBPEL_MASK);
834
835
36.6k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)(filter + 1));
836
36.6k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
837
838
  // coeffs 3 4 3 4 3 4 3 4
839
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55);
840
36.6k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_2t
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_2t
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_2t
Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_2t
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_2t
841
842
static inline void prepare_coeffs_4t(
843
    const InterpFilterParams *const filter_params, const int subpel_q4,
844
693k
    __m256i *const coeffs /* [4] */) {
845
693k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
846
693k
      filter_params, subpel_q4 & SUBPEL_MASK);
847
848
693k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
849
693k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
850
  // coeffs 2 3 2 3 2 3 2 3
851
693k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55);
852
  // coeffs 4 5 4 5 4 5 4 5
853
693k
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0xaa);
854
693k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_4t
Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_4t
convolve_2d_avx2.c:prepare_coeffs_4t
Line
Count
Source
844
693k
    __m256i *const coeffs /* [4] */) {
845
693k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
846
693k
      filter_params, subpel_q4 & SUBPEL_MASK);
847
848
693k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
849
693k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
850
  // coeffs 2 3 2 3 2 3 2 3
851
693k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55);
852
  // coeffs 4 5 4 5 4 5 4 5
853
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0xaa);
854
693k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_4t
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_4t
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_4t
Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_4t
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_4t
855
856
static inline void prepare_coeffs_6t(
857
    const InterpFilterParams *const filter_params, const int subpel_q4,
858
508k
    __m256i *const coeffs /* [4] */) {
859
508k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
860
508k
      filter_params, subpel_q4 & SUBPEL_MASK);
861
862
508k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)(filter + 1));
863
508k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
864
865
  // coeffs 1 2 1 2 1 2 1 2
866
508k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
867
  // coeffs 3 4 3 4 3 4 3 4
868
508k
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
869
  // coeffs 5 6 5 6 5 6 5 6
870
508k
  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
871
508k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_6t
Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_6t
convolve_2d_avx2.c:prepare_coeffs_6t
Line
Count
Source
858
508k
    __m256i *const coeffs /* [4] */) {
859
508k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
860
508k
      filter_params, subpel_q4 & SUBPEL_MASK);
861
862
508k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)(filter + 1));
863
508k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
864
865
  // coeffs 1 2 1 2 1 2 1 2
866
508k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
867
  // coeffs 3 4 3 4 3 4 3 4
868
508k
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
869
  // coeffs 5 6 5 6 5 6 5 6
870
  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
871
508k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_6t
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_6t
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_6t
Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_6t
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_6t
872
873
static inline void prepare_coeffs(const InterpFilterParams *const filter_params,
874
                                  const int subpel_q4,
875
8.46M
                                  __m256i *const coeffs /* [4] */) {
876
8.46M
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
877
8.46M
      filter_params, subpel_q4 & SUBPEL_MASK);
878
879
8.46M
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
880
8.46M
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
881
882
  // coeffs 0 1 0 1 0 1 0 1
883
8.46M
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
884
  // coeffs 2 3 2 3 2 3 2 3
885
8.46M
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
886
  // coeffs 4 5 4 5 4 5 4 5
887
8.46M
  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
888
  // coeffs 6 7 6 7 6 7 6 7
889
8.46M
  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
890
8.46M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs
highbd_convolve_avx2.c:prepare_coeffs
Line
Count
Source
875
1.61M
                                  __m256i *const coeffs /* [4] */) {
876
1.61M
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
877
1.61M
      filter_params, subpel_q4 & SUBPEL_MASK);
878
879
1.61M
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
880
1.61M
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
881
882
  // coeffs 0 1 0 1 0 1 0 1
883
1.61M
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
884
  // coeffs 2 3 2 3 2 3 2 3
885
1.61M
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
886
  // coeffs 4 5 4 5 4 5 4 5
887
1.61M
  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
888
  // coeffs 6 7 6 7 6 7 6 7
889
  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
890
1.61M
}
convolve_2d_avx2.c:prepare_coeffs
Line
Count
Source
875
50.6k
                                  __m256i *const coeffs /* [4] */) {
876
50.6k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
877
50.6k
      filter_params, subpel_q4 & SUBPEL_MASK);
878
879
50.6k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
880
50.6k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
881
882
  // coeffs 0 1 0 1 0 1 0 1
883
50.6k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
884
  // coeffs 2 3 2 3 2 3 2 3
885
50.6k
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
886
  // coeffs 4 5 4 5 4 5 4 5
887
50.6k
  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
888
  // coeffs 6 7 6 7 6 7 6 7
889
  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
890
50.6k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs
jnt_convolve_avx2.c:prepare_coeffs
Line
Count
Source
875
162k
                                  __m256i *const coeffs /* [4] */) {
876
162k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
877
162k
      filter_params, subpel_q4 & SUBPEL_MASK);
878
879
162k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
880
162k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
881
882
  // coeffs 0 1 0 1 0 1 0 1
883
162k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
884
  // coeffs 2 3 2 3 2 3 2 3
885
162k
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
886
  // coeffs 4 5 4 5 4 5 4 5
887
162k
  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
888
  // coeffs 6 7 6 7 6 7 6 7
889
  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
890
162k
}
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs
highbd_convolve_2d_avx2.c:prepare_coeffs
Line
Count
Source
875
5.56M
                                  __m256i *const coeffs /* [4] */) {
876
5.56M
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
877
5.56M
      filter_params, subpel_q4 & SUBPEL_MASK);
878
879
5.56M
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
880
5.56M
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
881
882
  // coeffs 0 1 0 1 0 1 0 1
883
5.56M
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
884
  // coeffs 2 3 2 3 2 3 2 3
885
5.56M
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
886
  // coeffs 4 5 4 5 4 5 4 5
887
5.56M
  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
888
  // coeffs 6 7 6 7 6 7 6 7
889
  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
890
5.56M
}
highbd_jnt_convolve_avx2.c:prepare_coeffs
Line
Count
Source
875
1.06M
                                  __m256i *const coeffs /* [4] */) {
876
1.06M
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
877
1.06M
      filter_params, subpel_q4 & SUBPEL_MASK);
878
879
1.06M
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
880
1.06M
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
881
882
  // coeffs 0 1 0 1 0 1 0 1
883
1.06M
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
884
  // coeffs 2 3 2 3 2 3 2 3
885
1.06M
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
886
  // coeffs 4 5 4 5 4 5 4 5
887
1.06M
  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
888
  // coeffs 6 7 6 7 6 7 6 7
889
  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
890
1.06M
}
891
892
static inline void prepare_coeffs_12taps(
893
    const InterpFilterParams *const filter_params, const int subpel_q4,
894
0
    __m256i *const coeffs /* [4] */) {
895
0
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
896
0
      filter_params, subpel_q4 & SUBPEL_MASK);
897
898
0
  __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
899
0
  __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
900
901
  // coeffs 0 1 0 1 0 1 0 1
902
0
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
903
  // coeffs 2 3 2 3 2 3 2 3
904
0
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
905
  // coeffs 4 5 4 5 4 5 4 5
906
0
  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
907
  // coeffs 6 7 6 7 6 7 6 7
908
0
  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
909
  // coeffs 8 9 10 11 0 0 0 0
910
0
  coeff_8 = _mm_loadl_epi64((__m128i *)(filter + 8));
911
0
  coeff = _mm256_broadcastq_epi64(coeff_8);
912
0
  coeffs[4] = _mm256_shuffle_epi32(coeff, 0x00);  // coeffs 8 9 8 9 8 9 8 9
913
0
  coeffs[5] = _mm256_shuffle_epi32(coeff, 0x55);  // coeffs 10 11 10 11.. 10 11
914
0
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_12taps
Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_12taps
Unexecuted instantiation: convolve_2d_avx2.c:prepare_coeffs_12taps
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_12taps
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_12taps
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_12taps
Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_12taps
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_12taps
915
916
static inline __m128i convolve_lowbd_4tap_ssse3(const __m128i ss[2],
917
3.49M
                                                const __m128i coeffs[2]) {
918
3.49M
  const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
919
3.49M
  const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
920
921
3.49M
  return _mm_add_epi16(res_01, res_23);
922
3.49M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_4tap_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_4tap_ssse3
convolve_2d_avx2.c:convolve_lowbd_4tap_ssse3
Line
Count
Source
917
2.81M
                                                const __m128i coeffs[2]) {
918
2.81M
  const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
919
2.81M
  const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
920
921
2.81M
  return _mm_add_epi16(res_01, res_23);
922
2.81M
}
convolve_avx2.c:convolve_lowbd_4tap_ssse3
Line
Count
Source
917
676k
                                                const __m128i coeffs[2]) {
918
676k
  const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
919
676k
  const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
920
921
676k
  return _mm_add_epi16(res_01, res_23);
922
676k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_4tap_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_4tap_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_4tap_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_4tap_ssse3
923
924
static inline __m128i convolve_lowbd_6tap_ssse3(const __m128i ss[3],
925
327k
                                                const __m128i coeffs[3]) {
926
327k
  const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
927
327k
  const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
928
327k
  const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]);
929
930
327k
  const __m128i res = _mm_add_epi16(_mm_add_epi16(res_01, res_45), res_23);
931
932
327k
  return res;
933
327k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_6tap_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_6tap_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_lowbd_6tap_ssse3
convolve_avx2.c:convolve_lowbd_6tap_ssse3
Line
Count
Source
925
327k
                                                const __m128i coeffs[3]) {
926
327k
  const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
927
327k
  const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
928
327k
  const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]);
929
930
327k
  const __m128i res = _mm_add_epi16(_mm_add_epi16(res_01, res_45), res_23);
931
932
327k
  return res;
933
327k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_6tap_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_6tap_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_6tap_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_6tap_ssse3
934
935
static inline __m128i convolve_lowbd_ssse3(const __m128i ss[4],
936
36.0k
                                           const __m128i coeffs[4]) {
937
36.0k
  const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
938
36.0k
  const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
939
36.0k
  const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]);
940
36.0k
  const __m128i res_67 = _mm_maddubs_epi16(ss[3], coeffs[3]);
941
942
36.0k
  const __m128i res = _mm_add_epi16(_mm_add_epi16(res_01, res_45),
943
36.0k
                                    _mm_add_epi16(res_23, res_67));
944
945
36.0k
  return res;
946
36.0k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_lowbd_ssse3
convolve_avx2.c:convolve_lowbd_ssse3
Line
Count
Source
936
36.0k
                                           const __m128i coeffs[4]) {
937
36.0k
  const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
938
36.0k
  const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
939
36.0k
  const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]);
940
36.0k
  const __m128i res_67 = _mm_maddubs_epi16(ss[3], coeffs[3]);
941
942
36.0k
  const __m128i res = _mm_add_epi16(_mm_add_epi16(res_01, res_45),
943
36.0k
                                    _mm_add_epi16(res_23, res_67));
944
945
36.0k
  return res;
946
36.0k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_ssse3
947
948
static inline __m256i convolve_lowbd(const __m256i *const s,
949
26.9M
                                     const __m256i *const coeffs) {
950
26.9M
  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
951
26.9M
  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
952
26.9M
  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
953
26.9M
  const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]);
954
955
  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
956
26.9M
  const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45),
957
26.9M
                                       _mm256_add_epi16(res_23, res_67));
958
959
26.9M
  return res;
960
26.9M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd
convolve_2d_avx2.c:convolve_lowbd
Line
Count
Source
949
2.46M
                                     const __m256i *const coeffs) {
950
2.46M
  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
951
2.46M
  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
952
2.46M
  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
953
2.46M
  const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]);
954
955
  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
956
2.46M
  const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45),
957
2.46M
                                       _mm256_add_epi16(res_23, res_67));
958
959
2.46M
  return res;
960
2.46M
}
convolve_avx2.c:convolve_lowbd
Line
Count
Source
949
617k
                                     const __m256i *const coeffs) {
950
617k
  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
951
617k
  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
952
617k
  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
953
617k
  const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]);
954
955
  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
956
617k
  const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45),
957
617k
                                       _mm256_add_epi16(res_23, res_67));
958
959
617k
  return res;
960
617k
}
jnt_convolve_avx2.c:convolve_lowbd
Line
Count
Source
949
5.26M
                                     const __m256i *const coeffs) {
950
5.26M
  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
951
5.26M
  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
952
5.26M
  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
953
5.26M
  const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]);
954
955
  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
956
5.26M
  const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45),
957
5.26M
                                       _mm256_add_epi16(res_23, res_67));
958
959
5.26M
  return res;
960
5.26M
}
wiener_convolve_avx2.c:convolve_lowbd
Line
Count
Source
949
18.6M
                                     const __m256i *const coeffs) {
950
18.6M
  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
951
18.6M
  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
952
18.6M
  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
953
18.6M
  const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]);
954
955
  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
956
18.6M
  const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45),
957
18.6M
                                       _mm256_add_epi16(res_23, res_67));
958
959
18.6M
  return res;
960
18.6M
}
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd
961
962
static inline __m256i convolve_lowbd_6tap(const __m256i *const s,
963
17.7M
                                          const __m256i *const coeffs) {
964
17.7M
  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
965
17.7M
  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
966
17.7M
  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
967
968
  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
969
17.7M
  const __m256i res =
970
17.7M
      _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), res_23);
971
972
17.7M
  return res;
973
17.7M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_6tap
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_6tap
convolve_2d_avx2.c:convolve_lowbd_6tap
Line
Count
Source
963
10.9M
                                          const __m256i *const coeffs) {
964
10.9M
  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
965
10.9M
  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
966
10.9M
  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
967
968
  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
969
10.9M
  const __m256i res =
970
10.9M
      _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), res_23);
971
972
10.9M
  return res;
973
10.9M
}
convolve_avx2.c:convolve_lowbd_6tap
Line
Count
Source
963
6.81M
                                          const __m256i *const coeffs) {
964
6.81M
  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
965
6.81M
  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
966
6.81M
  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
967
968
  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
969
6.81M
  const __m256i res =
970
6.81M
      _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), res_23);
971
972
6.81M
  return res;
973
6.81M
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_6tap
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_6tap
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_6tap
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_6tap
974
975
static inline __m256i convolve_lowbd_4tap(const __m256i *const s,
976
3.85M
                                          const __m256i *const coeffs) {
977
3.85M
  const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]);
978
3.85M
  const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]);
979
980
  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
981
3.85M
  const __m256i res = _mm256_add_epi16(res_45, res_23);
982
983
3.85M
  return res;
984
3.85M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_4tap
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_4tap
convolve_2d_avx2.c:convolve_lowbd_4tap
Line
Count
Source
976
861k
                                          const __m256i *const coeffs) {
977
861k
  const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]);
978
861k
  const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]);
979
980
  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
981
861k
  const __m256i res = _mm256_add_epi16(res_45, res_23);
982
983
861k
  return res;
984
861k
}
convolve_avx2.c:convolve_lowbd_4tap
Line
Count
Source
976
1.13M
                                          const __m256i *const coeffs) {
977
1.13M
  const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]);
978
1.13M
  const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]);
979
980
  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
981
1.13M
  const __m256i res = _mm256_add_epi16(res_45, res_23);
982
983
1.13M
  return res;
984
1.13M
}
jnt_convolve_avx2.c:convolve_lowbd_4tap
Line
Count
Source
976
1.85M
                                          const __m256i *const coeffs) {
977
1.85M
  const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]);
978
1.85M
  const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]);
979
980
  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
981
1.85M
  const __m256i res = _mm256_add_epi16(res_45, res_23);
982
983
1.85M
  return res;
984
1.85M
}
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_4tap
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_4tap
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_4tap
985
986
static inline __m256i convolve_6tap(const __m256i *const s,
987
16.1M
                                    const __m256i *const coeffs) {
988
16.1M
  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
989
16.1M
  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
990
16.1M
  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
991
992
16.1M
  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), res_2);
993
994
16.1M
  return res;
995
16.1M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_6tap
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_6tap
convolve_2d_avx2.c:convolve_6tap
Line
Count
Source
987
16.1M
                                    const __m256i *const coeffs) {
988
16.1M
  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
989
16.1M
  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
990
16.1M
  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
991
992
16.1M
  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), res_2);
993
994
16.1M
  return res;
995
16.1M
}
Unexecuted instantiation: convolve_avx2.c:convolve_6tap
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_6tap
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_6tap
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_6tap
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_6tap
996
997
static inline __m256i convolve_12taps(const __m256i *const s,
998
0
                                      const __m256i *const coeffs) {
999
0
  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
1000
0
  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
1001
0
  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
1002
0
  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
1003
0
  const __m256i res_4 = _mm256_madd_epi16(s[4], coeffs[4]);
1004
0
  const __m256i res_5 = _mm256_madd_epi16(s[5], coeffs[5]);
1005
1006
0
  const __m256i res1 = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
1007
0
                                        _mm256_add_epi32(res_2, res_3));
1008
0
  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_4, res_5), res1);
1009
1010
0
  return res;
1011
0
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_12taps
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_12taps
Unexecuted instantiation: convolve_2d_avx2.c:convolve_12taps
Unexecuted instantiation: convolve_avx2.c:convolve_12taps
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_12taps
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_12taps
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_12taps
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_12taps
1012
1013
static inline __m256i convolve(const __m256i *const s,
1014
223M
                               const __m256i *const coeffs) {
1015
223M
  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
1016
223M
  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
1017
223M
  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
1018
223M
  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
1019
1020
223M
  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
1021
223M
                                       _mm256_add_epi32(res_2, res_3));
1022
1023
223M
  return res;
1024
223M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve
highbd_convolve_avx2.c:convolve
Line
Count
Source
1014
27.3M
                               const __m256i *const coeffs) {
1015
27.3M
  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
1016
27.3M
  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
1017
27.3M
  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
1018
27.3M
  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
1019
1020
27.3M
  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
1021
27.3M
                                       _mm256_add_epi32(res_2, res_3));
1022
1023
27.3M
  return res;
1024
27.3M
}
convolve_2d_avx2.c:convolve
Line
Count
Source
1014
3.61M
                               const __m256i *const coeffs) {
1015
3.61M
  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
1016
3.61M
  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
1017
3.61M
  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
1018
3.61M
  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
1019
1020
3.61M
  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
1021
3.61M
                                       _mm256_add_epi32(res_2, res_3));
1022
1023
3.61M
  return res;
1024
3.61M
}
Unexecuted instantiation: convolve_avx2.c:convolve
jnt_convolve_avx2.c:convolve
Line
Count
Source
1014
4.77M
                               const __m256i *const coeffs) {
1015
4.77M
  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
1016
4.77M
  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
1017
4.77M
  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
1018
4.77M
  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
1019
1020
4.77M
  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
1021
4.77M
                                       _mm256_add_epi32(res_2, res_3));
1022
1023
4.77M
  return res;
1024
4.77M
}
wiener_convolve_avx2.c:convolve
Line
Count
Source
1014
32.4M
                               const __m256i *const coeffs) {
1015
32.4M
  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
1016
32.4M
  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
1017
32.4M
  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
1018
32.4M
  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
1019
1020
32.4M
  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
1021
32.4M
                                       _mm256_add_epi32(res_2, res_3));
1022
1023
32.4M
  return res;
1024
32.4M
}
highbd_convolve_2d_avx2.c:convolve
Line
Count
Source
1014
110M
                               const __m256i *const coeffs) {
1015
110M
  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
1016
110M
  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
1017
110M
  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
1018
110M
  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
1019
1020
110M
  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
1021
110M
                                       _mm256_add_epi32(res_2, res_3));
1022
1023
110M
  return res;
1024
110M
}
highbd_jnt_convolve_avx2.c:convolve
Line
Count
Source
1014
44.1M
                               const __m256i *const coeffs) {
1015
44.1M
  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
1016
44.1M
  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
1017
44.1M
  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
1018
44.1M
  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
1019
1020
44.1M
  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
1021
44.1M
                                       _mm256_add_epi32(res_2, res_3));
1022
1023
44.1M
  return res;
1024
44.1M
}
1025
1026
static inline __m256i convolve_4tap(const __m256i *const s,
1027
3.57M
                                    const __m256i *const coeffs) {
1028
3.57M
  const __m256i res_1 = _mm256_madd_epi16(s[0], coeffs[0]);
1029
3.57M
  const __m256i res_2 = _mm256_madd_epi16(s[1], coeffs[1]);
1030
1031
3.57M
  const __m256i res = _mm256_add_epi32(res_1, res_2);
1032
3.57M
  return res;
1033
3.57M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_4tap
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_4tap
convolve_2d_avx2.c:convolve_4tap
Line
Count
Source
1027
3.34M
                                    const __m256i *const coeffs) {
1028
3.34M
  const __m256i res_1 = _mm256_madd_epi16(s[0], coeffs[0]);
1029
3.34M
  const __m256i res_2 = _mm256_madd_epi16(s[1], coeffs[1]);
1030
1031
3.34M
  const __m256i res = _mm256_add_epi32(res_1, res_2);
1032
3.34M
  return res;
1033
3.34M
}
Unexecuted instantiation: convolve_avx2.c:convolve_4tap
jnt_convolve_avx2.c:convolve_4tap
Line
Count
Source
1027
223k
                                    const __m256i *const coeffs) {
1028
223k
  const __m256i res_1 = _mm256_madd_epi16(s[0], coeffs[0]);
1029
223k
  const __m256i res_2 = _mm256_madd_epi16(s[1], coeffs[1]);
1030
1031
223k
  const __m256i res = _mm256_add_epi32(res_1, res_2);
1032
223k
  return res;
1033
223k
}
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_4tap
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_4tap
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_4tap
1034
1035
static inline __m128i convolve_lowbd_x_2tap_ssse3(const __m128i data,
1036
                                                  const __m128i *const coeffs,
1037
76.0k
                                                  const __m128i *const filt) {
1038
76.0k
  __m128i s;
1039
76.0k
  s = _mm_shuffle_epi8(data, filt[0]);
1040
1041
76.0k
  return _mm_maddubs_epi16(s, coeffs[0]);
1042
76.0k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x_2tap_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x_2tap_ssse3
convolve_2d_avx2.c:convolve_lowbd_x_2tap_ssse3
Line
Count
Source
1037
76.0k
                                                  const __m128i *const filt) {
1038
76.0k
  __m128i s;
1039
76.0k
  s = _mm_shuffle_epi8(data, filt[0]);
1040
1041
76.0k
  return _mm_maddubs_epi16(s, coeffs[0]);
1042
76.0k
}
Unexecuted instantiation: convolve_avx2.c:convolve_lowbd_x_2tap_ssse3
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_x_2tap_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_x_2tap_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x_2tap_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x_2tap_ssse3
1043
1044
static inline __m128i convolve_lowbd_x_4tap_ssse3(const __m128i data,
1045
                                                  const __m128i *const coeffs,
1046
2.81M
                                                  const __m128i *const filt) {
1047
2.81M
  __m128i s[2];
1048
1049
2.81M
  s[0] = _mm_shuffle_epi8(data, filt[0]);
1050
2.81M
  s[1] = _mm_shuffle_epi8(data, filt[1]);
1051
1052
2.81M
  return convolve_lowbd_4tap_ssse3(s, coeffs);
1053
2.81M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x_4tap_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x_4tap_ssse3
convolve_2d_avx2.c:convolve_lowbd_x_4tap_ssse3
Line
Count
Source
1046
2.81M
                                                  const __m128i *const filt) {
1047
2.81M
  __m128i s[2];
1048
1049
2.81M
  s[0] = _mm_shuffle_epi8(data, filt[0]);
1050
2.81M
  s[1] = _mm_shuffle_epi8(data, filt[1]);
1051
1052
2.81M
  return convolve_lowbd_4tap_ssse3(s, coeffs);
1053
2.81M
}
Unexecuted instantiation: convolve_avx2.c:convolve_lowbd_x_4tap_ssse3
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_x_4tap_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_x_4tap_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x_4tap_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x_4tap_ssse3
1054
1055
static inline __m256i convolve_lowbd_x(const __m256i data,
1056
                                       const __m256i *const coeffs,
1057
25.9M
                                       const __m256i *const filt) {
1058
25.9M
  __m256i s[4];
1059
1060
25.9M
  s[0] = _mm256_shuffle_epi8(data, filt[0]);
1061
25.9M
  s[1] = _mm256_shuffle_epi8(data, filt[1]);
1062
25.9M
  s[2] = _mm256_shuffle_epi8(data, filt[2]);
1063
25.9M
  s[3] = _mm256_shuffle_epi8(data, filt[3]);
1064
1065
25.9M
  return convolve_lowbd(s, coeffs);
1066
25.9M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x
convolve_2d_avx2.c:convolve_lowbd_x
Line
Count
Source
1057
2.46M
                                       const __m256i *const filt) {
1058
2.46M
  __m256i s[4];
1059
1060
2.46M
  s[0] = _mm256_shuffle_epi8(data, filt[0]);
1061
2.46M
  s[1] = _mm256_shuffle_epi8(data, filt[1]);
1062
2.46M
  s[2] = _mm256_shuffle_epi8(data, filt[2]);
1063
2.46M
  s[3] = _mm256_shuffle_epi8(data, filt[3]);
1064
1065
2.46M
  return convolve_lowbd(s, coeffs);
1066
2.46M
}
convolve_avx2.c:convolve_lowbd_x
Line
Count
Source
1057
398k
                                       const __m256i *const filt) {
1058
398k
  __m256i s[4];
1059
1060
398k
  s[0] = _mm256_shuffle_epi8(data, filt[0]);
1061
398k
  s[1] = _mm256_shuffle_epi8(data, filt[1]);
1062
398k
  s[2] = _mm256_shuffle_epi8(data, filt[2]);
1063
398k
  s[3] = _mm256_shuffle_epi8(data, filt[3]);
1064
1065
398k
  return convolve_lowbd(s, coeffs);
1066
398k
}
jnt_convolve_avx2.c:convolve_lowbd_x
Line
Count
Source
1057
4.52M
                                       const __m256i *const filt) {
1058
4.52M
  __m256i s[4];
1059
1060
4.52M
  s[0] = _mm256_shuffle_epi8(data, filt[0]);
1061
4.52M
  s[1] = _mm256_shuffle_epi8(data, filt[1]);
1062
4.52M
  s[2] = _mm256_shuffle_epi8(data, filt[2]);
1063
4.52M
  s[3] = _mm256_shuffle_epi8(data, filt[3]);
1064
1065
4.52M
  return convolve_lowbd(s, coeffs);
1066
4.52M
}
wiener_convolve_avx2.c:convolve_lowbd_x
Line
Count
Source
1057
18.5M
                                       const __m256i *const filt) {
1058
18.5M
  __m256i s[4];
1059
1060
18.5M
  s[0] = _mm256_shuffle_epi8(data, filt[0]);
1061
18.5M
  s[1] = _mm256_shuffle_epi8(data, filt[1]);
1062
18.5M
  s[2] = _mm256_shuffle_epi8(data, filt[2]);
1063
18.5M
  s[3] = _mm256_shuffle_epi8(data, filt[3]);
1064
1065
18.5M
  return convolve_lowbd(s, coeffs);
1066
18.5M
}
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x
1067
1068
static inline __m256i convolve_lowbd_x_6tap(const __m256i data,
1069
                                            const __m256i *const coeffs,
1070
14.5M
                                            const __m256i *const filt) {
1071
14.5M
  __m256i s[4];
1072
1073
14.5M
  s[0] = _mm256_shuffle_epi8(data, filt[0]);
1074
14.5M
  s[1] = _mm256_shuffle_epi8(data, filt[1]);
1075
14.5M
  s[2] = _mm256_shuffle_epi8(data, filt[2]);
1076
1077
14.5M
  return convolve_lowbd_6tap(s, coeffs);
1078
14.5M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x_6tap
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x_6tap
convolve_2d_avx2.c:convolve_lowbd_x_6tap
Line
Count
Source
1070
10.9M
                                            const __m256i *const filt) {
1071
10.9M
  __m256i s[4];
1072
1073
10.9M
  s[0] = _mm256_shuffle_epi8(data, filt[0]);
1074
10.9M
  s[1] = _mm256_shuffle_epi8(data, filt[1]);
1075
10.9M
  s[2] = _mm256_shuffle_epi8(data, filt[2]);
1076
1077
10.9M
  return convolve_lowbd_6tap(s, coeffs);
1078
10.9M
}
convolve_avx2.c:convolve_lowbd_x_6tap
Line
Count
Source
1070
3.60M
                                            const __m256i *const filt) {
1071
3.60M
  __m256i s[4];
1072
1073
3.60M
  s[0] = _mm256_shuffle_epi8(data, filt[0]);
1074
3.60M
  s[1] = _mm256_shuffle_epi8(data, filt[1]);
1075
3.60M
  s[2] = _mm256_shuffle_epi8(data, filt[2]);
1076
1077
3.60M
  return convolve_lowbd_6tap(s, coeffs);
1078
3.60M
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_x_6tap
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_x_6tap
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x_6tap
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x_6tap
1079
1080
static inline __m256i convolve_lowbd_x_4tap(const __m256i data,
1081
                                            const __m256i *const coeffs,
1082
2.80M
                                            const __m256i *const filt) {
1083
2.80M
  __m256i s[2];
1084
1085
2.80M
  s[0] = _mm256_shuffle_epi8(data, filt[0]);
1086
2.80M
  s[1] = _mm256_shuffle_epi8(data, filt[1]);
1087
1088
2.80M
  return convolve_lowbd_4tap(s, coeffs);
1089
2.80M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x_4tap
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x_4tap
convolve_2d_avx2.c:convolve_lowbd_x_4tap
Line
Count
Source
1082
861k
                                            const __m256i *const filt) {
1083
861k
  __m256i s[2];
1084
1085
861k
  s[0] = _mm256_shuffle_epi8(data, filt[0]);
1086
861k
  s[1] = _mm256_shuffle_epi8(data, filt[1]);
1087
1088
861k
  return convolve_lowbd_4tap(s, coeffs);
1089
861k
}
convolve_avx2.c:convolve_lowbd_x_4tap
Line
Count
Source
1082
425k
                                            const __m256i *const filt) {
1083
425k
  __m256i s[2];
1084
1085
425k
  s[0] = _mm256_shuffle_epi8(data, filt[0]);
1086
425k
  s[1] = _mm256_shuffle_epi8(data, filt[1]);
1087
1088
425k
  return convolve_lowbd_4tap(s, coeffs);
1089
425k
}
jnt_convolve_avx2.c:convolve_lowbd_x_4tap
Line
Count
Source
1082
1.52M
                                            const __m256i *const filt) {
1083
1.52M
  __m256i s[2];
1084
1085
1.52M
  s[0] = _mm256_shuffle_epi8(data, filt[0]);
1086
1.52M
  s[1] = _mm256_shuffle_epi8(data, filt[1]);
1087
1088
1.52M
  return convolve_lowbd_4tap(s, coeffs);
1089
1.52M
}
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_x_4tap
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x_4tap
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x_4tap
1090
1091
static inline __m256i convolve_lowbd_x_2tap(const __m256i data,
1092
                                            const __m256i *const coeffs,
1093
435k
                                            const __m256i *const filt) {
1094
435k
  __m256i s;
1095
435k
  s = _mm256_shuffle_epi8(data, filt[0]);
1096
1097
435k
  return _mm256_maddubs_epi16(s, coeffs[0]);
1098
435k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x_2tap
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x_2tap
convolve_2d_avx2.c:convolve_lowbd_x_2tap
Line
Count
Source
1093
435k
                                            const __m256i *const filt) {
1094
435k
  __m256i s;
1095
435k
  s = _mm256_shuffle_epi8(data, filt[0]);
1096
1097
435k
  return _mm256_maddubs_epi16(s, coeffs[0]);
1098
435k
}
Unexecuted instantiation: convolve_avx2.c:convolve_lowbd_x_2tap
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_x_2tap
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_x_2tap
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x_2tap
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x_2tap
1099
1100
static inline void add_store_aligned_256(CONV_BUF_TYPE *const dst,
1101
                                         const __m256i *const res,
1102
0
                                         const int do_average) {
1103
0
  __m256i d;
1104
0
  if (do_average) {
1105
0
    d = _mm256_load_si256((__m256i *)dst);
1106
0
    d = _mm256_add_epi32(d, *res);
1107
0
    d = _mm256_srai_epi32(d, 1);
1108
0
  } else {
1109
0
    d = *res;
1110
0
  }
1111
0
  _mm256_store_si256((__m256i *)dst, d);
1112
0
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:add_store_aligned_256
Unexecuted instantiation: highbd_convolve_avx2.c:add_store_aligned_256
Unexecuted instantiation: convolve_2d_avx2.c:add_store_aligned_256
Unexecuted instantiation: convolve_avx2.c:add_store_aligned_256
Unexecuted instantiation: jnt_convolve_avx2.c:add_store_aligned_256
Unexecuted instantiation: wiener_convolve_avx2.c:add_store_aligned_256
Unexecuted instantiation: highbd_convolve_2d_avx2.c:add_store_aligned_256
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:add_store_aligned_256
1113
1114
static inline __m256i comp_avg(const __m256i *const data_ref_0,
1115
                               const __m256i *const res_unsigned,
1116
                               const __m256i *const wt,
1117
163M
                               const int use_dist_wtd_comp_avg) {
1118
163M
  __m256i res;
1119
163M
  if (use_dist_wtd_comp_avg) {
1120
1.48M
    const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned);
1121
1.48M
    const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned);
1122
1123
1.48M
    const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt);
1124
1.48M
    const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt);
1125
1126
1.48M
    const __m256i res_lo = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
1127
1.48M
    const __m256i res_hi = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
1128
1129
1.48M
    res = _mm256_packs_epi32(res_lo, res_hi);
1130
161M
  } else {
1131
161M
    const __m256i wt_res = _mm256_add_epi16(*data_ref_0, *res_unsigned);
1132
161M
    res = _mm256_srai_epi16(wt_res, 1);
1133
161M
  }
1134
163M
  return res;
1135
163M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:comp_avg
Unexecuted instantiation: highbd_convolve_avx2.c:comp_avg
Unexecuted instantiation: convolve_2d_avx2.c:comp_avg
Unexecuted instantiation: convolve_avx2.c:comp_avg
jnt_convolve_avx2.c:comp_avg
Line
Count
Source
1117
163M
                               const int use_dist_wtd_comp_avg) {
1118
163M
  __m256i res;
1119
163M
  if (use_dist_wtd_comp_avg) {
1120
1.48M
    const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned);
1121
1.48M
    const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned);
1122
1123
1.48M
    const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt);
1124
1.48M
    const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt);
1125
1126
1.48M
    const __m256i res_lo = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
1127
1.48M
    const __m256i res_hi = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
1128
1129
1.48M
    res = _mm256_packs_epi32(res_lo, res_hi);
1130
161M
  } else {
1131
161M
    const __m256i wt_res = _mm256_add_epi16(*data_ref_0, *res_unsigned);
1132
161M
    res = _mm256_srai_epi16(wt_res, 1);
1133
161M
  }
1134
163M
  return res;
1135
163M
}
Unexecuted instantiation: wiener_convolve_avx2.c:comp_avg
Unexecuted instantiation: highbd_convolve_2d_avx2.c:comp_avg
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:comp_avg
1136
1137
static inline __m256i convolve_rounding(const __m256i *const res_unsigned,
1138
                                        const __m256i *const offset_const,
1139
                                        const __m256i *const round_const,
1140
162M
                                        const int round_shift) {
1141
162M
  const __m256i res_signed = _mm256_sub_epi16(*res_unsigned, *offset_const);
1142
162M
  const __m256i res_round = _mm256_srai_epi16(
1143
162M
      _mm256_add_epi16(res_signed, *round_const), round_shift);
1144
162M
  return res_round;
1145
162M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_rounding
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_rounding
Unexecuted instantiation: convolve_2d_avx2.c:convolve_rounding
Unexecuted instantiation: convolve_avx2.c:convolve_rounding
jnt_convolve_avx2.c:convolve_rounding
Line
Count
Source
1140
162M
                                        const int round_shift) {
1141
162M
  const __m256i res_signed = _mm256_sub_epi16(*res_unsigned, *offset_const);
1142
162M
  const __m256i res_round = _mm256_srai_epi16(
1143
162M
      _mm256_add_epi16(res_signed, *round_const), round_shift);
1144
162M
  return res_round;
1145
162M
}
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_rounding
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_rounding
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_rounding
1146
1147
static inline __m256i highbd_comp_avg(const __m256i *const data_ref_0,
1148
                                      const __m256i *const res_unsigned,
1149
                                      const __m256i *const wt0,
1150
                                      const __m256i *const wt1,
1151
12.3M
                                      const int use_dist_wtd_comp_avg) {
1152
12.3M
  __m256i res;
1153
12.3M
  if (use_dist_wtd_comp_avg) {
1154
2.13M
    const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0);
1155
2.13M
    const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1);
1156
2.13M
    const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res);
1157
2.13M
    res = _mm256_srai_epi32(wt_res, DIST_PRECISION_BITS);
1158
10.1M
  } else {
1159
10.1M
    const __m256i wt_res = _mm256_add_epi32(*data_ref_0, *res_unsigned);
1160
10.1M
    res = _mm256_srai_epi32(wt_res, 1);
1161
10.1M
  }
1162
12.3M
  return res;
1163
12.3M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:highbd_comp_avg
Unexecuted instantiation: highbd_convolve_avx2.c:highbd_comp_avg
Unexecuted instantiation: convolve_2d_avx2.c:highbd_comp_avg
Unexecuted instantiation: convolve_avx2.c:highbd_comp_avg
Unexecuted instantiation: jnt_convolve_avx2.c:highbd_comp_avg
Unexecuted instantiation: wiener_convolve_avx2.c:highbd_comp_avg
Unexecuted instantiation: highbd_convolve_2d_avx2.c:highbd_comp_avg
highbd_jnt_convolve_avx2.c:highbd_comp_avg
Line
Count
Source
1151
12.3M
                                      const int use_dist_wtd_comp_avg) {
1152
12.3M
  __m256i res;
1153
12.3M
  if (use_dist_wtd_comp_avg) {
1154
2.13M
    const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0);
1155
2.13M
    const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1);
1156
2.13M
    const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res);
1157
2.13M
    res = _mm256_srai_epi32(wt_res, DIST_PRECISION_BITS);
1158
10.1M
  } else {
1159
10.1M
    const __m256i wt_res = _mm256_add_epi32(*data_ref_0, *res_unsigned);
1160
10.1M
    res = _mm256_srai_epi32(wt_res, 1);
1161
10.1M
  }
1162
12.3M
  return res;
1163
12.3M
}
1164
1165
static inline __m256i highbd_convolve_rounding(
1166
    const __m256i *const res_unsigned, const __m256i *const offset_const,
1167
12.3M
    const __m256i *const round_const, const int round_shift) {
1168
12.3M
  const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const);
1169
12.3M
  const __m256i res_round = _mm256_srai_epi32(
1170
12.3M
      _mm256_add_epi32(res_signed, *round_const), round_shift);
1171
1172
12.3M
  return res_round;
1173
12.3M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:highbd_convolve_rounding
Unexecuted instantiation: highbd_convolve_avx2.c:highbd_convolve_rounding
Unexecuted instantiation: convolve_2d_avx2.c:highbd_convolve_rounding
Unexecuted instantiation: convolve_avx2.c:highbd_convolve_rounding
Unexecuted instantiation: jnt_convolve_avx2.c:highbd_convolve_rounding
Unexecuted instantiation: wiener_convolve_avx2.c:highbd_convolve_rounding
Unexecuted instantiation: highbd_convolve_2d_avx2.c:highbd_convolve_rounding
highbd_jnt_convolve_avx2.c:highbd_convolve_rounding
Line
Count
Source
1167
12.3M
    const __m256i *const round_const, const int round_shift) {
1168
12.3M
  const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const);
1169
12.3M
  const __m256i res_round = _mm256_srai_epi32(
1170
12.3M
      _mm256_add_epi32(res_signed, *round_const), round_shift);
1171
1172
12.3M
  return res_round;
1173
12.3M
}
1174
1175
4.68M
static inline __m256i round_sr_x_avx2(const __m256i data) {
1176
  // we can perform the below steps:
1177
  // data = (data + 2) >> 2
1178
  // data = (data + 8) >> 4,
1179
  // in the below form as well
1180
  // data = (data + 0x22) >> 6
1181
4.68M
  const __m256i value = _mm256_set1_epi16(34);
1182
4.68M
  const __m256i reg = _mm256_add_epi16(data, value);
1183
4.68M
  return _mm256_srai_epi16(reg, 6);
1184
4.68M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_sr_x_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:round_sr_x_avx2
Unexecuted instantiation: convolve_2d_avx2.c:round_sr_x_avx2
convolve_avx2.c:round_sr_x_avx2
Line
Count
Source
1175
4.68M
static inline __m256i round_sr_x_avx2(const __m256i data) {
1176
  // we can perform the below steps:
1177
  // data = (data + 2) >> 2
1178
  // data = (data + 8) >> 4,
1179
  // in the below form as well
1180
  // data = (data + 0x22) >> 6
1181
4.68M
  const __m256i value = _mm256_set1_epi16(34);
1182
4.68M
  const __m256i reg = _mm256_add_epi16(data, value);
1183
4.68M
  return _mm256_srai_epi16(reg, 6);
1184
4.68M
}
Unexecuted instantiation: jnt_convolve_avx2.c:round_sr_x_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:round_sr_x_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_sr_x_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_sr_x_avx2
1185
1186
static inline __m128i convolve_x_4tap_4x2_ssse3(const uint8_t *const src,
1187
                                                const ptrdiff_t src_stride,
1188
383k
                                                __m128i *const coeffs) {
1189
383k
  __m128i data[2];
1190
383k
  const __m128i f_l0 = _mm_load_si128((__m128i const *)filt1_global_sse2);
1191
383k
  const __m128i f_l1 = _mm_load_si128((__m128i const *)filt2_global_sse2);
1192
383k
  const __m128i src_1 =
1193
383k
      load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * src_stride));
1194
1195
383k
  data[0] = _mm_shuffle_epi8(src_1, f_l0);
1196
383k
  data[1] = _mm_shuffle_epi8(src_1, f_l1);
1197
383k
  return convolve_lowbd_4tap_ssse3(data, coeffs);
1198
383k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_4tap_4x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_4tap_4x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_4tap_4x2_ssse3
convolve_avx2.c:convolve_x_4tap_4x2_ssse3
Line
Count
Source
1188
383k
                                                __m128i *const coeffs) {
1189
383k
  __m128i data[2];
1190
383k
  const __m128i f_l0 = _mm_load_si128((__m128i const *)filt1_global_sse2);
1191
383k
  const __m128i f_l1 = _mm_load_si128((__m128i const *)filt2_global_sse2);
1192
383k
  const __m128i src_1 =
1193
383k
      load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * src_stride));
1194
1195
383k
  data[0] = _mm_shuffle_epi8(src_1, f_l0);
1196
383k
  data[1] = _mm_shuffle_epi8(src_1, f_l1);
1197
383k
  return convolve_lowbd_4tap_ssse3(data, coeffs);
1198
383k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_4tap_4x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_4tap_4x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_4tap_4x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_4tap_4x2_ssse3
1199
1200
501k
static inline __m128i round_sr_x_ssse3(const __m128i data) {
1201
501k
  const __m128i val = _mm_set1_epi16(34);
1202
501k
  const __m128i reg = _mm_add_epi16(data, val);
1203
501k
  return _mm_srai_epi16(reg, 6);
1204
501k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_sr_x_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:round_sr_x_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:round_sr_x_ssse3
convolve_avx2.c:round_sr_x_ssse3
Line
Count
Source
1200
501k
static inline __m128i round_sr_x_ssse3(const __m128i data) {
1201
501k
  const __m128i val = _mm_set1_epi16(34);
1202
501k
  const __m128i reg = _mm_add_epi16(data, val);
1203
501k
  return _mm_srai_epi16(reg, 6);
1204
501k
}
Unexecuted instantiation: jnt_convolve_avx2.c:round_sr_x_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:round_sr_x_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_sr_x_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_sr_x_ssse3
1205
1206
static inline void store_8bit_4x2_sse2(const __m128i reg, uint8_t *const dst,
1207
918k
                                       const ptrdiff_t dst_stride) {
1208
918k
  xx_storel_32(dst, reg);
1209
918k
  *(uint32_t *)(dst + dst_stride) =
1210
918k
      ((uint32_t)_mm_extract_epi16(reg, 3) << 16) | _mm_extract_epi16(reg, 2);
1211
918k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:store_8bit_4x2_sse2
Unexecuted instantiation: highbd_convolve_avx2.c:store_8bit_4x2_sse2
Unexecuted instantiation: convolve_2d_avx2.c:store_8bit_4x2_sse2
convolve_avx2.c:store_8bit_4x2_sse2
Line
Count
Source
1207
918k
                                       const ptrdiff_t dst_stride) {
1208
918k
  xx_storel_32(dst, reg);
1209
918k
  *(uint32_t *)(dst + dst_stride) =
1210
918k
      ((uint32_t)_mm_extract_epi16(reg, 3) << 16) | _mm_extract_epi16(reg, 2);
1211
918k
}
Unexecuted instantiation: jnt_convolve_avx2.c:store_8bit_4x2_sse2
Unexecuted instantiation: wiener_convolve_avx2.c:store_8bit_4x2_sse2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:store_8bit_4x2_sse2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:store_8bit_4x2_sse2
1212
1213
static inline void pack_store_u8_4x2_sse2(const __m128i reg, uint8_t *const dst,
1214
918k
                                          const ptrdiff_t dst_stride) {
1215
918k
  const __m128i reg_pack = _mm_packus_epi16(reg, reg);
1216
918k
  store_8bit_4x2_sse2(reg_pack, dst, dst_stride);
1217
918k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:pack_store_u8_4x2_sse2
Unexecuted instantiation: highbd_convolve_avx2.c:pack_store_u8_4x2_sse2
Unexecuted instantiation: convolve_2d_avx2.c:pack_store_u8_4x2_sse2
convolve_avx2.c:pack_store_u8_4x2_sse2
Line
Count
Source
1214
918k
                                          const ptrdiff_t dst_stride) {
1215
918k
  const __m128i reg_pack = _mm_packus_epi16(reg, reg);
1216
918k
  store_8bit_4x2_sse2(reg_pack, dst, dst_stride);
1217
918k
}
Unexecuted instantiation: jnt_convolve_avx2.c:pack_store_u8_4x2_sse2
Unexecuted instantiation: wiener_convolve_avx2.c:pack_store_u8_4x2_sse2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:pack_store_u8_4x2_sse2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:pack_store_u8_4x2_sse2
1218
1219
static inline __m128i convolve_x_4tap_2x2_ssse3(const uint8_t *const src,
1220
                                                const ptrdiff_t src_stride,
1221
64.2k
                                                __m128i *const coeffs) {
1222
64.2k
  __m128i data[2];
1223
64.2k
  const __m128i f_0 = _mm_load_si128((__m128i const *)filt3_global_sse2);
1224
64.2k
  const __m128i f_1 = _mm_load_si128((__m128i const *)filt4_global_sse2);
1225
64.2k
  const __m128i reg =
1226
64.2k
      load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * src_stride));
1227
1228
64.2k
  data[0] = _mm_shuffle_epi8(reg, f_0);
1229
64.2k
  data[1] = _mm_shuffle_epi8(reg, f_1);
1230
64.2k
  return convolve_lowbd_4tap_ssse3(data, coeffs);
1231
64.2k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_4tap_2x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_4tap_2x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_4tap_2x2_ssse3
convolve_avx2.c:convolve_x_4tap_2x2_ssse3
Line
Count
Source
1221
64.2k
                                                __m128i *const coeffs) {
1222
64.2k
  __m128i data[2];
1223
64.2k
  const __m128i f_0 = _mm_load_si128((__m128i const *)filt3_global_sse2);
1224
64.2k
  const __m128i f_1 = _mm_load_si128((__m128i const *)filt4_global_sse2);
1225
64.2k
  const __m128i reg =
1226
64.2k
      load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * src_stride));
1227
1228
64.2k
  data[0] = _mm_shuffle_epi8(reg, f_0);
1229
64.2k
  data[1] = _mm_shuffle_epi8(reg, f_1);
1230
64.2k
  return convolve_lowbd_4tap_ssse3(data, coeffs);
1231
64.2k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_4tap_2x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_4tap_2x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_4tap_2x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_4tap_2x2_ssse3
1232
1233
static inline void pack_store_u8_2x2_sse2(const __m128i reg, uint8_t *const dst,
1234
159k
                                          const ptrdiff_t dst_stride) {
1235
159k
  const __m128i data = _mm_packus_epi16(reg, reg);
1236
159k
  *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(data);
1237
159k
  *(int16_t *)(dst + dst_stride) = (int16_t)_mm_extract_epi16(data, 1);
1238
159k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:pack_store_u8_2x2_sse2
Unexecuted instantiation: highbd_convolve_avx2.c:pack_store_u8_2x2_sse2
Unexecuted instantiation: convolve_2d_avx2.c:pack_store_u8_2x2_sse2
convolve_avx2.c:pack_store_u8_2x2_sse2
Line
Count
Source
1234
159k
                                          const ptrdiff_t dst_stride) {
1235
159k
  const __m128i data = _mm_packus_epi16(reg, reg);
1236
159k
  *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(data);
1237
  *(int16_t *)(dst + dst_stride) = (int16_t)_mm_extract_epi16(data, 1);
1238
159k
}
Unexecuted instantiation: jnt_convolve_avx2.c:pack_store_u8_2x2_sse2
Unexecuted instantiation: wiener_convolve_avx2.c:pack_store_u8_2x2_sse2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:pack_store_u8_2x2_sse2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:pack_store_u8_2x2_sse2
1239
1240
static inline __m128i convolve_x_2tap_ssse3(const __m128i *data,
1241
53.5k
                                            const __m128i *coeff) {
1242
53.5k
  return _mm_maddubs_epi16(data[0], coeff[0]);
1243
53.5k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_ssse3
convolve_avx2.c:convolve_x_2tap_ssse3
Line
Count
Source
1241
53.5k
                                            const __m128i *coeff) {
1242
53.5k
  return _mm_maddubs_epi16(data[0], coeff[0]);
1243
53.5k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_ssse3
1244
1245
static inline __m128i load8_x_4x2_sse4(const void *const src,
1246
10.8k
                                       const ptrdiff_t offset) {
1247
10.8k
  const __m128i s = _mm_cvtsi32_si128(loadu_int32(src));
1248
10.8k
  return _mm_insert_epi32(s, loadu_int32((uint8_t *)src + offset), 1);
1249
10.8k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load8_x_4x2_sse4
Unexecuted instantiation: highbd_convolve_avx2.c:load8_x_4x2_sse4
Unexecuted instantiation: convolve_2d_avx2.c:load8_x_4x2_sse4
convolve_avx2.c:load8_x_4x2_sse4
Line
Count
Source
1246
10.8k
                                       const ptrdiff_t offset) {
1247
10.8k
  const __m128i s = _mm_cvtsi32_si128(loadu_int32(src));
1248
  return _mm_insert_epi32(s, loadu_int32((uint8_t *)src + offset), 1);
1249
10.8k
}
Unexecuted instantiation: jnt_convolve_avx2.c:load8_x_4x2_sse4
Unexecuted instantiation: wiener_convolve_avx2.c:load8_x_4x2_sse4
Unexecuted instantiation: highbd_convolve_2d_avx2.c:load8_x_4x2_sse4
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load8_x_4x2_sse4
1250
1251
static inline __m128i load_x_u8_4x2_sse4(const uint8_t *const src,
1252
10.8k
                                         const ptrdiff_t stride) {
1253
10.8k
  return load8_x_4x2_sse4(src, sizeof(*src) * stride);
1254
10.8k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_x_u8_4x2_sse4
Unexecuted instantiation: highbd_convolve_avx2.c:load_x_u8_4x2_sse4
Unexecuted instantiation: convolve_2d_avx2.c:load_x_u8_4x2_sse4
convolve_avx2.c:load_x_u8_4x2_sse4
Line
Count
Source
1252
10.8k
                                         const ptrdiff_t stride) {
1253
10.8k
  return load8_x_4x2_sse4(src, sizeof(*src) * stride);
1254
10.8k
}
Unexecuted instantiation: jnt_convolve_avx2.c:load_x_u8_4x2_sse4
Unexecuted instantiation: wiener_convolve_avx2.c:load_x_u8_4x2_sse4
Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_x_u8_4x2_sse4
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_x_u8_4x2_sse4
1255
1256
static inline __m128i convolve_x_2tap_2x2_ssse3(const uint8_t *const src,
1257
                                                const ptrdiff_t stride,
1258
3.11k
                                                const __m128i *coeffs) {
1259
3.11k
  const __m128i flt = _mm_load_si128((__m128i const *)filt5_global_sse2);
1260
3.11k
  const __m128i reg = load_x_u8_4x2_sse4(src, stride);
1261
3.11k
  const __m128i data = _mm_shuffle_epi8(reg, flt);
1262
3.11k
  return convolve_x_2tap_ssse3(&data, coeffs);
1263
3.11k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_2x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_2x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_2x2_ssse3
convolve_avx2.c:convolve_x_2tap_2x2_ssse3
Line
Count
Source
1258
3.11k
                                                const __m128i *coeffs) {
1259
3.11k
  const __m128i flt = _mm_load_si128((__m128i const *)filt5_global_sse2);
1260
3.11k
  const __m128i reg = load_x_u8_4x2_sse4(src, stride);
1261
3.11k
  const __m128i data = _mm_shuffle_epi8(reg, flt);
1262
3.11k
  return convolve_x_2tap_ssse3(&data, coeffs);
1263
3.11k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_2x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_2x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_2x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_2x2_ssse3
1264
1265
static inline __m128i convolve_x_2tap_4x2_ssse3(const uint8_t *const src,
1266
                                                const ptrdiff_t stride,
1267
15.1k
                                                const __m128i *coeffs) {
1268
15.1k
  const __m128i flt = _mm_load_si128((__m128i const *)filt1_global_sse2);
1269
15.1k
  const __m128i data =
1270
15.1k
      load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * stride));
1271
15.1k
  const __m128i res = _mm_shuffle_epi8(data, flt);
1272
15.1k
  return convolve_x_2tap_ssse3(&res, coeffs);
1273
15.1k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_4x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_4x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_4x2_ssse3
convolve_avx2.c:convolve_x_2tap_4x2_ssse3
Line
Count
Source
1267
15.1k
                                                const __m128i *coeffs) {
1268
15.1k
  const __m128i flt = _mm_load_si128((__m128i const *)filt1_global_sse2);
1269
15.1k
  const __m128i data =
1270
15.1k
      load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * stride));
1271
15.1k
  const __m128i res = _mm_shuffle_epi8(data, flt);
1272
15.1k
  return convolve_x_2tap_ssse3(&res, coeffs);
1273
15.1k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_4x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_4x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_4x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_4x2_ssse3
1274
1275
static inline void convolve_x_2tap_8x2_ssse3(const uint8_t *const src,
1276
                                             const ptrdiff_t stride,
1277
                                             const __m128i *coeffs,
1278
17.6k
                                             __m128i *data) {
1279
17.6k
  __m128i res[2];
1280
17.6k
  const __m128i reg_00 = _mm_loadu_si128((__m128i *)src);
1281
17.6k
  const __m128i reg_10 = _mm_loadu_si128((__m128i *)(src + stride));
1282
17.6k
  const __m128i reg_01 = _mm_srli_si128(reg_00, 1);
1283
17.6k
  const __m128i reg_11 = _mm_srli_si128(reg_10, 1);
1284
17.6k
  res[0] = _mm_unpacklo_epi8(reg_00, reg_01);
1285
17.6k
  res[1] = _mm_unpacklo_epi8(reg_10, reg_11);
1286
1287
17.6k
  data[0] = convolve_x_2tap_ssse3(&res[0], coeffs);
1288
17.6k
  data[1] = convolve_x_2tap_ssse3(&res[1], coeffs);
1289
17.6k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_8x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_8x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_8x2_ssse3
convolve_avx2.c:convolve_x_2tap_8x2_ssse3
Line
Count
Source
1278
17.6k
                                             __m128i *data) {
1279
17.6k
  __m128i res[2];
1280
17.6k
  const __m128i reg_00 = _mm_loadu_si128((__m128i *)src);
1281
17.6k
  const __m128i reg_10 = _mm_loadu_si128((__m128i *)(src + stride));
1282
17.6k
  const __m128i reg_01 = _mm_srli_si128(reg_00, 1);
1283
17.6k
  const __m128i reg_11 = _mm_srli_si128(reg_10, 1);
1284
17.6k
  res[0] = _mm_unpacklo_epi8(reg_00, reg_01);
1285
17.6k
  res[1] = _mm_unpacklo_epi8(reg_10, reg_11);
1286
1287
17.6k
  data[0] = convolve_x_2tap_ssse3(&res[0], coeffs);
1288
17.6k
  data[1] = convolve_x_2tap_ssse3(&res[1], coeffs);
1289
17.6k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_8x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_8x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_8x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_8x2_ssse3
1290
1291
static inline __m256i loadu_x_8bit_16x2_avx2(const void *const src,
1292
776k
                                             const ptrdiff_t offset) {
1293
776k
  const __m128i reg0 = _mm_loadu_si128((__m128i *)src);
1294
776k
  const __m128i reg1 = _mm_loadu_si128((__m128i *)((uint8_t *)src + offset));
1295
776k
  return _mm256_setr_m128i(reg0, reg1);
1296
776k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:loadu_x_8bit_16x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:loadu_x_8bit_16x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:loadu_x_8bit_16x2_avx2
convolve_avx2.c:loadu_x_8bit_16x2_avx2
Line
Count
Source
1292
776k
                                             const ptrdiff_t offset) {
1293
776k
  const __m128i reg0 = _mm_loadu_si128((__m128i *)src);
1294
776k
  const __m128i reg1 = _mm_loadu_si128((__m128i *)((uint8_t *)src + offset));
1295
776k
  return _mm256_setr_m128i(reg0, reg1);
1296
776k
}
Unexecuted instantiation: jnt_convolve_avx2.c:loadu_x_8bit_16x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:loadu_x_8bit_16x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:loadu_x_8bit_16x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:loadu_x_8bit_16x2_avx2
1297
1298
static inline __m256i convolve_x_2tap_avx2(const __m256i *data,
1299
255k
                                           const __m256i *coeffs) {
1300
255k
  return _mm256_maddubs_epi16(data[0], coeffs[0]);
1301
255k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_avx2
convolve_avx2.c:convolve_x_2tap_avx2
Line
Count
Source
1299
255k
                                           const __m256i *coeffs) {
1300
255k
  return _mm256_maddubs_epi16(data[0], coeffs[0]);
1301
255k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_avx2
1302
1303
static inline void convolve_x_2tap_16x2_avx2(const uint8_t *const src,
1304
                                             const ptrdiff_t stride,
1305
                                             const __m256i *coeffs,
1306
13.6k
                                             __m256i *data) {
1307
13.6k
  const __m256i reg0 = loadu_x_8bit_16x2_avx2(src, stride);
1308
13.6k
  const __m256i reg1 = loadu_x_8bit_16x2_avx2(src + 1, stride);
1309
13.6k
  const __m256i res0 = _mm256_unpacklo_epi8(reg0, reg1);
1310
13.6k
  const __m256i res1 = _mm256_unpackhi_epi8(reg0, reg1);
1311
13.6k
  data[0] = convolve_x_2tap_avx2(&res0, coeffs);
1312
13.6k
  data[1] = convolve_x_2tap_avx2(&res1, coeffs);
1313
13.6k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_16x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_16x2_avx2
convolve_avx2.c:convolve_x_2tap_16x2_avx2
Line
Count
Source
1306
13.6k
                                             __m256i *data) {
1307
13.6k
  const __m256i reg0 = loadu_x_8bit_16x2_avx2(src, stride);
1308
13.6k
  const __m256i reg1 = loadu_x_8bit_16x2_avx2(src + 1, stride);
1309
13.6k
  const __m256i res0 = _mm256_unpacklo_epi8(reg0, reg1);
1310
13.6k
  const __m256i res1 = _mm256_unpackhi_epi8(reg0, reg1);
1311
13.6k
  data[0] = convolve_x_2tap_avx2(&res0, coeffs);
1312
13.6k
  data[1] = convolve_x_2tap_avx2(&res1, coeffs);
1313
13.6k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_16x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_16x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_16x2_avx2
1314
1315
static inline void store_u8_16x2_avx2(const __m256i src, uint8_t *const dst,
1316
2.01M
                                      const ptrdiff_t stride) {
1317
2.01M
  const __m128i reg0 = _mm256_castsi256_si128(src);
1318
2.01M
  const __m128i reg1 = _mm256_extracti128_si256(src, 1);
1319
2.01M
  _mm_storeu_si128((__m128i *)dst, reg0);
1320
2.01M
  _mm_storeu_si128((__m128i *)((uint8_t *)dst + stride), reg1);
1321
2.01M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:store_u8_16x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:store_u8_16x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:store_u8_16x2_avx2
convolve_avx2.c:store_u8_16x2_avx2
Line
Count
Source
1316
2.01M
                                      const ptrdiff_t stride) {
1317
2.01M
  const __m128i reg0 = _mm256_castsi256_si128(src);
1318
  const __m128i reg1 = _mm256_extracti128_si256(src, 1);
1319
2.01M
  _mm_storeu_si128((__m128i *)dst, reg0);
1320
2.01M
  _mm_storeu_si128((__m128i *)((uint8_t *)dst + stride), reg1);
1321
2.01M
}
Unexecuted instantiation: jnt_convolve_avx2.c:store_u8_16x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:store_u8_16x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:store_u8_16x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:store_u8_16x2_avx2
1322
1323
static inline void store_u8_8x2_avx2(const __m256i src, uint8_t *const dst,
1324
583k
                                     const ptrdiff_t stride) {
1325
583k
  const __m128i reg0 = _mm256_castsi256_si128(src);
1326
583k
  const __m128i reg1 = _mm256_extracti128_si256(src, 1);
1327
583k
  _mm_storel_epi64((__m128i *)dst, reg0);
1328
583k
  _mm_storel_epi64((__m128i *)(dst + stride), reg1);
1329
583k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:store_u8_8x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:store_u8_8x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:store_u8_8x2_avx2
convolve_avx2.c:store_u8_8x2_avx2
Line
Count
Source
1324
583k
                                     const ptrdiff_t stride) {
1325
583k
  const __m128i reg0 = _mm256_castsi256_si128(src);
1326
  const __m128i reg1 = _mm256_extracti128_si256(src, 1);
1327
583k
  _mm_storel_epi64((__m128i *)dst, reg0);
1328
583k
  _mm_storel_epi64((__m128i *)(dst + stride), reg1);
1329
583k
}
Unexecuted instantiation: jnt_convolve_avx2.c:store_u8_8x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:store_u8_8x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:store_u8_8x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:store_u8_8x2_avx2
1330
1331
static inline void pack_store_16x2_avx2(const __m256i data0,
1332
                                        const __m256i data1, uint8_t *const dst,
1333
2.01M
                                        const ptrdiff_t stride) {
1334
2.01M
  const __m256i res = _mm256_packus_epi16(data0, data1);
1335
2.01M
  store_u8_16x2_avx2(res, dst, stride);
1336
2.01M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:pack_store_16x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:pack_store_16x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:pack_store_16x2_avx2
convolve_avx2.c:pack_store_16x2_avx2
Line
Count
Source
1333
2.01M
                                        const ptrdiff_t stride) {
1334
2.01M
  const __m256i res = _mm256_packus_epi16(data0, data1);
1335
2.01M
  store_u8_16x2_avx2(res, dst, stride);
1336
2.01M
}
Unexecuted instantiation: jnt_convolve_avx2.c:pack_store_16x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:pack_store_16x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:pack_store_16x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:pack_store_16x2_avx2
1337
1338
static inline void pack_store_8x2_avx2(const __m256i data, uint8_t *const dst,
1339
583k
                                       const ptrdiff_t stride) {
1340
583k
  const __m256i res = _mm256_packus_epi16(data, data);
1341
583k
  store_u8_8x2_avx2(res, dst, stride);
1342
583k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:pack_store_8x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:pack_store_8x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:pack_store_8x2_avx2
convolve_avx2.c:pack_store_8x2_avx2
Line
Count
Source
1339
583k
                                       const ptrdiff_t stride) {
1340
583k
  const __m256i res = _mm256_packus_epi16(data, data);
1341
583k
  store_u8_8x2_avx2(res, dst, stride);
1342
583k
}
Unexecuted instantiation: jnt_convolve_avx2.c:pack_store_8x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:pack_store_8x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:pack_store_8x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:pack_store_8x2_avx2
1343
1344
static inline void round_pack_store_16x2_avx2(const __m256i *data,
1345
                                              uint8_t *const dst,
1346
388k
                                              const ptrdiff_t dst_stride) {
1347
388k
  __m256i reg[2];
1348
1349
388k
  reg[0] = round_sr_x_avx2(data[0]);
1350
388k
  reg[1] = round_sr_x_avx2(data[1]);
1351
388k
  pack_store_16x2_avx2(reg[0], reg[1], dst, dst_stride);
1352
388k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_pack_store_16x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:round_pack_store_16x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:round_pack_store_16x2_avx2
convolve_avx2.c:round_pack_store_16x2_avx2
Line
Count
Source
1346
388k
                                              const ptrdiff_t dst_stride) {
1347
388k
  __m256i reg[2];
1348
1349
388k
  reg[0] = round_sr_x_avx2(data[0]);
1350
388k
  reg[1] = round_sr_x_avx2(data[1]);
1351
388k
  pack_store_16x2_avx2(reg[0], reg[1], dst, dst_stride);
1352
388k
}
Unexecuted instantiation: jnt_convolve_avx2.c:round_pack_store_16x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:round_pack_store_16x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_pack_store_16x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_pack_store_16x2_avx2
1353
1354
static inline void convolve_x_2tap_32_avx2(const uint8_t *const src,
1355
                                           const __m256i *coeffs,
1356
114k
                                           __m256i *data) {
1357
114k
  const __m256i res0 = _mm256_loadu_si256((__m256i *)src);
1358
114k
  const __m256i res1 = _mm256_loadu_si256((__m256i *)(src + 1));
1359
114k
  const __m256i reg0 = _mm256_unpacklo_epi8(res0, res1);
1360
114k
  const __m256i reg1 = _mm256_unpackhi_epi8(res0, res1);
1361
1362
114k
  data[0] = convolve_x_2tap_avx2(&reg0, coeffs);
1363
114k
  data[1] = convolve_x_2tap_avx2(&reg1, coeffs);
1364
114k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_32_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_32_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_32_avx2
convolve_avx2.c:convolve_x_2tap_32_avx2
Line
Count
Source
1356
114k
                                           __m256i *data) {
1357
114k
  const __m256i res0 = _mm256_loadu_si256((__m256i *)src);
1358
114k
  const __m256i res1 = _mm256_loadu_si256((__m256i *)(src + 1));
1359
114k
  const __m256i reg0 = _mm256_unpacklo_epi8(res0, res1);
1360
114k
  const __m256i reg1 = _mm256_unpackhi_epi8(res0, res1);
1361
1362
114k
  data[0] = convolve_x_2tap_avx2(&reg0, coeffs);
1363
114k
  data[1] = convolve_x_2tap_avx2(&reg1, coeffs);
1364
114k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_32_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_32_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_32_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_32_avx2
1365
1366
static inline void pack_store_32_avx2(const __m256i data0, const __m256i data1,
1367
1.78M
                                      uint8_t *const dst) {
1368
1.78M
  const __m256i reg = _mm256_packus_epi16(data0, data1);
1369
1.78M
  _mm256_storeu_si256((__m256i *)dst, reg);
1370
1.78M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:pack_store_32_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:pack_store_32_avx2
Unexecuted instantiation: convolve_2d_avx2.c:pack_store_32_avx2
convolve_avx2.c:pack_store_32_avx2
Line
Count
Source
1367
1.78M
                                      uint8_t *const dst) {
1368
1.78M
  const __m256i reg = _mm256_packus_epi16(data0, data1);
1369
1.78M
  _mm256_storeu_si256((__m256i *)dst, reg);
1370
1.78M
}
Unexecuted instantiation: jnt_convolve_avx2.c:pack_store_32_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:pack_store_32_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:pack_store_32_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:pack_store_32_avx2
1371
1372
static inline void round_pack_store_32_avx2(const __m256i *data,
1373
1.52M
                                            uint8_t *const dst) {
1374
1.52M
  __m256i reg[2];
1375
1376
1.52M
  reg[0] = round_sr_x_avx2(data[0]);
1377
1.52M
  reg[1] = round_sr_x_avx2(data[1]);
1378
1.52M
  pack_store_32_avx2(reg[0], reg[1], dst);
1379
1.52M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_pack_store_32_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:round_pack_store_32_avx2
Unexecuted instantiation: convolve_2d_avx2.c:round_pack_store_32_avx2
convolve_avx2.c:round_pack_store_32_avx2
Line
Count
Source
1373
1.52M
                                            uint8_t *const dst) {
1374
1.52M
  __m256i reg[2];
1375
1376
1.52M
  reg[0] = round_sr_x_avx2(data[0]);
1377
1.52M
  reg[1] = round_sr_x_avx2(data[1]);
1378
1.52M
  pack_store_32_avx2(reg[0], reg[1], dst);
1379
1.52M
}
Unexecuted instantiation: jnt_convolve_avx2.c:round_pack_store_32_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:round_pack_store_32_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_pack_store_32_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_pack_store_32_avx2
1380
1381
static inline void convolve_round_2tap_32_avx2(const uint8_t *const src,
1382
                                               const __m256i *coeffs,
1383
114k
                                               uint8_t *const dst) {
1384
114k
  __m256i data[2];
1385
1386
114k
  convolve_x_2tap_32_avx2(src, coeffs, data);
1387
114k
  round_pack_store_32_avx2(data, dst);
1388
114k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_round_2tap_32_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_round_2tap_32_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_round_2tap_32_avx2
convolve_avx2.c:convolve_round_2tap_32_avx2
Line
Count
Source
1383
114k
                                               uint8_t *const dst) {
1384
114k
  __m256i data[2];
1385
1386
114k
  convolve_x_2tap_32_avx2(src, coeffs, data);
1387
114k
  round_pack_store_32_avx2(data, dst);
1388
114k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_round_2tap_32_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_round_2tap_32_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_round_2tap_32_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_round_2tap_32_avx2
1389
1390
static inline void load_avg_store_2tap_32_avx2(const uint8_t *const src,
1391
100k
                                               uint8_t *const dst) {
1392
100k
  const __m256i res0 = _mm256_loadu_si256((__m256i *)src);
1393
100k
  const __m256i res1 = _mm256_loadu_si256((__m256i *)(src + 1));
1394
100k
  const __m256i data = _mm256_avg_epu8(res0, res1);
1395
100k
  _mm256_storeu_si256((__m256i *)dst, data);
1396
100k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_avg_store_2tap_32_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:load_avg_store_2tap_32_avx2
Unexecuted instantiation: convolve_2d_avx2.c:load_avg_store_2tap_32_avx2
convolve_avx2.c:load_avg_store_2tap_32_avx2
Line
Count
Source
1391
100k
                                               uint8_t *const dst) {
1392
100k
  const __m256i res0 = _mm256_loadu_si256((__m256i *)src);
1393
100k
  const __m256i res1 = _mm256_loadu_si256((__m256i *)(src + 1));
1394
100k
  const __m256i data = _mm256_avg_epu8(res0, res1);
1395
100k
  _mm256_storeu_si256((__m256i *)dst, data);
1396
100k
}
Unexecuted instantiation: jnt_convolve_avx2.c:load_avg_store_2tap_32_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:load_avg_store_2tap_32_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_avg_store_2tap_32_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_avg_store_2tap_32_avx2
1397
1398
static inline __m256i load_convolve_8tap_8x2_avx2(const uint8_t *const src,
1399
                                                  const ptrdiff_t stride,
1400
                                                  const __m256i *coeffs,
1401
57.3k
                                                  const __m256i *flt) {
1402
57.3k
  const __m256i res = loadu_x_8bit_16x2_avx2(src, stride);
1403
57.3k
  return convolve_lowbd_x(res, coeffs, flt);
1404
57.3k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_8tap_8x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_8tap_8x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_8tap_8x2_avx2
convolve_avx2.c:load_convolve_8tap_8x2_avx2
Line
Count
Source
1401
57.3k
                                                  const __m256i *flt) {
1402
57.3k
  const __m256i res = loadu_x_8bit_16x2_avx2(src, stride);
1403
57.3k
  return convolve_lowbd_x(res, coeffs, flt);
1404
57.3k
}
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_8tap_8x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_8tap_8x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_8tap_8x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_8tap_8x2_avx2
1405
1406
static inline void load_convolve_8tap_16x2_avx2(const uint8_t *const src,
1407
                                                const int32_t src_stride,
1408
                                                const __m256i *coeffs,
1409
                                                const __m256i *flt,
1410
28.6k
                                                __m256i *reg) {
1411
28.6k
  reg[0] = load_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, flt);
1412
28.6k
  reg[1] = load_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, flt);
1413
28.6k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_8tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_8tap_16x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_8tap_16x2_avx2
convolve_avx2.c:load_convolve_8tap_16x2_avx2
Line
Count
Source
1410
28.6k
                                                __m256i *reg) {
1411
28.6k
  reg[0] = load_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, flt);
1412
28.6k
  reg[1] = load_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, flt);
1413
28.6k
}
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_8tap_16x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_8tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_8tap_16x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_8tap_16x2_avx2
1414
1415
static inline void load_convolve_8tap_32_avx2(const uint8_t *const src,
1416
                                              const __m256i *coeffs,
1417
                                              const __m256i *filt,
1418
155k
                                              __m256i *data) {
1419
155k
  const __m256i reg_0 = _mm256_loadu_si256((__m256i *)src);
1420
155k
  const __m256i reg_8 = _mm256_loadu_si256((__m256i *)(src + 8));
1421
1422
155k
  data[0] = convolve_lowbd_x(reg_0, coeffs, filt);
1423
155k
  data[1] = convolve_lowbd_x(reg_8, coeffs, filt);
1424
155k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_8tap_32_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_8tap_32_avx2
Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_8tap_32_avx2
convolve_avx2.c:load_convolve_8tap_32_avx2
Line
Count
Source
1418
155k
                                              __m256i *data) {
1419
155k
  const __m256i reg_0 = _mm256_loadu_si256((__m256i *)src);
1420
155k
  const __m256i reg_8 = _mm256_loadu_si256((__m256i *)(src + 8));
1421
1422
155k
  data[0] = convolve_lowbd_x(reg_0, coeffs, filt);
1423
155k
  data[1] = convolve_lowbd_x(reg_8, coeffs, filt);
1424
155k
}
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_8tap_32_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_8tap_32_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_8tap_32_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_8tap_32_avx2
1425
1426
static inline void load_convolve_round_8tap_32_avx2(const uint8_t *const src,
1427
                                                    const __m256i *coeffs,
1428
                                                    const __m256i *filt,
1429
155k
                                                    uint8_t *const dst) {
1430
155k
  __m256i data[2];
1431
1432
155k
  load_convolve_8tap_32_avx2(src, coeffs, filt, data);
1433
155k
  round_pack_store_32_avx2(data, dst);
1434
155k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_round_8tap_32_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_round_8tap_32_avx2
Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_round_8tap_32_avx2
convolve_avx2.c:load_convolve_round_8tap_32_avx2
Line
Count
Source
1429
155k
                                                    uint8_t *const dst) {
1430
155k
  __m256i data[2];
1431
1432
155k
  load_convolve_8tap_32_avx2(src, coeffs, filt, data);
1433
155k
  round_pack_store_32_avx2(data, dst);
1434
155k
}
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_round_8tap_32_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_round_8tap_32_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_round_8tap_32_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_round_8tap_32_avx2
1435
1436
static inline void load_convolve_6tap_32_avx2(const uint8_t *const src,
1437
                                              const __m256i *coeffs,
1438
                                              const __m256i *filt,
1439
1.25M
                                              __m256i *data) {
1440
1.25M
  const __m256i reg0 = _mm256_loadu_si256((__m256i *)src);
1441
1.25M
  const __m256i reg1 = _mm256_loadu_si256((__m256i *)(src + 8));
1442
1443
1.25M
  data[0] = convolve_lowbd_x_6tap(reg0, coeffs, filt);
1444
1.25M
  data[1] = convolve_lowbd_x_6tap(reg1, coeffs, filt);
1445
1.25M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_6tap_32_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_6tap_32_avx2
Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_6tap_32_avx2
convolve_avx2.c:load_convolve_6tap_32_avx2
Line
Count
Source
1439
1.25M
                                              __m256i *data) {
1440
1.25M
  const __m256i reg0 = _mm256_loadu_si256((__m256i *)src);
1441
1.25M
  const __m256i reg1 = _mm256_loadu_si256((__m256i *)(src + 8));
1442
1443
1.25M
  data[0] = convolve_lowbd_x_6tap(reg0, coeffs, filt);
1444
1.25M
  data[1] = convolve_lowbd_x_6tap(reg1, coeffs, filt);
1445
1.25M
}
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_6tap_32_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_6tap_32_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_6tap_32_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_6tap_32_avx2
1446
1447
static inline void convolve_sr_store_6tap_32_avx2(const uint8_t *const src,
1448
                                                  const __m256i *coeffs,
1449
                                                  const __m256i *filt,
1450
1.25M
                                                  uint8_t *const dst) {
1451
1.25M
  __m256i data[2];
1452
1453
1.25M
  load_convolve_6tap_32_avx2(src, coeffs, filt, data);
1454
1.25M
  round_pack_store_32_avx2(data, dst);
1455
1.25M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_sr_store_6tap_32_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_sr_store_6tap_32_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_sr_store_6tap_32_avx2
convolve_avx2.c:convolve_sr_store_6tap_32_avx2
Line
Count
Source
1450
1.25M
                                                  uint8_t *const dst) {
1451
1.25M
  __m256i data[2];
1452
1453
1.25M
  load_convolve_6tap_32_avx2(src, coeffs, filt, data);
1454
1.25M
  round_pack_store_32_avx2(data, dst);
1455
1.25M
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_sr_store_6tap_32_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_sr_store_6tap_32_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_sr_store_6tap_32_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_sr_store_6tap_32_avx2
1456
1457
static inline __m256i load_convolve_6tap_8x2_avx2(const uint8_t *const src,
1458
                                                  const ptrdiff_t stride,
1459
                                                  const __m256i *coeffs,
1460
691k
                                                  const __m256i *filt) {
1461
691k
  const __m256i data = loadu_x_8bit_16x2_avx2(src, stride);
1462
691k
  return convolve_lowbd_x_6tap(data, coeffs, filt);
1463
691k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_6tap_8x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_6tap_8x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_6tap_8x2_avx2
convolve_avx2.c:load_convolve_6tap_8x2_avx2
Line
Count
Source
1460
691k
                                                  const __m256i *filt) {
1461
691k
  const __m256i data = loadu_x_8bit_16x2_avx2(src, stride);
1462
691k
  return convolve_lowbd_x_6tap(data, coeffs, filt);
1463
691k
}
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_6tap_8x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_6tap_8x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_6tap_8x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_6tap_8x2_avx2
1464
1465
static inline void load_convolve_6tap_16x2_avx2(const uint8_t *const src,
1466
                                                const int32_t src_stride,
1467
                                                const __m256i *coeffs,
1468
                                                const __m256i *filt,
1469
345k
                                                __m256i *data) {
1470
345k
  data[0] = load_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1471
345k
  data[1] = load_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1472
345k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_6tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_6tap_16x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_6tap_16x2_avx2
convolve_avx2.c:load_convolve_6tap_16x2_avx2
Line
Count
Source
1469
345k
                                                __m256i *data) {
1470
345k
  data[0] = load_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1471
345k
  data[1] = load_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1472
345k
}
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_6tap_16x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_6tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_6tap_16x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_6tap_16x2_avx2
1473
1474
611k
static inline __m128i round_sr_y_ssse3(const __m128i data) {
1475
611k
  const __m128i value = _mm_set1_epi16(32);
1476
611k
  const __m128i reg = _mm_add_epi16(data, value);
1477
611k
  return _mm_srai_epi16(reg, FILTER_BITS - 1);
1478
611k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_sr_y_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:round_sr_y_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:round_sr_y_ssse3
convolve_avx2.c:round_sr_y_ssse3
Line
Count
Source
1474
611k
static inline __m128i round_sr_y_ssse3(const __m128i data) {
1475
611k
  const __m128i value = _mm_set1_epi16(32);
1476
611k
  const __m128i reg = _mm_add_epi16(data, value);
1477
611k
  return _mm_srai_epi16(reg, FILTER_BITS - 1);
1478
611k
}
Unexecuted instantiation: jnt_convolve_avx2.c:round_sr_y_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:round_sr_y_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_sr_y_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_sr_y_ssse3
1479
1480
4.35M
static inline __m256i round_sr_y_avx2(const __m256i data) {
1481
4.35M
  const __m256i value = _mm256_set1_epi16(32);
1482
4.35M
  const __m256i reg = _mm256_add_epi16(data, value);
1483
4.35M
  return _mm256_srai_epi16(reg, FILTER_BITS - 1);
1484
4.35M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_sr_y_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:round_sr_y_avx2
Unexecuted instantiation: convolve_2d_avx2.c:round_sr_y_avx2
convolve_avx2.c:round_sr_y_avx2
Line
Count
Source
1480
4.35M
static inline __m256i round_sr_y_avx2(const __m256i data) {
1481
4.35M
  const __m256i value = _mm256_set1_epi16(32);
1482
4.35M
  const __m256i reg = _mm256_add_epi16(data, value);
1483
4.35M
  return _mm256_srai_epi16(reg, FILTER_BITS - 1);
1484
4.35M
}
Unexecuted instantiation: jnt_convolve_avx2.c:round_sr_y_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:round_sr_y_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_sr_y_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_sr_y_avx2
1485
1486
static inline void round_pack_store_y_8x2_avx2(const __m256i res,
1487
                                               uint8_t *const dst,
1488
583k
                                               const ptrdiff_t dst_stride) {
1489
583k
  __m256i r;
1490
1491
583k
  r = round_sr_y_avx2(res);
1492
583k
  pack_store_8x2_avx2(r, dst, dst_stride);
1493
583k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_pack_store_y_8x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:round_pack_store_y_8x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:round_pack_store_y_8x2_avx2
convolve_avx2.c:round_pack_store_y_8x2_avx2
Line
Count
Source
1488
583k
                                               const ptrdiff_t dst_stride) {
1489
583k
  __m256i r;
1490
1491
583k
  r = round_sr_y_avx2(res);
1492
583k
  pack_store_8x2_avx2(r, dst, dst_stride);
1493
583k
}
Unexecuted instantiation: jnt_convolve_avx2.c:round_pack_store_y_8x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:round_pack_store_y_8x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_pack_store_y_8x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_pack_store_y_8x2_avx2
1494
1495
static inline void round_pack_store_y_16x2_avx2(const __m256i res[2],
1496
                                                uint8_t *const dst,
1497
1.62M
                                                const ptrdiff_t dst_stride) {
1498
1.62M
  __m256i r[2];
1499
1500
1.62M
  r[0] = round_sr_y_avx2(res[0]);
1501
1.62M
  r[1] = round_sr_y_avx2(res[1]);
1502
1.62M
  pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
1503
1.62M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_pack_store_y_16x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:round_pack_store_y_16x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:round_pack_store_y_16x2_avx2
convolve_avx2.c:round_pack_store_y_16x2_avx2
Line
Count
Source
1497
1.62M
                                                const ptrdiff_t dst_stride) {
1498
1.62M
  __m256i r[2];
1499
1500
1.62M
  r[0] = round_sr_y_avx2(res[0]);
1501
1.62M
  r[1] = round_sr_y_avx2(res[1]);
1502
1.62M
  pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
1503
1.62M
}
Unexecuted instantiation: jnt_convolve_avx2.c:round_pack_store_y_16x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:round_pack_store_y_16x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_pack_store_y_16x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_pack_store_y_16x2_avx2
1504
1505
static inline void round_pack_store_y_32_avx2(const __m256i res[2],
1506
256k
                                              uint8_t *const dst) {
1507
256k
  __m256i r[2];
1508
1509
256k
  r[0] = round_sr_y_avx2(res[0]);
1510
256k
  r[1] = round_sr_y_avx2(res[1]);
1511
256k
  pack_store_32_avx2(r[0], r[1], dst);
1512
256k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_pack_store_y_32_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:round_pack_store_y_32_avx2
Unexecuted instantiation: convolve_2d_avx2.c:round_pack_store_y_32_avx2
convolve_avx2.c:round_pack_store_y_32_avx2
Line
Count
Source
1506
256k
                                              uint8_t *const dst) {
1507
256k
  __m256i r[2];
1508
1509
256k
  r[0] = round_sr_y_avx2(res[0]);
1510
256k
  r[1] = round_sr_y_avx2(res[1]);
1511
256k
  pack_store_32_avx2(r[0], r[1], dst);
1512
256k
}
Unexecuted instantiation: jnt_convolve_avx2.c:round_pack_store_y_32_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:round_pack_store_y_32_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_pack_store_y_32_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_pack_store_y_32_avx2
1513
1514
static inline void round_pack_store_y_32x2_avx2(const __m256i res[4],
1515
                                                uint8_t *const dst,
1516
128k
                                                const ptrdiff_t dst_stride) {
1517
128k
  round_pack_store_y_32_avx2(res, dst);
1518
128k
  round_pack_store_y_32_avx2(res + 2, dst + dst_stride);
1519
128k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_pack_store_y_32x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:round_pack_store_y_32x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:round_pack_store_y_32x2_avx2
convolve_avx2.c:round_pack_store_y_32x2_avx2
Line
Count
Source
1516
128k
                                                const ptrdiff_t dst_stride) {
1517
128k
  round_pack_store_y_32_avx2(res, dst);
1518
128k
  round_pack_store_y_32_avx2(res + 2, dst + dst_stride);
1519
128k
}
Unexecuted instantiation: jnt_convolve_avx2.c:round_pack_store_y_32x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:round_pack_store_y_32x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_pack_store_y_32x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_pack_store_y_32x2_avx2
1520
1521
static inline void convolve_y_2tap_2x2_ssse3(const uint8_t *const data,
1522
                                             const ptrdiff_t stride,
1523
                                             const __m128i *coeffs,
1524
3.55k
                                             __m128i d[2], __m128i *res) {
1525
3.55k
  d[1] = _mm_cvtsi32_si128(loadu_int16(data + 1 * stride));
1526
3.55k
  const __m128i src_01a = _mm_unpacklo_epi16(d[0], d[1]);
1527
3.55k
  d[0] = _mm_cvtsi32_si128(loadu_int16(data + 2 * stride));
1528
3.55k
  const __m128i src_12a = _mm_unpacklo_epi16(d[1], d[0]);
1529
1530
3.55k
  const __m128i s = _mm_unpacklo_epi8(src_01a, src_12a);
1531
1532
3.55k
  *res = _mm_maddubs_epi16(s, coeffs[0]);
1533
3.55k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_2tap_2x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_2tap_2x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_2tap_2x2_ssse3
convolve_avx2.c:convolve_y_2tap_2x2_ssse3
Line
Count
Source
1524
3.55k
                                             __m128i d[2], __m128i *res) {
1525
3.55k
  d[1] = _mm_cvtsi32_si128(loadu_int16(data + 1 * stride));
1526
3.55k
  const __m128i src_01a = _mm_unpacklo_epi16(d[0], d[1]);
1527
3.55k
  d[0] = _mm_cvtsi32_si128(loadu_int16(data + 2 * stride));
1528
3.55k
  const __m128i src_12a = _mm_unpacklo_epi16(d[1], d[0]);
1529
1530
3.55k
  const __m128i s = _mm_unpacklo_epi8(src_01a, src_12a);
1531
1532
3.55k
  *res = _mm_maddubs_epi16(s, coeffs[0]);
1533
3.55k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_2tap_2x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_2tap_2x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_2tap_2x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_2tap_2x2_ssse3
1534
1535
static inline void convolve_y_4tap_2x2_ssse3(const uint8_t *const data,
1536
                                             const ptrdiff_t stride,
1537
                                             const __m128i coeffs[2],
1538
                                             __m128i d[4], __m128i s[2],
1539
34.1k
                                             __m128i *res) {
1540
34.1k
  d[3] = _mm_cvtsi32_si128(loadu_int16(data + 3 * stride));
1541
34.1k
  const __m128i src_23a = _mm_unpacklo_epi16(d[2], d[3]);
1542
34.1k
  d[2] = _mm_cvtsi32_si128(loadu_int16(data + 4 * stride));
1543
34.1k
  const __m128i src_34a = _mm_unpacklo_epi16(d[3], d[2]);
1544
1545
34.1k
  s[1] = _mm_unpacklo_epi8(src_23a, src_34a);
1546
1547
34.1k
  *res = convolve_lowbd_4tap_ssse3(s, coeffs);
1548
34.1k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_4tap_2x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_4tap_2x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_4tap_2x2_ssse3
convolve_avx2.c:convolve_y_4tap_2x2_ssse3
Line
Count
Source
1539
34.1k
                                             __m128i *res) {
1540
34.1k
  d[3] = _mm_cvtsi32_si128(loadu_int16(data + 3 * stride));
1541
34.1k
  const __m128i src_23a = _mm_unpacklo_epi16(d[2], d[3]);
1542
34.1k
  d[2] = _mm_cvtsi32_si128(loadu_int16(data + 4 * stride));
1543
34.1k
  const __m128i src_34a = _mm_unpacklo_epi16(d[3], d[2]);
1544
1545
34.1k
  s[1] = _mm_unpacklo_epi8(src_23a, src_34a);
1546
1547
34.1k
  *res = convolve_lowbd_4tap_ssse3(s, coeffs);
1548
34.1k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_4tap_2x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_4tap_2x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_4tap_2x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_4tap_2x2_ssse3
1549
1550
static inline void convolve_y_6tap_2x2_ssse3(const uint8_t *const data,
1551
                                             const ptrdiff_t stride,
1552
                                             const __m128i coeffs[3],
1553
                                             __m128i d[6], __m128i s[3],
1554
48.7k
                                             __m128i *res) {
1555
48.7k
  d[5] = _mm_cvtsi32_si128(loadu_int16(data + 5 * stride));
1556
48.7k
  const __m128i src_45a = _mm_unpacklo_epi16(d[4], d[5]);
1557
48.7k
  d[4] = _mm_cvtsi32_si128(loadu_int16(data + 6 * stride));
1558
48.7k
  const __m128i src_56a = _mm_unpacklo_epi16(d[5], d[4]);
1559
1560
48.7k
  s[2] = _mm_unpacklo_epi8(src_45a, src_56a);
1561
1562
48.7k
  *res = convolve_lowbd_6tap_ssse3(s, coeffs);
1563
48.7k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_6tap_2x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_6tap_2x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_6tap_2x2_ssse3
convolve_avx2.c:convolve_y_6tap_2x2_ssse3
Line
Count
Source
1554
48.7k
                                             __m128i *res) {
1555
48.7k
  d[5] = _mm_cvtsi32_si128(loadu_int16(data + 5 * stride));
1556
48.7k
  const __m128i src_45a = _mm_unpacklo_epi16(d[4], d[5]);
1557
48.7k
  d[4] = _mm_cvtsi32_si128(loadu_int16(data + 6 * stride));
1558
48.7k
  const __m128i src_56a = _mm_unpacklo_epi16(d[5], d[4]);
1559
1560
48.7k
  s[2] = _mm_unpacklo_epi8(src_45a, src_56a);
1561
1562
48.7k
  *res = convolve_lowbd_6tap_ssse3(s, coeffs);
1563
48.7k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_6tap_2x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_6tap_2x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_6tap_2x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_6tap_2x2_ssse3
1564
1565
static inline void convolve_y_8tap_2x2_ssse3(const uint8_t *const data,
1566
                                             const ptrdiff_t stride,
1567
                                             const __m128i coeffs[4],
1568
                                             __m128i d[8], __m128i s[4],
1569
5.93k
                                             __m128i *res) {
1570
5.93k
  d[7] = _mm_cvtsi32_si128(loadu_int16(data + 7 * stride));
1571
5.93k
  const __m128i src_67a = _mm_unpacklo_epi16(d[6], d[7]);
1572
5.93k
  d[6] = _mm_cvtsi32_si128(loadu_int16(data + 8 * stride));
1573
5.93k
  const __m128i src_78a = _mm_unpacklo_epi16(d[7], d[6]);
1574
1575
5.93k
  s[3] = _mm_unpacklo_epi8(src_67a, src_78a);
1576
1577
5.93k
  *res = convolve_lowbd_ssse3(s, coeffs);
1578
5.93k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_8tap_2x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_8tap_2x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_8tap_2x2_ssse3
convolve_avx2.c:convolve_y_8tap_2x2_ssse3
Line
Count
Source
1569
5.93k
                                             __m128i *res) {
1570
5.93k
  d[7] = _mm_cvtsi32_si128(loadu_int16(data + 7 * stride));
1571
5.93k
  const __m128i src_67a = _mm_unpacklo_epi16(d[6], d[7]);
1572
5.93k
  d[6] = _mm_cvtsi32_si128(loadu_int16(data + 8 * stride));
1573
5.93k
  const __m128i src_78a = _mm_unpacklo_epi16(d[7], d[6]);
1574
1575
5.93k
  s[3] = _mm_unpacklo_epi8(src_67a, src_78a);
1576
1577
5.93k
  *res = convolve_lowbd_ssse3(s, coeffs);
1578
5.93k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_8tap_2x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_8tap_2x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_8tap_2x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_8tap_2x2_ssse3
1579
1580
static inline void convolve_y_2tap_4x2_ssse3(const uint8_t *const data,
1581
                                             const ptrdiff_t stride,
1582
                                             const __m128i *coeffs,
1583
15.3k
                                             __m128i d[2], __m128i *res) {
1584
15.3k
  d[1] = _mm_cvtsi32_si128(loadu_int32(data + 1 * stride));
1585
15.3k
  const __m128i src_01a = _mm_unpacklo_epi32(d[0], d[1]);
1586
15.3k
  d[0] = _mm_cvtsi32_si128(loadu_int32(data + 2 * stride));
1587
15.3k
  const __m128i src_12a = _mm_unpacklo_epi32(d[1], d[0]);
1588
1589
15.3k
  const __m128i s = _mm_unpacklo_epi8(src_01a, src_12a);
1590
1591
15.3k
  *res = _mm_maddubs_epi16(s, coeffs[0]);
1592
15.3k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_2tap_4x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_2tap_4x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_2tap_4x2_ssse3
convolve_avx2.c:convolve_y_2tap_4x2_ssse3
Line
Count
Source
1583
15.3k
                                             __m128i d[2], __m128i *res) {
1584
15.3k
  d[1] = _mm_cvtsi32_si128(loadu_int32(data + 1 * stride));
1585
15.3k
  const __m128i src_01a = _mm_unpacklo_epi32(d[0], d[1]);
1586
15.3k
  d[0] = _mm_cvtsi32_si128(loadu_int32(data + 2 * stride));
1587
15.3k
  const __m128i src_12a = _mm_unpacklo_epi32(d[1], d[0]);
1588
1589
15.3k
  const __m128i s = _mm_unpacklo_epi8(src_01a, src_12a);
1590
1591
15.3k
  *res = _mm_maddubs_epi16(s, coeffs[0]);
1592
15.3k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_2tap_4x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_2tap_4x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_2tap_4x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_2tap_4x2_ssse3
1593
1594
static inline void convolve_y_4tap_4x2_ssse3(const uint8_t *const data,
1595
                                             const ptrdiff_t stride,
1596
                                             const __m128i coeffs[2],
1597
                                             __m128i d[4], __m128i s[2],
1598
194k
                                             __m128i *res) {
1599
194k
  d[3] = _mm_cvtsi32_si128(loadu_int32(data + 3 * stride));
1600
194k
  const __m128i src_23a = _mm_unpacklo_epi32(d[2], d[3]);
1601
194k
  d[2] = _mm_cvtsi32_si128(loadu_int32(data + 4 * stride));
1602
194k
  const __m128i src_34a = _mm_unpacklo_epi32(d[3], d[2]);
1603
1604
194k
  s[1] = _mm_unpacklo_epi8(src_23a, src_34a);
1605
1606
194k
  *res = convolve_lowbd_4tap_ssse3(s, coeffs);
1607
194k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_4tap_4x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_4tap_4x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_4tap_4x2_ssse3
convolve_avx2.c:convolve_y_4tap_4x2_ssse3
Line
Count
Source
1598
194k
                                             __m128i *res) {
1599
194k
  d[3] = _mm_cvtsi32_si128(loadu_int32(data + 3 * stride));
1600
194k
  const __m128i src_23a = _mm_unpacklo_epi32(d[2], d[3]);
1601
194k
  d[2] = _mm_cvtsi32_si128(loadu_int32(data + 4 * stride));
1602
194k
  const __m128i src_34a = _mm_unpacklo_epi32(d[3], d[2]);
1603
1604
194k
  s[1] = _mm_unpacklo_epi8(src_23a, src_34a);
1605
1606
194k
  *res = convolve_lowbd_4tap_ssse3(s, coeffs);
1607
194k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_4tap_4x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_4tap_4x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_4tap_4x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_4tap_4x2_ssse3
1608
1609
static inline void convolve_y_6tap_4x2_ssse3(const uint8_t *const data,
1610
                                             const ptrdiff_t stride,
1611
                                             const __m128i coeffs[3],
1612
                                             __m128i d[6], __m128i s[3],
1613
279k
                                             __m128i *res) {
1614
279k
  d[5] = _mm_cvtsi32_si128(loadu_int32(data + 5 * stride));
1615
279k
  const __m128i src_45a = _mm_unpacklo_epi32(d[4], d[5]);
1616
279k
  d[4] = _mm_cvtsi32_si128(loadu_int32(data + 6 * stride));
1617
279k
  const __m128i src_56a = _mm_unpacklo_epi32(d[5], d[4]);
1618
1619
279k
  s[2] = _mm_unpacklo_epi8(src_45a, src_56a);
1620
1621
279k
  *res = convolve_lowbd_6tap_ssse3(s, coeffs);
1622
279k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_6tap_4x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_6tap_4x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_6tap_4x2_ssse3
convolve_avx2.c:convolve_y_6tap_4x2_ssse3
Line
Count
Source
1613
279k
                                             __m128i *res) {
1614
279k
  d[5] = _mm_cvtsi32_si128(loadu_int32(data + 5 * stride));
1615
279k
  const __m128i src_45a = _mm_unpacklo_epi32(d[4], d[5]);
1616
279k
  d[4] = _mm_cvtsi32_si128(loadu_int32(data + 6 * stride));
1617
279k
  const __m128i src_56a = _mm_unpacklo_epi32(d[5], d[4]);
1618
1619
279k
  s[2] = _mm_unpacklo_epi8(src_45a, src_56a);
1620
1621
279k
  *res = convolve_lowbd_6tap_ssse3(s, coeffs);
1622
279k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_6tap_4x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_6tap_4x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_6tap_4x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_6tap_4x2_ssse3
1623
1624
static inline void convolve_y_8tap_4x2_ssse3(const uint8_t *const data,
1625
                                             const ptrdiff_t stride,
1626
                                             const __m128i coeffs[4],
1627
                                             __m128i d[8], __m128i s[4],
1628
30.0k
                                             __m128i *res) {
1629
30.0k
  d[7] = _mm_cvtsi32_si128(loadu_int32(data + 7 * stride));
1630
30.0k
  const __m128i src_67a = _mm_unpacklo_epi32(d[6], d[7]);
1631
30.0k
  d[6] = _mm_cvtsi32_si128(loadu_int32(data + 8 * stride));
1632
30.0k
  const __m128i src_78a = _mm_unpacklo_epi32(d[7], d[6]);
1633
1634
30.0k
  s[3] = _mm_unpacklo_epi8(src_67a, src_78a);
1635
1636
30.0k
  res[0] = convolve_lowbd_ssse3(s, coeffs);
1637
30.0k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_8tap_4x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_8tap_4x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_8tap_4x2_ssse3
convolve_avx2.c:convolve_y_8tap_4x2_ssse3
Line
Count
Source
1628
30.0k
                                             __m128i *res) {
1629
30.0k
  d[7] = _mm_cvtsi32_si128(loadu_int32(data + 7 * stride));
1630
30.0k
  const __m128i src_67a = _mm_unpacklo_epi32(d[6], d[7]);
1631
30.0k
  d[6] = _mm_cvtsi32_si128(loadu_int32(data + 8 * stride));
1632
30.0k
  const __m128i src_78a = _mm_unpacklo_epi32(d[7], d[6]);
1633
1634
30.0k
  s[3] = _mm_unpacklo_epi8(src_67a, src_78a);
1635
1636
30.0k
  res[0] = convolve_lowbd_ssse3(s, coeffs);
1637
30.0k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_8tap_4x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_8tap_4x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_8tap_4x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_8tap_4x2_ssse3
1638
1639
static inline void convolve_y_2tap_8x2_avx2(const uint8_t *const data,
1640
                                            const ptrdiff_t stride,
1641
                                            const __m256i *coeffs, __m128i d[2],
1642
12.7k
                                            __m256i *res) {
1643
12.7k
  d[1] = _mm_loadu_si128((__m128i *)(data + 1 * stride));
1644
12.7k
  const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]);
1645
12.7k
  d[0] = _mm_loadu_si128((__m128i *)(data + 2 * stride));
1646
12.7k
  const __m256i src_12a = _mm256_setr_m128i(d[1], d[0]);
1647
1648
12.7k
  const __m256i s = _mm256_unpacklo_epi8(src_01a, src_12a);
1649
1650
12.7k
  *res = _mm256_maddubs_epi16(s, coeffs[0]);
1651
12.7k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_2tap_8x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_2tap_8x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_2tap_8x2_avx2
convolve_avx2.c:convolve_y_2tap_8x2_avx2
Line
Count
Source
1642
12.7k
                                            __m256i *res) {
1643
12.7k
  d[1] = _mm_loadu_si128((__m128i *)(data + 1 * stride));
1644
12.7k
  const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]);
1645
12.7k
  d[0] = _mm_loadu_si128((__m128i *)(data + 2 * stride));
1646
12.7k
  const __m256i src_12a = _mm256_setr_m128i(d[1], d[0]);
1647
1648
12.7k
  const __m256i s = _mm256_unpacklo_epi8(src_01a, src_12a);
1649
1650
12.7k
  *res = _mm256_maddubs_epi16(s, coeffs[0]);
1651
12.7k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_2tap_8x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_2tap_8x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_2tap_8x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_2tap_8x2_avx2
1652
1653
static inline void convolve_y_4tap_8x2_avx2(const uint8_t *const data,
1654
                                            const ptrdiff_t stride,
1655
                                            const __m256i coeffs[2],
1656
                                            __m128i d[4], __m256i s[2],
1657
169k
                                            __m256i *res) {
1658
169k
  d[3] = _mm_loadu_si128((__m128i *)(data + 3 * stride));
1659
169k
  const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]);
1660
169k
  d[2] = _mm_loadu_si128((__m128i *)(data + 4 * stride));
1661
169k
  const __m256i src_34a = _mm256_setr_m128i(d[3], d[2]);
1662
1663
169k
  s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
1664
1665
169k
  *res = convolve_lowbd_4tap(s, coeffs);
1666
169k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_4tap_8x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_4tap_8x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_4tap_8x2_avx2
convolve_avx2.c:convolve_y_4tap_8x2_avx2
Line
Count
Source
1657
169k
                                            __m256i *res) {
1658
169k
  d[3] = _mm_loadu_si128((__m128i *)(data + 3 * stride));
1659
169k
  const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]);
1660
169k
  d[2] = _mm_loadu_si128((__m128i *)(data + 4 * stride));
1661
169k
  const __m256i src_34a = _mm256_setr_m128i(d[3], d[2]);
1662
1663
169k
  s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
1664
1665
169k
  *res = convolve_lowbd_4tap(s, coeffs);
1666
169k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_4tap_8x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_4tap_8x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_4tap_8x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_4tap_8x2_avx2
1667
1668
static inline void convolve_y_6tap_8x2_avx2(const uint8_t *const data,
1669
                                            const ptrdiff_t stride,
1670
                                            const __m256i coeffs[3],
1671
                                            __m128i d[6], __m256i s[3],
1672
370k
                                            __m256i *res) {
1673
370k
  d[5] = _mm_loadu_si128((__m128i *)(data + 5 * stride));
1674
370k
  const __m256i src_45a = _mm256_setr_m128i(d[4], d[5]);
1675
370k
  d[4] = _mm_loadu_si128((__m128i *)(data + 6 * stride));
1676
370k
  const __m256i src_56a = _mm256_setr_m128i(d[5], d[4]);
1677
1678
370k
  s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
1679
1680
370k
  *res = convolve_lowbd_6tap(s, coeffs);
1681
370k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_6tap_8x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_6tap_8x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_6tap_8x2_avx2
convolve_avx2.c:convolve_y_6tap_8x2_avx2
Line
Count
Source
1672
370k
                                            __m256i *res) {
1673
370k
  d[5] = _mm_loadu_si128((__m128i *)(data + 5 * stride));
1674
370k
  const __m256i src_45a = _mm256_setr_m128i(d[4], d[5]);
1675
370k
  d[4] = _mm_loadu_si128((__m128i *)(data + 6 * stride));
1676
370k
  const __m256i src_56a = _mm256_setr_m128i(d[5], d[4]);
1677
1678
370k
  s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
1679
1680
370k
  *res = convolve_lowbd_6tap(s, coeffs);
1681
370k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_6tap_8x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_6tap_8x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_6tap_8x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_6tap_8x2_avx2
1682
1683
static inline void convolve_y_8tap_8x2_avx2(const uint8_t *const data,
1684
                                            const ptrdiff_t stride,
1685
                                            const __m256i coeffs[4],
1686
                                            __m128i d[8], __m256i s[4],
1687
30.5k
                                            __m256i *res) {
1688
30.5k
  d[7] = _mm_loadu_si128((__m128i *)(data + 7 * stride));
1689
30.5k
  const __m256i src_67a = _mm256_setr_m128i(d[6], d[7]);
1690
30.5k
  d[6] = _mm_loadu_si128((__m128i *)(data + 8 * stride));
1691
30.5k
  const __m256i src_78a = _mm256_setr_m128i(d[7], d[6]);
1692
1693
30.5k
  s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
1694
1695
30.5k
  *res = convolve_lowbd(s, coeffs);
1696
30.5k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_8tap_8x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_8tap_8x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_8tap_8x2_avx2
convolve_avx2.c:convolve_y_8tap_8x2_avx2
Line
Count
Source
1687
30.5k
                                            __m256i *res) {
1688
30.5k
  d[7] = _mm_loadu_si128((__m128i *)(data + 7 * stride));
1689
30.5k
  const __m256i src_67a = _mm256_setr_m128i(d[6], d[7]);
1690
30.5k
  d[6] = _mm_loadu_si128((__m128i *)(data + 8 * stride));
1691
30.5k
  const __m256i src_78a = _mm256_setr_m128i(d[7], d[6]);
1692
1693
30.5k
  s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
1694
1695
30.5k
  *res = convolve_lowbd(s, coeffs);
1696
30.5k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_8tap_8x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_8tap_8x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_8tap_8x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_8tap_8x2_avx2
1697
1698
static inline void convolve_y_2tap_16x2_avx2(const uint8_t *const data,
1699
                                             const ptrdiff_t stride,
1700
                                             const __m256i *coeffs,
1701
13.9k
                                             __m128i d[2], __m256i res[2]) {
1702
13.9k
  d[1] = _mm_loadu_si128((__m128i *)(data + 1 * stride));
1703
13.9k
  const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]);
1704
13.9k
  d[0] = _mm_loadu_si128((__m128i *)(data + 2 * stride));
1705
13.9k
  const __m256i src_12a = _mm256_setr_m128i(d[1], d[0]);
1706
1707
13.9k
  const __m256i s0 = _mm256_unpacklo_epi8(src_01a, src_12a);
1708
13.9k
  const __m256i s1 = _mm256_unpackhi_epi8(src_01a, src_12a);
1709
1710
13.9k
  res[0] = _mm256_maddubs_epi16(s0, coeffs[0]);
1711
13.9k
  res[1] = _mm256_maddubs_epi16(s1, coeffs[0]);
1712
13.9k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_2tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_2tap_16x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_2tap_16x2_avx2
convolve_avx2.c:convolve_y_2tap_16x2_avx2
Line
Count
Source
1701
13.9k
                                             __m128i d[2], __m256i res[2]) {
1702
13.9k
  d[1] = _mm_loadu_si128((__m128i *)(data + 1 * stride));
1703
13.9k
  const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]);
1704
13.9k
  d[0] = _mm_loadu_si128((__m128i *)(data + 2 * stride));
1705
13.9k
  const __m256i src_12a = _mm256_setr_m128i(d[1], d[0]);
1706
1707
13.9k
  const __m256i s0 = _mm256_unpacklo_epi8(src_01a, src_12a);
1708
13.9k
  const __m256i s1 = _mm256_unpackhi_epi8(src_01a, src_12a);
1709
1710
13.9k
  res[0] = _mm256_maddubs_epi16(s0, coeffs[0]);
1711
13.9k
  res[1] = _mm256_maddubs_epi16(s1, coeffs[0]);
1712
13.9k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_2tap_16x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_2tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_2tap_16x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_2tap_16x2_avx2
1713
1714
static inline void convolve_y_4tap_16x2_avx2(const uint8_t *const data,
1715
                                             const ptrdiff_t stride,
1716
                                             const __m256i coeffs[2],
1717
                                             __m128i d[4], __m256i s[4],
1718
97.3k
                                             __m256i res[2]) {
1719
97.3k
  d[3] = _mm_loadu_si128((__m128i *)(data + 3 * stride));
1720
97.3k
  const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]);
1721
97.3k
  d[2] = _mm_loadu_si128((__m128i *)(data + 4 * stride));
1722
97.3k
  const __m256i src_34a = _mm256_setr_m128i(d[3], d[2]);
1723
1724
97.3k
  s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
1725
97.3k
  s[3] = _mm256_unpackhi_epi8(src_23a, src_34a);
1726
1727
97.3k
  res[0] = convolve_lowbd_4tap(s, coeffs);
1728
97.3k
  res[1] = convolve_lowbd_4tap(s + 2, coeffs);
1729
97.3k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_4tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_4tap_16x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_4tap_16x2_avx2
convolve_avx2.c:convolve_y_4tap_16x2_avx2
Line
Count
Source
1718
97.3k
                                             __m256i res[2]) {
1719
97.3k
  d[3] = _mm_loadu_si128((__m128i *)(data + 3 * stride));
1720
97.3k
  const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]);
1721
97.3k
  d[2] = _mm_loadu_si128((__m128i *)(data + 4 * stride));
1722
97.3k
  const __m256i src_34a = _mm256_setr_m128i(d[3], d[2]);
1723
1724
97.3k
  s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
1725
97.3k
  s[3] = _mm256_unpackhi_epi8(src_23a, src_34a);
1726
1727
97.3k
  res[0] = convolve_lowbd_4tap(s, coeffs);
1728
97.3k
  res[1] = convolve_lowbd_4tap(s + 2, coeffs);
1729
97.3k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_4tap_16x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_4tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_4tap_16x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_4tap_16x2_avx2
1730
1731
static inline void convolve_y_6tap_16x2_avx2(const uint8_t *const data,
1732
                                             const ptrdiff_t stride,
1733
                                             const __m256i coeffs[3],
1734
                                             __m128i d[6], __m256i s[6],
1735
1.42M
                                             __m256i res[2]) {
1736
1.42M
  d[5] = _mm_loadu_si128((__m128i *)(data + 5 * stride));
1737
1.42M
  const __m256i src_45a = _mm256_setr_m128i(d[4], d[5]);
1738
1.42M
  d[4] = _mm_loadu_si128((__m128i *)(data + 6 * stride));
1739
1.42M
  const __m256i src_56a = _mm256_setr_m128i(d[5], d[4]);
1740
1741
1.42M
  s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
1742
1.42M
  s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
1743
1744
1.42M
  res[0] = convolve_lowbd_6tap(s, coeffs);
1745
1.42M
  res[1] = convolve_lowbd_6tap(s + 3, coeffs);
1746
1.42M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_6tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_6tap_16x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_6tap_16x2_avx2
convolve_avx2.c:convolve_y_6tap_16x2_avx2
Line
Count
Source
1735
1.42M
                                             __m256i res[2]) {
1736
1.42M
  d[5] = _mm_loadu_si128((__m128i *)(data + 5 * stride));
1737
1.42M
  const __m256i src_45a = _mm256_setr_m128i(d[4], d[5]);
1738
1.42M
  d[4] = _mm_loadu_si128((__m128i *)(data + 6 * stride));
1739
1.42M
  const __m256i src_56a = _mm256_setr_m128i(d[5], d[4]);
1740
1741
1.42M
  s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
1742
1.42M
  s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
1743
1744
1.42M
  res[0] = convolve_lowbd_6tap(s, coeffs);
1745
1.42M
  res[1] = convolve_lowbd_6tap(s + 3, coeffs);
1746
1.42M
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_6tap_16x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_6tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_6tap_16x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_6tap_16x2_avx2
1747
1748
static inline void convolve_y_8tap_16x2_avx2(const uint8_t *const data,
1749
                                             const ptrdiff_t stride,
1750
                                             const __m256i coeffs[4],
1751
                                             __m128i d[8], __m256i s[8],
1752
94.4k
                                             __m256i res[2]) {
1753
94.4k
  d[7] = _mm_loadu_si128((__m128i *)(data + 7 * stride));
1754
94.4k
  const __m256i src_67a = _mm256_setr_m128i(d[6], d[7]);
1755
94.4k
  d[6] = _mm_loadu_si128((__m128i *)(data + 8 * stride));
1756
94.4k
  const __m256i src_78a = _mm256_setr_m128i(d[7], d[6]);
1757
1758
94.4k
  s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
1759
94.4k
  s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
1760
1761
94.4k
  res[0] = convolve_lowbd(s, coeffs);
1762
94.4k
  res[1] = convolve_lowbd(s + 4, coeffs);
1763
94.4k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_8tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_8tap_16x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_8tap_16x2_avx2
convolve_avx2.c:convolve_y_8tap_16x2_avx2
Line
Count
Source
1752
94.4k
                                             __m256i res[2]) {
1753
94.4k
  d[7] = _mm_loadu_si128((__m128i *)(data + 7 * stride));
1754
94.4k
  const __m256i src_67a = _mm256_setr_m128i(d[6], d[7]);
1755
94.4k
  d[6] = _mm_loadu_si128((__m128i *)(data + 8 * stride));
1756
94.4k
  const __m256i src_78a = _mm256_setr_m128i(d[7], d[6]);
1757
1758
94.4k
  s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
1759
94.4k
  s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
1760
1761
94.4k
  res[0] = convolve_lowbd(s, coeffs);
1762
94.4k
  res[1] = convolve_lowbd(s + 4, coeffs);
1763
94.4k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_8tap_16x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_8tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_8tap_16x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_8tap_16x2_avx2
1764
1765
static inline void convolve_y_2tap_32x2_avx2(const uint8_t *const data,
1766
                                             const ptrdiff_t stride,
1767
                                             const __m256i *coeffs,
1768
41.0k
                                             __m256i d[2], __m256i res[4]) {
1769
41.0k
  d[1] = _mm256_loadu_si256((__m256i *)(data + 1 * stride));
1770
41.0k
  const __m256i s00 = _mm256_unpacklo_epi8(d[0], d[1]);
1771
41.0k
  const __m256i s01 = _mm256_unpackhi_epi8(d[0], d[1]);
1772
41.0k
  d[0] = _mm256_loadu_si256((__m256i *)(data + 2 * stride));
1773
41.0k
  const __m256i s10 = _mm256_unpacklo_epi8(d[1], d[0]);
1774
41.0k
  const __m256i s11 = _mm256_unpackhi_epi8(d[1], d[0]);
1775
1776
41.0k
  res[0] = _mm256_maddubs_epi16(s00, coeffs[0]);
1777
41.0k
  res[1] = _mm256_maddubs_epi16(s01, coeffs[0]);
1778
41.0k
  res[2] = _mm256_maddubs_epi16(s10, coeffs[0]);
1779
41.0k
  res[3] = _mm256_maddubs_epi16(s11, coeffs[0]);
1780
41.0k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_2tap_32x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_2tap_32x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_2tap_32x2_avx2
convolve_avx2.c:convolve_y_2tap_32x2_avx2
Line
Count
Source
1768
41.0k
                                             __m256i d[2], __m256i res[4]) {
1769
41.0k
  d[1] = _mm256_loadu_si256((__m256i *)(data + 1 * stride));
1770
41.0k
  const __m256i s00 = _mm256_unpacklo_epi8(d[0], d[1]);
1771
41.0k
  const __m256i s01 = _mm256_unpackhi_epi8(d[0], d[1]);
1772
41.0k
  d[0] = _mm256_loadu_si256((__m256i *)(data + 2 * stride));
1773
41.0k
  const __m256i s10 = _mm256_unpacklo_epi8(d[1], d[0]);
1774
41.0k
  const __m256i s11 = _mm256_unpackhi_epi8(d[1], d[0]);
1775
1776
41.0k
  res[0] = _mm256_maddubs_epi16(s00, coeffs[0]);
1777
41.0k
  res[1] = _mm256_maddubs_epi16(s01, coeffs[0]);
1778
41.0k
  res[2] = _mm256_maddubs_epi16(s10, coeffs[0]);
1779
41.0k
  res[3] = _mm256_maddubs_epi16(s11, coeffs[0]);
1780
41.0k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_2tap_32x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_2tap_32x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_2tap_32x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_2tap_32x2_avx2
1781
1782
static inline void convolve_y_4tap_32x2_avx2(const uint8_t *const data,
1783
                                             const ptrdiff_t stride,
1784
                                             const __m256i coeffs[2],
1785
                                             __m256i d[4], __m256i s1[4],
1786
87.2k
                                             __m256i s2[4], __m256i res[4]) {
1787
87.2k
  d[3] = _mm256_loadu_si256((__m256i *)(data + 3 * stride));
1788
87.2k
  s1[1] = _mm256_unpacklo_epi8(d[2], d[3]);
1789
87.2k
  s1[3] = _mm256_unpackhi_epi8(d[2], d[3]);
1790
87.2k
  d[2] = _mm256_loadu_si256((__m256i *)(data + 4 * stride));
1791
87.2k
  s2[1] = _mm256_unpacklo_epi8(d[3], d[2]);
1792
87.2k
  s2[3] = _mm256_unpackhi_epi8(d[3], d[2]);
1793
1794
87.2k
  res[0] = convolve_lowbd_4tap(s1, coeffs);
1795
87.2k
  res[1] = convolve_lowbd_4tap(s1 + 2, coeffs);
1796
87.2k
  res[2] = convolve_lowbd_4tap(s2, coeffs);
1797
87.2k
  res[3] = convolve_lowbd_4tap(s2 + 2, coeffs);
1798
87.2k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_4tap_32x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_4tap_32x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_4tap_32x2_avx2
convolve_avx2.c:convolve_y_4tap_32x2_avx2
Line
Count
Source
1786
87.2k
                                             __m256i s2[4], __m256i res[4]) {
1787
87.2k
  d[3] = _mm256_loadu_si256((__m256i *)(data + 3 * stride));
1788
87.2k
  s1[1] = _mm256_unpacklo_epi8(d[2], d[3]);
1789
87.2k
  s1[3] = _mm256_unpackhi_epi8(d[2], d[3]);
1790
87.2k
  d[2] = _mm256_loadu_si256((__m256i *)(data + 4 * stride));
1791
87.2k
  s2[1] = _mm256_unpacklo_epi8(d[3], d[2]);
1792
87.2k
  s2[3] = _mm256_unpackhi_epi8(d[3], d[2]);
1793
1794
87.2k
  res[0] = convolve_lowbd_4tap(s1, coeffs);
1795
87.2k
  res[1] = convolve_lowbd_4tap(s1 + 2, coeffs);
1796
87.2k
  res[2] = convolve_lowbd_4tap(s2, coeffs);
1797
87.2k
  res[3] = convolve_lowbd_4tap(s2 + 2, coeffs);
1798
87.2k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_4tap_32x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_4tap_32x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_4tap_32x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_4tap_32x2_avx2
1799
#endif  // AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_