Coverage Report

Created: 2026-03-31 06:59

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/aom/aom_dsp/x86/convolve_avx2.h
Line
Count
Source
1
/*
2
 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#ifndef AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_
13
#define AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_
14
15
#include <immintrin.h>
16
17
#include "aom_ports/mem.h"
18
19
#include "aom_dsp/x86/mem_sse2.h"
20
#include "aom_dsp/x86/synonyms.h"
21
22
#include "av1/common/convolve.h"
23
#include "av1/common/filter.h"
24
25
527k
#define SECOND_32_BLK (32)
26
448k
#define THIRD_32_BLK (32 << 1)
27
224k
#define FOURTH_32_BLK (SECOND_32_BLK + THIRD_32_BLK)
28
29
// filters for 16
30
DECLARE_ALIGNED(32, static const uint8_t, filt_global_avx2[]) = {
31
  0,  1,  1,  2,  2, 3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  0,  1,  1,
32
  2,  2,  3,  3,  4, 4,  5,  5,  6,  6,  7,  7,  8,  2,  3,  3,  4,  4,  5,
33
  5,  6,  6,  7,  7, 8,  8,  9,  9,  10, 2,  3,  3,  4,  4,  5,  5,  6,  6,
34
  7,  7,  8,  8,  9, 9,  10, 4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10,
35
  10, 11, 11, 12, 4, 5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10, 10, 11, 11,
36
  12, 6,  7,  7,  8, 8,  9,  9,  10, 10, 11, 11, 12, 12, 13, 13, 14, 6,  7,
37
  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14
38
};
39
40
DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[]) = {
41
  0, 1, 2, 3,  1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3,  1, 2,
42
  3, 4, 2, 3,  4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7,  8, 9,
43
  7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
44
};
45
46
DECLARE_ALIGNED(32, static const uint8_t, filt4_d4_global_avx2[]) = {
47
  2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
48
  2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
49
};
50
51
DECLARE_ALIGNED(32, static const uint8_t, filt_center_global_avx2[32]) = {
52
  3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255,
53
  3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255
54
};
55
56
DECLARE_ALIGNED(32, static const uint8_t,
57
                filt1_global_sse2[16]) = { 0, 1, 1, 2,  2,  3,  3,  4,
58
                                           8, 9, 9, 10, 10, 11, 11, 12 };
59
60
DECLARE_ALIGNED(32, static const uint8_t,
61
                filt2_global_sse2[16]) = { 2,  3,  3,  4,  4,  5,  5,  6,
62
                                           10, 11, 11, 12, 12, 13, 13, 14 };
63
64
DECLARE_ALIGNED(32, static const uint8_t,
65
                filt3_global_sse2[16]) = { 0, 1, 1, 2, 8, 9, 9, 10,
66
                                           0, 0, 0, 0, 0, 0, 0, 0 };
67
68
DECLARE_ALIGNED(32, static const uint8_t,
69
                filt4_global_sse2[16]) = { 2, 3, 3, 4, 10, 11, 11, 12,
70
                                           0, 0, 0, 0, 0,  0,  0,  0 };
71
72
DECLARE_ALIGNED(32, static const uint8_t,
73
                filt5_global_sse2[16]) = { 0, 1, 1, 2, 4, 5, 5, 6,
74
                                           0, 0, 0, 0, 0, 0, 0, 0 };
75
76
DECLARE_ALIGNED(32, static const uint8_t,
77
                filt1_global_avx2[32]) = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,
78
                                           6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3,
79
                                           3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
80
81
DECLARE_ALIGNED(32, static const uint8_t,
82
                filt2_global_avx2[32]) = { 2, 3, 3, 4, 4,  5, 5, 6, 6, 7, 7,
83
                                           8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5,
84
                                           5, 6, 6, 7, 7,  8, 8, 9, 9, 10 };
85
86
DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
87
  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
88
  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
89
};
90
91
DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
92
  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
93
  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
94
};
95
96
#define CONVOLVE_SR_HOR_FILTER_W4(CONVOLVE_LOWBD)                            \
97
2.90M
  for (i = 0; i < (im_h - 2); i += 2) {                                      \
98
2.35M
    __m128i data =                                                           \
99
2.35M
        load_8bit_8x2_to_1_reg_sse2(&src_ptr[(i * src_stride)], src_stride); \
100
2.35M
    __m128i res = CONVOLVE_LOWBD(data, coeffs_h, filt);                      \
101
2.35M
    res = _mm_srai_epi16(_mm_add_epi16(res, round_const_h), 2);              \
102
2.35M
    _mm_store_si128((__m128i *)&im_block[i * 4], res);                       \
103
2.35M
  }                                                                          \
104
551k
  __m128i data_1 = _mm_loadl_epi64((__m128i *)&src_ptr[(i * src_stride)]);   \
105
551k
  __m128i res = CONVOLVE_LOWBD(data_1, coeffs_h, filt);                      \
106
551k
  res = _mm_srai_epi16(_mm_add_epi16(res, round_const_h), 2);                \
107
551k
  _mm_storel_epi64((__m128i *)&im_block[i * 4], res);
108
109
#define CONVOLVE_SR_HOR_FILTER_2TAP_W4 \
110
22.8k
  CONVOLVE_SR_HOR_FILTER_W4(convolve_lowbd_x_2tap_ssse3)
111
112
#define CONVOLVE_SR_HOR_FILTER_4TAP_W4 \
113
528k
  CONVOLVE_SR_HOR_FILTER_W4(convolve_lowbd_x_4tap_ssse3)
114
115
static inline void sr_2d_ver_round_and_store_w4(int w, __m256i res,
116
                                                uint8_t *dst, int dst_stride,
117
1.63M
                                                __m256i round_const_v) {
118
1.63M
  const __m256i res_round =
119
1.63M
      _mm256_srai_epi32(_mm256_add_epi32(res, round_const_v), 11);
120
121
1.63M
  const __m256i res_16bit = _mm256_packs_epi32(res_round, res_round);
122
1.63M
  const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);
123
124
1.63M
  const __m128i r0 = _mm256_castsi256_si128(res_8b);
125
1.63M
  const __m128i r1 = _mm256_extracti128_si256(res_8b, 1);
126
127
1.63M
  __m128i *const p0 = (__m128i *)dst;
128
1.63M
  __m128i *const p1 = (__m128i *)(dst + dst_stride);
129
130
1.63M
  if (w == 4) {
131
1.35M
    xx_storel_32(p0, r0);
132
1.35M
    xx_storel_32(p1, r1);
133
1.35M
  } else {
134
283k
    assert(w == 2);
135
283k
    *(uint16_t *)p0 = (uint16_t)_mm_cvtsi128_si32(r0);
136
283k
    *(uint16_t *)p1 = (uint16_t)_mm_cvtsi128_si32(r1);
137
283k
  }
138
1.63M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:sr_2d_ver_round_and_store_w4
Unexecuted instantiation: highbd_convolve_avx2.c:sr_2d_ver_round_and_store_w4
convolve_2d_avx2.c:sr_2d_ver_round_and_store_w4
Line
Count
Source
117
1.63M
                                                __m256i round_const_v) {
118
1.63M
  const __m256i res_round =
119
1.63M
      _mm256_srai_epi32(_mm256_add_epi32(res, round_const_v), 11);
120
121
1.63M
  const __m256i res_16bit = _mm256_packs_epi32(res_round, res_round);
122
1.63M
  const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);
123
124
1.63M
  const __m128i r0 = _mm256_castsi256_si128(res_8b);
125
1.63M
  const __m128i r1 = _mm256_extracti128_si256(res_8b, 1);
126
127
1.63M
  __m128i *const p0 = (__m128i *)dst;
128
1.63M
  __m128i *const p1 = (__m128i *)(dst + dst_stride);
129
130
1.63M
  if (w == 4) {
131
1.35M
    xx_storel_32(p0, r0);
132
1.35M
    xx_storel_32(p1, r1);
133
1.35M
  } else {
134
283k
    assert(w == 2);
135
283k
    *(uint16_t *)p0 = (uint16_t)_mm_cvtsi128_si32(r0);
136
283k
    *(uint16_t *)p1 = (uint16_t)_mm_cvtsi128_si32(r1);
137
283k
  }
138
1.63M
}
Unexecuted instantiation: convolve_avx2.c:sr_2d_ver_round_and_store_w4
Unexecuted instantiation: jnt_convolve_avx2.c:sr_2d_ver_round_and_store_w4
Unexecuted instantiation: wiener_convolve_avx2.c:sr_2d_ver_round_and_store_w4
Unexecuted instantiation: highbd_convolve_2d_avx2.c:sr_2d_ver_round_and_store_w4
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:sr_2d_ver_round_and_store_w4
139
140
#define CONVOLVE_SR_VER_FILTER_2TAP_W4                                        \
141
22.8k
  __m128i s[2];                                                               \
142
22.8k
  s[0] = _mm_loadl_epi64((__m128i *)(im_block + 0 * 4));                      \
143
22.8k
                                                                              \
144
87.1k
  for (i = 0; i < h; i += 2) {                                                \
145
64.2k
    const int16_t *data = &im_block[i * 4];                                   \
146
64.2k
    s[1] = _mm_loadl_epi64((__m128i *)(data + 1 * 4));                        \
147
64.2k
    const __m256i src_0 = _mm256_setr_m128i(s[0], s[1]);                      \
148
64.2k
    s[0] = _mm_loadl_epi64((__m128i *)(data + 2 * 4));                        \
149
64.2k
    const __m256i src_1 = _mm256_setr_m128i(s[1], s[0]);                      \
150
64.2k
    const __m256i ss = _mm256_unpacklo_epi16(src_0, src_1);                   \
151
64.2k
                                                                              \
152
64.2k
    const __m256i res = _mm256_madd_epi16(ss, coeffs_v[0]);                   \
153
64.2k
                                                                              \
154
64.2k
    sr_2d_ver_round_and_store_w4(w, res, dst_ptr, dst_stride, round_const_v); \
155
64.2k
    dst_ptr += 2 * dst_stride;                                                \
156
64.2k
  }
157
158
#define CONVOLVE_SR_VER_FILTER_4TAP_W4                                        \
159
347k
  __m128i s[4];                                                               \
160
347k
  __m256i ss[2];                                                              \
161
347k
  s[0] = _mm_loadl_epi64((__m128i *)(im_block + 0 * 4));                      \
162
347k
  s[1] = _mm_loadl_epi64((__m128i *)(im_block + 1 * 4));                      \
163
347k
  s[2] = _mm_loadl_epi64((__m128i *)(im_block + 2 * 4));                      \
164
347k
                                                                              \
165
347k
  const __m256i src_0 = _mm256_setr_m128i(s[0], s[1]);                        \
166
347k
  const __m256i src_1 = _mm256_setr_m128i(s[1], s[2]);                        \
167
347k
                                                                              \
168
347k
  ss[0] = _mm256_unpacklo_epi16(src_0, src_1);                                \
169
347k
                                                                              \
170
1.00M
  for (i = 0; i < h; i += 2) {                                                \
171
658k
    const int16_t *data = &im_block[i * 4];                                   \
172
658k
    s[3] = _mm_loadl_epi64((__m128i *)(data + 3 * 4));                        \
173
658k
    const __m256i src_2 = _mm256_setr_m128i(s[2], s[3]);                      \
174
658k
    s[2] = _mm_loadl_epi64((__m128i *)(data + 4 * 4));                        \
175
658k
    const __m256i src_3 = _mm256_setr_m128i(s[3], s[2]);                      \
176
658k
    ss[1] = _mm256_unpacklo_epi16(src_2, src_3);                              \
177
658k
                                                                              \
178
658k
    const __m256i res = convolve_4tap(ss, coeffs_v);                          \
179
658k
                                                                              \
180
658k
    sr_2d_ver_round_and_store_w4(w, res, dst_ptr, dst_stride, round_const_v); \
181
658k
    dst_ptr += 2 * dst_stride;                                                \
182
658k
                                                                              \
183
658k
    ss[0] = ss[1];                                                            \
184
658k
  }
185
186
#define CONVOLVE_SR_VER_FILTER_6TAP_W4                                        \
187
170k
  __m128i s[6];                                                               \
188
170k
  __m256i ss[3];                                                              \
189
170k
  s[0] = _mm_loadl_epi64((__m128i *)(im_block + 0 * 4));                      \
190
170k
  s[1] = _mm_loadl_epi64((__m128i *)(im_block + 1 * 4));                      \
191
170k
  s[2] = _mm_loadl_epi64((__m128i *)(im_block + 2 * 4));                      \
192
170k
  s[3] = _mm_loadl_epi64((__m128i *)(im_block + 3 * 4));                      \
193
170k
  s[4] = _mm_loadl_epi64((__m128i *)(im_block + 4 * 4));                      \
194
170k
                                                                              \
195
170k
  const __m256i src_0 = _mm256_setr_m128i(s[0], s[1]);                        \
196
170k
  const __m256i src_1 = _mm256_setr_m128i(s[1], s[2]);                        \
197
170k
  const __m256i src_2 = _mm256_setr_m128i(s[2], s[3]);                        \
198
170k
  const __m256i src_3 = _mm256_setr_m128i(s[3], s[4]);                        \
199
170k
                                                                              \
200
170k
  ss[0] = _mm256_unpacklo_epi16(src_0, src_1);                                \
201
170k
  ss[1] = _mm256_unpacklo_epi16(src_2, src_3);                                \
202
170k
                                                                              \
203
1.03M
  for (i = 0; i < h; i += 2) {                                                \
204
860k
    const int16_t *data = &im_block[i * 4];                                   \
205
860k
    s[5] = _mm_loadl_epi64((__m128i *)(data + 5 * 4));                        \
206
860k
    const __m256i src_4 = _mm256_setr_m128i(s[4], s[5]);                      \
207
860k
    s[4] = _mm_loadl_epi64((__m128i *)(data + 6 * 4));                        \
208
860k
    const __m256i src_5 = _mm256_setr_m128i(s[5], s[4]);                      \
209
860k
    ss[2] = _mm256_unpacklo_epi16(src_4, src_5);                              \
210
860k
                                                                              \
211
860k
    const __m256i res = convolve_6tap(ss, coeffs_v);                          \
212
860k
                                                                              \
213
860k
    sr_2d_ver_round_and_store_w4(w, res, dst_ptr, dst_stride, round_const_v); \
214
860k
    dst_ptr += 2 * dst_stride;                                                \
215
860k
                                                                              \
216
860k
    ss[0] = ss[1];                                                            \
217
860k
    ss[1] = ss[2];                                                            \
218
860k
  }
219
220
#define CONVOLVE_SR_VER_FILTER_8TAP_W4                                        \
221
10.2k
  __m128i s[8];                                                               \
222
10.2k
  __m256i ss[4];                                                              \
223
10.2k
  s[0] = _mm_loadl_epi64((__m128i *)(im_block + 0 * 4));                      \
224
10.2k
  s[1] = _mm_loadl_epi64((__m128i *)(im_block + 1 * 4));                      \
225
10.2k
  s[2] = _mm_loadl_epi64((__m128i *)(im_block + 2 * 4));                      \
226
10.2k
  s[3] = _mm_loadl_epi64((__m128i *)(im_block + 3 * 4));                      \
227
10.2k
  s[4] = _mm_loadl_epi64((__m128i *)(im_block + 4 * 4));                      \
228
10.2k
  s[5] = _mm_loadl_epi64((__m128i *)(im_block + 5 * 4));                      \
229
10.2k
  s[6] = _mm_loadl_epi64((__m128i *)(im_block + 6 * 4));                      \
230
10.2k
                                                                              \
231
10.2k
  const __m256i src_0 = _mm256_setr_m128i(s[0], s[1]);                        \
232
10.2k
  const __m256i src_1 = _mm256_setr_m128i(s[1], s[2]);                        \
233
10.2k
  const __m256i src_2 = _mm256_setr_m128i(s[2], s[3]);                        \
234
10.2k
  const __m256i src_3 = _mm256_setr_m128i(s[3], s[4]);                        \
235
10.2k
  const __m256i src_4 = _mm256_setr_m128i(s[4], s[5]);                        \
236
10.2k
  const __m256i src_5 = _mm256_setr_m128i(s[5], s[6]);                        \
237
10.2k
                                                                              \
238
10.2k
  ss[0] = _mm256_unpacklo_epi16(src_0, src_1);                                \
239
10.2k
  ss[1] = _mm256_unpacklo_epi16(src_2, src_3);                                \
240
10.2k
  ss[2] = _mm256_unpacklo_epi16(src_4, src_5);                                \
241
10.2k
                                                                              \
242
62.7k
  for (i = 0; i < h; i += 2) {                                                \
243
52.5k
    const int16_t *data = &im_block[i * 4];                                   \
244
52.5k
    s[7] = _mm_loadl_epi64((__m128i *)(data + 7 * 4));                        \
245
52.5k
    const __m256i src_6 = _mm256_setr_m128i(s[6], s[7]);                      \
246
52.5k
    s[6] = _mm_loadl_epi64((__m128i *)(data + 8 * 4));                        \
247
52.5k
    const __m256i src_7 = _mm256_setr_m128i(s[7], s[6]);                      \
248
52.5k
    ss[3] = _mm256_unpacklo_epi16(src_6, src_7);                              \
249
52.5k
                                                                              \
250
52.5k
    const __m256i res = convolve(ss, coeffs_v);                               \
251
52.5k
                                                                              \
252
52.5k
    sr_2d_ver_round_and_store_w4(w, res, dst_ptr, dst_stride, round_const_v); \
253
52.5k
    dst_ptr += 2 * dst_stride;                                                \
254
52.5k
                                                                              \
255
52.5k
    ss[0] = ss[1];                                                            \
256
52.5k
    ss[1] = ss[2];                                                            \
257
52.5k
    ss[2] = ss[3];                                                            \
258
52.5k
  }
259
260
#define CONVOLVE_SR_HORIZONTAL_FILTER(CONVOLVE_LOWBD)                 \
261
13.8M
  for (i = 0; i < (im_h - 2); i += 2) {                               \
262
12.6M
    __m256i data = _mm256_castsi128_si256(                            \
263
12.6M
        _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));  \
264
12.6M
    data = _mm256_inserti128_si256(                                   \
265
12.6M
        data,                                                         \
266
12.6M
        _mm_loadu_si128(                                              \
267
12.6M
            (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),  \
268
12.6M
        1);                                                           \
269
12.6M
    __m256i res = CONVOLVE_LOWBD(data, coeffs_h, filt);               \
270
12.6M
    res = _mm256_srai_epi16(_mm256_add_epi16(res, round_const_h), 2); \
271
12.6M
    _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);     \
272
12.6M
  }                                                                   \
273
1.24M
  __m256i data_1 = _mm256_castsi128_si256(                            \
274
1.24M
      _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));    \
275
1.24M
  __m256i res = CONVOLVE_LOWBD(data_1, coeffs_h, filt);               \
276
1.24M
  res = _mm256_srai_epi16(_mm256_add_epi16(res, round_const_h), 2);   \
277
1.24M
  _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
278
279
#define CONVOLVE_SR_HORIZONTAL_FILTER_2TAP \
280
39.8k
  CONVOLVE_SR_HORIZONTAL_FILTER(convolve_lowbd_x_2tap)
281
282
#define CONVOLVE_SR_HORIZONTAL_FILTER_4TAP \
283
73.0k
  CONVOLVE_SR_HORIZONTAL_FILTER(convolve_lowbd_x_4tap)
284
285
#define CONVOLVE_SR_HORIZONTAL_FILTER_6TAP \
286
976k
  CONVOLVE_SR_HORIZONTAL_FILTER(convolve_lowbd_x_6tap)
287
288
#define CONVOLVE_SR_HORIZONTAL_FILTER_8TAP \
289
154k
  CONVOLVE_SR_HORIZONTAL_FILTER(convolve_lowbd_x)
290
291
static inline void sr_2d_ver_round_and_store(__m256i res_a, __m256i res_b,
292
                                             uint8_t *dst, int dst_stride,
293
10.4M
                                             __m256i round_const_v) {
294
10.4M
  const __m256i res_a_round =
295
10.4M
      _mm256_srai_epi32(_mm256_add_epi32(res_a, round_const_v), 11);
296
10.4M
  const __m256i res_b_round =
297
10.4M
      _mm256_srai_epi32(_mm256_add_epi32(res_b, round_const_v), 11);
298
10.4M
  const __m256i r16 = _mm256_packs_epi32(res_a_round, res_b_round);
299
10.4M
  const __m256i r8 = _mm256_packus_epi16(r16, r16);
300
301
10.4M
  _mm_storel_epi64((__m128i *)dst, _mm256_castsi256_si128(r8));
302
10.4M
  _mm_storel_epi64((__m128i *)(dst + dst_stride),
303
10.4M
                   _mm256_extracti128_si256(r8, 1));
304
10.4M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:sr_2d_ver_round_and_store
Unexecuted instantiation: highbd_convolve_avx2.c:sr_2d_ver_round_and_store
convolve_2d_avx2.c:sr_2d_ver_round_and_store
Line
Count
Source
293
10.4M
                                             __m256i round_const_v) {
294
10.4M
  const __m256i res_a_round =
295
10.4M
      _mm256_srai_epi32(_mm256_add_epi32(res_a, round_const_v), 11);
296
10.4M
  const __m256i res_b_round =
297
10.4M
      _mm256_srai_epi32(_mm256_add_epi32(res_b, round_const_v), 11);
298
10.4M
  const __m256i r16 = _mm256_packs_epi32(res_a_round, res_b_round);
299
10.4M
  const __m256i r8 = _mm256_packus_epi16(r16, r16);
300
301
10.4M
  _mm_storel_epi64((__m128i *)dst, _mm256_castsi256_si128(r8));
302
10.4M
  _mm_storel_epi64((__m128i *)(dst + dst_stride),
303
                   _mm256_extracti128_si256(r8, 1));
304
10.4M
}
Unexecuted instantiation: convolve_avx2.c:sr_2d_ver_round_and_store
Unexecuted instantiation: jnt_convolve_avx2.c:sr_2d_ver_round_and_store
Unexecuted instantiation: wiener_convolve_avx2.c:sr_2d_ver_round_and_store
Unexecuted instantiation: highbd_convolve_2d_avx2.c:sr_2d_ver_round_and_store
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:sr_2d_ver_round_and_store
305
306
#define CONVOLVE_SR_VERTICAL_FILTER_2TAP                                      \
307
528k
  for (i = 0; i < h; i += 2) {                                                \
308
488k
    __m256i s[2];                                                             \
309
488k
    const int16_t *data = &im_block[i * im_stride];                           \
310
488k
    const __m256i s1 = _mm256_loadu_si256((__m256i *)(data + 0 * im_stride)); \
311
488k
    const __m256i s2 = _mm256_loadu_si256((__m256i *)(data + 1 * im_stride)); \
312
488k
    s[0] = _mm256_unpacklo_epi16(s1, s2);                                     \
313
488k
    s[1] = _mm256_unpackhi_epi16(s1, s2);                                     \
314
488k
                                                                              \
315
488k
    __m256i res_a = _mm256_madd_epi16(s[0], coeffs_v[0]);                     \
316
488k
    __m256i res_b = _mm256_madd_epi16(s[1], coeffs_v[0]);                     \
317
488k
                                                                              \
318
488k
    sr_2d_ver_round_and_store(res_a, res_b, dst_ptr, dst_stride,              \
319
488k
                              round_const_v);                                 \
320
488k
    dst_ptr += 2 * dst_stride;                                                \
321
488k
  }
322
323
#define CONVOLVE_SR_VERTICAL_FILTER_4TAP                                      \
324
436k
  __m256i s[6];                                                               \
325
436k
  __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));  \
326
436k
  __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));  \
327
436k
                                                                              \
328
436k
  s[0] = _mm256_unpacklo_epi16(src_0, src_1);                                 \
329
436k
  s[2] = _mm256_unpackhi_epi16(src_0, src_1);                                 \
330
436k
                                                                              \
331
1.76M
  for (i = 0; i < h; i += 2) {                                                \
332
1.32M
    const int16_t *data = &im_block[i * im_stride];                           \
333
1.32M
    const __m256i s4 = _mm256_loadu_si256((__m256i *)(data + 2 * im_stride)); \
334
1.32M
    const __m256i s5 = _mm256_loadu_si256((__m256i *)(data + 3 * im_stride)); \
335
1.32M
    s[1] = _mm256_unpacklo_epi16(s4, s5);                                     \
336
1.32M
    s[3] = _mm256_unpackhi_epi16(s4, s5);                                     \
337
1.32M
                                                                              \
338
1.32M
    __m256i res_a = convolve_4tap(s, coeffs_v);                               \
339
1.32M
    __m256i res_b = convolve_4tap(s + 2, coeffs_v);                           \
340
1.32M
                                                                              \
341
1.32M
    sr_2d_ver_round_and_store(res_a, res_b, dst_ptr, dst_stride,              \
342
1.32M
                              round_const_v);                                 \
343
1.32M
    dst_ptr += 2 * dst_stride;                                                \
344
1.32M
                                                                              \
345
1.32M
    s[0] = s[1];                                                              \
346
1.32M
    s[2] = s[3];                                                              \
347
1.32M
  }
348
349
#define CONVOLVE_SR_VERTICAL_FILTER_6TAP                                      \
350
631k
  __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));  \
351
631k
  __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));  \
352
631k
  __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));  \
353
631k
  __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));  \
354
631k
                                                                              \
355
631k
  __m256i s[8];                                                               \
356
631k
  s[0] = _mm256_unpacklo_epi16(src_0, src_1);                                 \
357
631k
  s[1] = _mm256_unpacklo_epi16(src_2, src_3);                                 \
358
631k
                                                                              \
359
631k
  s[3] = _mm256_unpackhi_epi16(src_0, src_1);                                 \
360
631k
  s[4] = _mm256_unpackhi_epi16(src_2, src_3);                                 \
361
631k
                                                                              \
362
7.62M
  for (i = 0; i < h; i += 2) {                                                \
363
6.99M
    const int16_t *data = &im_block[i * im_stride];                           \
364
6.99M
                                                                              \
365
6.99M
    const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 4 * im_stride)); \
366
6.99M
    const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 5 * im_stride)); \
367
6.99M
                                                                              \
368
6.99M
    s[2] = _mm256_unpacklo_epi16(s6, s7);                                     \
369
6.99M
    s[5] = _mm256_unpackhi_epi16(s6, s7);                                     \
370
6.99M
                                                                              \
371
6.99M
    __m256i res_a = convolve_6tap(s, coeffs_v);                               \
372
6.99M
    __m256i res_b = convolve_6tap(s + 3, coeffs_v);                           \
373
6.99M
                                                                              \
374
6.99M
    sr_2d_ver_round_and_store(res_a, res_b, dst_ptr, dst_stride,              \
375
6.99M
                              round_const_v);                                 \
376
6.99M
    dst_ptr += 2 * dst_stride;                                                \
377
6.99M
                                                                              \
378
6.99M
    s[0] = s[1];                                                              \
379
6.99M
    s[1] = s[2];                                                              \
380
6.99M
                                                                              \
381
6.99M
    s[3] = s[4];                                                              \
382
6.99M
    s[4] = s[5];                                                              \
383
6.99M
  }
384
385
#define CONVOLVE_SR_VERTICAL_FILTER_8TAP                                      \
386
136k
  __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));  \
387
136k
  __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));  \
388
136k
  __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));  \
389
136k
  __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));  \
390
136k
  __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));  \
391
136k
  __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));  \
392
136k
                                                                              \
393
136k
  __m256i s[8];                                                               \
394
136k
  s[0] = _mm256_unpacklo_epi16(src_0, src_1);                                 \
395
136k
  s[1] = _mm256_unpacklo_epi16(src_2, src_3);                                 \
396
136k
  s[2] = _mm256_unpacklo_epi16(src_4, src_5);                                 \
397
136k
                                                                              \
398
136k
  s[4] = _mm256_unpackhi_epi16(src_0, src_1);                                 \
399
136k
  s[5] = _mm256_unpackhi_epi16(src_2, src_3);                                 \
400
136k
  s[6] = _mm256_unpackhi_epi16(src_4, src_5);                                 \
401
136k
                                                                              \
402
1.80M
  for (i = 0; i < h; i += 2) {                                                \
403
1.66M
    const int16_t *data = &im_block[i * im_stride];                           \
404
1.66M
                                                                              \
405
1.66M
    const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); \
406
1.66M
    const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); \
407
1.66M
                                                                              \
408
1.66M
    s[3] = _mm256_unpacklo_epi16(s6, s7);                                     \
409
1.66M
    s[7] = _mm256_unpackhi_epi16(s6, s7);                                     \
410
1.66M
                                                                              \
411
1.66M
    __m256i res_a = convolve(s, coeffs_v);                                    \
412
1.66M
    __m256i res_b = convolve(s + 4, coeffs_v);                                \
413
1.66M
                                                                              \
414
1.66M
    sr_2d_ver_round_and_store(res_a, res_b, dst_ptr, dst_stride,              \
415
1.66M
                              round_const_v);                                 \
416
1.66M
    dst_ptr += 2 * dst_stride;                                                \
417
1.66M
                                                                              \
418
1.66M
    s[0] = s[1];                                                              \
419
1.66M
    s[1] = s[2];                                                              \
420
1.66M
    s[2] = s[3];                                                              \
421
1.66M
                                                                              \
422
1.66M
    s[4] = s[5];                                                              \
423
1.66M
    s[5] = s[6];                                                              \
424
1.66M
    s[6] = s[7];                                                              \
425
1.66M
  }
426
427
#define CONVOLVE_SR_HORIZONTAL_FILTER_12TAP                                    \
428
0
  const __m256i v_zero = _mm256_setzero_si256();                               \
429
0
  __m256i s[12];                                                               \
430
0
  if (w <= 4) {                                                                \
431
0
    for (i = 0; i < im_h; i += 2) {                                            \
432
0
      const __m256i data = _mm256_permute2x128_si256(                          \
433
0
          _mm256_castsi128_si256(                                              \
434
0
              _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),     \
435
0
          _mm256_castsi128_si256(_mm_loadu_si128(                              \
436
0
              (__m128i *)(&src_ptr[i * src_stride + src_stride + j]))),        \
437
0
          0x20);                                                               \
438
0
      const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);               \
439
0
      const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);               \
440
0
      const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);            \
441
0
      const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);            \
442
0
                                                                               \
443
0
      const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);            \
444
0
      const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);            \
445
0
                                                                               \
446
0
      s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);                            \
447
0
      s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);                           \
448
0
      s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);                            \
449
0
      s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);                           \
450
0
      s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);                            \
451
0
      s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);                           \
452
0
                                                                               \
453
0
      const __m256i res_lo = convolve_12taps(s, coeffs_h);                     \
454
0
                                                                               \
455
0
      __m256i res_32b_lo = _mm256_sra_epi32(                                   \
456
0
          _mm256_add_epi32(res_lo, round_const_h12), round_shift_h12);         \
457
0
      __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);         \
458
0
      const __m128i res_0 = _mm256_extracti128_si256(res_16b_lo, 0);           \
459
0
      const __m128i res_1 = _mm256_extracti128_si256(res_16b_lo, 1);           \
460
0
      if (w > 2) {                                                             \
461
0
        _mm_storel_epi64((__m128i *)&im_block[i * im_stride], res_0);          \
462
0
        _mm_storel_epi64((__m128i *)&im_block[i * im_stride + im_stride],      \
463
0
                         res_1);                                               \
464
0
      } else {                                                                 \
465
0
        uint32_t horiz_2;                                                      \
466
0
        horiz_2 = (uint32_t)_mm_cvtsi128_si32(res_0);                          \
467
0
        im_block[i * im_stride] = (uint16_t)horiz_2;                           \
468
0
        im_block[i * im_stride + 1] = (uint16_t)(horiz_2 >> 16);               \
469
0
        horiz_2 = (uint32_t)_mm_cvtsi128_si32(res_1);                          \
470
0
        im_block[i * im_stride + im_stride] = (uint16_t)horiz_2;               \
471
0
        im_block[i * im_stride + im_stride + 1] = (uint16_t)(horiz_2 >> 16);   \
472
0
      }                                                                        \
473
0
    }                                                                          \
474
0
  } else {                                                                     \
475
0
    for (i = 0; i < im_h; i++) {                                               \
476
0
      const __m256i data = _mm256_permute2x128_si256(                          \
477
0
          _mm256_castsi128_si256(                                              \
478
0
              _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),     \
479
0
          _mm256_castsi128_si256(                                              \
480
0
              _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j + 4]))), \
481
0
          0x20);                                                               \
482
0
      const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);               \
483
0
      const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);               \
484
0
                                                                               \
485
0
      const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);            \
486
0
      const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);            \
487
0
                                                                               \
488
0
      const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);            \
489
0
      const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);            \
490
0
                                                                               \
491
0
      s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);                            \
492
0
      s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);                           \
493
0
      s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);                            \
494
0
      s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);                           \
495
0
      s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);                            \
496
0
      s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);                           \
497
0
                                                                               \
498
0
      const __m256i res_lo = convolve_12taps(s, coeffs_h);                     \
499
0
                                                                               \
500
0
      __m256i res_32b_lo = _mm256_sra_epi32(                                   \
501
0
          _mm256_add_epi32(res_lo, round_const_h12), round_shift_h12);         \
502
0
                                                                               \
503
0
      __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);         \
504
0
      _mm_store_si128((__m128i *)&im_block[i * im_stride],                     \
505
0
                      _mm256_extracti128_si256(                                \
506
0
                          _mm256_permute4x64_epi64(res_16b_lo, 0x88), 0));     \
507
0
    }                                                                          \
508
0
  }
509
510
#define CONVOLVE_SR_VERTICAL_FILTER_12TAP                                      \
511
0
  __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));   \
512
0
  __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));   \
513
0
  __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));   \
514
0
  __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));   \
515
0
  __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));   \
516
0
  __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));   \
517
0
  __m256i src_6 = _mm256_loadu_si256((__m256i *)(im_block + 6 * im_stride));   \
518
0
  __m256i src_7 = _mm256_loadu_si256((__m256i *)(im_block + 7 * im_stride));   \
519
0
  __m256i src_8 = _mm256_loadu_si256((__m256i *)(im_block + 8 * im_stride));   \
520
0
  __m256i src_9 = _mm256_loadu_si256((__m256i *)(im_block + 9 * im_stride));   \
521
0
                                                                               \
522
0
  s[0] = _mm256_unpacklo_epi16(src_0, src_1);                                  \
523
0
  s[1] = _mm256_unpacklo_epi16(src_2, src_3);                                  \
524
0
  s[2] = _mm256_unpacklo_epi16(src_4, src_5);                                  \
525
0
  s[3] = _mm256_unpacklo_epi16(src_6, src_7);                                  \
526
0
  s[4] = _mm256_unpacklo_epi16(src_8, src_9);                                  \
527
0
                                                                               \
528
0
  s[6] = _mm256_unpackhi_epi16(src_0, src_1);                                  \
529
0
  s[7] = _mm256_unpackhi_epi16(src_2, src_3);                                  \
530
0
  s[8] = _mm256_unpackhi_epi16(src_4, src_5);                                  \
531
0
  s[9] = _mm256_unpackhi_epi16(src_6, src_7);                                  \
532
0
  s[10] = _mm256_unpackhi_epi16(src_8, src_9);                                 \
533
0
                                                                               \
534
0
  for (i = 0; i < h; i += 2) {                                                 \
535
0
    const int16_t *data = &im_block[i * im_stride];                            \
536
0
                                                                               \
537
0
    const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 10 * im_stride)); \
538
0
    const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 11 * im_stride)); \
539
0
                                                                               \
540
0
    s[5] = _mm256_unpacklo_epi16(s6, s7);                                      \
541
0
    s[11] = _mm256_unpackhi_epi16(s6, s7);                                     \
542
0
                                                                               \
543
0
    __m256i res_a = convolve_12taps(s, coeffs_v);                              \
544
0
    __m256i res_b = convolve_12taps(s + 6, coeffs_v);                          \
545
0
                                                                               \
546
0
    res_a =                                                                    \
547
0
        _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v);   \
548
0
    res_b =                                                                    \
549
0
        _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v);   \
550
0
                                                                               \
551
0
    const __m256i res_a_round = _mm256_sra_epi32(                              \
552
0
        _mm256_add_epi32(res_a, round_const_v), round_shift_v);                \
553
0
    const __m256i res_b_round = _mm256_sra_epi32(                              \
554
0
        _mm256_add_epi32(res_b, round_const_v), round_shift_v);                \
555
0
                                                                               \
556
0
    const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);    \
557
0
    const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);          \
558
0
                                                                               \
559
0
    const __m128i res_0 = _mm256_castsi256_si128(res_8b);                      \
560
0
    const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);                 \
561
0
                                                                               \
562
0
    __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];                  \
563
0
    __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];     \
564
0
    if (w - j > 4) {                                                           \
565
0
      _mm_storel_epi64(p_0, res_0);                                            \
566
0
      _mm_storel_epi64(p_1, res_1);                                            \
567
0
    } else if (w == 4) {                                                       \
568
0
      xx_storel_32(p_0, res_0);                                                \
569
0
      xx_storel_32(p_1, res_1);                                                \
570
0
    } else {                                                                   \
571
0
      *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);                   \
572
0
      *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);                   \
573
0
    }                                                                          \
574
0
                                                                               \
575
0
    s[0] = s[1];                                                               \
576
0
    s[1] = s[2];                                                               \
577
0
    s[2] = s[3];                                                               \
578
0
    s[3] = s[4];                                                               \
579
0
    s[4] = s[5];                                                               \
580
0
                                                                               \
581
0
    s[6] = s[7];                                                               \
582
0
    s[7] = s[8];                                                               \
583
0
    s[8] = s[9];                                                               \
584
0
    s[9] = s[10];                                                              \
585
0
    s[10] = s[11];                                                             \
586
0
  }
587
588
#define DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP                        \
589
207k
  do {                                                                  \
590
3.11M
    for (i = 0; i < im_h; i += 2) {                                     \
591
2.91M
      __m256i data =                                                    \
592
2.91M
          _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h));    \
593
2.91M
      if (i + 1 < im_h)                                                 \
594
2.91M
        data = _mm256_inserti128_si256(                                 \
595
2.91M
            data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1); \
596
2.91M
      src_h += (src_stride << 1);                                       \
597
2.91M
      __m256i res = convolve_lowbd_x(data, coeffs_x, filt);             \
598
2.91M
                                                                        \
599
2.91M
      res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),      \
600
2.91M
                             round_shift_h);                            \
601
2.91M
                                                                        \
602
2.91M
      _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);     \
603
2.91M
    }                                                                   \
604
207k
  } while (0)
605
606
#define DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP                                 \
607
252k
  do {                                                                         \
608
252k
    __m256i s[8];                                                              \
609
252k
    __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));    \
610
252k
    __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));    \
611
252k
    __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));    \
612
252k
    __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));    \
613
252k
    __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));    \
614
252k
    __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));    \
615
252k
                                                                               \
616
252k
    s[0] = _mm256_unpacklo_epi16(s0, s1);                                      \
617
252k
    s[1] = _mm256_unpacklo_epi16(s2, s3);                                      \
618
252k
    s[2] = _mm256_unpacklo_epi16(s4, s5);                                      \
619
252k
                                                                               \
620
252k
    s[4] = _mm256_unpackhi_epi16(s0, s1);                                      \
621
252k
    s[5] = _mm256_unpackhi_epi16(s2, s3);                                      \
622
252k
    s[6] = _mm256_unpackhi_epi16(s4, s5);                                      \
623
252k
                                                                               \
624
2.63M
    for (i = 0; i < h; i += 2) {                                               \
625
2.37M
      const int16_t *data = &im_block[i * im_stride];                          \
626
2.37M
                                                                               \
627
2.37M
      const __m256i s6 =                                                       \
628
2.37M
          _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));               \
629
2.37M
      const __m256i s7 =                                                       \
630
2.37M
          _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));               \
631
2.37M
                                                                               \
632
2.37M
      s[3] = _mm256_unpacklo_epi16(s6, s7);                                    \
633
2.37M
      s[7] = _mm256_unpackhi_epi16(s6, s7);                                    \
634
2.37M
                                                                               \
635
2.37M
      const __m256i res_a = convolve(s, coeffs_y);                             \
636
2.37M
      const __m256i res_a_round = _mm256_sra_epi32(                            \
637
2.37M
          _mm256_add_epi32(res_a, round_const_v), round_shift_v);              \
638
2.37M
                                                                               \
639
2.37M
      if (w - j > 4) {                                                         \
640
2.22M
        const __m256i res_b = convolve(s + 4, coeffs_y);                       \
641
2.22M
        const __m256i res_b_round = _mm256_sra_epi32(                          \
642
2.22M
            _mm256_add_epi32(res_b, round_const_v), round_shift_v);            \
643
2.22M
        const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round);  \
644
2.22M
        const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);  \
645
2.22M
                                                                               \
646
2.22M
        if (do_average) {                                                      \
647
954k
          const __m256i data_ref_0 =                                           \
648
954k
              load_line2_avx2(&dst[i * dst_stride + j],                        \
649
954k
                              &dst[i * dst_stride + j + dst_stride]);          \
650
954k
          const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,    \
651
954k
                                                &wt, use_dist_wtd_comp_avg);   \
652
954k
                                                                               \
653
954k
          const __m256i round_result = convolve_rounding(                      \
654
954k
              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);  \
655
954k
                                                                               \
656
954k
          const __m256i res_8 =                                                \
657
954k
              _mm256_packus_epi16(round_result, round_result);                 \
658
954k
          const __m128i res_0 = _mm256_castsi256_si128(res_8);                 \
659
954k
          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);            \
660
954k
                                                                               \
661
954k
          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);    \
662
954k
          _mm_storel_epi64(                                                    \
663
954k
              (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); \
664
1.27M
        } else {                                                               \
665
1.27M
          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);          \
666
1.27M
          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);       \
667
1.27M
                                                                               \
668
1.27M
          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);     \
669
1.27M
          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),  \
670
1.27M
                          res_1);                                              \
671
1.27M
        }                                                                      \
672
2.22M
      } else {                                                                 \
673
150k
        const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round);  \
674
150k
        const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);  \
675
150k
                                                                               \
676
150k
        if (do_average) {                                                      \
677
66.8k
          const __m256i data_ref_0 =                                           \
678
66.8k
              load_line2_avx2(&dst[i * dst_stride + j],                        \
679
66.8k
                              &dst[i * dst_stride + j + dst_stride]);          \
680
66.8k
                                                                               \
681
66.8k
          const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,    \
682
66.8k
                                                &wt, use_dist_wtd_comp_avg);   \
683
66.8k
                                                                               \
684
66.8k
          const __m256i round_result = convolve_rounding(                      \
685
66.8k
              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);  \
686
66.8k
                                                                               \
687
66.8k
          const __m256i res_8 =                                                \
688
66.8k
              _mm256_packus_epi16(round_result, round_result);                 \
689
66.8k
          const __m128i res_0 = _mm256_castsi256_si128(res_8);                 \
690
66.8k
          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);            \
691
66.8k
                                                                               \
692
66.8k
          *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);     \
693
66.8k
          *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) =                 \
694
66.8k
              _mm_cvtsi128_si32(res_1);                                        \
695
66.8k
                                                                               \
696
83.3k
        } else {                                                               \
697
83.3k
          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);          \
698
83.3k
          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);       \
699
83.3k
                                                                               \
700
83.3k
          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);     \
701
83.3k
          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),  \
702
83.3k
                          res_1);                                              \
703
83.3k
        }                                                                      \
704
150k
      }                                                                        \
705
2.37M
                                                                               \
706
2.37M
      s[0] = s[1];                                                             \
707
2.37M
      s[1] = s[2];                                                             \
708
2.37M
      s[2] = s[3];                                                             \
709
2.37M
                                                                               \
710
2.37M
      s[4] = s[5];                                                             \
711
2.37M
      s[5] = s[6];                                                             \
712
2.37M
      s[6] = s[7];                                                             \
713
2.37M
    }                                                                          \
714
252k
  } while (0)
715
716
static inline void prepare_coeffs_2t_ssse3(
717
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
718
37.7k
    __m128i *const coeffs /* [4] */) {
719
37.7k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
720
37.7k
      filter_params, subpel_q4 & SUBPEL_MASK);
721
37.7k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
722
723
  // right shift all filter co-efficients by 1 to reduce the bits required.
724
  // This extra right shift will be taken care of at the end while rounding
725
  // the result.
726
  // Since all filter co-efficients are even, this change will not affect the
727
  // end result
728
37.7k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
729
37.7k
                            _mm_set1_epi16((short)0xffff)));
730
731
37.7k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
732
733
  // coeffs 3 4 3 4 3 4 3 4
734
37.7k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u));
735
37.7k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_2t_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_2t_ssse3
convolve_2d_avx2.c:prepare_coeffs_2t_ssse3
Line
Count
Source
718
22.8k
    __m128i *const coeffs /* [4] */) {
719
22.8k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
720
22.8k
      filter_params, subpel_q4 & SUBPEL_MASK);
721
22.8k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
722
723
  // right shift all filter co-efficients by 1 to reduce the bits required.
724
  // This extra right shift will be taken care of at the end while rounding
725
  // the result.
726
  // Since all filter co-efficients are even, this change will not affect the
727
  // end result
728
22.8k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
729
22.8k
                            _mm_set1_epi16((short)0xffff)));
730
731
22.8k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
732
733
  // coeffs 3 4 3 4 3 4 3 4
734
22.8k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u));
735
22.8k
}
convolve_avx2.c:prepare_coeffs_2t_ssse3
Line
Count
Source
718
14.8k
    __m128i *const coeffs /* [4] */) {
719
14.8k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
720
14.8k
      filter_params, subpel_q4 & SUBPEL_MASK);
721
14.8k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
722
723
  // right shift all filter co-efficients by 1 to reduce the bits required.
724
  // This extra right shift will be taken care of at the end while rounding
725
  // the result.
726
  // Since all filter co-efficients are even, this change will not affect the
727
  // end result
728
14.8k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
729
14.8k
                            _mm_set1_epi16((short)0xffff)));
730
731
14.8k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
732
733
  // coeffs 3 4 3 4 3 4 3 4
734
14.8k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u));
735
14.8k
}
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_2t_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_2t_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_2t_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_2t_ssse3
736
737
static inline void prepare_coeffs_4t_ssse3(
738
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
739
828k
    __m128i *const coeffs /* [4] */) {
740
828k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
741
828k
      filter_params, subpel_q4 & SUBPEL_MASK);
742
828k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
743
744
  // right shift all filter co-efficients by 1 to reduce the bits required.
745
  // This extra right shift will be taken care of at the end while rounding
746
  // the result.
747
  // Since all filter co-efficients are even, this change will not affect the
748
  // end result
749
828k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
750
828k
                            _mm_set1_epi16((short)0xffff)));
751
752
828k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
753
754
  // coeffs 2 3 2 3 2 3 2 3
755
828k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
756
  // coeffs 4 5 4 5 4 5 4 5
757
828k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
758
828k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_4t_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_4t_ssse3
convolve_2d_avx2.c:prepare_coeffs_4t_ssse3
Line
Count
Source
739
528k
    __m128i *const coeffs /* [4] */) {
740
528k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
741
528k
      filter_params, subpel_q4 & SUBPEL_MASK);
742
528k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
743
744
  // right shift all filter co-efficients by 1 to reduce the bits required.
745
  // This extra right shift will be taken care of at the end while rounding
746
  // the result.
747
  // Since all filter co-efficients are even, this change will not affect the
748
  // end result
749
528k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
750
528k
                            _mm_set1_epi16((short)0xffff)));
751
752
528k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
753
754
  // coeffs 2 3 2 3 2 3 2 3
755
528k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
756
  // coeffs 4 5 4 5 4 5 4 5
757
528k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
758
528k
}
convolve_avx2.c:prepare_coeffs_4t_ssse3
Line
Count
Source
739
300k
    __m128i *const coeffs /* [4] */) {
740
300k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
741
300k
      filter_params, subpel_q4 & SUBPEL_MASK);
742
300k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
743
744
  // right shift all filter co-efficients by 1 to reduce the bits required.
745
  // This extra right shift will be taken care of at the end while rounding
746
  // the result.
747
  // Since all filter co-efficients are even, this change will not affect the
748
  // end result
749
300k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
750
300k
                            _mm_set1_epi16((short)0xffff)));
751
752
300k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
753
754
  // coeffs 2 3 2 3 2 3 2 3
755
300k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
756
  // coeffs 4 5 4 5 4 5 4 5
757
300k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
758
300k
}
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_4t_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_4t_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_4t_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_4t_ssse3
759
760
static inline void prepare_coeffs_6t_ssse3(
761
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
762
61.9k
    __m128i *const coeffs /* [4] */) {
763
61.9k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
764
61.9k
      filter_params, subpel_q4 & SUBPEL_MASK);
765
61.9k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
766
767
  // right shift all filter co-efficients by 1 to reduce the bits required.
768
  // This extra right shift will be taken care of at the end while rounding
769
  // the result.
770
  // Since all filter co-efficients are even, this change will not affect the
771
  // end result
772
61.9k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
773
61.9k
                            _mm_set1_epi16((short)0xffff)));
774
775
61.9k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
776
777
  // coeffs 2 3 2 3 2 3 2 3
778
61.9k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0402u));
779
  // coeffs 4 5 4 5 4 5 4 5
780
61.9k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u));
781
  // coeffs 5 6 5 6 5 6 5 6
782
61.9k
  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0c0au));
783
61.9k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_6t_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_6t_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:prepare_coeffs_6t_ssse3
convolve_avx2.c:prepare_coeffs_6t_ssse3
Line
Count
Source
762
61.9k
    __m128i *const coeffs /* [4] */) {
763
61.9k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
764
61.9k
      filter_params, subpel_q4 & SUBPEL_MASK);
765
61.9k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
766
767
  // right shift all filter co-efficients by 1 to reduce the bits required.
768
  // This extra right shift will be taken care of at the end while rounding
769
  // the result.
770
  // Since all filter co-efficients are even, this change will not affect the
771
  // end result
772
61.9k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
773
61.9k
                            _mm_set1_epi16((short)0xffff)));
774
775
61.9k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
776
777
  // coeffs 2 3 2 3 2 3 2 3
778
61.9k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0402u));
779
  // coeffs 4 5 4 5 4 5 4 5
780
61.9k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u));
781
  // coeffs 5 6 5 6 5 6 5 6
782
61.9k
  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0c0au));
783
61.9k
}
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_6t_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_6t_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_6t_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_6t_ssse3
784
785
static inline void prepare_coeffs_ssse3(
786
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
787
5.08k
    __m128i *const coeffs /* [4] */) {
788
5.08k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
789
5.08k
      filter_params, subpel_q4 & SUBPEL_MASK);
790
5.08k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
791
792
  // right shift all filter co-efficients by 1 to reduce the bits required.
793
  // This extra right shift will be taken care of at the end while rounding
794
  // the result.
795
  // Since all filter co-efficients are even, this change will not affect the
796
  // end result
797
5.08k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
798
5.08k
                            _mm_set1_epi16((short)0xffff)));
799
800
5.08k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
801
802
  // coeffs 0 1 0 1 0 1 0 1
803
5.08k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
804
  // coeffs 2 3 2 3 2 3 2 3
805
5.08k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
806
  // coeffs 4 5 4 5 4 5 4 5
807
5.08k
  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
808
  // coeffs 6 7 6 7 6 7 6 7
809
5.08k
  coeffs[3] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0e0cu));
810
5.08k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:prepare_coeffs_ssse3
convolve_avx2.c:prepare_coeffs_ssse3
Line
Count
Source
787
5.08k
    __m128i *const coeffs /* [4] */) {
788
5.08k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
789
5.08k
      filter_params, subpel_q4 & SUBPEL_MASK);
790
5.08k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
791
792
  // right shift all filter co-efficients by 1 to reduce the bits required.
793
  // This extra right shift will be taken care of at the end while rounding
794
  // the result.
795
  // Since all filter co-efficients are even, this change will not affect the
796
  // end result
797
5.08k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
798
5.08k
                            _mm_set1_epi16((short)0xffff)));
799
800
5.08k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
801
802
  // coeffs 0 1 0 1 0 1 0 1
803
5.08k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
804
  // coeffs 2 3 2 3 2 3 2 3
805
5.08k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
806
  // coeffs 4 5 4 5 4 5 4 5
807
5.08k
  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
808
  // coeffs 6 7 6 7 6 7 6 7
809
5.08k
  coeffs[3] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0e0cu));
810
5.08k
}
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_ssse3
811
812
static inline void prepare_coeffs_2t_lowbd(
813
    const InterpFilterParams *const filter_params, const int subpel_q4,
814
30.4k
    __m256i *const coeffs /* [4] */) {
815
30.4k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
816
30.4k
      filter_params, subpel_q4 & SUBPEL_MASK);
817
30.4k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
818
30.4k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
819
820
  // right shift all filter co-efficients by 1 to reduce the bits required.
821
  // This extra right shift will be taken care of at the end while rounding
822
  // the result.
823
  // Since all filter co-efficients are even, this change will not affect the
824
  // end result
825
30.4k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
826
30.4k
                            _mm_set1_epi16((int16_t)0xffff)));
827
828
30.4k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
829
830
  // coeffs 3 4 3 4 3 4 3 4
831
30.4k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u));
832
30.4k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_2t_lowbd
Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_2t_lowbd
convolve_2d_avx2.c:prepare_coeffs_2t_lowbd
Line
Count
Source
814
20.4k
    __m256i *const coeffs /* [4] */) {
815
20.4k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
816
20.4k
      filter_params, subpel_q4 & SUBPEL_MASK);
817
20.4k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
818
20.4k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
819
820
  // right shift all filter co-efficients by 1 to reduce the bits required.
821
  // This extra right shift will be taken care of at the end while rounding
822
  // the result.
823
  // Since all filter co-efficients are even, this change will not affect the
824
  // end result
825
20.4k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
826
20.4k
                            _mm_set1_epi16((int16_t)0xffff)));
827
828
20.4k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
829
830
  // coeffs 3 4 3 4 3 4 3 4
831
20.4k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u));
832
20.4k
}
convolve_avx2.c:prepare_coeffs_2t_lowbd
Line
Count
Source
814
9.94k
    __m256i *const coeffs /* [4] */) {
815
9.94k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
816
9.94k
      filter_params, subpel_q4 & SUBPEL_MASK);
817
9.94k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
818
9.94k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
819
820
  // right shift all filter co-efficients by 1 to reduce the bits required.
821
  // This extra right shift will be taken care of at the end while rounding
822
  // the result.
823
  // Since all filter co-efficients are even, this change will not affect the
824
  // end result
825
9.94k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
826
9.94k
                            _mm_set1_epi16((int16_t)0xffff)));
827
828
9.94k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
829
830
  // coeffs 3 4 3 4 3 4 3 4
831
9.94k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u));
832
9.94k
}
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_2t_lowbd
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_2t_lowbd
Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_2t_lowbd
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_2t_lowbd
833
834
static inline void prepare_coeffs_4t_lowbd(
835
    const InterpFilterParams *const filter_params, const int subpel_q4,
836
163k
    __m256i *const coeffs /* [4] */) {
837
163k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
838
163k
      filter_params, subpel_q4 & SUBPEL_MASK);
839
163k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
840
163k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
841
842
  // right shift all filter co-efficients by 1 to reduce the bits required.
843
  // This extra right shift will be taken care of at the end while rounding
844
  // the result.
845
  // Since all filter co-efficients are even, this change will not affect the
846
  // end result
847
163k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
848
163k
                            _mm_set1_epi16((short)0xffff)));
849
850
163k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
851
852
  // coeffs 2 3 2 3 2 3 2 3
853
163k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
854
  // coeffs 4 5 4 5 4 5 4 5
855
163k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
856
163k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_4t_lowbd
Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_4t_lowbd
convolve_2d_avx2.c:prepare_coeffs_4t_lowbd
Line
Count
Source
836
39.7k
    __m256i *const coeffs /* [4] */) {
837
39.7k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
838
39.7k
      filter_params, subpel_q4 & SUBPEL_MASK);
839
39.7k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
840
39.7k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
841
842
  // right shift all filter co-efficients by 1 to reduce the bits required.
843
  // This extra right shift will be taken care of at the end while rounding
844
  // the result.
845
  // Since all filter co-efficients are even, this change will not affect the
846
  // end result
847
39.7k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
848
39.7k
                            _mm_set1_epi16((short)0xffff)));
849
850
39.7k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
851
852
  // coeffs 2 3 2 3 2 3 2 3
853
39.7k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
854
  // coeffs 4 5 4 5 4 5 4 5
855
39.7k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
856
39.7k
}
convolve_avx2.c:prepare_coeffs_4t_lowbd
Line
Count
Source
836
123k
    __m256i *const coeffs /* [4] */) {
837
123k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
838
123k
      filter_params, subpel_q4 & SUBPEL_MASK);
839
123k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
840
123k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
841
842
  // right shift all filter co-efficients by 1 to reduce the bits required.
843
  // This extra right shift will be taken care of at the end while rounding
844
  // the result.
845
  // Since all filter co-efficients are even, this change will not affect the
846
  // end result
847
123k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
848
123k
                            _mm_set1_epi16((short)0xffff)));
849
850
123k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
851
852
  // coeffs 2 3 2 3 2 3 2 3
853
123k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
854
  // coeffs 4 5 4 5 4 5 4 5
855
123k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
856
123k
}
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_4t_lowbd
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_4t_lowbd
Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_4t_lowbd
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_4t_lowbd
857
858
static inline void prepare_coeffs_6t_lowbd(
859
    const InterpFilterParams *const filter_params, const int subpel_q4,
860
978k
    __m256i *const coeffs /* [4] */) {
861
978k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
862
978k
      filter_params, subpel_q4 & SUBPEL_MASK);
863
978k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
864
978k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
865
866
  // right shift all filter co-efficients by 1 to reduce the bits required.
867
  // This extra right shift will be taken care of at the end while rounding
868
  // the result.
869
  // Since all filter co-efficients are even, this change will not affect the
870
  // end result
871
978k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
872
978k
                            _mm_set1_epi16((int16_t)0xffff)));
873
874
978k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
875
876
  // coeffs 1 2 1 2 1 2 1 2
877
978k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0402u));
878
  // coeffs 3 4 3 4 3 4 3 4
879
978k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u));
880
  // coeffs 5 6 5 6 5 6 5 6
881
978k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0c0au));
882
978k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_6t_lowbd
Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_6t_lowbd
convolve_2d_avx2.c:prepare_coeffs_6t_lowbd
Line
Count
Source
860
595k
    __m256i *const coeffs /* [4] */) {
861
595k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
862
595k
      filter_params, subpel_q4 & SUBPEL_MASK);
863
595k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
864
595k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
865
866
  // right shift all filter co-efficients by 1 to reduce the bits required.
867
  // This extra right shift will be taken care of at the end while rounding
868
  // the result.
869
  // Since all filter co-efficients are even, this change will not affect the
870
  // end result
871
595k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
872
595k
                            _mm_set1_epi16((int16_t)0xffff)));
873
874
594k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
875
876
  // coeffs 1 2 1 2 1 2 1 2
877
594k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0402u));
878
  // coeffs 3 4 3 4 3 4 3 4
879
594k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u));
880
  // coeffs 5 6 5 6 5 6 5 6
881
594k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0c0au));
882
594k
}
convolve_avx2.c:prepare_coeffs_6t_lowbd
Line
Count
Source
860
383k
    __m256i *const coeffs /* [4] */) {
861
383k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
862
383k
      filter_params, subpel_q4 & SUBPEL_MASK);
863
383k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
864
383k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
865
866
  // right shift all filter co-efficients by 1 to reduce the bits required.
867
  // This extra right shift will be taken care of at the end while rounding
868
  // the result.
869
  // Since all filter co-efficients are even, this change will not affect the
870
  // end result
871
383k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
872
383k
                            _mm_set1_epi16((int16_t)0xffff)));
873
874
383k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
875
876
  // coeffs 1 2 1 2 1 2 1 2
877
383k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0402u));
878
  // coeffs 3 4 3 4 3 4 3 4
879
383k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u));
880
  // coeffs 5 6 5 6 5 6 5 6
881
383k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0c0au));
882
383k
}
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_6t_lowbd
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_6t_lowbd
Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_6t_lowbd
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_6t_lowbd
883
884
static inline void prepare_coeffs_lowbd(
885
    const InterpFilterParams *const filter_params, const int subpel_q4,
886
427k
    __m256i *const coeffs /* [4] */) {
887
427k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
888
427k
      filter_params, subpel_q4 & SUBPEL_MASK);
889
427k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
890
427k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
891
892
  // right shift all filter co-efficients by 1 to reduce the bits required.
893
  // This extra right shift will be taken care of at the end while rounding
894
  // the result.
895
  // Since all filter co-efficients are even, this change will not affect the
896
  // end result
897
427k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
898
427k
                            _mm_set1_epi16((short)0xffff)));
899
900
427k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
901
902
  // coeffs 0 1 0 1 0 1 0 1
903
427k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
904
  // coeffs 2 3 2 3 2 3 2 3
905
427k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
906
  // coeffs 4 5 4 5 4 5 4 5
907
427k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
908
  // coeffs 6 7 6 7 6 7 6 7
909
427k
  coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu));
910
427k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_lowbd
Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_lowbd
convolve_2d_avx2.c:prepare_coeffs_lowbd
Line
Count
Source
886
49.5k
    __m256i *const coeffs /* [4] */) {
887
49.5k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
888
49.5k
      filter_params, subpel_q4 & SUBPEL_MASK);
889
49.5k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
890
49.5k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
891
892
  // right shift all filter co-efficients by 1 to reduce the bits required.
893
  // This extra right shift will be taken care of at the end while rounding
894
  // the result.
895
  // Since all filter co-efficients are even, this change will not affect the
896
  // end result
897
49.5k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
898
49.5k
                            _mm_set1_epi16((short)0xffff)));
899
900
49.5k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
901
902
  // coeffs 0 1 0 1 0 1 0 1
903
49.5k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
904
  // coeffs 2 3 2 3 2 3 2 3
905
49.5k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
906
  // coeffs 4 5 4 5 4 5 4 5
907
49.5k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
908
  // coeffs 6 7 6 7 6 7 6 7
909
49.5k
  coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu));
910
49.5k
}
convolve_avx2.c:prepare_coeffs_lowbd
Line
Count
Source
886
35.4k
    __m256i *const coeffs /* [4] */) {
887
35.4k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
888
35.4k
      filter_params, subpel_q4 & SUBPEL_MASK);
889
35.4k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
890
35.4k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
891
892
  // right shift all filter co-efficients by 1 to reduce the bits required.
893
  // This extra right shift will be taken care of at the end while rounding
894
  // the result.
895
  // Since all filter co-efficients are even, this change will not affect the
896
  // end result
897
35.4k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
898
35.4k
                            _mm_set1_epi16((short)0xffff)));
899
900
35.4k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
901
902
  // coeffs 0 1 0 1 0 1 0 1
903
35.4k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
904
  // coeffs 2 3 2 3 2 3 2 3
905
35.4k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
906
  // coeffs 4 5 4 5 4 5 4 5
907
35.4k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
908
  // coeffs 6 7 6 7 6 7 6 7
909
35.4k
  coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu));
910
35.4k
}
jnt_convolve_avx2.c:prepare_coeffs_lowbd
Line
Count
Source
886
342k
    __m256i *const coeffs /* [4] */) {
887
342k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
888
342k
      filter_params, subpel_q4 & SUBPEL_MASK);
889
342k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
890
342k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
891
892
  // right shift all filter co-efficients by 1 to reduce the bits required.
893
  // This extra right shift will be taken care of at the end while rounding
894
  // the result.
895
  // Since all filter co-efficients are even, this change will not affect the
896
  // end result
897
342k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
898
342k
                            _mm_set1_epi16((short)0xffff)));
899
900
342k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
901
902
  // coeffs 0 1 0 1 0 1 0 1
903
342k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
904
  // coeffs 2 3 2 3 2 3 2 3
905
342k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
906
  // coeffs 4 5 4 5 4 5 4 5
907
342k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
908
  // coeffs 6 7 6 7 6 7 6 7
909
342k
  coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu));
910
342k
}
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_lowbd
Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_lowbd
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_lowbd
911
912
static inline void prepare_coeffs_2t(
913
    const InterpFilterParams *const filter_params, const int subpel_q4,
914
43.3k
    __m256i *const coeffs /* [4] */) {
915
43.3k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
916
43.3k
      filter_params, subpel_q4 & SUBPEL_MASK);
917
918
43.3k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)(filter + 1));
919
43.3k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
920
921
  // coeffs 3 4 3 4 3 4 3 4
922
43.3k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55);
923
43.3k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_2t
Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_2t
convolve_2d_avx2.c:prepare_coeffs_2t
Line
Count
Source
914
43.3k
    __m256i *const coeffs /* [4] */) {
915
43.3k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
916
43.3k
      filter_params, subpel_q4 & SUBPEL_MASK);
917
918
43.3k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)(filter + 1));
919
43.3k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
920
921
  // coeffs 3 4 3 4 3 4 3 4
922
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55);
923
43.3k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_2t
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_2t
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_2t
Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_2t
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_2t
924
925
static inline void prepare_coeffs_4t(
926
    const InterpFilterParams *const filter_params, const int subpel_q4,
927
670k
    __m256i *const coeffs /* [4] */) {
928
670k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
929
670k
      filter_params, subpel_q4 & SUBPEL_MASK);
930
931
670k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
932
670k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
933
  // coeffs 2 3 2 3 2 3 2 3
934
670k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55);
935
  // coeffs 4 5 4 5 4 5 4 5
936
670k
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0xaa);
937
670k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_4t
Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_4t
convolve_2d_avx2.c:prepare_coeffs_4t
Line
Count
Source
927
670k
    __m256i *const coeffs /* [4] */) {
928
670k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
929
670k
      filter_params, subpel_q4 & SUBPEL_MASK);
930
931
670k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
932
670k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
933
  // coeffs 2 3 2 3 2 3 2 3
934
670k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55);
935
  // coeffs 4 5 4 5 4 5 4 5
936
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0xaa);
937
670k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_4t
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_4t
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_4t
Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_4t
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_4t
938
939
static inline void prepare_coeffs_6t(
940
    const InterpFilterParams *const filter_params, const int subpel_q4,
941
495k
    __m256i *const coeffs /* [4] */) {
942
495k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
943
495k
      filter_params, subpel_q4 & SUBPEL_MASK);
944
945
495k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)(filter + 1));
946
495k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
947
948
  // coeffs 1 2 1 2 1 2 1 2
949
495k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
950
  // coeffs 3 4 3 4 3 4 3 4
951
495k
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
952
  // coeffs 5 6 5 6 5 6 5 6
953
495k
  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
954
495k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_6t
Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_6t
convolve_2d_avx2.c:prepare_coeffs_6t
Line
Count
Source
941
495k
    __m256i *const coeffs /* [4] */) {
942
495k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
943
495k
      filter_params, subpel_q4 & SUBPEL_MASK);
944
945
495k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)(filter + 1));
946
495k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
947
948
  // coeffs 1 2 1 2 1 2 1 2
949
495k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
950
  // coeffs 3 4 3 4 3 4 3 4
951
495k
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
952
  // coeffs 5 6 5 6 5 6 5 6
953
  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
954
495k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_6t
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_6t
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_6t
Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_6t
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_6t
955
956
static inline void prepare_coeffs(const InterpFilterParams *const filter_params,
957
                                  const int subpel_q4,
958
7.67M
                                  __m256i *const coeffs /* [4] */) {
959
7.67M
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
960
7.67M
      filter_params, subpel_q4 & SUBPEL_MASK);
961
962
7.67M
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
963
7.67M
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
964
965
  // coeffs 0 1 0 1 0 1 0 1
966
7.67M
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
967
  // coeffs 2 3 2 3 2 3 2 3
968
7.67M
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
969
  // coeffs 4 5 4 5 4 5 4 5
970
7.67M
  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
971
  // coeffs 6 7 6 7 6 7 6 7
972
7.67M
  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
973
7.67M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs
highbd_convolve_avx2.c:prepare_coeffs
Line
Count
Source
958
1.59M
                                  __m256i *const coeffs /* [4] */) {
959
1.59M
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
960
1.59M
      filter_params, subpel_q4 & SUBPEL_MASK);
961
962
1.59M
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
963
1.59M
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
964
965
  // coeffs 0 1 0 1 0 1 0 1
966
1.59M
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
967
  // coeffs 2 3 2 3 2 3 2 3
968
1.59M
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
969
  // coeffs 4 5 4 5 4 5 4 5
970
1.59M
  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
971
  // coeffs 6 7 6 7 6 7 6 7
972
  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
973
1.59M
}
convolve_2d_avx2.c:prepare_coeffs
Line
Count
Source
958
46.4k
                                  __m256i *const coeffs /* [4] */) {
959
46.4k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
960
46.4k
      filter_params, subpel_q4 & SUBPEL_MASK);
961
962
46.4k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
963
46.4k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
964
965
  // coeffs 0 1 0 1 0 1 0 1
966
46.4k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
967
  // coeffs 2 3 2 3 2 3 2 3
968
46.4k
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
969
  // coeffs 4 5 4 5 4 5 4 5
970
46.4k
  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
971
  // coeffs 6 7 6 7 6 7 6 7
972
  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
973
46.4k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs
jnt_convolve_avx2.c:prepare_coeffs
Line
Count
Source
958
173k
                                  __m256i *const coeffs /* [4] */) {
959
173k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
960
173k
      filter_params, subpel_q4 & SUBPEL_MASK);
961
962
173k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
963
173k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
964
965
  // coeffs 0 1 0 1 0 1 0 1
966
173k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
967
  // coeffs 2 3 2 3 2 3 2 3
968
173k
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
969
  // coeffs 4 5 4 5 4 5 4 5
970
173k
  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
971
  // coeffs 6 7 6 7 6 7 6 7
972
  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
973
173k
}
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs
highbd_convolve_2d_avx2.c:prepare_coeffs
Line
Count
Source
958
5.16M
                                  __m256i *const coeffs /* [4] */) {
959
5.16M
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
960
5.16M
      filter_params, subpel_q4 & SUBPEL_MASK);
961
962
5.16M
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
963
5.16M
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
964
965
  // coeffs 0 1 0 1 0 1 0 1
966
5.16M
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
967
  // coeffs 2 3 2 3 2 3 2 3
968
5.16M
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
969
  // coeffs 4 5 4 5 4 5 4 5
970
5.16M
  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
971
  // coeffs 6 7 6 7 6 7 6 7
972
  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
973
5.16M
}
highbd_jnt_convolve_avx2.c:prepare_coeffs
Line
Count
Source
958
693k
                                  __m256i *const coeffs /* [4] */) {
959
693k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
960
693k
      filter_params, subpel_q4 & SUBPEL_MASK);
961
962
693k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
963
693k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
964
965
  // coeffs 0 1 0 1 0 1 0 1
966
693k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
967
  // coeffs 2 3 2 3 2 3 2 3
968
693k
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
969
  // coeffs 4 5 4 5 4 5 4 5
970
693k
  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
971
  // coeffs 6 7 6 7 6 7 6 7
972
  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
973
693k
}
974
975
static inline void prepare_coeffs_12taps(
976
    const InterpFilterParams *const filter_params, const int subpel_q4,
977
0
    __m256i *const coeffs /* [4] */) {
978
0
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
979
0
      filter_params, subpel_q4 & SUBPEL_MASK);
980
981
0
  __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
982
0
  __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
983
984
  // coeffs 0 1 0 1 0 1 0 1
985
0
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
986
  // coeffs 2 3 2 3 2 3 2 3
987
0
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
988
  // coeffs 4 5 4 5 4 5 4 5
989
0
  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
990
  // coeffs 6 7 6 7 6 7 6 7
991
0
  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
992
  // coeffs 8 9 10 11 0 0 0 0
993
0
  coeff_8 = _mm_loadl_epi64((__m128i *)(filter + 8));
994
0
  coeff = _mm256_broadcastq_epi64(coeff_8);
995
0
  coeffs[4] = _mm256_shuffle_epi32(coeff, 0x00);  // coeffs 8 9 8 9 8 9 8 9
996
0
  coeffs[5] = _mm256_shuffle_epi32(coeff, 0x55);  // coeffs 10 11 10 11.. 10 11
997
0
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:prepare_coeffs_12taps
Unexecuted instantiation: highbd_convolve_avx2.c:prepare_coeffs_12taps
Unexecuted instantiation: convolve_2d_avx2.c:prepare_coeffs_12taps
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_12taps
Unexecuted instantiation: jnt_convolve_avx2.c:prepare_coeffs_12taps
Unexecuted instantiation: wiener_convolve_avx2.c:prepare_coeffs_12taps
Unexecuted instantiation: highbd_convolve_2d_avx2.c:prepare_coeffs_12taps
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:prepare_coeffs_12taps
998
999
static inline __m128i convolve_lowbd_4tap_ssse3(const __m128i ss[2],
1000
3.63M
                                                const __m128i coeffs[2]) {
1001
3.63M
  const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
1002
3.63M
  const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
1003
1004
3.63M
  return _mm_add_epi16(res_01, res_23);
1005
3.63M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_4tap_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_4tap_ssse3
convolve_2d_avx2.c:convolve_lowbd_4tap_ssse3
Line
Count
Source
1000
2.81M
                                                const __m128i coeffs[2]) {
1001
2.81M
  const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
1002
2.81M
  const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
1003
1004
2.81M
  return _mm_add_epi16(res_01, res_23);
1005
2.81M
}
convolve_avx2.c:convolve_lowbd_4tap_ssse3
Line
Count
Source
1000
814k
                                                const __m128i coeffs[2]) {
1001
814k
  const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
1002
814k
  const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
1003
1004
814k
  return _mm_add_epi16(res_01, res_23);
1005
814k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_4tap_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_4tap_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_4tap_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_4tap_ssse3
1006
1007
static inline __m128i convolve_lowbd_6tap_ssse3(const __m128i ss[3],
1008
320k
                                                const __m128i coeffs[3]) {
1009
320k
  const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
1010
320k
  const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
1011
320k
  const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]);
1012
1013
320k
  const __m128i res = _mm_add_epi16(_mm_add_epi16(res_01, res_45), res_23);
1014
1015
320k
  return res;
1016
320k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_6tap_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_6tap_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_lowbd_6tap_ssse3
convolve_avx2.c:convolve_lowbd_6tap_ssse3
Line
Count
Source
1008
320k
                                                const __m128i coeffs[3]) {
1009
320k
  const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
1010
320k
  const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
1011
320k
  const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]);
1012
1013
320k
  const __m128i res = _mm_add_epi16(_mm_add_epi16(res_01, res_45), res_23);
1014
1015
320k
  return res;
1016
320k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_6tap_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_6tap_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_6tap_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_6tap_ssse3
1017
1018
static inline __m128i convolve_lowbd_ssse3(const __m128i ss[4],
1019
26.0k
                                           const __m128i coeffs[4]) {
1020
26.0k
  const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
1021
26.0k
  const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
1022
26.0k
  const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]);
1023
26.0k
  const __m128i res_67 = _mm_maddubs_epi16(ss[3], coeffs[3]);
1024
1025
26.0k
  const __m128i res = _mm_add_epi16(_mm_add_epi16(res_01, res_45),
1026
26.0k
                                    _mm_add_epi16(res_23, res_67));
1027
1028
26.0k
  return res;
1029
26.0k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_lowbd_ssse3
convolve_avx2.c:convolve_lowbd_ssse3
Line
Count
Source
1019
26.0k
                                           const __m128i coeffs[4]) {
1020
26.0k
  const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
1021
26.0k
  const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
1022
26.0k
  const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]);
1023
26.0k
  const __m128i res_67 = _mm_maddubs_epi16(ss[3], coeffs[3]);
1024
1025
26.0k
  const __m128i res = _mm_add_epi16(_mm_add_epi16(res_01, res_45),
1026
26.0k
                                    _mm_add_epi16(res_23, res_67));
1027
1028
26.0k
  return res;
1029
26.0k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_ssse3
1030
1031
static inline __m256i convolve_lowbd(const __m256i *const s,
1032
19.0M
                                     const __m256i *const coeffs) {
1033
19.0M
  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
1034
19.0M
  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
1035
19.0M
  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
1036
19.0M
  const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]);
1037
1038
  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1039
19.0M
  const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45),
1040
19.0M
                                       _mm256_add_epi16(res_23, res_67));
1041
1042
19.0M
  return res;
1043
19.0M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd
convolve_2d_avx2.c:convolve_lowbd
Line
Count
Source
1032
2.28M
                                     const __m256i *const coeffs) {
1033
2.28M
  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
1034
2.28M
  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
1035
2.28M
  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
1036
2.28M
  const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]);
1037
1038
  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1039
2.28M
  const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45),
1040
2.28M
                                       _mm256_add_epi16(res_23, res_67));
1041
1042
2.28M
  return res;
1043
2.28M
}
convolve_avx2.c:convolve_lowbd
Line
Count
Source
1032
511k
                                     const __m256i *const coeffs) {
1033
511k
  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
1034
511k
  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
1035
511k
  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
1036
511k
  const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]);
1037
1038
  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1039
511k
  const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45),
1040
511k
                                       _mm256_add_epi16(res_23, res_67));
1041
1042
511k
  return res;
1043
511k
}
jnt_convolve_avx2.c:convolve_lowbd
Line
Count
Source
1032
5.43M
                                     const __m256i *const coeffs) {
1033
5.43M
  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
1034
5.43M
  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
1035
5.43M
  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
1036
5.43M
  const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]);
1037
1038
  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1039
5.43M
  const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45),
1040
5.43M
                                       _mm256_add_epi16(res_23, res_67));
1041
1042
5.43M
  return res;
1043
5.43M
}
wiener_convolve_avx2.c:convolve_lowbd
Line
Count
Source
1032
10.7M
                                     const __m256i *const coeffs) {
1033
10.7M
  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
1034
10.7M
  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
1035
10.7M
  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
1036
10.7M
  const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]);
1037
1038
  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1039
10.7M
  const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45),
1040
10.7M
                                       _mm256_add_epi16(res_23, res_67));
1041
1042
10.7M
  return res;
1043
10.7M
}
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd
1044
1045
static inline __m256i convolve_lowbd_6tap(const __m256i *const s,
1046
16.1M
                                          const __m256i *const coeffs) {
1047
16.1M
  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
1048
16.1M
  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
1049
16.1M
  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
1050
1051
  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1052
16.1M
  const __m256i res =
1053
16.1M
      _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), res_23);
1054
1055
16.1M
  return res;
1056
16.1M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_6tap
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_6tap
convolve_2d_avx2.c:convolve_lowbd_6tap
Line
Count
Source
1046
9.99M
                                          const __m256i *const coeffs) {
1047
9.99M
  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
1048
9.99M
  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
1049
9.99M
  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
1050
1051
  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1052
9.99M
  const __m256i res =
1053
9.99M
      _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), res_23);
1054
1055
9.99M
  return res;
1056
9.99M
}
convolve_avx2.c:convolve_lowbd_6tap
Line
Count
Source
1046
6.15M
                                          const __m256i *const coeffs) {
1047
6.15M
  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
1048
6.15M
  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
1049
6.15M
  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
1050
1051
  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1052
6.15M
  const __m256i res =
1053
6.15M
      _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), res_23);
1054
1055
6.15M
  return res;
1056
6.15M
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_6tap
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_6tap
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_6tap
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_6tap
1057
1058
static inline __m256i convolve_lowbd_4tap(const __m256i *const s,
1059
4.41M
                                          const __m256i *const coeffs) {
1060
4.41M
  const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]);
1061
4.41M
  const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]);
1062
1063
  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1064
4.41M
  const __m256i res = _mm256_add_epi16(res_45, res_23);
1065
1066
4.41M
  return res;
1067
4.41M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_4tap
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_4tap
convolve_2d_avx2.c:convolve_lowbd_4tap
Line
Count
Source
1059
1.04M
                                          const __m256i *const coeffs) {
1060
1.04M
  const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]);
1061
1.04M
  const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]);
1062
1063
  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1064
1.04M
  const __m256i res = _mm256_add_epi16(res_45, res_23);
1065
1066
1.04M
  return res;
1067
1.04M
}
convolve_avx2.c:convolve_lowbd_4tap
Line
Count
Source
1059
1.50M
                                          const __m256i *const coeffs) {
1060
1.50M
  const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]);
1061
1.50M
  const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]);
1062
1063
  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1064
1.50M
  const __m256i res = _mm256_add_epi16(res_45, res_23);
1065
1066
1.50M
  return res;
1067
1.50M
}
jnt_convolve_avx2.c:convolve_lowbd_4tap
Line
Count
Source
1059
1.86M
                                          const __m256i *const coeffs) {
1060
1.86M
  const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]);
1061
1.86M
  const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]);
1062
1063
  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1064
1.86M
  const __m256i res = _mm256_add_epi16(res_45, res_23);
1065
1066
1.86M
  return res;
1067
1.86M
}
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_4tap
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_4tap
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_4tap
1068
1069
static inline __m256i convolve_6tap(const __m256i *const s,
1070
14.8M
                                    const __m256i *const coeffs) {
1071
14.8M
  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
1072
14.8M
  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
1073
14.8M
  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
1074
1075
14.8M
  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), res_2);
1076
1077
14.8M
  return res;
1078
14.8M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_6tap
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_6tap
convolve_2d_avx2.c:convolve_6tap
Line
Count
Source
1070
14.8M
                                    const __m256i *const coeffs) {
1071
14.8M
  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
1072
14.8M
  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
1073
14.8M
  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
1074
1075
14.8M
  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), res_2);
1076
1077
14.8M
  return res;
1078
14.8M
}
Unexecuted instantiation: convolve_avx2.c:convolve_6tap
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_6tap
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_6tap
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_6tap
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_6tap
1079
1080
static inline __m256i convolve_12taps(const __m256i *const s,
1081
0
                                      const __m256i *const coeffs) {
1082
0
  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
1083
0
  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
1084
0
  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
1085
0
  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
1086
0
  const __m256i res_4 = _mm256_madd_epi16(s[4], coeffs[4]);
1087
0
  const __m256i res_5 = _mm256_madd_epi16(s[5], coeffs[5]);
1088
1089
0
  const __m256i res1 = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
1090
0
                                        _mm256_add_epi32(res_2, res_3));
1091
0
  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_4, res_5), res1);
1092
1093
0
  return res;
1094
0
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_12taps
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_12taps
Unexecuted instantiation: convolve_2d_avx2.c:convolve_12taps
Unexecuted instantiation: convolve_avx2.c:convolve_12taps
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_12taps
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_12taps
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_12taps
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_12taps
1095
1096
static inline __m256i convolve(const __m256i *const s,
1097
207M
                               const __m256i *const coeffs) {
1098
207M
  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
1099
207M
  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
1100
207M
  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
1101
207M
  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
1102
1103
207M
  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
1104
207M
                                       _mm256_add_epi32(res_2, res_3));
1105
1106
207M
  return res;
1107
207M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve
highbd_convolve_avx2.c:convolve
Line
Count
Source
1097
30.1M
                               const __m256i *const coeffs) {
1098
30.1M
  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
1099
30.1M
  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
1100
30.1M
  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
1101
30.1M
  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
1102
1103
30.1M
  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
1104
30.1M
                                       _mm256_add_epi32(res_2, res_3));
1105
1106
30.1M
  return res;
1107
30.1M
}
convolve_2d_avx2.c:convolve
Line
Count
Source
1097
3.38M
                               const __m256i *const coeffs) {
1098
3.38M
  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
1099
3.38M
  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
1100
3.38M
  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
1101
3.38M
  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
1102
1103
3.38M
  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
1104
3.38M
                                       _mm256_add_epi32(res_2, res_3));
1105
1106
3.38M
  return res;
1107
3.38M
}
Unexecuted instantiation: convolve_avx2.c:convolve
jnt_convolve_avx2.c:convolve
Line
Count
Source
1097
4.59M
                               const __m256i *const coeffs) {
1098
4.59M
  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
1099
4.59M
  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
1100
4.59M
  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
1101
4.59M
  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
1102
1103
4.59M
  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
1104
4.59M
                                       _mm256_add_epi32(res_2, res_3));
1105
1106
4.59M
  return res;
1107
4.59M
}
wiener_convolve_avx2.c:convolve
Line
Count
Source
1097
19.2M
                               const __m256i *const coeffs) {
1098
19.2M
  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
1099
19.2M
  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
1100
19.2M
  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
1101
19.2M
  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
1102
1103
19.2M
  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
1104
19.2M
                                       _mm256_add_epi32(res_2, res_3));
1105
1106
19.2M
  return res;
1107
19.2M
}
highbd_convolve_2d_avx2.c:convolve
Line
Count
Source
1097
111M
                               const __m256i *const coeffs) {
1098
111M
  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
1099
111M
  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
1100
111M
  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
1101
111M
  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
1102
1103
111M
  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
1104
111M
                                       _mm256_add_epi32(res_2, res_3));
1105
1106
111M
  return res;
1107
111M
}
highbd_jnt_convolve_avx2.c:convolve
Line
Count
Source
1097
39.1M
                               const __m256i *const coeffs) {
1098
39.1M
  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
1099
39.1M
  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
1100
39.1M
  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
1101
39.1M
  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
1102
1103
39.1M
  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
1104
39.1M
                                       _mm256_add_epi32(res_2, res_3));
1105
1106
39.1M
  return res;
1107
39.1M
}
1108
1109
static inline __m256i convolve_4tap(const __m256i *const s,
1110
3.55M
                                    const __m256i *const coeffs) {
1111
3.55M
  const __m256i res_1 = _mm256_madd_epi16(s[0], coeffs[0]);
1112
3.55M
  const __m256i res_2 = _mm256_madd_epi16(s[1], coeffs[1]);
1113
1114
3.55M
  const __m256i res = _mm256_add_epi32(res_1, res_2);
1115
3.55M
  return res;
1116
3.55M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_4tap
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_4tap
convolve_2d_avx2.c:convolve_4tap
Line
Count
Source
1110
3.31M
                                    const __m256i *const coeffs) {
1111
3.31M
  const __m256i res_1 = _mm256_madd_epi16(s[0], coeffs[0]);
1112
3.31M
  const __m256i res_2 = _mm256_madd_epi16(s[1], coeffs[1]);
1113
1114
3.31M
  const __m256i res = _mm256_add_epi32(res_1, res_2);
1115
3.31M
  return res;
1116
3.31M
}
Unexecuted instantiation: convolve_avx2.c:convolve_4tap
jnt_convolve_avx2.c:convolve_4tap
Line
Count
Source
1110
244k
                                    const __m256i *const coeffs) {
1111
244k
  const __m256i res_1 = _mm256_madd_epi16(s[0], coeffs[0]);
1112
244k
  const __m256i res_2 = _mm256_madd_epi16(s[1], coeffs[1]);
1113
1114
244k
  const __m256i res = _mm256_add_epi32(res_1, res_2);
1115
244k
  return res;
1116
244k
}
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_4tap
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_4tap
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_4tap
1117
1118
static inline __m128i convolve_lowbd_x_2tap_ssse3(const __m128i data,
1119
                                                  const __m128i *const coeffs,
1120
87.1k
                                                  const __m128i *const filt) {
1121
87.1k
  __m128i s;
1122
87.1k
  s = _mm_shuffle_epi8(data, filt[0]);
1123
1124
87.1k
  return _mm_maddubs_epi16(s, coeffs[0]);
1125
87.1k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x_2tap_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x_2tap_ssse3
convolve_2d_avx2.c:convolve_lowbd_x_2tap_ssse3
Line
Count
Source
1120
87.1k
                                                  const __m128i *const filt) {
1121
87.1k
  __m128i s;
1122
87.1k
  s = _mm_shuffle_epi8(data, filt[0]);
1123
1124
87.1k
  return _mm_maddubs_epi16(s, coeffs[0]);
1125
87.1k
}
Unexecuted instantiation: convolve_avx2.c:convolve_lowbd_x_2tap_ssse3
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_x_2tap_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_x_2tap_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x_2tap_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x_2tap_ssse3
1126
1127
static inline __m128i convolve_lowbd_x_4tap_ssse3(const __m128i data,
1128
                                                  const __m128i *const coeffs,
1129
2.81M
                                                  const __m128i *const filt) {
1130
2.81M
  __m128i s[2];
1131
1132
2.81M
  s[0] = _mm_shuffle_epi8(data, filt[0]);
1133
2.81M
  s[1] = _mm_shuffle_epi8(data, filt[1]);
1134
1135
2.81M
  return convolve_lowbd_4tap_ssse3(s, coeffs);
1136
2.81M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x_4tap_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x_4tap_ssse3
convolve_2d_avx2.c:convolve_lowbd_x_4tap_ssse3
Line
Count
Source
1129
2.81M
                                                  const __m128i *const filt) {
1130
2.81M
  __m128i s[2];
1131
1132
2.81M
  s[0] = _mm_shuffle_epi8(data, filt[0]);
1133
2.81M
  s[1] = _mm_shuffle_epi8(data, filt[1]);
1134
1135
2.81M
  return convolve_lowbd_4tap_ssse3(s, coeffs);
1136
2.81M
}
Unexecuted instantiation: convolve_avx2.c:convolve_lowbd_x_4tap_ssse3
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_x_4tap_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_x_4tap_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x_4tap_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x_4tap_ssse3
1137
1138
static inline __m256i convolve_lowbd_x(const __m256i data,
1139
                                       const __m256i *const coeffs,
1140
18.0M
                                       const __m256i *const filt) {
1141
18.0M
  __m256i s[4];
1142
1143
18.0M
  s[0] = _mm256_shuffle_epi8(data, filt[0]);
1144
18.0M
  s[1] = _mm256_shuffle_epi8(data, filt[1]);
1145
18.0M
  s[2] = _mm256_shuffle_epi8(data, filt[2]);
1146
18.0M
  s[3] = _mm256_shuffle_epi8(data, filt[3]);
1147
1148
18.0M
  return convolve_lowbd(s, coeffs);
1149
18.0M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x
convolve_2d_avx2.c:convolve_lowbd_x
Line
Count
Source
1140
2.28M
                                       const __m256i *const filt) {
1141
2.28M
  __m256i s[4];
1142
1143
2.28M
  s[0] = _mm256_shuffle_epi8(data, filt[0]);
1144
2.28M
  s[1] = _mm256_shuffle_epi8(data, filt[1]);
1145
2.28M
  s[2] = _mm256_shuffle_epi8(data, filt[2]);
1146
2.28M
  s[3] = _mm256_shuffle_epi8(data, filt[3]);
1147
1148
2.28M
  return convolve_lowbd(s, coeffs);
1149
2.28M
}
convolve_avx2.c:convolve_lowbd_x
Line
Count
Source
1140
331k
                                       const __m256i *const filt) {
1141
331k
  __m256i s[4];
1142
1143
331k
  s[0] = _mm256_shuffle_epi8(data, filt[0]);
1144
331k
  s[1] = _mm256_shuffle_epi8(data, filt[1]);
1145
331k
  s[2] = _mm256_shuffle_epi8(data, filt[2]);
1146
331k
  s[3] = _mm256_shuffle_epi8(data, filt[3]);
1147
1148
331k
  return convolve_lowbd(s, coeffs);
1149
331k
}
jnt_convolve_avx2.c:convolve_lowbd_x
Line
Count
Source
1140
4.63M
                                       const __m256i *const filt) {
1141
4.63M
  __m256i s[4];
1142
1143
4.63M
  s[0] = _mm256_shuffle_epi8(data, filt[0]);
1144
4.63M
  s[1] = _mm256_shuffle_epi8(data, filt[1]);
1145
4.63M
  s[2] = _mm256_shuffle_epi8(data, filt[2]);
1146
4.63M
  s[3] = _mm256_shuffle_epi8(data, filt[3]);
1147
1148
4.63M
  return convolve_lowbd(s, coeffs);
1149
4.63M
}
wiener_convolve_avx2.c:convolve_lowbd_x
Line
Count
Source
1140
10.7M
                                       const __m256i *const filt) {
1141
10.7M
  __m256i s[4];
1142
1143
10.7M
  s[0] = _mm256_shuffle_epi8(data, filt[0]);
1144
10.7M
  s[1] = _mm256_shuffle_epi8(data, filt[1]);
1145
10.7M
  s[2] = _mm256_shuffle_epi8(data, filt[2]);
1146
10.7M
  s[3] = _mm256_shuffle_epi8(data, filt[3]);
1147
1148
10.7M
  return convolve_lowbd(s, coeffs);
1149
10.7M
}
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x
1150
1151
static inline __m256i convolve_lowbd_x_6tap(const __m256i data,
1152
                                            const __m256i *const coeffs,
1153
13.2M
                                            const __m256i *const filt) {
1154
13.2M
  __m256i s[4];
1155
1156
13.2M
  s[0] = _mm256_shuffle_epi8(data, filt[0]);
1157
13.2M
  s[1] = _mm256_shuffle_epi8(data, filt[1]);
1158
13.2M
  s[2] = _mm256_shuffle_epi8(data, filt[2]);
1159
1160
13.2M
  return convolve_lowbd_6tap(s, coeffs);
1161
13.2M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x_6tap
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x_6tap
convolve_2d_avx2.c:convolve_lowbd_x_6tap
Line
Count
Source
1153
9.99M
                                            const __m256i *const filt) {
1154
9.99M
  __m256i s[4];
1155
1156
9.99M
  s[0] = _mm256_shuffle_epi8(data, filt[0]);
1157
9.99M
  s[1] = _mm256_shuffle_epi8(data, filt[1]);
1158
9.99M
  s[2] = _mm256_shuffle_epi8(data, filt[2]);
1159
1160
9.99M
  return convolve_lowbd_6tap(s, coeffs);
1161
9.99M
}
convolve_avx2.c:convolve_lowbd_x_6tap
Line
Count
Source
1153
3.29M
                                            const __m256i *const filt) {
1154
3.29M
  __m256i s[4];
1155
1156
3.29M
  s[0] = _mm256_shuffle_epi8(data, filt[0]);
1157
3.29M
  s[1] = _mm256_shuffle_epi8(data, filt[1]);
1158
3.29M
  s[2] = _mm256_shuffle_epi8(data, filt[2]);
1159
1160
3.29M
  return convolve_lowbd_6tap(s, coeffs);
1161
3.29M
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_x_6tap
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_x_6tap
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x_6tap
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x_6tap
1162
1163
static inline __m256i convolve_lowbd_x_4tap(const __m256i data,
1164
                                            const __m256i *const coeffs,
1165
3.21M
                                            const __m256i *const filt) {
1166
3.21M
  __m256i s[2];
1167
1168
3.21M
  s[0] = _mm256_shuffle_epi8(data, filt[0]);
1169
3.21M
  s[1] = _mm256_shuffle_epi8(data, filt[1]);
1170
1171
3.21M
  return convolve_lowbd_4tap(s, coeffs);
1172
3.21M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x_4tap
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x_4tap
convolve_2d_avx2.c:convolve_lowbd_x_4tap
Line
Count
Source
1165
1.04M
                                            const __m256i *const filt) {
1166
1.04M
  __m256i s[2];
1167
1168
1.04M
  s[0] = _mm256_shuffle_epi8(data, filt[0]);
1169
1.04M
  s[1] = _mm256_shuffle_epi8(data, filt[1]);
1170
1171
1.04M
  return convolve_lowbd_4tap(s, coeffs);
1172
1.04M
}
convolve_avx2.c:convolve_lowbd_x_4tap
Line
Count
Source
1165
589k
                                            const __m256i *const filt) {
1166
589k
  __m256i s[2];
1167
1168
589k
  s[0] = _mm256_shuffle_epi8(data, filt[0]);
1169
589k
  s[1] = _mm256_shuffle_epi8(data, filt[1]);
1170
1171
589k
  return convolve_lowbd_4tap(s, coeffs);
1172
589k
}
jnt_convolve_avx2.c:convolve_lowbd_x_4tap
Line
Count
Source
1165
1.58M
                                            const __m256i *const filt) {
1166
1.58M
  __m256i s[2];
1167
1168
1.58M
  s[0] = _mm256_shuffle_epi8(data, filt[0]);
1169
1.58M
  s[1] = _mm256_shuffle_epi8(data, filt[1]);
1170
1171
1.58M
  return convolve_lowbd_4tap(s, coeffs);
1172
1.58M
}
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_x_4tap
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x_4tap
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x_4tap
1173
1174
static inline __m256i convolve_lowbd_x_2tap(const __m256i data,
1175
                                            const __m256i *const coeffs,
1176
528k
                                            const __m256i *const filt) {
1177
528k
  __m256i s;
1178
528k
  s = _mm256_shuffle_epi8(data, filt[0]);
1179
1180
528k
  return _mm256_maddubs_epi16(s, coeffs[0]);
1181
528k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_lowbd_x_2tap
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_lowbd_x_2tap
convolve_2d_avx2.c:convolve_lowbd_x_2tap
Line
Count
Source
1176
528k
                                            const __m256i *const filt) {
1177
528k
  __m256i s;
1178
528k
  s = _mm256_shuffle_epi8(data, filt[0]);
1179
1180
528k
  return _mm256_maddubs_epi16(s, coeffs[0]);
1181
528k
}
Unexecuted instantiation: convolve_avx2.c:convolve_lowbd_x_2tap
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_lowbd_x_2tap
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_lowbd_x_2tap
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_lowbd_x_2tap
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_lowbd_x_2tap
1182
1183
static inline void add_store_aligned_256(CONV_BUF_TYPE *const dst,
1184
                                         const __m256i *const res,
1185
0
                                         const int do_average) {
1186
0
  __m256i d;
1187
0
  if (do_average) {
1188
0
    d = _mm256_load_si256((__m256i *)dst);
1189
0
    d = _mm256_add_epi32(d, *res);
1190
0
    d = _mm256_srai_epi32(d, 1);
1191
0
  } else {
1192
0
    d = *res;
1193
0
  }
1194
0
  _mm256_store_si256((__m256i *)dst, d);
1195
0
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:add_store_aligned_256
Unexecuted instantiation: highbd_convolve_avx2.c:add_store_aligned_256
Unexecuted instantiation: convolve_2d_avx2.c:add_store_aligned_256
Unexecuted instantiation: convolve_avx2.c:add_store_aligned_256
Unexecuted instantiation: jnt_convolve_avx2.c:add_store_aligned_256
Unexecuted instantiation: wiener_convolve_avx2.c:add_store_aligned_256
Unexecuted instantiation: highbd_convolve_2d_avx2.c:add_store_aligned_256
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:add_store_aligned_256
1196
1197
static inline __m256i comp_avg(const __m256i *const data_ref_0,
1198
                               const __m256i *const res_unsigned,
1199
                               const __m256i *const wt,
1200
124M
                               const int use_dist_wtd_comp_avg) {
1201
124M
  __m256i res;
1202
124M
  if (use_dist_wtd_comp_avg) {
1203
2.06M
    const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned);
1204
2.06M
    const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned);
1205
1206
2.06M
    const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt);
1207
2.06M
    const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt);
1208
1209
2.06M
    const __m256i res_lo = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
1210
2.06M
    const __m256i res_hi = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
1211
1212
2.06M
    res = _mm256_packs_epi32(res_lo, res_hi);
1213
122M
  } else {
1214
122M
    const __m256i wt_res = _mm256_add_epi16(*data_ref_0, *res_unsigned);
1215
122M
    res = _mm256_srai_epi16(wt_res, 1);
1216
122M
  }
1217
124M
  return res;
1218
124M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:comp_avg
Unexecuted instantiation: highbd_convolve_avx2.c:comp_avg
Unexecuted instantiation: convolve_2d_avx2.c:comp_avg
Unexecuted instantiation: convolve_avx2.c:comp_avg
jnt_convolve_avx2.c:comp_avg
Line
Count
Source
1200
124M
                               const int use_dist_wtd_comp_avg) {
1201
124M
  __m256i res;
1202
124M
  if (use_dist_wtd_comp_avg) {
1203
2.06M
    const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned);
1204
2.06M
    const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned);
1205
1206
2.06M
    const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt);
1207
2.06M
    const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt);
1208
1209
2.06M
    const __m256i res_lo = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
1210
2.06M
    const __m256i res_hi = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
1211
1212
2.06M
    res = _mm256_packs_epi32(res_lo, res_hi);
1213
122M
  } else {
1214
122M
    const __m256i wt_res = _mm256_add_epi16(*data_ref_0, *res_unsigned);
1215
122M
    res = _mm256_srai_epi16(wt_res, 1);
1216
122M
  }
1217
124M
  return res;
1218
124M
}
Unexecuted instantiation: wiener_convolve_avx2.c:comp_avg
Unexecuted instantiation: highbd_convolve_2d_avx2.c:comp_avg
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:comp_avg
1219
1220
static inline __m256i convolve_rounding(const __m256i *const res_unsigned,
1221
                                        const __m256i *const offset_const,
1222
                                        const __m256i *const round_const,
1223
124M
                                        const int round_shift) {
1224
124M
  const __m256i res_signed = _mm256_sub_epi16(*res_unsigned, *offset_const);
1225
124M
  const __m256i res_round = _mm256_srai_epi16(
1226
124M
      _mm256_add_epi16(res_signed, *round_const), round_shift);
1227
124M
  return res_round;
1228
124M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_rounding
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_rounding
Unexecuted instantiation: convolve_2d_avx2.c:convolve_rounding
Unexecuted instantiation: convolve_avx2.c:convolve_rounding
jnt_convolve_avx2.c:convolve_rounding
Line
Count
Source
1223
124M
                                        const int round_shift) {
1224
124M
  const __m256i res_signed = _mm256_sub_epi16(*res_unsigned, *offset_const);
1225
124M
  const __m256i res_round = _mm256_srai_epi16(
1226
124M
      _mm256_add_epi16(res_signed, *round_const), round_shift);
1227
124M
  return res_round;
1228
124M
}
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_rounding
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_rounding
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_rounding
1229
1230
static inline __m256i highbd_comp_avg(const __m256i *const data_ref_0,
1231
                                      const __m256i *const res_unsigned,
1232
                                      const __m256i *const wt0,
1233
                                      const __m256i *const wt1,
1234
12.5M
                                      const int use_dist_wtd_comp_avg) {
1235
12.5M
  __m256i res;
1236
12.5M
  if (use_dist_wtd_comp_avg) {
1237
1.29M
    const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0);
1238
1.29M
    const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1);
1239
1.29M
    const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res);
1240
1.29M
    res = _mm256_srai_epi32(wt_res, DIST_PRECISION_BITS);
1241
11.2M
  } else {
1242
11.2M
    const __m256i wt_res = _mm256_add_epi32(*data_ref_0, *res_unsigned);
1243
11.2M
    res = _mm256_srai_epi32(wt_res, 1);
1244
11.2M
  }
1245
12.5M
  return res;
1246
12.5M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:highbd_comp_avg
Unexecuted instantiation: highbd_convolve_avx2.c:highbd_comp_avg
Unexecuted instantiation: convolve_2d_avx2.c:highbd_comp_avg
Unexecuted instantiation: convolve_avx2.c:highbd_comp_avg
Unexecuted instantiation: jnt_convolve_avx2.c:highbd_comp_avg
Unexecuted instantiation: wiener_convolve_avx2.c:highbd_comp_avg
Unexecuted instantiation: highbd_convolve_2d_avx2.c:highbd_comp_avg
highbd_jnt_convolve_avx2.c:highbd_comp_avg
Line
Count
Source
1234
12.5M
                                      const int use_dist_wtd_comp_avg) {
1235
12.5M
  __m256i res;
1236
12.5M
  if (use_dist_wtd_comp_avg) {
1237
1.29M
    const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0);
1238
1.29M
    const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1);
1239
1.29M
    const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res);
1240
1.29M
    res = _mm256_srai_epi32(wt_res, DIST_PRECISION_BITS);
1241
11.2M
  } else {
1242
11.2M
    const __m256i wt_res = _mm256_add_epi32(*data_ref_0, *res_unsigned);
1243
11.2M
    res = _mm256_srai_epi32(wt_res, 1);
1244
11.2M
  }
1245
12.5M
  return res;
1246
12.5M
}
1247
1248
static inline __m256i highbd_convolve_rounding(
1249
    const __m256i *const res_unsigned, const __m256i *const offset_const,
1250
12.5M
    const __m256i *const round_const, const int round_shift) {
1251
12.5M
  const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const);
1252
12.5M
  const __m256i res_round = _mm256_srai_epi32(
1253
12.5M
      _mm256_add_epi32(res_signed, *round_const), round_shift);
1254
1255
12.5M
  return res_round;
1256
12.5M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:highbd_convolve_rounding
Unexecuted instantiation: highbd_convolve_avx2.c:highbd_convolve_rounding
Unexecuted instantiation: convolve_2d_avx2.c:highbd_convolve_rounding
Unexecuted instantiation: convolve_avx2.c:highbd_convolve_rounding
Unexecuted instantiation: jnt_convolve_avx2.c:highbd_convolve_rounding
Unexecuted instantiation: wiener_convolve_avx2.c:highbd_convolve_rounding
Unexecuted instantiation: highbd_convolve_2d_avx2.c:highbd_convolve_rounding
highbd_jnt_convolve_avx2.c:highbd_convolve_rounding
Line
Count
Source
1250
12.5M
    const __m256i *const round_const, const int round_shift) {
1251
12.5M
  const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const);
1252
12.5M
  const __m256i res_round = _mm256_srai_epi32(
1253
12.5M
      _mm256_add_epi32(res_signed, *round_const), round_shift);
1254
1255
12.5M
  return res_round;
1256
12.5M
}
1257
1258
4.44M
static inline __m256i round_sr_x_avx2(const __m256i data) {
1259
  // we can perform the below steps:
1260
  // data = (data + 2) >> 2
1261
  // data = (data + 8) >> 4,
1262
  // in the below form as well
1263
  // data = (data + 0x22) >> 6
1264
4.44M
  const __m256i value = _mm256_set1_epi16(34);
1265
4.44M
  const __m256i reg = _mm256_add_epi16(data, value);
1266
4.44M
  return _mm256_srai_epi16(reg, 6);
1267
4.44M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_sr_x_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:round_sr_x_avx2
Unexecuted instantiation: convolve_2d_avx2.c:round_sr_x_avx2
convolve_avx2.c:round_sr_x_avx2
Line
Count
Source
1258
4.44M
static inline __m256i round_sr_x_avx2(const __m256i data) {
1259
  // we can perform the below steps:
1260
  // data = (data + 2) >> 2
1261
  // data = (data + 8) >> 4,
1262
  // in the below form as well
1263
  // data = (data + 0x22) >> 6
1264
4.44M
  const __m256i value = _mm256_set1_epi16(34);
1265
4.44M
  const __m256i reg = _mm256_add_epi16(data, value);
1266
4.44M
  return _mm256_srai_epi16(reg, 6);
1267
4.44M
}
Unexecuted instantiation: jnt_convolve_avx2.c:round_sr_x_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:round_sr_x_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_sr_x_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_sr_x_avx2
1268
1269
static inline __m128i convolve_x_4tap_4x2_ssse3(const uint8_t *const src,
1270
                                                const ptrdiff_t src_stride,
1271
507k
                                                __m128i *const coeffs) {
1272
507k
  __m128i data[2];
1273
507k
  const __m128i f_l0 = _mm_load_si128((__m128i const *)filt1_global_sse2);
1274
507k
  const __m128i f_l1 = _mm_load_si128((__m128i const *)filt2_global_sse2);
1275
507k
  const __m128i src_1 =
1276
507k
      load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * src_stride));
1277
1278
507k
  data[0] = _mm_shuffle_epi8(src_1, f_l0);
1279
507k
  data[1] = _mm_shuffle_epi8(src_1, f_l1);
1280
507k
  return convolve_lowbd_4tap_ssse3(data, coeffs);
1281
507k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_4tap_4x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_4tap_4x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_4tap_4x2_ssse3
convolve_avx2.c:convolve_x_4tap_4x2_ssse3
Line
Count
Source
1271
507k
                                                __m128i *const coeffs) {
1272
507k
  __m128i data[2];
1273
507k
  const __m128i f_l0 = _mm_load_si128((__m128i const *)filt1_global_sse2);
1274
507k
  const __m128i f_l1 = _mm_load_si128((__m128i const *)filt2_global_sse2);
1275
507k
  const __m128i src_1 =
1276
507k
      load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * src_stride));
1277
1278
507k
  data[0] = _mm_shuffle_epi8(src_1, f_l0);
1279
507k
  data[1] = _mm_shuffle_epi8(src_1, f_l1);
1280
507k
  return convolve_lowbd_4tap_ssse3(data, coeffs);
1281
507k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_4tap_4x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_4tap_4x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_4tap_4x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_4tap_4x2_ssse3
1282
1283
630k
static inline __m128i round_sr_x_ssse3(const __m128i data) {
1284
630k
  const __m128i val = _mm_set1_epi16(34);
1285
630k
  const __m128i reg = _mm_add_epi16(data, val);
1286
630k
  return _mm_srai_epi16(reg, 6);
1287
630k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_sr_x_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:round_sr_x_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:round_sr_x_ssse3
convolve_avx2.c:round_sr_x_ssse3
Line
Count
Source
1283
630k
static inline __m128i round_sr_x_ssse3(const __m128i data) {
1284
630k
  const __m128i val = _mm_set1_epi16(34);
1285
630k
  const __m128i reg = _mm_add_epi16(data, val);
1286
630k
  return _mm_srai_epi16(reg, 6);
1287
630k
}
Unexecuted instantiation: jnt_convolve_avx2.c:round_sr_x_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:round_sr_x_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_sr_x_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_sr_x_ssse3
1288
1289
static inline void store_8bit_4x2_sse2(const __m128i reg, uint8_t *const dst,
1290
1.01M
                                       const ptrdiff_t dst_stride) {
1291
1.01M
  xx_storel_32(dst, reg);
1292
1.01M
  *(uint32_t *)(dst + dst_stride) =
1293
1.01M
      ((uint32_t)_mm_extract_epi16(reg, 3) << 16) | _mm_extract_epi16(reg, 2);
1294
1.01M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:store_8bit_4x2_sse2
Unexecuted instantiation: highbd_convolve_avx2.c:store_8bit_4x2_sse2
Unexecuted instantiation: convolve_2d_avx2.c:store_8bit_4x2_sse2
convolve_avx2.c:store_8bit_4x2_sse2
Line
Count
Source
1290
1.01M
                                       const ptrdiff_t dst_stride) {
1291
1.01M
  xx_storel_32(dst, reg);
1292
1.01M
  *(uint32_t *)(dst + dst_stride) =
1293
1.01M
      ((uint32_t)_mm_extract_epi16(reg, 3) << 16) | _mm_extract_epi16(reg, 2);
1294
1.01M
}
Unexecuted instantiation: jnt_convolve_avx2.c:store_8bit_4x2_sse2
Unexecuted instantiation: wiener_convolve_avx2.c:store_8bit_4x2_sse2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:store_8bit_4x2_sse2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:store_8bit_4x2_sse2
1295
1296
static inline void pack_store_u8_4x2_sse2(const __m128i reg, uint8_t *const dst,
1297
1.01M
                                          const ptrdiff_t dst_stride) {
1298
1.01M
  const __m128i reg_pack = _mm_packus_epi16(reg, reg);
1299
1.01M
  store_8bit_4x2_sse2(reg_pack, dst, dst_stride);
1300
1.01M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:pack_store_u8_4x2_sse2
Unexecuted instantiation: highbd_convolve_avx2.c:pack_store_u8_4x2_sse2
Unexecuted instantiation: convolve_2d_avx2.c:pack_store_u8_4x2_sse2
convolve_avx2.c:pack_store_u8_4x2_sse2
Line
Count
Source
1297
1.01M
                                          const ptrdiff_t dst_stride) {
1298
1.01M
  const __m128i reg_pack = _mm_packus_epi16(reg, reg);
1299
1.01M
  store_8bit_4x2_sse2(reg_pack, dst, dst_stride);
1300
1.01M
}
Unexecuted instantiation: jnt_convolve_avx2.c:pack_store_u8_4x2_sse2
Unexecuted instantiation: wiener_convolve_avx2.c:pack_store_u8_4x2_sse2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:pack_store_u8_4x2_sse2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:pack_store_u8_4x2_sse2
1301
1302
static inline __m128i convolve_x_4tap_2x2_ssse3(const uint8_t *const src,
1303
                                                const ptrdiff_t src_stride,
1304
88.0k
                                                __m128i *const coeffs) {
1305
88.0k
  __m128i data[2];
1306
88.0k
  const __m128i f_0 = _mm_load_si128((__m128i const *)filt3_global_sse2);
1307
88.0k
  const __m128i f_1 = _mm_load_si128((__m128i const *)filt4_global_sse2);
1308
88.0k
  const __m128i reg =
1309
88.0k
      load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * src_stride));
1310
1311
88.0k
  data[0] = _mm_shuffle_epi8(reg, f_0);
1312
88.0k
  data[1] = _mm_shuffle_epi8(reg, f_1);
1313
88.0k
  return convolve_lowbd_4tap_ssse3(data, coeffs);
1314
88.0k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_4tap_2x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_4tap_2x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_4tap_2x2_ssse3
convolve_avx2.c:convolve_x_4tap_2x2_ssse3
Line
Count
Source
1304
88.0k
                                                __m128i *const coeffs) {
1305
88.0k
  __m128i data[2];
1306
88.0k
  const __m128i f_0 = _mm_load_si128((__m128i const *)filt3_global_sse2);
1307
88.0k
  const __m128i f_1 = _mm_load_si128((__m128i const *)filt4_global_sse2);
1308
88.0k
  const __m128i reg =
1309
88.0k
      load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * src_stride));
1310
1311
88.0k
  data[0] = _mm_shuffle_epi8(reg, f_0);
1312
88.0k
  data[1] = _mm_shuffle_epi8(reg, f_1);
1313
88.0k
  return convolve_lowbd_4tap_ssse3(data, coeffs);
1314
88.0k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_4tap_2x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_4tap_2x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_4tap_2x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_4tap_2x2_ssse3
1315
1316
static inline void pack_store_u8_2x2_sse2(const __m128i reg, uint8_t *const dst,
1317
178k
                                          const ptrdiff_t dst_stride) {
1318
178k
  const __m128i data = _mm_packus_epi16(reg, reg);
1319
178k
  *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(data);
1320
178k
  *(int16_t *)(dst + dst_stride) = (int16_t)_mm_extract_epi16(data, 1);
1321
178k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:pack_store_u8_2x2_sse2
Unexecuted instantiation: highbd_convolve_avx2.c:pack_store_u8_2x2_sse2
Unexecuted instantiation: convolve_2d_avx2.c:pack_store_u8_2x2_sse2
convolve_avx2.c:pack_store_u8_2x2_sse2
Line
Count
Source
1317
178k
                                          const ptrdiff_t dst_stride) {
1318
178k
  const __m128i data = _mm_packus_epi16(reg, reg);
1319
178k
  *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(data);
1320
  *(int16_t *)(dst + dst_stride) = (int16_t)_mm_extract_epi16(data, 1);
1321
178k
}
Unexecuted instantiation: jnt_convolve_avx2.c:pack_store_u8_2x2_sse2
Unexecuted instantiation: wiener_convolve_avx2.c:pack_store_u8_2x2_sse2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:pack_store_u8_2x2_sse2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:pack_store_u8_2x2_sse2
1322
1323
static inline __m128i convolve_x_2tap_ssse3(const __m128i *data,
1324
35.1k
                                            const __m128i *coeff) {
1325
35.1k
  return _mm_maddubs_epi16(data[0], coeff[0]);
1326
35.1k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_ssse3
convolve_avx2.c:convolve_x_2tap_ssse3
Line
Count
Source
1324
35.1k
                                            const __m128i *coeff) {
1325
35.1k
  return _mm_maddubs_epi16(data[0], coeff[0]);
1326
35.1k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_ssse3
1327
1328
static inline __m128i load8_x_4x2_sse4(const void *const src,
1329
11.5k
                                       const ptrdiff_t offset) {
1330
11.5k
  const __m128i s = _mm_cvtsi32_si128(loadu_int32(src));
1331
11.5k
  return _mm_insert_epi32(s, loadu_int32((uint8_t *)src + offset), 1);
1332
11.5k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load8_x_4x2_sse4
Unexecuted instantiation: highbd_convolve_avx2.c:load8_x_4x2_sse4
Unexecuted instantiation: convolve_2d_avx2.c:load8_x_4x2_sse4
convolve_avx2.c:load8_x_4x2_sse4
Line
Count
Source
1329
11.5k
                                       const ptrdiff_t offset) {
1330
11.5k
  const __m128i s = _mm_cvtsi32_si128(loadu_int32(src));
1331
  return _mm_insert_epi32(s, loadu_int32((uint8_t *)src + offset), 1);
1332
11.5k
}
Unexecuted instantiation: jnt_convolve_avx2.c:load8_x_4x2_sse4
Unexecuted instantiation: wiener_convolve_avx2.c:load8_x_4x2_sse4
Unexecuted instantiation: highbd_convolve_2d_avx2.c:load8_x_4x2_sse4
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load8_x_4x2_sse4
1333
1334
static inline __m128i load_x_u8_4x2_sse4(const uint8_t *const src,
1335
11.5k
                                         const ptrdiff_t stride) {
1336
11.5k
  return load8_x_4x2_sse4(src, sizeof(*src) * stride);
1337
11.5k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_x_u8_4x2_sse4
Unexecuted instantiation: highbd_convolve_avx2.c:load_x_u8_4x2_sse4
Unexecuted instantiation: convolve_2d_avx2.c:load_x_u8_4x2_sse4
convolve_avx2.c:load_x_u8_4x2_sse4
Line
Count
Source
1335
11.5k
                                         const ptrdiff_t stride) {
1336
11.5k
  return load8_x_4x2_sse4(src, sizeof(*src) * stride);
1337
11.5k
}
Unexecuted instantiation: jnt_convolve_avx2.c:load_x_u8_4x2_sse4
Unexecuted instantiation: wiener_convolve_avx2.c:load_x_u8_4x2_sse4
Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_x_u8_4x2_sse4
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_x_u8_4x2_sse4
1338
1339
static inline __m128i convolve_x_2tap_2x2_ssse3(const uint8_t *const src,
1340
                                                const ptrdiff_t stride,
1341
2.69k
                                                const __m128i *coeffs) {
1342
2.69k
  const __m128i flt = _mm_load_si128((__m128i const *)filt5_global_sse2);
1343
2.69k
  const __m128i reg = load_x_u8_4x2_sse4(src, stride);
1344
2.69k
  const __m128i data = _mm_shuffle_epi8(reg, flt);
1345
2.69k
  return convolve_x_2tap_ssse3(&data, coeffs);
1346
2.69k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_2x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_2x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_2x2_ssse3
convolve_avx2.c:convolve_x_2tap_2x2_ssse3
Line
Count
Source
1341
2.69k
                                                const __m128i *coeffs) {
1342
2.69k
  const __m128i flt = _mm_load_si128((__m128i const *)filt5_global_sse2);
1343
2.69k
  const __m128i reg = load_x_u8_4x2_sse4(src, stride);
1344
2.69k
  const __m128i data = _mm_shuffle_epi8(reg, flt);
1345
2.69k
  return convolve_x_2tap_ssse3(&data, coeffs);
1346
2.69k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_2x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_2x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_2x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_2x2_ssse3
1347
1348
static inline __m128i convolve_x_2tap_4x2_ssse3(const uint8_t *const src,
1349
                                                const ptrdiff_t stride,
1350
10.7k
                                                const __m128i *coeffs) {
1351
10.7k
  const __m128i flt = _mm_load_si128((__m128i const *)filt1_global_sse2);
1352
10.7k
  const __m128i data =
1353
10.7k
      load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * stride));
1354
10.7k
  const __m128i res = _mm_shuffle_epi8(data, flt);
1355
10.7k
  return convolve_x_2tap_ssse3(&res, coeffs);
1356
10.7k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_4x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_4x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_4x2_ssse3
convolve_avx2.c:convolve_x_2tap_4x2_ssse3
Line
Count
Source
1350
10.7k
                                                const __m128i *coeffs) {
1351
10.7k
  const __m128i flt = _mm_load_si128((__m128i const *)filt1_global_sse2);
1352
10.7k
  const __m128i data =
1353
10.7k
      load_8bit_8x2_to_1_reg_sse2(src, (int)(sizeof(*src) * stride));
1354
10.7k
  const __m128i res = _mm_shuffle_epi8(data, flt);
1355
10.7k
  return convolve_x_2tap_ssse3(&res, coeffs);
1356
10.7k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_4x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_4x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_4x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_4x2_ssse3
1357
1358
static inline void convolve_x_2tap_8x2_ssse3(const uint8_t *const src,
1359
                                             const ptrdiff_t stride,
1360
                                             const __m128i *coeffs,
1361
10.8k
                                             __m128i *data) {
1362
10.8k
  __m128i res[2];
1363
10.8k
  const __m128i reg_00 = _mm_loadu_si128((__m128i *)src);
1364
10.8k
  const __m128i reg_10 = _mm_loadu_si128((__m128i *)(src + stride));
1365
10.8k
  const __m128i reg_01 = _mm_srli_si128(reg_00, 1);
1366
10.8k
  const __m128i reg_11 = _mm_srli_si128(reg_10, 1);
1367
10.8k
  res[0] = _mm_unpacklo_epi8(reg_00, reg_01);
1368
10.8k
  res[1] = _mm_unpacklo_epi8(reg_10, reg_11);
1369
1370
10.8k
  data[0] = convolve_x_2tap_ssse3(&res[0], coeffs);
1371
10.8k
  data[1] = convolve_x_2tap_ssse3(&res[1], coeffs);
1372
10.8k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_8x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_8x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_8x2_ssse3
convolve_avx2.c:convolve_x_2tap_8x2_ssse3
Line
Count
Source
1361
10.8k
                                             __m128i *data) {
1362
10.8k
  __m128i res[2];
1363
10.8k
  const __m128i reg_00 = _mm_loadu_si128((__m128i *)src);
1364
10.8k
  const __m128i reg_10 = _mm_loadu_si128((__m128i *)(src + stride));
1365
10.8k
  const __m128i reg_01 = _mm_srli_si128(reg_00, 1);
1366
10.8k
  const __m128i reg_11 = _mm_srli_si128(reg_10, 1);
1367
10.8k
  res[0] = _mm_unpacklo_epi8(reg_00, reg_01);
1368
10.8k
  res[1] = _mm_unpacklo_epi8(reg_10, reg_11);
1369
1370
10.8k
  data[0] = convolve_x_2tap_ssse3(&res[0], coeffs);
1371
10.8k
  data[1] = convolve_x_2tap_ssse3(&res[1], coeffs);
1372
10.8k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_8x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_8x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_8x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_8x2_ssse3
1373
1374
static inline __m256i loadu_x_8bit_16x2_avx2(const void *const src,
1375
896k
                                             const ptrdiff_t offset) {
1376
896k
  const __m128i reg0 = _mm_loadu_si128((__m128i *)src);
1377
896k
  const __m128i reg1 = _mm_loadu_si128((__m128i *)((uint8_t *)src + offset));
1378
896k
  return _mm256_setr_m128i(reg0, reg1);
1379
896k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:loadu_x_8bit_16x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:loadu_x_8bit_16x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:loadu_x_8bit_16x2_avx2
convolve_avx2.c:loadu_x_8bit_16x2_avx2
Line
Count
Source
1375
896k
                                             const ptrdiff_t offset) {
1376
896k
  const __m128i reg0 = _mm_loadu_si128((__m128i *)src);
1377
896k
  const __m128i reg1 = _mm_loadu_si128((__m128i *)((uint8_t *)src + offset));
1378
896k
  return _mm256_setr_m128i(reg0, reg1);
1379
896k
}
Unexecuted instantiation: jnt_convolve_avx2.c:loadu_x_8bit_16x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:loadu_x_8bit_16x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:loadu_x_8bit_16x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:loadu_x_8bit_16x2_avx2
1380
1381
static inline __m256i convolve_x_2tap_avx2(const __m256i *data,
1382
228k
                                           const __m256i *coeffs) {
1383
228k
  return _mm256_maddubs_epi16(data[0], coeffs[0]);
1384
228k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_avx2
convolve_avx2.c:convolve_x_2tap_avx2
Line
Count
Source
1382
228k
                                           const __m256i *coeffs) {
1383
228k
  return _mm256_maddubs_epi16(data[0], coeffs[0]);
1384
228k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_avx2
1385
1386
static inline void convolve_x_2tap_16x2_avx2(const uint8_t *const src,
1387
                                             const ptrdiff_t stride,
1388
                                             const __m256i *coeffs,
1389
9.00k
                                             __m256i *data) {
1390
9.00k
  const __m256i reg0 = loadu_x_8bit_16x2_avx2(src, stride);
1391
9.00k
  const __m256i reg1 = loadu_x_8bit_16x2_avx2(src + 1, stride);
1392
9.00k
  const __m256i res0 = _mm256_unpacklo_epi8(reg0, reg1);
1393
9.00k
  const __m256i res1 = _mm256_unpackhi_epi8(reg0, reg1);
1394
9.00k
  data[0] = convolve_x_2tap_avx2(&res0, coeffs);
1395
9.00k
  data[1] = convolve_x_2tap_avx2(&res1, coeffs);
1396
9.00k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_16x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_16x2_avx2
convolve_avx2.c:convolve_x_2tap_16x2_avx2
Line
Count
Source
1389
9.00k
                                             __m256i *data) {
1390
9.00k
  const __m256i reg0 = loadu_x_8bit_16x2_avx2(src, stride);
1391
9.00k
  const __m256i reg1 = loadu_x_8bit_16x2_avx2(src + 1, stride);
1392
9.00k
  const __m256i res0 = _mm256_unpacklo_epi8(reg0, reg1);
1393
9.00k
  const __m256i res1 = _mm256_unpackhi_epi8(reg0, reg1);
1394
9.00k
  data[0] = convolve_x_2tap_avx2(&res0, coeffs);
1395
9.00k
  data[1] = convolve_x_2tap_avx2(&res1, coeffs);
1396
9.00k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_16x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_16x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_16x2_avx2
1397
1398
static inline void store_u8_16x2_avx2(const __m256i src, uint8_t *const dst,
1399
1.87M
                                      const ptrdiff_t stride) {
1400
1.87M
  const __m128i reg0 = _mm256_castsi256_si128(src);
1401
1.87M
  const __m128i reg1 = _mm256_extracti128_si256(src, 1);
1402
1.87M
  _mm_storeu_si128((__m128i *)dst, reg0);
1403
1.87M
  _mm_storeu_si128((__m128i *)((uint8_t *)dst + stride), reg1);
1404
1.87M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:store_u8_16x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:store_u8_16x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:store_u8_16x2_avx2
convolve_avx2.c:store_u8_16x2_avx2
Line
Count
Source
1399
1.87M
                                      const ptrdiff_t stride) {
1400
1.87M
  const __m128i reg0 = _mm256_castsi256_si128(src);
1401
  const __m128i reg1 = _mm256_extracti128_si256(src, 1);
1402
1.87M
  _mm_storeu_si128((__m128i *)dst, reg0);
1403
1.87M
  _mm_storeu_si128((__m128i *)((uint8_t *)dst + stride), reg1);
1404
1.87M
}
Unexecuted instantiation: jnt_convolve_avx2.c:store_u8_16x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:store_u8_16x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:store_u8_16x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:store_u8_16x2_avx2
1405
1406
static inline void store_u8_8x2_avx2(const __m256i src, uint8_t *const dst,
1407
576k
                                     const ptrdiff_t stride) {
1408
576k
  const __m128i reg0 = _mm256_castsi256_si128(src);
1409
576k
  const __m128i reg1 = _mm256_extracti128_si256(src, 1);
1410
576k
  _mm_storel_epi64((__m128i *)dst, reg0);
1411
576k
  _mm_storel_epi64((__m128i *)(dst + stride), reg1);
1412
576k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:store_u8_8x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:store_u8_8x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:store_u8_8x2_avx2
convolve_avx2.c:store_u8_8x2_avx2
Line
Count
Source
1407
576k
                                     const ptrdiff_t stride) {
1408
576k
  const __m128i reg0 = _mm256_castsi256_si128(src);
1409
  const __m128i reg1 = _mm256_extracti128_si256(src, 1);
1410
576k
  _mm_storel_epi64((__m128i *)dst, reg0);
1411
576k
  _mm_storel_epi64((__m128i *)(dst + stride), reg1);
1412
576k
}
Unexecuted instantiation: jnt_convolve_avx2.c:store_u8_8x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:store_u8_8x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:store_u8_8x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:store_u8_8x2_avx2
1413
1414
static inline void pack_store_16x2_avx2(const __m256i data0,
1415
                                        const __m256i data1, uint8_t *const dst,
1416
1.87M
                                        const ptrdiff_t stride) {
1417
1.87M
  const __m256i res = _mm256_packus_epi16(data0, data1);
1418
1.87M
  store_u8_16x2_avx2(res, dst, stride);
1419
1.87M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:pack_store_16x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:pack_store_16x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:pack_store_16x2_avx2
convolve_avx2.c:pack_store_16x2_avx2
Line
Count
Source
1416
1.87M
                                        const ptrdiff_t stride) {
1417
1.87M
  const __m256i res = _mm256_packus_epi16(data0, data1);
1418
1.87M
  store_u8_16x2_avx2(res, dst, stride);
1419
1.87M
}
Unexecuted instantiation: jnt_convolve_avx2.c:pack_store_16x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:pack_store_16x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:pack_store_16x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:pack_store_16x2_avx2
1420
1421
static inline void pack_store_8x2_avx2(const __m256i data, uint8_t *const dst,
1422
576k
                                       const ptrdiff_t stride) {
1423
576k
  const __m256i res = _mm256_packus_epi16(data, data);
1424
576k
  store_u8_8x2_avx2(res, dst, stride);
1425
576k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:pack_store_8x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:pack_store_8x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:pack_store_8x2_avx2
convolve_avx2.c:pack_store_8x2_avx2
Line
Count
Source
1422
576k
                                       const ptrdiff_t stride) {
1423
576k
  const __m256i res = _mm256_packus_epi16(data, data);
1424
576k
  store_u8_8x2_avx2(res, dst, stride);
1425
576k
}
Unexecuted instantiation: jnt_convolve_avx2.c:pack_store_8x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:pack_store_8x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:pack_store_8x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:pack_store_8x2_avx2
1426
1427
static inline void round_pack_store_16x2_avx2(const __m256i *data,
1428
                                              uint8_t *const dst,
1429
448k
                                              const ptrdiff_t dst_stride) {
1430
448k
  __m256i reg[2];
1431
1432
448k
  reg[0] = round_sr_x_avx2(data[0]);
1433
448k
  reg[1] = round_sr_x_avx2(data[1]);
1434
448k
  pack_store_16x2_avx2(reg[0], reg[1], dst, dst_stride);
1435
448k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_pack_store_16x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:round_pack_store_16x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:round_pack_store_16x2_avx2
convolve_avx2.c:round_pack_store_16x2_avx2
Line
Count
Source
1429
448k
                                              const ptrdiff_t dst_stride) {
1430
448k
  __m256i reg[2];
1431
1432
448k
  reg[0] = round_sr_x_avx2(data[0]);
1433
448k
  reg[1] = round_sr_x_avx2(data[1]);
1434
448k
  pack_store_16x2_avx2(reg[0], reg[1], dst, dst_stride);
1435
448k
}
Unexecuted instantiation: jnt_convolve_avx2.c:round_pack_store_16x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:round_pack_store_16x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_pack_store_16x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_pack_store_16x2_avx2
1436
1437
static inline void convolve_x_2tap_32_avx2(const uint8_t *const src,
1438
                                           const __m256i *coeffs,
1439
105k
                                           __m256i *data) {
1440
105k
  const __m256i res0 = _mm256_loadu_si256((__m256i *)src);
1441
105k
  const __m256i res1 = _mm256_loadu_si256((__m256i *)(src + 1));
1442
105k
  const __m256i reg0 = _mm256_unpacklo_epi8(res0, res1);
1443
105k
  const __m256i reg1 = _mm256_unpackhi_epi8(res0, res1);
1444
1445
105k
  data[0] = convolve_x_2tap_avx2(&reg0, coeffs);
1446
105k
  data[1] = convolve_x_2tap_avx2(&reg1, coeffs);
1447
105k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_x_2tap_32_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_x_2tap_32_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_x_2tap_32_avx2
convolve_avx2.c:convolve_x_2tap_32_avx2
Line
Count
Source
1439
105k
                                           __m256i *data) {
1440
105k
  const __m256i res0 = _mm256_loadu_si256((__m256i *)src);
1441
105k
  const __m256i res1 = _mm256_loadu_si256((__m256i *)(src + 1));
1442
105k
  const __m256i reg0 = _mm256_unpacklo_epi8(res0, res1);
1443
105k
  const __m256i reg1 = _mm256_unpackhi_epi8(res0, res1);
1444
1445
105k
  data[0] = convolve_x_2tap_avx2(&reg0, coeffs);
1446
105k
  data[1] = convolve_x_2tap_avx2(&reg1, coeffs);
1447
105k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_x_2tap_32_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_x_2tap_32_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_x_2tap_32_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_x_2tap_32_avx2
1448
1449
static inline void pack_store_32_avx2(const __m256i data0, const __m256i data1,
1450
1.56M
                                      uint8_t *const dst) {
1451
1.56M
  const __m256i reg = _mm256_packus_epi16(data0, data1);
1452
1.56M
  _mm256_storeu_si256((__m256i *)dst, reg);
1453
1.56M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:pack_store_32_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:pack_store_32_avx2
Unexecuted instantiation: convolve_2d_avx2.c:pack_store_32_avx2
convolve_avx2.c:pack_store_32_avx2
Line
Count
Source
1450
1.56M
                                      uint8_t *const dst) {
1451
1.56M
  const __m256i reg = _mm256_packus_epi16(data0, data1);
1452
1.56M
  _mm256_storeu_si256((__m256i *)dst, reg);
1453
1.56M
}
Unexecuted instantiation: jnt_convolve_avx2.c:pack_store_32_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:pack_store_32_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:pack_store_32_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:pack_store_32_avx2
1454
1455
static inline void round_pack_store_32_avx2(const __m256i *data,
1456
1.20M
                                            uint8_t *const dst) {
1457
1.20M
  __m256i reg[2];
1458
1459
1.20M
  reg[0] = round_sr_x_avx2(data[0]);
1460
1.20M
  reg[1] = round_sr_x_avx2(data[1]);
1461
1.20M
  pack_store_32_avx2(reg[0], reg[1], dst);
1462
1.20M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_pack_store_32_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:round_pack_store_32_avx2
Unexecuted instantiation: convolve_2d_avx2.c:round_pack_store_32_avx2
convolve_avx2.c:round_pack_store_32_avx2
Line
Count
Source
1456
1.20M
                                            uint8_t *const dst) {
1457
1.20M
  __m256i reg[2];
1458
1459
1.20M
  reg[0] = round_sr_x_avx2(data[0]);
1460
1.20M
  reg[1] = round_sr_x_avx2(data[1]);
1461
1.20M
  pack_store_32_avx2(reg[0], reg[1], dst);
1462
1.20M
}
Unexecuted instantiation: jnt_convolve_avx2.c:round_pack_store_32_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:round_pack_store_32_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_pack_store_32_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_pack_store_32_avx2
1463
1464
static inline void convolve_round_2tap_32_avx2(const uint8_t *const src,
1465
                                               const __m256i *coeffs,
1466
105k
                                               uint8_t *const dst) {
1467
105k
  __m256i data[2];
1468
1469
105k
  convolve_x_2tap_32_avx2(src, coeffs, data);
1470
105k
  round_pack_store_32_avx2(data, dst);
1471
105k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_round_2tap_32_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_round_2tap_32_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_round_2tap_32_avx2
convolve_avx2.c:convolve_round_2tap_32_avx2
Line
Count
Source
1466
105k
                                               uint8_t *const dst) {
1467
105k
  __m256i data[2];
1468
1469
105k
  convolve_x_2tap_32_avx2(src, coeffs, data);
1470
105k
  round_pack_store_32_avx2(data, dst);
1471
105k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_round_2tap_32_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_round_2tap_32_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_round_2tap_32_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_round_2tap_32_avx2
1472
1473
static inline void load_avg_store_2tap_32_avx2(const uint8_t *const src,
1474
116k
                                               uint8_t *const dst) {
1475
116k
  const __m256i res0 = _mm256_loadu_si256((__m256i *)src);
1476
116k
  const __m256i res1 = _mm256_loadu_si256((__m256i *)(src + 1));
1477
116k
  const __m256i data = _mm256_avg_epu8(res0, res1);
1478
116k
  _mm256_storeu_si256((__m256i *)dst, data);
1479
116k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_avg_store_2tap_32_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:load_avg_store_2tap_32_avx2
Unexecuted instantiation: convolve_2d_avx2.c:load_avg_store_2tap_32_avx2
convolve_avx2.c:load_avg_store_2tap_32_avx2
Line
Count
Source
1474
116k
                                               uint8_t *const dst) {
1475
116k
  const __m256i res0 = _mm256_loadu_si256((__m256i *)src);
1476
116k
  const __m256i res1 = _mm256_loadu_si256((__m256i *)(src + 1));
1477
116k
  const __m256i data = _mm256_avg_epu8(res0, res1);
1478
116k
  _mm256_storeu_si256((__m256i *)dst, data);
1479
116k
}
Unexecuted instantiation: jnt_convolve_avx2.c:load_avg_store_2tap_32_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:load_avg_store_2tap_32_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_avg_store_2tap_32_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_avg_store_2tap_32_avx2
1480
1481
static inline __m256i load_convolve_8tap_8x2_avx2(const uint8_t *const src,
1482
                                                  const ptrdiff_t stride,
1483
                                                  const __m256i *coeffs,
1484
46.4k
                                                  const __m256i *flt) {
1485
46.4k
  const __m256i res = loadu_x_8bit_16x2_avx2(src, stride);
1486
46.4k
  return convolve_lowbd_x(res, coeffs, flt);
1487
46.4k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_8tap_8x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_8tap_8x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_8tap_8x2_avx2
convolve_avx2.c:load_convolve_8tap_8x2_avx2
Line
Count
Source
1484
46.4k
                                                  const __m256i *flt) {
1485
46.4k
  const __m256i res = loadu_x_8bit_16x2_avx2(src, stride);
1486
46.4k
  return convolve_lowbd_x(res, coeffs, flt);
1487
46.4k
}
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_8tap_8x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_8tap_8x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_8tap_8x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_8tap_8x2_avx2
1488
1489
static inline void load_convolve_8tap_16x2_avx2(const uint8_t *const src,
1490
                                                const int32_t src_stride,
1491
                                                const __m256i *coeffs,
1492
                                                const __m256i *flt,
1493
23.2k
                                                __m256i *reg) {
1494
23.2k
  reg[0] = load_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, flt);
1495
23.2k
  reg[1] = load_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, flt);
1496
23.2k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_8tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_8tap_16x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_8tap_16x2_avx2
convolve_avx2.c:load_convolve_8tap_16x2_avx2
Line
Count
Source
1493
23.2k
                                                __m256i *reg) {
1494
23.2k
  reg[0] = load_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, flt);
1495
23.2k
  reg[1] = load_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, flt);
1496
23.2k
}
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_8tap_16x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_8tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_8tap_16x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_8tap_16x2_avx2
1497
1498
static inline void load_convolve_8tap_32_avx2(const uint8_t *const src,
1499
                                              const __m256i *coeffs,
1500
                                              const __m256i *filt,
1501
128k
                                              __m256i *data) {
1502
128k
  const __m256i reg_0 = _mm256_loadu_si256((__m256i *)src);
1503
128k
  const __m256i reg_8 = _mm256_loadu_si256((__m256i *)(src + 8));
1504
1505
128k
  data[0] = convolve_lowbd_x(reg_0, coeffs, filt);
1506
128k
  data[1] = convolve_lowbd_x(reg_8, coeffs, filt);
1507
128k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_8tap_32_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_8tap_32_avx2
Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_8tap_32_avx2
convolve_avx2.c:load_convolve_8tap_32_avx2
Line
Count
Source
1501
128k
                                              __m256i *data) {
1502
128k
  const __m256i reg_0 = _mm256_loadu_si256((__m256i *)src);
1503
128k
  const __m256i reg_8 = _mm256_loadu_si256((__m256i *)(src + 8));
1504
1505
128k
  data[0] = convolve_lowbd_x(reg_0, coeffs, filt);
1506
128k
  data[1] = convolve_lowbd_x(reg_8, coeffs, filt);
1507
128k
}
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_8tap_32_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_8tap_32_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_8tap_32_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_8tap_32_avx2
1508
1509
static inline void load_convolve_round_8tap_32_avx2(const uint8_t *const src,
1510
                                                    const __m256i *coeffs,
1511
                                                    const __m256i *filt,
1512
128k
                                                    uint8_t *const dst) {
1513
128k
  __m256i data[2];
1514
1515
128k
  load_convolve_8tap_32_avx2(src, coeffs, filt, data);
1516
128k
  round_pack_store_32_avx2(data, dst);
1517
128k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_round_8tap_32_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_round_8tap_32_avx2
Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_round_8tap_32_avx2
convolve_avx2.c:load_convolve_round_8tap_32_avx2
Line
Count
Source
1512
128k
                                                    uint8_t *const dst) {
1513
128k
  __m256i data[2];
1514
1515
128k
  load_convolve_8tap_32_avx2(src, coeffs, filt, data);
1516
128k
  round_pack_store_32_avx2(data, dst);
1517
128k
}
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_round_8tap_32_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_round_8tap_32_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_round_8tap_32_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_round_8tap_32_avx2
1518
1519
static inline void load_convolve_6tap_32_avx2(const uint8_t *const src,
1520
                                              const __m256i *coeffs,
1521
                                              const __m256i *filt,
1522
968k
                                              __m256i *data) {
1523
968k
  const __m256i reg0 = _mm256_loadu_si256((__m256i *)src);
1524
968k
  const __m256i reg1 = _mm256_loadu_si256((__m256i *)(src + 8));
1525
1526
968k
  data[0] = convolve_lowbd_x_6tap(reg0, coeffs, filt);
1527
968k
  data[1] = convolve_lowbd_x_6tap(reg1, coeffs, filt);
1528
968k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_6tap_32_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_6tap_32_avx2
Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_6tap_32_avx2
convolve_avx2.c:load_convolve_6tap_32_avx2
Line
Count
Source
1522
968k
                                              __m256i *data) {
1523
968k
  const __m256i reg0 = _mm256_loadu_si256((__m256i *)src);
1524
968k
  const __m256i reg1 = _mm256_loadu_si256((__m256i *)(src + 8));
1525
1526
968k
  data[0] = convolve_lowbd_x_6tap(reg0, coeffs, filt);
1527
968k
  data[1] = convolve_lowbd_x_6tap(reg1, coeffs, filt);
1528
968k
}
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_6tap_32_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_6tap_32_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_6tap_32_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_6tap_32_avx2
1529
1530
static inline void convolve_sr_store_6tap_32_avx2(const uint8_t *const src,
1531
                                                  const __m256i *coeffs,
1532
                                                  const __m256i *filt,
1533
968k
                                                  uint8_t *const dst) {
1534
968k
  __m256i data[2];
1535
1536
968k
  load_convolve_6tap_32_avx2(src, coeffs, filt, data);
1537
968k
  round_pack_store_32_avx2(data, dst);
1538
968k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_sr_store_6tap_32_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_sr_store_6tap_32_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_sr_store_6tap_32_avx2
convolve_avx2.c:convolve_sr_store_6tap_32_avx2
Line
Count
Source
1533
968k
                                                  uint8_t *const dst) {
1534
968k
  __m256i data[2];
1535
1536
968k
  load_convolve_6tap_32_avx2(src, coeffs, filt, data);
1537
968k
  round_pack_store_32_avx2(data, dst);
1538
968k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_sr_store_6tap_32_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_sr_store_6tap_32_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_sr_store_6tap_32_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_sr_store_6tap_32_avx2
1539
1540
static inline __m256i load_convolve_6tap_8x2_avx2(const uint8_t *const src,
1541
                                                  const ptrdiff_t stride,
1542
                                                  const __m256i *coeffs,
1543
831k
                                                  const __m256i *filt) {
1544
831k
  const __m256i data = loadu_x_8bit_16x2_avx2(src, stride);
1545
831k
  return convolve_lowbd_x_6tap(data, coeffs, filt);
1546
831k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_6tap_8x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_6tap_8x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_6tap_8x2_avx2
convolve_avx2.c:load_convolve_6tap_8x2_avx2
Line
Count
Source
1543
831k
                                                  const __m256i *filt) {
1544
831k
  const __m256i data = loadu_x_8bit_16x2_avx2(src, stride);
1545
831k
  return convolve_lowbd_x_6tap(data, coeffs, filt);
1546
831k
}
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_6tap_8x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_6tap_8x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_6tap_8x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_6tap_8x2_avx2
1547
1548
static inline void load_convolve_6tap_16x2_avx2(const uint8_t *const src,
1549
                                                const int32_t src_stride,
1550
                                                const __m256i *coeffs,
1551
                                                const __m256i *filt,
1552
415k
                                                __m256i *data) {
1553
415k
  data[0] = load_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1554
415k
  data[1] = load_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1555
415k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:load_convolve_6tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:load_convolve_6tap_16x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:load_convolve_6tap_16x2_avx2
convolve_avx2.c:load_convolve_6tap_16x2_avx2
Line
Count
Source
1552
415k
                                                __m256i *data) {
1553
415k
  data[0] = load_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1554
415k
  data[1] = load_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1555
415k
}
Unexecuted instantiation: jnt_convolve_avx2.c:load_convolve_6tap_16x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:load_convolve_6tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:load_convolve_6tap_16x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:load_convolve_6tap_16x2_avx2
1556
1557
584k
static inline __m128i round_sr_y_ssse3(const __m128i data) {
1558
584k
  const __m128i value = _mm_set1_epi16(32);
1559
584k
  const __m128i reg = _mm_add_epi16(data, value);
1560
584k
  return _mm_srai_epi16(reg, FILTER_BITS - 1);
1561
584k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_sr_y_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:round_sr_y_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:round_sr_y_ssse3
convolve_avx2.c:round_sr_y_ssse3
Line
Count
Source
1557
584k
static inline __m128i round_sr_y_ssse3(const __m128i data) {
1558
584k
  const __m128i value = _mm_set1_epi16(32);
1559
584k
  const __m128i reg = _mm_add_epi16(data, value);
1560
584k
  return _mm_srai_epi16(reg, FILTER_BITS - 1);
1561
584k
}
Unexecuted instantiation: jnt_convolve_avx2.c:round_sr_y_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:round_sr_y_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_sr_y_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_sr_y_ssse3
1562
1563
4.15M
static inline __m256i round_sr_y_avx2(const __m256i data) {
1564
4.15M
  const __m256i value = _mm256_set1_epi16(32);
1565
4.15M
  const __m256i reg = _mm256_add_epi16(data, value);
1566
4.15M
  return _mm256_srai_epi16(reg, FILTER_BITS - 1);
1567
4.15M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_sr_y_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:round_sr_y_avx2
Unexecuted instantiation: convolve_2d_avx2.c:round_sr_y_avx2
convolve_avx2.c:round_sr_y_avx2
Line
Count
Source
1563
4.15M
static inline __m256i round_sr_y_avx2(const __m256i data) {
1564
4.15M
  const __m256i value = _mm256_set1_epi16(32);
1565
4.15M
  const __m256i reg = _mm256_add_epi16(data, value);
1566
4.15M
  return _mm256_srai_epi16(reg, FILTER_BITS - 1);
1567
4.15M
}
Unexecuted instantiation: jnt_convolve_avx2.c:round_sr_y_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:round_sr_y_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_sr_y_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_sr_y_avx2
1568
1569
static inline void round_pack_store_y_8x2_avx2(const __m256i res,
1570
                                               uint8_t *const dst,
1571
576k
                                               const ptrdiff_t dst_stride) {
1572
576k
  __m256i r;
1573
1574
576k
  r = round_sr_y_avx2(res);
1575
576k
  pack_store_8x2_avx2(r, dst, dst_stride);
1576
576k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_pack_store_y_8x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:round_pack_store_y_8x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:round_pack_store_y_8x2_avx2
convolve_avx2.c:round_pack_store_y_8x2_avx2
Line
Count
Source
1571
576k
                                               const ptrdiff_t dst_stride) {
1572
576k
  __m256i r;
1573
1574
576k
  r = round_sr_y_avx2(res);
1575
576k
  pack_store_8x2_avx2(r, dst, dst_stride);
1576
576k
}
Unexecuted instantiation: jnt_convolve_avx2.c:round_pack_store_y_8x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:round_pack_store_y_8x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_pack_store_y_8x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_pack_store_y_8x2_avx2
1577
1578
static inline void round_pack_store_y_16x2_avx2(const __m256i res[2],
1579
                                                uint8_t *const dst,
1580
1.42M
                                                const ptrdiff_t dst_stride) {
1581
1.42M
  __m256i r[2];
1582
1583
1.42M
  r[0] = round_sr_y_avx2(res[0]);
1584
1.42M
  r[1] = round_sr_y_avx2(res[1]);
1585
1.42M
  pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
1586
1.42M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_pack_store_y_16x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:round_pack_store_y_16x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:round_pack_store_y_16x2_avx2
convolve_avx2.c:round_pack_store_y_16x2_avx2
Line
Count
Source
1580
1.42M
                                                const ptrdiff_t dst_stride) {
1581
1.42M
  __m256i r[2];
1582
1583
1.42M
  r[0] = round_sr_y_avx2(res[0]);
1584
1.42M
  r[1] = round_sr_y_avx2(res[1]);
1585
1.42M
  pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
1586
1.42M
}
Unexecuted instantiation: jnt_convolve_avx2.c:round_pack_store_y_16x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:round_pack_store_y_16x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_pack_store_y_16x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_pack_store_y_16x2_avx2
1587
1588
static inline void round_pack_store_y_32_avx2(const __m256i res[2],
1589
362k
                                              uint8_t *const dst) {
1590
362k
  __m256i r[2];
1591
1592
362k
  r[0] = round_sr_y_avx2(res[0]);
1593
362k
  r[1] = round_sr_y_avx2(res[1]);
1594
362k
  pack_store_32_avx2(r[0], r[1], dst);
1595
362k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_pack_store_y_32_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:round_pack_store_y_32_avx2
Unexecuted instantiation: convolve_2d_avx2.c:round_pack_store_y_32_avx2
convolve_avx2.c:round_pack_store_y_32_avx2
Line
Count
Source
1589
362k
                                              uint8_t *const dst) {
1590
362k
  __m256i r[2];
1591
1592
362k
  r[0] = round_sr_y_avx2(res[0]);
1593
362k
  r[1] = round_sr_y_avx2(res[1]);
1594
362k
  pack_store_32_avx2(r[0], r[1], dst);
1595
362k
}
Unexecuted instantiation: jnt_convolve_avx2.c:round_pack_store_y_32_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:round_pack_store_y_32_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_pack_store_y_32_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_pack_store_y_32_avx2
1596
1597
static inline void round_pack_store_y_32x2_avx2(const __m256i res[4],
1598
                                                uint8_t *const dst,
1599
181k
                                                const ptrdiff_t dst_stride) {
1600
181k
  round_pack_store_y_32_avx2(res, dst);
1601
181k
  round_pack_store_y_32_avx2(res + 2, dst + dst_stride);
1602
181k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:round_pack_store_y_32x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:round_pack_store_y_32x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:round_pack_store_y_32x2_avx2
convolve_avx2.c:round_pack_store_y_32x2_avx2
Line
Count
Source
1599
181k
                                                const ptrdiff_t dst_stride) {
1600
181k
  round_pack_store_y_32_avx2(res, dst);
1601
181k
  round_pack_store_y_32_avx2(res + 2, dst + dst_stride);
1602
181k
}
Unexecuted instantiation: jnt_convolve_avx2.c:round_pack_store_y_32x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:round_pack_store_y_32x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:round_pack_store_y_32x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:round_pack_store_y_32x2_avx2
1603
1604
static inline void convolve_y_2tap_2x2_ssse3(const uint8_t *const data,
1605
                                             const ptrdiff_t stride,
1606
                                             const __m128i *coeffs,
1607
3.20k
                                             __m128i d[2], __m128i *res) {
1608
3.20k
  d[1] = _mm_cvtsi32_si128(loadu_int16(data + 1 * stride));
1609
3.20k
  const __m128i src_01a = _mm_unpacklo_epi16(d[0], d[1]);
1610
3.20k
  d[0] = _mm_cvtsi32_si128(loadu_int16(data + 2 * stride));
1611
3.20k
  const __m128i src_12a = _mm_unpacklo_epi16(d[1], d[0]);
1612
1613
3.20k
  const __m128i s = _mm_unpacklo_epi8(src_01a, src_12a);
1614
1615
3.20k
  *res = _mm_maddubs_epi16(s, coeffs[0]);
1616
3.20k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_2tap_2x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_2tap_2x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_2tap_2x2_ssse3
convolve_avx2.c:convolve_y_2tap_2x2_ssse3
Line
Count
Source
1607
3.20k
                                             __m128i d[2], __m128i *res) {
1608
3.20k
  d[1] = _mm_cvtsi32_si128(loadu_int16(data + 1 * stride));
1609
3.20k
  const __m128i src_01a = _mm_unpacklo_epi16(d[0], d[1]);
1610
3.20k
  d[0] = _mm_cvtsi32_si128(loadu_int16(data + 2 * stride));
1611
3.20k
  const __m128i src_12a = _mm_unpacklo_epi16(d[1], d[0]);
1612
1613
3.20k
  const __m128i s = _mm_unpacklo_epi8(src_01a, src_12a);
1614
1615
3.20k
  *res = _mm_maddubs_epi16(s, coeffs[0]);
1616
3.20k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_2tap_2x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_2tap_2x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_2tap_2x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_2tap_2x2_ssse3
1617
1618
static inline void convolve_y_4tap_2x2_ssse3(const uint8_t *const data,
1619
                                             const ptrdiff_t stride,
1620
                                             const __m128i coeffs[2],
1621
                                             __m128i d[4], __m128i s[2],
1622
33.6k
                                             __m128i *res) {
1623
33.6k
  d[3] = _mm_cvtsi32_si128(loadu_int16(data + 3 * stride));
1624
33.6k
  const __m128i src_23a = _mm_unpacklo_epi16(d[2], d[3]);
1625
33.6k
  d[2] = _mm_cvtsi32_si128(loadu_int16(data + 4 * stride));
1626
33.6k
  const __m128i src_34a = _mm_unpacklo_epi16(d[3], d[2]);
1627
1628
33.6k
  s[1] = _mm_unpacklo_epi8(src_23a, src_34a);
1629
1630
33.6k
  *res = convolve_lowbd_4tap_ssse3(s, coeffs);
1631
33.6k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_4tap_2x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_4tap_2x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_4tap_2x2_ssse3
convolve_avx2.c:convolve_y_4tap_2x2_ssse3
Line
Count
Source
1622
33.6k
                                             __m128i *res) {
1623
33.6k
  d[3] = _mm_cvtsi32_si128(loadu_int16(data + 3 * stride));
1624
33.6k
  const __m128i src_23a = _mm_unpacklo_epi16(d[2], d[3]);
1625
33.6k
  d[2] = _mm_cvtsi32_si128(loadu_int16(data + 4 * stride));
1626
33.6k
  const __m128i src_34a = _mm_unpacklo_epi16(d[3], d[2]);
1627
1628
33.6k
  s[1] = _mm_unpacklo_epi8(src_23a, src_34a);
1629
1630
33.6k
  *res = convolve_lowbd_4tap_ssse3(s, coeffs);
1631
33.6k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_4tap_2x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_4tap_2x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_4tap_2x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_4tap_2x2_ssse3
1632
1633
static inline void convolve_y_6tap_2x2_ssse3(const uint8_t *const data,
1634
                                             const ptrdiff_t stride,
1635
                                             const __m128i coeffs[3],
1636
                                             __m128i d[6], __m128i s[3],
1637
46.5k
                                             __m128i *res) {
1638
46.5k
  d[5] = _mm_cvtsi32_si128(loadu_int16(data + 5 * stride));
1639
46.5k
  const __m128i src_45a = _mm_unpacklo_epi16(d[4], d[5]);
1640
46.5k
  d[4] = _mm_cvtsi32_si128(loadu_int16(data + 6 * stride));
1641
46.5k
  const __m128i src_56a = _mm_unpacklo_epi16(d[5], d[4]);
1642
1643
46.5k
  s[2] = _mm_unpacklo_epi8(src_45a, src_56a);
1644
1645
46.5k
  *res = convolve_lowbd_6tap_ssse3(s, coeffs);
1646
46.5k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_6tap_2x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_6tap_2x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_6tap_2x2_ssse3
convolve_avx2.c:convolve_y_6tap_2x2_ssse3
Line
Count
Source
1637
46.5k
                                             __m128i *res) {
1638
46.5k
  d[5] = _mm_cvtsi32_si128(loadu_int16(data + 5 * stride));
1639
46.5k
  const __m128i src_45a = _mm_unpacklo_epi16(d[4], d[5]);
1640
46.5k
  d[4] = _mm_cvtsi32_si128(loadu_int16(data + 6 * stride));
1641
46.5k
  const __m128i src_56a = _mm_unpacklo_epi16(d[5], d[4]);
1642
1643
46.5k
  s[2] = _mm_unpacklo_epi8(src_45a, src_56a);
1644
1645
46.5k
  *res = convolve_lowbd_6tap_ssse3(s, coeffs);
1646
46.5k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_6tap_2x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_6tap_2x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_6tap_2x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_6tap_2x2_ssse3
1647
1648
static inline void convolve_y_8tap_2x2_ssse3(const uint8_t *const data,
1649
                                             const ptrdiff_t stride,
1650
                                             const __m128i coeffs[4],
1651
                                             __m128i d[8], __m128i s[4],
1652
4.46k
                                             __m128i *res) {
1653
4.46k
  d[7] = _mm_cvtsi32_si128(loadu_int16(data + 7 * stride));
1654
4.46k
  const __m128i src_67a = _mm_unpacklo_epi16(d[6], d[7]);
1655
4.46k
  d[6] = _mm_cvtsi32_si128(loadu_int16(data + 8 * stride));
1656
4.46k
  const __m128i src_78a = _mm_unpacklo_epi16(d[7], d[6]);
1657
1658
4.46k
  s[3] = _mm_unpacklo_epi8(src_67a, src_78a);
1659
1660
4.46k
  *res = convolve_lowbd_ssse3(s, coeffs);
1661
4.46k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_8tap_2x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_8tap_2x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_8tap_2x2_ssse3
convolve_avx2.c:convolve_y_8tap_2x2_ssse3
Line
Count
Source
1652
4.46k
                                             __m128i *res) {
1653
4.46k
  d[7] = _mm_cvtsi32_si128(loadu_int16(data + 7 * stride));
1654
4.46k
  const __m128i src_67a = _mm_unpacklo_epi16(d[6], d[7]);
1655
4.46k
  d[6] = _mm_cvtsi32_si128(loadu_int16(data + 8 * stride));
1656
4.46k
  const __m128i src_78a = _mm_unpacklo_epi16(d[7], d[6]);
1657
1658
4.46k
  s[3] = _mm_unpacklo_epi8(src_67a, src_78a);
1659
1660
4.46k
  *res = convolve_lowbd_ssse3(s, coeffs);
1661
4.46k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_8tap_2x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_8tap_2x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_8tap_2x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_8tap_2x2_ssse3
1662
1663
static inline void convolve_y_2tap_4x2_ssse3(const uint8_t *const data,
1664
                                             const ptrdiff_t stride,
1665
                                             const __m128i *coeffs,
1666
14.8k
                                             __m128i d[2], __m128i *res) {
1667
14.8k
  d[1] = _mm_cvtsi32_si128(loadu_int32(data + 1 * stride));
1668
14.8k
  const __m128i src_01a = _mm_unpacklo_epi32(d[0], d[1]);
1669
14.8k
  d[0] = _mm_cvtsi32_si128(loadu_int32(data + 2 * stride));
1670
14.8k
  const __m128i src_12a = _mm_unpacklo_epi32(d[1], d[0]);
1671
1672
14.8k
  const __m128i s = _mm_unpacklo_epi8(src_01a, src_12a);
1673
1674
14.8k
  *res = _mm_maddubs_epi16(s, coeffs[0]);
1675
14.8k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_2tap_4x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_2tap_4x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_2tap_4x2_ssse3
convolve_avx2.c:convolve_y_2tap_4x2_ssse3
Line
Count
Source
1666
14.8k
                                             __m128i d[2], __m128i *res) {
1667
14.8k
  d[1] = _mm_cvtsi32_si128(loadu_int32(data + 1 * stride));
1668
14.8k
  const __m128i src_01a = _mm_unpacklo_epi32(d[0], d[1]);
1669
14.8k
  d[0] = _mm_cvtsi32_si128(loadu_int32(data + 2 * stride));
1670
14.8k
  const __m128i src_12a = _mm_unpacklo_epi32(d[1], d[0]);
1671
1672
14.8k
  const __m128i s = _mm_unpacklo_epi8(src_01a, src_12a);
1673
1674
14.8k
  *res = _mm_maddubs_epi16(s, coeffs[0]);
1675
14.8k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_2tap_4x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_2tap_4x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_2tap_4x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_2tap_4x2_ssse3
1676
1677
static inline void convolve_y_4tap_4x2_ssse3(const uint8_t *const data,
1678
                                             const ptrdiff_t stride,
1679
                                             const __m128i coeffs[2],
1680
                                             __m128i d[4], __m128i s[2],
1681
185k
                                             __m128i *res) {
1682
185k
  d[3] = _mm_cvtsi32_si128(loadu_int32(data + 3 * stride));
1683
185k
  const __m128i src_23a = _mm_unpacklo_epi32(d[2], d[3]);
1684
185k
  d[2] = _mm_cvtsi32_si128(loadu_int32(data + 4 * stride));
1685
185k
  const __m128i src_34a = _mm_unpacklo_epi32(d[3], d[2]);
1686
1687
185k
  s[1] = _mm_unpacklo_epi8(src_23a, src_34a);
1688
1689
185k
  *res = convolve_lowbd_4tap_ssse3(s, coeffs);
1690
185k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_4tap_4x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_4tap_4x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_4tap_4x2_ssse3
convolve_avx2.c:convolve_y_4tap_4x2_ssse3
Line
Count
Source
1681
185k
                                             __m128i *res) {
1682
185k
  d[3] = _mm_cvtsi32_si128(loadu_int32(data + 3 * stride));
1683
185k
  const __m128i src_23a = _mm_unpacklo_epi32(d[2], d[3]);
1684
185k
  d[2] = _mm_cvtsi32_si128(loadu_int32(data + 4 * stride));
1685
185k
  const __m128i src_34a = _mm_unpacklo_epi32(d[3], d[2]);
1686
1687
185k
  s[1] = _mm_unpacklo_epi8(src_23a, src_34a);
1688
1689
185k
  *res = convolve_lowbd_4tap_ssse3(s, coeffs);
1690
185k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_4tap_4x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_4tap_4x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_4tap_4x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_4tap_4x2_ssse3
1691
1692
static inline void convolve_y_6tap_4x2_ssse3(const uint8_t *const data,
1693
                                             const ptrdiff_t stride,
1694
                                             const __m128i coeffs[3],
1695
                                             __m128i d[6], __m128i s[3],
1696
274k
                                             __m128i *res) {
1697
274k
  d[5] = _mm_cvtsi32_si128(loadu_int32(data + 5 * stride));
1698
274k
  const __m128i src_45a = _mm_unpacklo_epi32(d[4], d[5]);
1699
274k
  d[4] = _mm_cvtsi32_si128(loadu_int32(data + 6 * stride));
1700
274k
  const __m128i src_56a = _mm_unpacklo_epi32(d[5], d[4]);
1701
1702
274k
  s[2] = _mm_unpacklo_epi8(src_45a, src_56a);
1703
1704
274k
  *res = convolve_lowbd_6tap_ssse3(s, coeffs);
1705
274k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_6tap_4x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_6tap_4x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_6tap_4x2_ssse3
convolve_avx2.c:convolve_y_6tap_4x2_ssse3
Line
Count
Source
1696
274k
                                             __m128i *res) {
1697
274k
  d[5] = _mm_cvtsi32_si128(loadu_int32(data + 5 * stride));
1698
274k
  const __m128i src_45a = _mm_unpacklo_epi32(d[4], d[5]);
1699
274k
  d[4] = _mm_cvtsi32_si128(loadu_int32(data + 6 * stride));
1700
274k
  const __m128i src_56a = _mm_unpacklo_epi32(d[5], d[4]);
1701
1702
274k
  s[2] = _mm_unpacklo_epi8(src_45a, src_56a);
1703
1704
274k
  *res = convolve_lowbd_6tap_ssse3(s, coeffs);
1705
274k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_6tap_4x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_6tap_4x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_6tap_4x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_6tap_4x2_ssse3
1706
1707
static inline void convolve_y_8tap_4x2_ssse3(const uint8_t *const data,
1708
                                             const ptrdiff_t stride,
1709
                                             const __m128i coeffs[4],
1710
                                             __m128i d[8], __m128i s[4],
1711
21.6k
                                             __m128i *res) {
1712
21.6k
  d[7] = _mm_cvtsi32_si128(loadu_int32(data + 7 * stride));
1713
21.6k
  const __m128i src_67a = _mm_unpacklo_epi32(d[6], d[7]);
1714
21.6k
  d[6] = _mm_cvtsi32_si128(loadu_int32(data + 8 * stride));
1715
21.6k
  const __m128i src_78a = _mm_unpacklo_epi32(d[7], d[6]);
1716
1717
21.6k
  s[3] = _mm_unpacklo_epi8(src_67a, src_78a);
1718
1719
21.6k
  res[0] = convolve_lowbd_ssse3(s, coeffs);
1720
21.6k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_8tap_4x2_ssse3
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_8tap_4x2_ssse3
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_8tap_4x2_ssse3
convolve_avx2.c:convolve_y_8tap_4x2_ssse3
Line
Count
Source
1711
21.6k
                                             __m128i *res) {
1712
21.6k
  d[7] = _mm_cvtsi32_si128(loadu_int32(data + 7 * stride));
1713
21.6k
  const __m128i src_67a = _mm_unpacklo_epi32(d[6], d[7]);
1714
21.6k
  d[6] = _mm_cvtsi32_si128(loadu_int32(data + 8 * stride));
1715
21.6k
  const __m128i src_78a = _mm_unpacklo_epi32(d[7], d[6]);
1716
1717
21.6k
  s[3] = _mm_unpacklo_epi8(src_67a, src_78a);
1718
1719
21.6k
  res[0] = convolve_lowbd_ssse3(s, coeffs);
1720
21.6k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_8tap_4x2_ssse3
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_8tap_4x2_ssse3
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_8tap_4x2_ssse3
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_8tap_4x2_ssse3
1721
1722
static inline void convolve_y_2tap_8x2_avx2(const uint8_t *const data,
1723
                                            const ptrdiff_t stride,
1724
                                            const __m256i *coeffs, __m128i d[2],
1725
12.5k
                                            __m256i *res) {
1726
12.5k
  d[1] = _mm_loadu_si128((__m128i *)(data + 1 * stride));
1727
12.5k
  const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]);
1728
12.5k
  d[0] = _mm_loadu_si128((__m128i *)(data + 2 * stride));
1729
12.5k
  const __m256i src_12a = _mm256_setr_m128i(d[1], d[0]);
1730
1731
12.5k
  const __m256i s = _mm256_unpacklo_epi8(src_01a, src_12a);
1732
1733
12.5k
  *res = _mm256_maddubs_epi16(s, coeffs[0]);
1734
12.5k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_2tap_8x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_2tap_8x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_2tap_8x2_avx2
convolve_avx2.c:convolve_y_2tap_8x2_avx2
Line
Count
Source
1725
12.5k
                                            __m256i *res) {
1726
12.5k
  d[1] = _mm_loadu_si128((__m128i *)(data + 1 * stride));
1727
12.5k
  const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]);
1728
12.5k
  d[0] = _mm_loadu_si128((__m128i *)(data + 2 * stride));
1729
12.5k
  const __m256i src_12a = _mm256_setr_m128i(d[1], d[0]);
1730
1731
12.5k
  const __m256i s = _mm256_unpacklo_epi8(src_01a, src_12a);
1732
1733
12.5k
  *res = _mm256_maddubs_epi16(s, coeffs[0]);
1734
12.5k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_2tap_8x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_2tap_8x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_2tap_8x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_2tap_8x2_avx2
1735
1736
static inline void convolve_y_4tap_8x2_avx2(const uint8_t *const data,
1737
                                            const ptrdiff_t stride,
1738
                                            const __m256i coeffs[2],
1739
                                            __m128i d[4], __m256i s[2],
1740
160k
                                            __m256i *res) {
1741
160k
  d[3] = _mm_loadu_si128((__m128i *)(data + 3 * stride));
1742
160k
  const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]);
1743
160k
  d[2] = _mm_loadu_si128((__m128i *)(data + 4 * stride));
1744
160k
  const __m256i src_34a = _mm256_setr_m128i(d[3], d[2]);
1745
1746
160k
  s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
1747
1748
160k
  *res = convolve_lowbd_4tap(s, coeffs);
1749
160k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_4tap_8x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_4tap_8x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_4tap_8x2_avx2
convolve_avx2.c:convolve_y_4tap_8x2_avx2
Line
Count
Source
1740
160k
                                            __m256i *res) {
1741
160k
  d[3] = _mm_loadu_si128((__m128i *)(data + 3 * stride));
1742
160k
  const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]);
1743
160k
  d[2] = _mm_loadu_si128((__m128i *)(data + 4 * stride));
1744
160k
  const __m256i src_34a = _mm256_setr_m128i(d[3], d[2]);
1745
1746
160k
  s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
1747
1748
160k
  *res = convolve_lowbd_4tap(s, coeffs);
1749
160k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_4tap_8x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_4tap_8x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_4tap_8x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_4tap_8x2_avx2
1750
1751
static inline void convolve_y_6tap_8x2_avx2(const uint8_t *const data,
1752
                                            const ptrdiff_t stride,
1753
                                            const __m256i coeffs[3],
1754
                                            __m128i d[6], __m256i s[3],
1755
382k
                                            __m256i *res) {
1756
382k
  d[5] = _mm_loadu_si128((__m128i *)(data + 5 * stride));
1757
382k
  const __m256i src_45a = _mm256_setr_m128i(d[4], d[5]);
1758
382k
  d[4] = _mm_loadu_si128((__m128i *)(data + 6 * stride));
1759
382k
  const __m256i src_56a = _mm256_setr_m128i(d[5], d[4]);
1760
1761
382k
  s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
1762
1763
382k
  *res = convolve_lowbd_6tap(s, coeffs);
1764
382k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_6tap_8x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_6tap_8x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_6tap_8x2_avx2
convolve_avx2.c:convolve_y_6tap_8x2_avx2
Line
Count
Source
1755
382k
                                            __m256i *res) {
1756
382k
  d[5] = _mm_loadu_si128((__m128i *)(data + 5 * stride));
1757
382k
  const __m256i src_45a = _mm256_setr_m128i(d[4], d[5]);
1758
382k
  d[4] = _mm_loadu_si128((__m128i *)(data + 6 * stride));
1759
382k
  const __m256i src_56a = _mm256_setr_m128i(d[5], d[4]);
1760
1761
382k
  s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
1762
1763
382k
  *res = convolve_lowbd_6tap(s, coeffs);
1764
382k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_6tap_8x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_6tap_8x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_6tap_8x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_6tap_8x2_avx2
1765
1766
static inline void convolve_y_8tap_8x2_avx2(const uint8_t *const data,
1767
                                            const ptrdiff_t stride,
1768
                                            const __m256i coeffs[4],
1769
                                            __m128i d[8], __m256i s[4],
1770
20.9k
                                            __m256i *res) {
1771
20.9k
  d[7] = _mm_loadu_si128((__m128i *)(data + 7 * stride));
1772
20.9k
  const __m256i src_67a = _mm256_setr_m128i(d[6], d[7]);
1773
20.9k
  d[6] = _mm_loadu_si128((__m128i *)(data + 8 * stride));
1774
20.9k
  const __m256i src_78a = _mm256_setr_m128i(d[7], d[6]);
1775
1776
20.9k
  s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
1777
1778
20.9k
  *res = convolve_lowbd(s, coeffs);
1779
20.9k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_8tap_8x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_8tap_8x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_8tap_8x2_avx2
convolve_avx2.c:convolve_y_8tap_8x2_avx2
Line
Count
Source
1770
20.9k
                                            __m256i *res) {
1771
20.9k
  d[7] = _mm_loadu_si128((__m128i *)(data + 7 * stride));
1772
20.9k
  const __m256i src_67a = _mm256_setr_m128i(d[6], d[7]);
1773
20.9k
  d[6] = _mm_loadu_si128((__m128i *)(data + 8 * stride));
1774
20.9k
  const __m256i src_78a = _mm256_setr_m128i(d[7], d[6]);
1775
1776
20.9k
  s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
1777
1778
20.9k
  *res = convolve_lowbd(s, coeffs);
1779
20.9k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_8tap_8x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_8tap_8x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_8tap_8x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_8tap_8x2_avx2
1780
1781
static inline void convolve_y_2tap_16x2_avx2(const uint8_t *const data,
1782
                                             const ptrdiff_t stride,
1783
                                             const __m256i *coeffs,
1784
12.8k
                                             __m128i d[2], __m256i res[2]) {
1785
12.8k
  d[1] = _mm_loadu_si128((__m128i *)(data + 1 * stride));
1786
12.8k
  const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]);
1787
12.8k
  d[0] = _mm_loadu_si128((__m128i *)(data + 2 * stride));
1788
12.8k
  const __m256i src_12a = _mm256_setr_m128i(d[1], d[0]);
1789
1790
12.8k
  const __m256i s0 = _mm256_unpacklo_epi8(src_01a, src_12a);
1791
12.8k
  const __m256i s1 = _mm256_unpackhi_epi8(src_01a, src_12a);
1792
1793
12.8k
  res[0] = _mm256_maddubs_epi16(s0, coeffs[0]);
1794
12.8k
  res[1] = _mm256_maddubs_epi16(s1, coeffs[0]);
1795
12.8k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_2tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_2tap_16x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_2tap_16x2_avx2
convolve_avx2.c:convolve_y_2tap_16x2_avx2
Line
Count
Source
1784
12.8k
                                             __m128i d[2], __m256i res[2]) {
1785
12.8k
  d[1] = _mm_loadu_si128((__m128i *)(data + 1 * stride));
1786
12.8k
  const __m256i src_01a = _mm256_setr_m128i(d[0], d[1]);
1787
12.8k
  d[0] = _mm_loadu_si128((__m128i *)(data + 2 * stride));
1788
12.8k
  const __m256i src_12a = _mm256_setr_m128i(d[1], d[0]);
1789
1790
12.8k
  const __m256i s0 = _mm256_unpacklo_epi8(src_01a, src_12a);
1791
12.8k
  const __m256i s1 = _mm256_unpackhi_epi8(src_01a, src_12a);
1792
1793
12.8k
  res[0] = _mm256_maddubs_epi16(s0, coeffs[0]);
1794
12.8k
  res[1] = _mm256_maddubs_epi16(s1, coeffs[0]);
1795
12.8k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_2tap_16x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_2tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_2tap_16x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_2tap_16x2_avx2
1796
1797
static inline void convolve_y_4tap_16x2_avx2(const uint8_t *const data,
1798
                                             const ptrdiff_t stride,
1799
                                             const __m256i coeffs[2],
1800
                                             __m128i d[4], __m256i s[4],
1801
92.8k
                                             __m256i res[2]) {
1802
92.8k
  d[3] = _mm_loadu_si128((__m128i *)(data + 3 * stride));
1803
92.8k
  const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]);
1804
92.8k
  d[2] = _mm_loadu_si128((__m128i *)(data + 4 * stride));
1805
92.8k
  const __m256i src_34a = _mm256_setr_m128i(d[3], d[2]);
1806
1807
92.8k
  s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
1808
92.8k
  s[3] = _mm256_unpackhi_epi8(src_23a, src_34a);
1809
1810
92.8k
  res[0] = convolve_lowbd_4tap(s, coeffs);
1811
92.8k
  res[1] = convolve_lowbd_4tap(s + 2, coeffs);
1812
92.8k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_4tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_4tap_16x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_4tap_16x2_avx2
convolve_avx2.c:convolve_y_4tap_16x2_avx2
Line
Count
Source
1801
92.8k
                                             __m256i res[2]) {
1802
92.8k
  d[3] = _mm_loadu_si128((__m128i *)(data + 3 * stride));
1803
92.8k
  const __m256i src_23a = _mm256_setr_m128i(d[2], d[3]);
1804
92.8k
  d[2] = _mm_loadu_si128((__m128i *)(data + 4 * stride));
1805
92.8k
  const __m256i src_34a = _mm256_setr_m128i(d[3], d[2]);
1806
1807
92.8k
  s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
1808
92.8k
  s[3] = _mm256_unpackhi_epi8(src_23a, src_34a);
1809
1810
92.8k
  res[0] = convolve_lowbd_4tap(s, coeffs);
1811
92.8k
  res[1] = convolve_lowbd_4tap(s + 2, coeffs);
1812
92.8k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_4tap_16x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_4tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_4tap_16x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_4tap_16x2_avx2
1813
1814
static inline void convolve_y_6tap_16x2_avx2(const uint8_t *const data,
1815
                                             const ptrdiff_t stride,
1816
                                             const __m256i coeffs[3],
1817
                                             __m128i d[6], __m256i s[6],
1818
1.24M
                                             __m256i res[2]) {
1819
1.24M
  d[5] = _mm_loadu_si128((__m128i *)(data + 5 * stride));
1820
1.24M
  const __m256i src_45a = _mm256_setr_m128i(d[4], d[5]);
1821
1.24M
  d[4] = _mm_loadu_si128((__m128i *)(data + 6 * stride));
1822
1.24M
  const __m256i src_56a = _mm256_setr_m128i(d[5], d[4]);
1823
1824
1.24M
  s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
1825
1.24M
  s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
1826
1827
1.24M
  res[0] = convolve_lowbd_6tap(s, coeffs);
1828
1.24M
  res[1] = convolve_lowbd_6tap(s + 3, coeffs);
1829
1.24M
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_6tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_6tap_16x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_6tap_16x2_avx2
convolve_avx2.c:convolve_y_6tap_16x2_avx2
Line
Count
Source
1818
1.24M
                                             __m256i res[2]) {
1819
1.24M
  d[5] = _mm_loadu_si128((__m128i *)(data + 5 * stride));
1820
1.24M
  const __m256i src_45a = _mm256_setr_m128i(d[4], d[5]);
1821
1.24M
  d[4] = _mm_loadu_si128((__m128i *)(data + 6 * stride));
1822
1.24M
  const __m256i src_56a = _mm256_setr_m128i(d[5], d[4]);
1823
1824
1.24M
  s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
1825
1.24M
  s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
1826
1827
1.24M
  res[0] = convolve_lowbd_6tap(s, coeffs);
1828
1.24M
  res[1] = convolve_lowbd_6tap(s + 3, coeffs);
1829
1.24M
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_6tap_16x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_6tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_6tap_16x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_6tap_16x2_avx2
1830
1831
static inline void convolve_y_8tap_16x2_avx2(const uint8_t *const data,
1832
                                             const ptrdiff_t stride,
1833
                                             const __m256i coeffs[4],
1834
                                             __m128i d[8], __m256i s[8],
1835
79.4k
                                             __m256i res[2]) {
1836
79.4k
  d[7] = _mm_loadu_si128((__m128i *)(data + 7 * stride));
1837
79.4k
  const __m256i src_67a = _mm256_setr_m128i(d[6], d[7]);
1838
79.4k
  d[6] = _mm_loadu_si128((__m128i *)(data + 8 * stride));
1839
79.4k
  const __m256i src_78a = _mm256_setr_m128i(d[7], d[6]);
1840
1841
79.4k
  s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
1842
79.4k
  s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
1843
1844
79.4k
  res[0] = convolve_lowbd(s, coeffs);
1845
79.4k
  res[1] = convolve_lowbd(s + 4, coeffs);
1846
79.4k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_8tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_8tap_16x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_8tap_16x2_avx2
convolve_avx2.c:convolve_y_8tap_16x2_avx2
Line
Count
Source
1835
79.4k
                                             __m256i res[2]) {
1836
79.4k
  d[7] = _mm_loadu_si128((__m128i *)(data + 7 * stride));
1837
79.4k
  const __m256i src_67a = _mm256_setr_m128i(d[6], d[7]);
1838
79.4k
  d[6] = _mm_loadu_si128((__m128i *)(data + 8 * stride));
1839
79.4k
  const __m256i src_78a = _mm256_setr_m128i(d[7], d[6]);
1840
1841
79.4k
  s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
1842
79.4k
  s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
1843
1844
79.4k
  res[0] = convolve_lowbd(s, coeffs);
1845
79.4k
  res[1] = convolve_lowbd(s + 4, coeffs);
1846
79.4k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_8tap_16x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_8tap_16x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_8tap_16x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_8tap_16x2_avx2
1847
1848
static inline void convolve_y_2tap_32x2_avx2(const uint8_t *const data,
1849
                                             const ptrdiff_t stride,
1850
                                             const __m256i *coeffs,
1851
39.6k
                                             __m256i d[2], __m256i res[4]) {
1852
39.6k
  d[1] = _mm256_loadu_si256((__m256i *)(data + 1 * stride));
1853
39.6k
  const __m256i s00 = _mm256_unpacklo_epi8(d[0], d[1]);
1854
39.6k
  const __m256i s01 = _mm256_unpackhi_epi8(d[0], d[1]);
1855
39.6k
  d[0] = _mm256_loadu_si256((__m256i *)(data + 2 * stride));
1856
39.6k
  const __m256i s10 = _mm256_unpacklo_epi8(d[1], d[0]);
1857
39.6k
  const __m256i s11 = _mm256_unpackhi_epi8(d[1], d[0]);
1858
1859
39.6k
  res[0] = _mm256_maddubs_epi16(s00, coeffs[0]);
1860
39.6k
  res[1] = _mm256_maddubs_epi16(s01, coeffs[0]);
1861
39.6k
  res[2] = _mm256_maddubs_epi16(s10, coeffs[0]);
1862
39.6k
  res[3] = _mm256_maddubs_epi16(s11, coeffs[0]);
1863
39.6k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_2tap_32x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_2tap_32x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_2tap_32x2_avx2
convolve_avx2.c:convolve_y_2tap_32x2_avx2
Line
Count
Source
1851
39.6k
                                             __m256i d[2], __m256i res[4]) {
1852
39.6k
  d[1] = _mm256_loadu_si256((__m256i *)(data + 1 * stride));
1853
39.6k
  const __m256i s00 = _mm256_unpacklo_epi8(d[0], d[1]);
1854
39.6k
  const __m256i s01 = _mm256_unpackhi_epi8(d[0], d[1]);
1855
39.6k
  d[0] = _mm256_loadu_si256((__m256i *)(data + 2 * stride));
1856
39.6k
  const __m256i s10 = _mm256_unpacklo_epi8(d[1], d[0]);
1857
39.6k
  const __m256i s11 = _mm256_unpackhi_epi8(d[1], d[0]);
1858
1859
39.6k
  res[0] = _mm256_maddubs_epi16(s00, coeffs[0]);
1860
39.6k
  res[1] = _mm256_maddubs_epi16(s01, coeffs[0]);
1861
39.6k
  res[2] = _mm256_maddubs_epi16(s10, coeffs[0]);
1862
39.6k
  res[3] = _mm256_maddubs_epi16(s11, coeffs[0]);
1863
39.6k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_2tap_32x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_2tap_32x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_2tap_32x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_2tap_32x2_avx2
1864
1865
static inline void convolve_y_4tap_32x2_avx2(const uint8_t *const data,
1866
                                             const ptrdiff_t stride,
1867
                                             const __m256i coeffs[2],
1868
                                             __m256i d[4], __m256i s1[4],
1869
141k
                                             __m256i s2[4], __m256i res[4]) {
1870
141k
  d[3] = _mm256_loadu_si256((__m256i *)(data + 3 * stride));
1871
141k
  s1[1] = _mm256_unpacklo_epi8(d[2], d[3]);
1872
141k
  s1[3] = _mm256_unpackhi_epi8(d[2], d[3]);
1873
141k
  d[2] = _mm256_loadu_si256((__m256i *)(data + 4 * stride));
1874
141k
  s2[1] = _mm256_unpacklo_epi8(d[3], d[2]);
1875
141k
  s2[3] = _mm256_unpackhi_epi8(d[3], d[2]);
1876
1877
141k
  res[0] = convolve_lowbd_4tap(s1, coeffs);
1878
141k
  res[1] = convolve_lowbd_4tap(s1 + 2, coeffs);
1879
141k
  res[2] = convolve_lowbd_4tap(s2, coeffs);
1880
141k
  res[3] = convolve_lowbd_4tap(s2 + 2, coeffs);
1881
141k
}
Unexecuted instantiation: aom_subpixel_8t_intrin_avx2.c:convolve_y_4tap_32x2_avx2
Unexecuted instantiation: highbd_convolve_avx2.c:convolve_y_4tap_32x2_avx2
Unexecuted instantiation: convolve_2d_avx2.c:convolve_y_4tap_32x2_avx2
convolve_avx2.c:convolve_y_4tap_32x2_avx2
Line
Count
Source
1869
141k
                                             __m256i s2[4], __m256i res[4]) {
1870
141k
  d[3] = _mm256_loadu_si256((__m256i *)(data + 3 * stride));
1871
141k
  s1[1] = _mm256_unpacklo_epi8(d[2], d[3]);
1872
141k
  s1[3] = _mm256_unpackhi_epi8(d[2], d[3]);
1873
141k
  d[2] = _mm256_loadu_si256((__m256i *)(data + 4 * stride));
1874
141k
  s2[1] = _mm256_unpacklo_epi8(d[3], d[2]);
1875
141k
  s2[3] = _mm256_unpackhi_epi8(d[3], d[2]);
1876
1877
141k
  res[0] = convolve_lowbd_4tap(s1, coeffs);
1878
141k
  res[1] = convolve_lowbd_4tap(s1 + 2, coeffs);
1879
141k
  res[2] = convolve_lowbd_4tap(s2, coeffs);
1880
141k
  res[3] = convolve_lowbd_4tap(s2 + 2, coeffs);
1881
141k
}
Unexecuted instantiation: jnt_convolve_avx2.c:convolve_y_4tap_32x2_avx2
Unexecuted instantiation: wiener_convolve_avx2.c:convolve_y_4tap_32x2_avx2
Unexecuted instantiation: highbd_convolve_2d_avx2.c:convolve_y_4tap_32x2_avx2
Unexecuted instantiation: highbd_jnt_convolve_avx2.c:convolve_y_4tap_32x2_avx2
1882
#endif  // AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_