Coverage Report

Created: 2023-06-07 06:31

/src/aom/third_party/SVT-AV1/convolve_avx2.h
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#ifndef THIRD_PARTY_SVT_AV1_CONVOLVE_AVX2_H_
13
#define THIRD_PARTY_SVT_AV1_CONVOLVE_AVX2_H_
14
15
#include "EbMemory_AVX2.h"
16
#include "EbMemory_SSE4_1.h"
17
#include "synonyms.h"
18
19
#include "aom_dsp/aom_filter.h"
20
#include "aom_dsp/x86/convolve_avx2.h"
21
#include "aom_dsp/x86/mem_sse2.h"
22
23
static INLINE void populate_coeffs_4tap_avx2(const __m128i coeffs_128,
24
432k
                                             __m256i coeffs[2]) {
25
432k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
26
27
  // coeffs 2 3 2 3 2 3 2 3
28
432k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
29
  // coeffs 4 5 4 5 4 5 4 5
30
432k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
31
432k
}
convolve_2d_avx2.c:populate_coeffs_4tap_avx2
Line
Count
Source
24
122k
                                             __m256i coeffs[2]) {
25
122k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
26
27
  // coeffs 2 3 2 3 2 3 2 3
28
122k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
29
  // coeffs 4 5 4 5 4 5 4 5
30
122k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
31
122k
}
convolve_avx2.c:populate_coeffs_4tap_avx2
Line
Count
Source
24
309k
                                             __m256i coeffs[2]) {
25
309k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
26
27
  // coeffs 2 3 2 3 2 3 2 3
28
309k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
29
  // coeffs 4 5 4 5 4 5 4 5
30
309k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
31
309k
}
32
33
static INLINE void populate_coeffs_6tap_avx2(const __m128i coeffs_128,
34
2.95M
                                             __m256i coeffs[3]) {
35
2.95M
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
36
37
  // coeffs 1 2 1 2 1 2 1 2
38
2.95M
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0402u));
39
  // coeffs 3 4 3 4 3 4 3 4
40
2.95M
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0806u));
41
  // coeffs 5 6 5 6 5 6 5 6
42
2.95M
  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0C0Au));
43
2.95M
}
convolve_2d_avx2.c:populate_coeffs_6tap_avx2
Line
Count
Source
34
1.81M
                                             __m256i coeffs[3]) {
35
1.81M
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
36
37
  // coeffs 1 2 1 2 1 2 1 2
38
1.81M
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0402u));
39
  // coeffs 3 4 3 4 3 4 3 4
40
1.81M
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0806u));
41
  // coeffs 5 6 5 6 5 6 5 6
42
1.81M
  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0C0Au));
43
1.81M
}
convolve_avx2.c:populate_coeffs_6tap_avx2
Line
Count
Source
34
1.13M
                                             __m256i coeffs[3]) {
35
1.13M
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
36
37
  // coeffs 1 2 1 2 1 2 1 2
38
1.13M
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0402u));
39
  // coeffs 3 4 3 4 3 4 3 4
40
1.13M
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0806u));
41
  // coeffs 5 6 5 6 5 6 5 6
42
1.13M
  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0C0Au));
43
1.13M
}
44
45
static INLINE void populate_coeffs_8tap_avx2(const __m128i coeffs_128,
46
165k
                                             __m256i coeffs[4]) {
47
165k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
48
49
  // coeffs 0 1 0 1 0 1 0 1
50
165k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0200u));
51
  // coeffs 2 3 2 3 2 3 2 3
52
165k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
53
  // coeffs 4 5 4 5 4 5 4 5
54
165k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
55
  // coeffs 6 7 6 7 6 7 6 7
56
165k
  coeffs[3] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0e0cu));
57
165k
}
convolve_2d_avx2.c:populate_coeffs_8tap_avx2
Line
Count
Source
46
94.7k
                                             __m256i coeffs[4]) {
47
94.7k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
48
49
  // coeffs 0 1 0 1 0 1 0 1
50
94.7k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0200u));
51
  // coeffs 2 3 2 3 2 3 2 3
52
94.7k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
53
  // coeffs 4 5 4 5 4 5 4 5
54
94.7k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
55
  // coeffs 6 7 6 7 6 7 6 7
56
94.7k
  coeffs[3] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0e0cu));
57
94.7k
}
convolve_avx2.c:populate_coeffs_8tap_avx2
Line
Count
Source
46
70.5k
                                             __m256i coeffs[4]) {
47
70.5k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
48
49
  // coeffs 0 1 0 1 0 1 0 1
50
70.5k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0200u));
51
  // coeffs 2 3 2 3 2 3 2 3
52
70.5k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
53
  // coeffs 4 5 4 5 4 5 4 5
54
70.5k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
55
  // coeffs 6 7 6 7 6 7 6 7
56
70.5k
  coeffs[3] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0e0cu));
57
70.5k
}
58
59
static INLINE void prepare_half_coeffs_2tap_ssse3(
60
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
61
103k
    __m128i *const coeffs /* [1] */) {
62
103k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
63
103k
      filter_params, subpel_q4 & SUBPEL_MASK);
64
103k
  const __m128i coeffs_8 = _mm_cvtsi32_si128(*(const int32_t *)(filter + 3));
65
66
  // right shift all filter co-efficients by 1 to reduce the bits required.
67
  // This extra right shift will be taken care of at the end while rounding
68
  // the result.
69
  // Since all filter co-efficients are even, this change will not affect the
70
  // end result
71
103k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
72
103k
                            _mm_set1_epi16((short)0xffff)));
73
74
0
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
75
76
  // coeffs 3 4 3 4 3 4 3 4
77
103k
  *coeffs = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
78
103k
}
convolve_2d_avx2.c:prepare_half_coeffs_2tap_ssse3
Line
Count
Source
61
70.1k
    __m128i *const coeffs /* [1] */) {
62
70.1k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
63
70.1k
      filter_params, subpel_q4 & SUBPEL_MASK);
64
70.1k
  const __m128i coeffs_8 = _mm_cvtsi32_si128(*(const int32_t *)(filter + 3));
65
66
  // right shift all filter co-efficients by 1 to reduce the bits required.
67
  // This extra right shift will be taken care of at the end while rounding
68
  // the result.
69
  // Since all filter co-efficients are even, this change will not affect the
70
  // end result
71
70.1k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
72
70.1k
                            _mm_set1_epi16((short)0xffff)));
73
74
0
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
75
76
  // coeffs 3 4 3 4 3 4 3 4
77
70.1k
  *coeffs = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
78
70.1k
}
convolve_avx2.c:prepare_half_coeffs_2tap_ssse3
Line
Count
Source
61
33.5k
    __m128i *const coeffs /* [1] */) {
62
33.5k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
63
33.5k
      filter_params, subpel_q4 & SUBPEL_MASK);
64
33.5k
  const __m128i coeffs_8 = _mm_cvtsi32_si128(*(const int32_t *)(filter + 3));
65
66
  // right shift all filter co-efficients by 1 to reduce the bits required.
67
  // This extra right shift will be taken care of at the end while rounding
68
  // the result.
69
  // Since all filter co-efficients are even, this change will not affect the
70
  // end result
71
33.5k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
72
33.5k
                            _mm_set1_epi16((short)0xffff)));
73
74
0
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
75
76
  // coeffs 3 4 3 4 3 4 3 4
77
33.5k
  *coeffs = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
78
33.5k
}
79
80
static INLINE void prepare_half_coeffs_4tap_ssse3(
81
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
82
2.79M
    __m128i *const coeffs /* [2] */) {
83
2.79M
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
84
2.79M
      filter_params, subpel_q4 & SUBPEL_MASK);
85
2.79M
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
86
87
  // right shift all filter co-efficients by 1 to reduce the bits required.
88
  // This extra right shift will be taken care of at the end while rounding
89
  // the result.
90
  // Since all filter co-efficients are even, this change will not affect the
91
  // end result
92
2.79M
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
93
2.79M
                            _mm_set1_epi16((short)0xffff)));
94
95
0
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
96
97
  // coeffs 2 3 2 3 2 3 2 3
98
2.79M
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
99
  // coeffs 4 5 4 5 4 5 4 5
100
2.79M
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
101
2.79M
}
convolve_2d_avx2.c:prepare_half_coeffs_4tap_ssse3
Line
Count
Source
82
1.75M
    __m128i *const coeffs /* [2] */) {
83
1.75M
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
84
1.75M
      filter_params, subpel_q4 & SUBPEL_MASK);
85
1.75M
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
86
87
  // right shift all filter co-efficients by 1 to reduce the bits required.
88
  // This extra right shift will be taken care of at the end while rounding
89
  // the result.
90
  // Since all filter co-efficients are even, this change will not affect the
91
  // end result
92
1.75M
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
93
1.75M
                            _mm_set1_epi16((short)0xffff)));
94
95
0
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
96
97
  // coeffs 2 3 2 3 2 3 2 3
98
1.75M
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
99
  // coeffs 4 5 4 5 4 5 4 5
100
1.75M
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
101
1.75M
}
convolve_avx2.c:prepare_half_coeffs_4tap_ssse3
Line
Count
Source
82
1.03M
    __m128i *const coeffs /* [2] */) {
83
1.03M
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
84
1.03M
      filter_params, subpel_q4 & SUBPEL_MASK);
85
1.03M
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
86
87
  // right shift all filter co-efficients by 1 to reduce the bits required.
88
  // This extra right shift will be taken care of at the end while rounding
89
  // the result.
90
  // Since all filter co-efficients are even, this change will not affect the
91
  // end result
92
1.03M
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
93
1.03M
                            _mm_set1_epi16((short)0xffff)));
94
95
0
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
96
97
  // coeffs 2 3 2 3 2 3 2 3
98
1.03M
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
99
  // coeffs 4 5 4 5 4 5 4 5
100
1.03M
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
101
1.03M
}
102
103
static INLINE void prepare_half_coeffs_6tap_ssse3(
104
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
105
227k
    __m128i *const coeffs /* [3] */) {
106
227k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
107
227k
      filter_params, subpel_q4 & SUBPEL_MASK);
108
227k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
109
110
  // right shift all filter co-efficients by 1 to reduce the bits required.
111
  // This extra right shift will be taken care of at the end while rounding
112
  // the result.
113
  // Since all filter co-efficients are even, this change will not affect the
114
  // end result
115
227k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
116
227k
                            _mm_set1_epi16((short)0xffff)));
117
118
0
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
119
120
  // coeffs 1 2 1 2 1 2 1 2
121
227k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0402u));
122
  // coeffs 3 4 3 4 3 4 3 4
123
227k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u));
124
  // coeffs 5 6 5 6 5 6 5 6
125
227k
  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0C0Au));
126
227k
}
Unexecuted instantiation: convolve_2d_avx2.c:prepare_half_coeffs_6tap_ssse3
convolve_avx2.c:prepare_half_coeffs_6tap_ssse3
Line
Count
Source
105
227k
    __m128i *const coeffs /* [3] */) {
106
227k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
107
227k
      filter_params, subpel_q4 & SUBPEL_MASK);
108
227k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
109
110
  // right shift all filter co-efficients by 1 to reduce the bits required.
111
  // This extra right shift will be taken care of at the end while rounding
112
  // the result.
113
  // Since all filter co-efficients are even, this change will not affect the
114
  // end result
115
227k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
116
227k
                            _mm_set1_epi16((short)0xffff)));
117
118
0
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
119
120
  // coeffs 1 2 1 2 1 2 1 2
121
227k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0402u));
122
  // coeffs 3 4 3 4 3 4 3 4
123
227k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u));
124
  // coeffs 5 6 5 6 5 6 5 6
125
227k
  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0C0Au));
126
227k
}
127
128
static INLINE void prepare_half_coeffs_8tap_ssse3(
129
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
130
11.8k
    __m128i *const coeffs /* [4] */) {
131
11.8k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
132
11.8k
      filter_params, subpel_q4 & SUBPEL_MASK);
133
11.8k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
134
135
  // right shift all filter co-efficients by 1 to reduce the bits required.
136
  // This extra right shift will be taken care of at the end while rounding
137
  // the result.
138
  // Since all filter co-efficients are even, this change will not affect the
139
  // end result
140
11.8k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
141
11.8k
                            _mm_set1_epi16((short)0xffff)));
142
143
0
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
144
145
  // coeffs 0 1 0 1 0 1 0 1
146
11.8k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
147
  // coeffs 2 3 2 3 2 3 2 3
148
11.8k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
149
  // coeffs 4 5 4 5 4 5 4 5
150
11.8k
  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
151
  // coeffs 6 7 6 7 6 7 6 7
152
11.8k
  coeffs[3] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0e0cu));
153
11.8k
}
Unexecuted instantiation: convolve_2d_avx2.c:prepare_half_coeffs_8tap_ssse3
convolve_avx2.c:prepare_half_coeffs_8tap_ssse3
Line
Count
Source
130
11.8k
    __m128i *const coeffs /* [4] */) {
131
11.8k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
132
11.8k
      filter_params, subpel_q4 & SUBPEL_MASK);
133
11.8k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
134
135
  // right shift all filter co-efficients by 1 to reduce the bits required.
136
  // This extra right shift will be taken care of at the end while rounding
137
  // the result.
138
  // Since all filter co-efficients are even, this change will not affect the
139
  // end result
140
11.8k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
141
11.8k
                            _mm_set1_epi16((short)0xffff)));
142
143
0
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
144
145
  // coeffs 0 1 0 1 0 1 0 1
146
11.8k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
147
  // coeffs 2 3 2 3 2 3 2 3
148
11.8k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
149
  // coeffs 4 5 4 5 4 5 4 5
150
11.8k
  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
151
  // coeffs 6 7 6 7 6 7 6 7
152
11.8k
  coeffs[3] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0e0cu));
153
11.8k
}
154
155
static INLINE void prepare_half_coeffs_2tap_avx2(
156
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
157
24.3k
    __m256i *const coeffs /* [1] */) {
158
24.3k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
159
24.3k
      filter_params, subpel_q4 & SUBPEL_MASK);
160
24.3k
  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
161
24.3k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
162
163
  // right shift all filter co-efficients by 1 to reduce the bits required.
164
  // This extra right shift will be taken care of at the end while rounding
165
  // the result.
166
  // Since all filter co-efficients are even, this change will not affect the
167
  // end result
168
24.3k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
169
24.3k
                            _mm_set1_epi16((short)0xffff)));
170
171
0
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
172
173
  // coeffs 3 4 3 4 3 4 3 4
174
24.3k
  *coeffs = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
175
24.3k
}
convolve_2d_avx2.c:prepare_half_coeffs_2tap_avx2
Line
Count
Source
157
15.4k
    __m256i *const coeffs /* [1] */) {
158
15.4k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
159
15.4k
      filter_params, subpel_q4 & SUBPEL_MASK);
160
15.4k
  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
161
15.4k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
162
163
  // right shift all filter co-efficients by 1 to reduce the bits required.
164
  // This extra right shift will be taken care of at the end while rounding
165
  // the result.
166
  // Since all filter co-efficients are even, this change will not affect the
167
  // end result
168
15.4k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
169
15.4k
                            _mm_set1_epi16((short)0xffff)));
170
171
0
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
172
173
  // coeffs 3 4 3 4 3 4 3 4
174
15.4k
  *coeffs = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
175
15.4k
}
convolve_avx2.c:prepare_half_coeffs_2tap_avx2
Line
Count
Source
157
8.82k
    __m256i *const coeffs /* [1] */) {
158
8.82k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
159
8.82k
      filter_params, subpel_q4 & SUBPEL_MASK);
160
8.82k
  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
161
8.82k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
162
163
  // right shift all filter co-efficients by 1 to reduce the bits required.
164
  // This extra right shift will be taken care of at the end while rounding
165
  // the result.
166
  // Since all filter co-efficients are even, this change will not affect the
167
  // end result
168
8.82k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
169
8.82k
                            _mm_set1_epi16((short)0xffff)));
170
171
0
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
172
173
  // coeffs 3 4 3 4 3 4 3 4
174
8.82k
  *coeffs = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
175
8.82k
}
176
177
static INLINE void prepare_half_coeffs_4tap_avx2(
178
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
179
432k
    __m256i *const coeffs /* [2] */) {
180
432k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
181
432k
      filter_params, subpel_q4 & SUBPEL_MASK);
182
432k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
183
184
  // right shift all filter co-efficients by 1 to reduce the bits required.
185
  // This extra right shift will be taken care of at the end while rounding
186
  // the result.
187
  // Since all filter co-efficients are even, this change will not affect the
188
  // end result
189
432k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
190
432k
                            _mm_set1_epi16((short)0xffff)));
191
0
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
192
432k
  populate_coeffs_4tap_avx2(coeffs_1, coeffs);
193
432k
}
convolve_2d_avx2.c:prepare_half_coeffs_4tap_avx2
Line
Count
Source
179
122k
    __m256i *const coeffs /* [2] */) {
180
122k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
181
122k
      filter_params, subpel_q4 & SUBPEL_MASK);
182
122k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
183
184
  // right shift all filter co-efficients by 1 to reduce the bits required.
185
  // This extra right shift will be taken care of at the end while rounding
186
  // the result.
187
  // Since all filter co-efficients are even, this change will not affect the
188
  // end result
189
122k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
190
122k
                            _mm_set1_epi16((short)0xffff)));
191
0
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
192
122k
  populate_coeffs_4tap_avx2(coeffs_1, coeffs);
193
122k
}
convolve_avx2.c:prepare_half_coeffs_4tap_avx2
Line
Count
Source
179
309k
    __m256i *const coeffs /* [2] */) {
180
309k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
181
309k
      filter_params, subpel_q4 & SUBPEL_MASK);
182
309k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
183
184
  // right shift all filter co-efficients by 1 to reduce the bits required.
185
  // This extra right shift will be taken care of at the end while rounding
186
  // the result.
187
  // Since all filter co-efficients are even, this change will not affect the
188
  // end result
189
309k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
190
309k
                            _mm_set1_epi16((short)0xffff)));
191
0
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
192
309k
  populate_coeffs_4tap_avx2(coeffs_1, coeffs);
193
309k
}
194
195
static INLINE void prepare_half_coeffs_6tap_avx2(
196
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
197
2.95M
    __m256i *const coeffs /* [3] */) {
198
2.95M
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
199
2.95M
      filter_params, subpel_q4 & SUBPEL_MASK);
200
2.95M
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
201
202
  // right shift all filter co-efficients by 1 to reduce the bits required.
203
  // This extra right shift will be taken care of at the end while rounding
204
  // the result.
205
  // Since all filter co-efficients are even, this change will not affect the
206
  // end result
207
2.95M
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
208
2.95M
                            _mm_set1_epi16((short)0xffff)));
209
0
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
210
2.95M
  populate_coeffs_6tap_avx2(coeffs_1, coeffs);
211
2.95M
}
convolve_2d_avx2.c:prepare_half_coeffs_6tap_avx2
Line
Count
Source
197
1.81M
    __m256i *const coeffs /* [3] */) {
198
1.81M
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
199
1.81M
      filter_params, subpel_q4 & SUBPEL_MASK);
200
1.81M
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
201
202
  // right shift all filter co-efficients by 1 to reduce the bits required.
203
  // This extra right shift will be taken care of at the end while rounding
204
  // the result.
205
  // Since all filter co-efficients are even, this change will not affect the
206
  // end result
207
1.81M
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
208
1.81M
                            _mm_set1_epi16((short)0xffff)));
209
0
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
210
1.81M
  populate_coeffs_6tap_avx2(coeffs_1, coeffs);
211
1.81M
}
convolve_avx2.c:prepare_half_coeffs_6tap_avx2
Line
Count
Source
197
1.13M
    __m256i *const coeffs /* [3] */) {
198
1.13M
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
199
1.13M
      filter_params, subpel_q4 & SUBPEL_MASK);
200
1.13M
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
201
202
  // right shift all filter co-efficients by 1 to reduce the bits required.
203
  // This extra right shift will be taken care of at the end while rounding
204
  // the result.
205
  // Since all filter co-efficients are even, this change will not affect the
206
  // end result
207
1.13M
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
208
1.13M
                            _mm_set1_epi16((short)0xffff)));
209
0
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
210
1.13M
  populate_coeffs_6tap_avx2(coeffs_1, coeffs);
211
1.13M
}
212
213
static INLINE void prepare_half_coeffs_8tap_avx2(
214
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
215
165k
    __m256i *const coeffs /* [4] */) {
216
165k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
217
165k
      filter_params, subpel_q4 & SUBPEL_MASK);
218
165k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
219
220
  // right shift all filter co-efficients by 1 to reduce the bits required.
221
  // This extra right shift will be taken care of at the end while rounding
222
  // the result.
223
  // Since all filter co-efficients are even, this change will not affect the
224
  // end result
225
165k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
226
165k
                            _mm_set1_epi16((short)0xffff)));
227
0
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
228
165k
  populate_coeffs_8tap_avx2(coeffs_1, coeffs);
229
165k
}
convolve_2d_avx2.c:prepare_half_coeffs_8tap_avx2
Line
Count
Source
215
94.7k
    __m256i *const coeffs /* [4] */) {
216
94.7k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
217
94.7k
      filter_params, subpel_q4 & SUBPEL_MASK);
218
94.7k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
219
220
  // right shift all filter co-efficients by 1 to reduce the bits required.
221
  // This extra right shift will be taken care of at the end while rounding
222
  // the result.
223
  // Since all filter co-efficients are even, this change will not affect the
224
  // end result
225
94.7k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
226
94.7k
                            _mm_set1_epi16((short)0xffff)));
227
0
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
228
94.7k
  populate_coeffs_8tap_avx2(coeffs_1, coeffs);
229
94.7k
}
convolve_avx2.c:prepare_half_coeffs_8tap_avx2
Line
Count
Source
215
70.5k
    __m256i *const coeffs /* [4] */) {
216
70.5k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
217
70.5k
      filter_params, subpel_q4 & SUBPEL_MASK);
218
70.5k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
219
220
  // right shift all filter co-efficients by 1 to reduce the bits required.
221
  // This extra right shift will be taken care of at the end while rounding
222
  // the result.
223
  // Since all filter co-efficients are even, this change will not affect the
224
  // end result
225
70.5k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
226
70.5k
                            _mm_set1_epi16((short)0xffff)));
227
0
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
228
70.5k
  populate_coeffs_8tap_avx2(coeffs_1, coeffs);
229
70.5k
}
230
231
static INLINE void prepare_coeffs_2tap_sse2(
232
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
233
36.2k
    __m128i *const coeffs /* [1] */) {
234
36.2k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
235
36.2k
      filter_params, subpel_q4 & SUBPEL_MASK);
236
237
36.2k
  const __m128i coeff = _mm_cvtsi32_si128(*(const int32_t *)(filter + 3));
238
239
  // coeffs 3 4 3 4 3 4 3 4
240
36.2k
  coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
241
36.2k
}
convolve_2d_avx2.c:prepare_coeffs_2tap_sse2
Line
Count
Source
233
36.2k
    __m128i *const coeffs /* [1] */) {
234
36.2k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
235
36.2k
      filter_params, subpel_q4 & SUBPEL_MASK);
236
237
36.2k
  const __m128i coeff = _mm_cvtsi32_si128(*(const int32_t *)(filter + 3));
238
239
  // coeffs 3 4 3 4 3 4 3 4
240
36.2k
  coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
241
36.2k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_2tap_sse2
242
243
static INLINE void prepare_coeffs_4tap_sse2(
244
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
245
251k
    __m128i *const coeffs /* [2] */) {
246
251k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
247
251k
      filter_params, subpel_q4 & SUBPEL_MASK);
248
249
251k
  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
250
251
  // coeffs 2 3 2 3 2 3 2 3
252
251k
  coeffs[0] = _mm_shuffle_epi32(coeff, 0x55);
253
  // coeffs 4 5 4 5 4 5 4 5
254
251k
  coeffs[1] = _mm_shuffle_epi32(coeff, 0xaa);
255
251k
}
convolve_2d_avx2.c:prepare_coeffs_4tap_sse2
Line
Count
Source
245
251k
    __m128i *const coeffs /* [2] */) {
246
251k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
247
251k
      filter_params, subpel_q4 & SUBPEL_MASK);
248
249
251k
  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
250
251
  // coeffs 2 3 2 3 2 3 2 3
252
251k
  coeffs[0] = _mm_shuffle_epi32(coeff, 0x55);
253
  // coeffs 4 5 4 5 4 5 4 5
254
251k
  coeffs[1] = _mm_shuffle_epi32(coeff, 0xaa);
255
251k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_4tap_sse2
256
257
static INLINE void prepare_coeffs_6tap_ssse3(
258
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
259
128k
    __m128i *const coeffs /* [3] */) {
260
128k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
261
128k
      filter_params, subpel_q4 & SUBPEL_MASK);
262
128k
  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
263
264
  // coeffs 1 2 1 2 1 2 1 2
265
128k
  coeffs[0] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x05040302u));
266
  // coeffs 3 4 3 4 3 4 3 4
267
128k
  coeffs[1] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x09080706u));
268
  // coeffs 5 6 5 6 5 6 5 6
269
128k
  coeffs[2] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x0D0C0B0Au));
270
128k
}
convolve_2d_avx2.c:prepare_coeffs_6tap_ssse3
Line
Count
Source
259
128k
    __m128i *const coeffs /* [3] */) {
260
128k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
261
128k
      filter_params, subpel_q4 & SUBPEL_MASK);
262
128k
  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
263
264
  // coeffs 1 2 1 2 1 2 1 2
265
128k
  coeffs[0] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x05040302u));
266
  // coeffs 3 4 3 4 3 4 3 4
267
128k
  coeffs[1] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x09080706u));
268
  // coeffs 5 6 5 6 5 6 5 6
269
128k
  coeffs[2] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x0D0C0B0Au));
270
128k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_6tap_ssse3
271
272
static INLINE void prepare_coeffs_8tap_sse2(
273
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
274
6.97k
    __m128i *const coeffs /* [4] */) {
275
6.97k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
276
6.97k
      filter_params, subpel_q4 & SUBPEL_MASK);
277
278
6.97k
  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
279
280
  // coeffs 0 1 0 1 0 1 0 1
281
6.97k
  coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
282
  // coeffs 2 3 2 3 2 3 2 3
283
6.97k
  coeffs[1] = _mm_shuffle_epi32(coeff, 0x55);
284
  // coeffs 4 5 4 5 4 5 4 5
285
6.97k
  coeffs[2] = _mm_shuffle_epi32(coeff, 0xaa);
286
  // coeffs 6 7 6 7 6 7 6 7
287
6.97k
  coeffs[3] = _mm_shuffle_epi32(coeff, 0xff);
288
6.97k
}
convolve_2d_avx2.c:prepare_coeffs_8tap_sse2
Line
Count
Source
274
6.97k
    __m128i *const coeffs /* [4] */) {
275
6.97k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
276
6.97k
      filter_params, subpel_q4 & SUBPEL_MASK);
277
278
6.97k
  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
279
280
  // coeffs 0 1 0 1 0 1 0 1
281
6.97k
  coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
282
  // coeffs 2 3 2 3 2 3 2 3
283
6.97k
  coeffs[1] = _mm_shuffle_epi32(coeff, 0x55);
284
  // coeffs 4 5 4 5 4 5 4 5
285
6.97k
  coeffs[2] = _mm_shuffle_epi32(coeff, 0xaa);
286
  // coeffs 6 7 6 7 6 7 6 7
287
6.97k
  coeffs[3] = _mm_shuffle_epi32(coeff, 0xff);
288
6.97k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_8tap_sse2
289
290
static INLINE void prepare_coeffs_2tap_avx2(
291
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
292
33.4k
    __m256i *const coeffs /* [1] */) {
293
33.4k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
294
33.4k
      filter_params, subpel_q4 & SUBPEL_MASK);
295
296
33.4k
  const __m128i coeff_8 = _mm_cvtsi32_si128(*(const int32_t *)(filter + 3));
297
33.4k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
298
299
  // coeffs 3 4 3 4 3 4 3 4
300
33.4k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
301
33.4k
}
convolve_2d_avx2.c:prepare_coeffs_2tap_avx2
Line
Count
Source
292
33.4k
    __m256i *const coeffs /* [1] */) {
293
33.4k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
294
33.4k
      filter_params, subpel_q4 & SUBPEL_MASK);
295
296
33.4k
  const __m128i coeff_8 = _mm_cvtsi32_si128(*(const int32_t *)(filter + 3));
297
33.4k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
298
299
  // coeffs 3 4 3 4 3 4 3 4
300
33.4k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
301
33.4k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_2tap_avx2
302
303
static INLINE void prepare_coeffs_4tap_avx2(
304
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
305
1.68M
    __m256i *const coeffs /* [2] */) {
306
1.68M
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
307
1.68M
      filter_params, subpel_q4 & SUBPEL_MASK);
308
309
1.68M
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
310
1.68M
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
311
312
  // coeffs 2 3 2 3 2 3 2 3
313
1.68M
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55);
314
  // coeffs 4 5 4 5 4 5 4 5
315
1.68M
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0xaa);
316
1.68M
}
convolve_2d_avx2.c:prepare_coeffs_4tap_avx2
Line
Count
Source
305
1.68M
    __m256i *const coeffs /* [2] */) {
306
1.68M
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
307
1.68M
      filter_params, subpel_q4 & SUBPEL_MASK);
308
309
1.68M
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
310
1.68M
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
311
312
  // coeffs 2 3 2 3 2 3 2 3
313
1.68M
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55);
314
  // coeffs 4 5 4 5 4 5 4 5
315
1.68M
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0xaa);
316
1.68M
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_4tap_avx2
317
318
static INLINE void prepare_coeffs_6tap_avx2(
319
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
320
1.63M
    __m256i *const coeffs /* [3]*/) {
321
1.63M
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
322
1.63M
      filter_params, subpel_q4 & SUBPEL_MASK);
323
1.63M
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
324
1.63M
  const __m256i coeff = _mm256_broadcastsi128_si256(coeffs_8);
325
326
  // coeffs 1 2 1 2 1 2 1 2
327
1.63M
  coeffs[0] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x05040302u));
328
  // coeffs 3 4 3 4 3 4 3 4
329
1.63M
  coeffs[1] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x09080706u));
330
  // coeffs 5 6 5 6 5 6 5 6
331
1.63M
  coeffs[2] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x0D0C0B0Au));
332
1.63M
}
convolve_2d_avx2.c:prepare_coeffs_6tap_avx2
Line
Count
Source
320
1.63M
    __m256i *const coeffs /* [3]*/) {
321
1.63M
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
322
1.63M
      filter_params, subpel_q4 & SUBPEL_MASK);
323
1.63M
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
324
1.63M
  const __m256i coeff = _mm256_broadcastsi128_si256(coeffs_8);
325
326
  // coeffs 1 2 1 2 1 2 1 2
327
1.63M
  coeffs[0] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x05040302u));
328
  // coeffs 3 4 3 4 3 4 3 4
329
1.63M
  coeffs[1] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x09080706u));
330
  // coeffs 5 6 5 6 5 6 5 6
331
1.63M
  coeffs[2] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x0D0C0B0Au));
332
1.63M
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_6tap_avx2
333
334
static INLINE void prepare_coeffs_8tap_avx2(
335
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
336
78.7k
    __m256i *const coeffs /* [4] */) {
337
78.7k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
338
78.7k
      filter_params, subpel_q4 & SUBPEL_MASK);
339
340
78.7k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
341
78.7k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
342
343
  // coeffs 0 1 0 1 0 1 0 1
344
78.7k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
345
  // coeffs 2 3 2 3 2 3 2 3
346
78.7k
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
347
  // coeffs 4 5 4 5 4 5 4 5
348
78.7k
  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
349
  // coeffs 6 7 6 7 6 7 6 7
350
78.7k
  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
351
78.7k
}
convolve_2d_avx2.c:prepare_coeffs_8tap_avx2
Line
Count
Source
336
78.7k
    __m256i *const coeffs /* [4] */) {
337
78.7k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
338
78.7k
      filter_params, subpel_q4 & SUBPEL_MASK);
339
340
78.7k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
341
78.7k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
342
343
  // coeffs 0 1 0 1 0 1 0 1
344
78.7k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
345
  // coeffs 2 3 2 3 2 3 2 3
346
78.7k
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
347
  // coeffs 4 5 4 5 4 5 4 5
348
78.7k
  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
349
  // coeffs 6 7 6 7 6 7 6 7
350
78.7k
  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
351
78.7k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_8tap_avx2
352
353
static INLINE void load_16bit_5rows_avx2(const int16_t *const src,
354
                                         const ptrdiff_t stride,
355
0
                                         __m256i dst[5]) {
356
0
  dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
357
0
  dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
358
0
  dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
359
0
  dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
360
0
  dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
361
0
}
Unexecuted instantiation: convolve_2d_avx2.c:load_16bit_5rows_avx2
Unexecuted instantiation: convolve_avx2.c:load_16bit_5rows_avx2
362
363
static INLINE void load_16bit_7rows_avx2(const int16_t *const src,
364
                                         const ptrdiff_t stride,
365
64.1k
                                         __m256i dst[7]) {
366
64.1k
  dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
367
64.1k
  dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
368
64.1k
  dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
369
64.1k
  dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
370
64.1k
  dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
371
64.1k
  dst[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
372
64.1k
  dst[6] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
373
64.1k
}
convolve_2d_avx2.c:load_16bit_7rows_avx2
Line
Count
Source
365
64.1k
                                         __m256i dst[7]) {
366
64.1k
  dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
367
64.1k
  dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
368
64.1k
  dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
369
64.1k
  dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
370
64.1k
  dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
371
64.1k
  dst[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
372
64.1k
  dst[6] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
373
64.1k
}
Unexecuted instantiation: convolve_avx2.c:load_16bit_7rows_avx2
374
375
static AOM_FORCE_INLINE void load_16bit_8rows_avx2(const int16_t *const src,
376
                                                   const ptrdiff_t stride,
377
392
                                                   __m256i dst[8]) {
378
392
  dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
379
392
  dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
380
392
  dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
381
392
  dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
382
392
  dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
383
392
  dst[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
384
392
  dst[6] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
385
392
  dst[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
386
392
}
convolve_2d_avx2.c:load_16bit_8rows_avx2
Line
Count
Source
377
392
                                                   __m256i dst[8]) {
378
392
  dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
379
392
  dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
380
392
  dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
381
392
  dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
382
392
  dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
383
392
  dst[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
384
392
  dst[6] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
385
392
  dst[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
386
392
}
Unexecuted instantiation: convolve_avx2.c:load_16bit_8rows_avx2
387
388
static AOM_FORCE_INLINE void loadu_unpack_16bit_5rows_avx2(
389
    const int16_t *const src, const ptrdiff_t stride, __m256i s_256[5],
390
354k
    __m256i ss_256[5], __m256i tt_256[5]) {
391
354k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
392
354k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
393
354k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
394
354k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
395
354k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
396
397
354k
  ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
398
354k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
399
354k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
400
354k
  ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
401
402
354k
  tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
403
354k
  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[4]);
404
354k
  tt_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
405
354k
  tt_256[4] = _mm256_unpackhi_epi16(s_256[3], s_256[4]);
406
354k
}
convolve_2d_avx2.c:loadu_unpack_16bit_5rows_avx2
Line
Count
Source
390
354k
    __m256i ss_256[5], __m256i tt_256[5]) {
391
354k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
392
354k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
393
354k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
394
354k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
395
354k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
396
397
354k
  ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
398
354k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
399
354k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
400
354k
  ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
401
402
354k
  tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
403
354k
  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[4]);
404
354k
  tt_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
405
354k
  tt_256[4] = _mm256_unpackhi_epi16(s_256[3], s_256[4]);
406
354k
}
Unexecuted instantiation: convolve_avx2.c:loadu_unpack_16bit_5rows_avx2
407
408
static AOM_FORCE_INLINE void loadu_unpack_16bit_3rows_avx2(
409
    const int16_t *const src, const ptrdiff_t stride, __m256i s_256[3],
410
38.4k
    __m256i ss_256[3], __m256i tt_256[3]) {
411
38.4k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
412
38.4k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
413
38.4k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
414
415
38.4k
  ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
416
38.4k
  ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
417
418
38.4k
  tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
419
38.4k
  tt_256[2] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
420
38.4k
}
convolve_2d_avx2.c:loadu_unpack_16bit_3rows_avx2
Line
Count
Source
410
38.4k
    __m256i ss_256[3], __m256i tt_256[3]) {
411
38.4k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
412
38.4k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
413
38.4k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
414
415
38.4k
  ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
416
38.4k
  ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
417
418
38.4k
  tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
419
38.4k
  tt_256[2] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
420
38.4k
}
Unexecuted instantiation: convolve_avx2.c:loadu_unpack_16bit_3rows_avx2
421
422
static INLINE void convolve_8tap_unpack_avx2(const __m256i s[6],
423
138k
                                             __m256i ss[7]) {
424
138k
  ss[0] = _mm256_unpacklo_epi16(s[0], s[1]);
425
138k
  ss[1] = _mm256_unpacklo_epi16(s[2], s[3]);
426
138k
  ss[2] = _mm256_unpacklo_epi16(s[4], s[5]);
427
138k
  ss[4] = _mm256_unpackhi_epi16(s[0], s[1]);
428
138k
  ss[5] = _mm256_unpackhi_epi16(s[2], s[3]);
429
138k
  ss[6] = _mm256_unpackhi_epi16(s[4], s[5]);
430
138k
}
convolve_2d_avx2.c:convolve_8tap_unpack_avx2
Line
Count
Source
423
138k
                                             __m256i ss[7]) {
424
138k
  ss[0] = _mm256_unpacklo_epi16(s[0], s[1]);
425
138k
  ss[1] = _mm256_unpacklo_epi16(s[2], s[3]);
426
138k
  ss[2] = _mm256_unpacklo_epi16(s[4], s[5]);
427
138k
  ss[4] = _mm256_unpackhi_epi16(s[0], s[1]);
428
138k
  ss[5] = _mm256_unpackhi_epi16(s[2], s[3]);
429
138k
  ss[6] = _mm256_unpackhi_epi16(s[4], s[5]);
430
138k
}
Unexecuted instantiation: convolve_avx2.c:convolve_8tap_unpack_avx2
431
432
static INLINE __m128i convolve_2tap_ssse3(const __m128i ss[1],
433
641k
                                          const __m128i coeffs[1]) {
434
641k
  return _mm_maddubs_epi16(ss[0], coeffs[0]);
435
641k
}
convolve_2d_avx2.c:convolve_2tap_ssse3
Line
Count
Source
433
457k
                                          const __m128i coeffs[1]) {
434
457k
  return _mm_maddubs_epi16(ss[0], coeffs[0]);
435
457k
}
convolve_avx2.c:convolve_2tap_ssse3
Line
Count
Source
433
184k
                                          const __m128i coeffs[1]) {
434
184k
  return _mm_maddubs_epi16(ss[0], coeffs[0]);
435
184k
}
436
437
static INLINE __m128i convolve_4tap_ssse3(const __m128i ss[2],
438
12.4M
                                          const __m128i coeffs[2]) {
439
12.4M
  const __m128i res_23 = _mm_maddubs_epi16(ss[0], coeffs[0]);
440
12.4M
  const __m128i res_45 = _mm_maddubs_epi16(ss[1], coeffs[1]);
441
12.4M
  return _mm_add_epi16(res_23, res_45);
442
12.4M
}
convolve_2d_avx2.c:convolve_4tap_ssse3
Line
Count
Source
438
9.72M
                                          const __m128i coeffs[2]) {
439
9.72M
  const __m128i res_23 = _mm_maddubs_epi16(ss[0], coeffs[0]);
440
9.72M
  const __m128i res_45 = _mm_maddubs_epi16(ss[1], coeffs[1]);
441
9.72M
  return _mm_add_epi16(res_23, res_45);
442
9.72M
}
convolve_avx2.c:convolve_4tap_ssse3
Line
Count
Source
438
2.68M
                                          const __m128i coeffs[2]) {
439
2.68M
  const __m128i res_23 = _mm_maddubs_epi16(ss[0], coeffs[0]);
440
2.68M
  const __m128i res_45 = _mm_maddubs_epi16(ss[1], coeffs[1]);
441
2.68M
  return _mm_add_epi16(res_23, res_45);
442
2.68M
}
443
444
static INLINE __m128i convolve_6tap_ssse3(const __m128i ss[3],
445
1.16M
                                          const __m128i coeffs[3]) {
446
1.16M
  const __m128i res_12 = _mm_maddubs_epi16(ss[0], coeffs[0]);
447
1.16M
  const __m128i res_34 = _mm_maddubs_epi16(ss[1], coeffs[1]);
448
1.16M
  const __m128i res_56 = _mm_maddubs_epi16(ss[2], coeffs[2]);
449
1.16M
  const __m128i res_1256 = _mm_add_epi16(res_12, res_56);
450
1.16M
  return _mm_add_epi16(res_1256, res_34);
451
1.16M
}
Unexecuted instantiation: convolve_2d_avx2.c:convolve_6tap_ssse3
convolve_avx2.c:convolve_6tap_ssse3
Line
Count
Source
445
1.16M
                                          const __m128i coeffs[3]) {
446
1.16M
  const __m128i res_12 = _mm_maddubs_epi16(ss[0], coeffs[0]);
447
1.16M
  const __m128i res_34 = _mm_maddubs_epi16(ss[1], coeffs[1]);
448
1.16M
  const __m128i res_56 = _mm_maddubs_epi16(ss[2], coeffs[2]);
449
1.16M
  const __m128i res_1256 = _mm_add_epi16(res_12, res_56);
450
1.16M
  return _mm_add_epi16(res_1256, res_34);
451
1.16M
}
452
453
static INLINE __m128i convolve_8tap_ssse3(const __m128i ss[4],
454
60.3k
                                          const __m128i coeffs[4]) {
455
60.3k
  const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
456
60.3k
  const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
457
60.3k
  const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]);
458
60.3k
  const __m128i res_67 = _mm_maddubs_epi16(ss[3], coeffs[3]);
459
60.3k
  const __m128i res_0145 = _mm_add_epi16(res_01, res_45);
460
60.3k
  const __m128i res_2367 = _mm_add_epi16(res_23, res_67);
461
60.3k
  return _mm_add_epi16(res_0145, res_2367);
462
60.3k
}
Unexecuted instantiation: convolve_2d_avx2.c:convolve_8tap_ssse3
convolve_avx2.c:convolve_8tap_ssse3
Line
Count
Source
454
60.3k
                                          const __m128i coeffs[4]) {
455
60.3k
  const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
456
60.3k
  const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
457
60.3k
  const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]);
458
60.3k
  const __m128i res_67 = _mm_maddubs_epi16(ss[3], coeffs[3]);
459
60.3k
  const __m128i res_0145 = _mm_add_epi16(res_01, res_45);
460
60.3k
  const __m128i res_2367 = _mm_add_epi16(res_23, res_67);
461
60.3k
  return _mm_add_epi16(res_0145, res_2367);
462
60.3k
}
463
464
static INLINE __m256i convolve_2tap_avx2(const __m256i ss[1],
465
1.36M
                                         const __m256i coeffs[1]) {
466
1.36M
  return _mm256_maddubs_epi16(ss[0], coeffs[0]);
467
1.36M
}
convolve_2d_avx2.c:convolve_2tap_avx2
Line
Count
Source
465
837k
                                         const __m256i coeffs[1]) {
466
837k
  return _mm256_maddubs_epi16(ss[0], coeffs[0]);
467
837k
}
convolve_avx2.c:convolve_2tap_avx2
Line
Count
Source
465
526k
                                         const __m256i coeffs[1]) {
466
526k
  return _mm256_maddubs_epi16(ss[0], coeffs[0]);
467
526k
}
468
469
static INLINE __m256i convolve_4tap_avx2(const __m256i ss[2],
470
3.78M
                                         const __m256i coeffs[2]) {
471
3.78M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
472
3.78M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
473
3.78M
  return _mm256_add_epi16(res_23, res_45);
474
3.78M
}
convolve_2d_avx2.c:convolve_4tap_avx2
Line
Count
Source
470
2.36M
                                         const __m256i coeffs[2]) {
471
2.36M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
472
2.36M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
473
2.36M
  return _mm256_add_epi16(res_23, res_45);
474
2.36M
}
convolve_avx2.c:convolve_4tap_avx2
Line
Count
Source
470
1.42M
                                         const __m256i coeffs[2]) {
471
1.42M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
472
1.42M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
473
1.42M
  return _mm256_add_epi16(res_23, res_45);
474
1.42M
}
475
476
static INLINE __m256i convolve_6tap_avx2(const __m256i ss[3],
477
47.0M
                                         const __m256i coeffs[3]) {
478
47.0M
  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
479
47.0M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
480
47.0M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
481
47.0M
  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
482
47.0M
  return _mm256_add_epi16(res_0145, res_23);
483
47.0M
}
convolve_2d_avx2.c:convolve_6tap_avx2
Line
Count
Source
477
29.8M
                                         const __m256i coeffs[3]) {
478
29.8M
  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
479
29.8M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
480
29.8M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
481
29.8M
  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
482
29.8M
  return _mm256_add_epi16(res_0145, res_23);
483
29.8M
}
convolve_avx2.c:convolve_6tap_avx2
Line
Count
Source
477
17.1M
                                         const __m256i coeffs[3]) {
478
17.1M
  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
479
17.1M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
480
17.1M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
481
17.1M
  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
482
17.1M
  return _mm256_add_epi16(res_0145, res_23);
483
17.1M
}
484
485
static INLINE __m256i convolve_8tap_avx2(const __m256i ss[4],
486
4.16M
                                         const __m256i coeffs[4]) {
487
4.16M
  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
488
4.16M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
489
4.16M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
490
4.16M
  const __m256i res_67 = _mm256_maddubs_epi16(ss[3], coeffs[3]);
491
4.16M
  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
492
4.16M
  const __m256i res_2367 = _mm256_add_epi16(res_23, res_67);
493
4.16M
  return _mm256_add_epi16(res_0145, res_2367);
494
4.16M
}
convolve_2d_avx2.c:convolve_8tap_avx2
Line
Count
Source
486
2.79M
                                         const __m256i coeffs[4]) {
487
2.79M
  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
488
2.79M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
489
2.79M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
490
2.79M
  const __m256i res_67 = _mm256_maddubs_epi16(ss[3], coeffs[3]);
491
2.79M
  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
492
2.79M
  const __m256i res_2367 = _mm256_add_epi16(res_23, res_67);
493
2.79M
  return _mm256_add_epi16(res_0145, res_2367);
494
2.79M
}
convolve_avx2.c:convolve_8tap_avx2
Line
Count
Source
486
1.36M
                                         const __m256i coeffs[4]) {
487
1.36M
  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
488
1.36M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
489
1.36M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
490
1.36M
  const __m256i res_67 = _mm256_maddubs_epi16(ss[3], coeffs[3]);
491
1.36M
  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
492
1.36M
  const __m256i res_2367 = _mm256_add_epi16(res_23, res_67);
493
1.36M
  return _mm256_add_epi16(res_0145, res_2367);
494
1.36M
}
495
496
static INLINE __m128i convolve16_2tap_sse2(const __m128i ss[1],
497
203k
                                           const __m128i coeffs[1]) {
498
203k
  return _mm_madd_epi16(ss[0], coeffs[0]);
499
203k
}
convolve_2d_avx2.c:convolve16_2tap_sse2
Line
Count
Source
497
203k
                                           const __m128i coeffs[1]) {
498
203k
  return _mm_madd_epi16(ss[0], coeffs[0]);
499
203k
}
Unexecuted instantiation: convolve_avx2.c:convolve16_2tap_sse2
500
501
static INLINE __m128i convolve16_4tap_sse2(const __m128i ss[2],
502
419k
                                           const __m128i coeffs[2]) {
503
419k
  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
504
419k
  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
505
419k
  return _mm_add_epi32(res_01, res_23);
506
419k
}
convolve_2d_avx2.c:convolve16_4tap_sse2
Line
Count
Source
502
419k
                                           const __m128i coeffs[2]) {
503
419k
  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
504
419k
  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
505
419k
  return _mm_add_epi32(res_01, res_23);
506
419k
}
Unexecuted instantiation: convolve_avx2.c:convolve16_4tap_sse2
507
508
static INLINE __m128i convolve16_6tap_sse2(const __m128i ss[3],
509
514k
                                           const __m128i coeffs[3]) {
510
514k
  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
511
514k
  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
512
514k
  const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
513
514k
  const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
514
514k
  return _mm_add_epi32(res_0123, res_45);
515
514k
}
convolve_2d_avx2.c:convolve16_6tap_sse2
Line
Count
Source
509
514k
                                           const __m128i coeffs[3]) {
510
514k
  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
511
514k
  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
512
514k
  const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
513
514k
  const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
514
514k
  return _mm_add_epi32(res_0123, res_45);
515
514k
}
Unexecuted instantiation: convolve_avx2.c:convolve16_6tap_sse2
516
517
static INLINE __m128i convolve16_8tap_sse2(const __m128i ss[4],
518
27.9k
                                           const __m128i coeffs[4]) {
519
27.9k
  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
520
27.9k
  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
521
27.9k
  const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
522
27.9k
  const __m128i res_67 = _mm_madd_epi16(ss[3], coeffs[3]);
523
27.9k
  const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
524
27.9k
  const __m128i res_4567 = _mm_add_epi32(res_45, res_67);
525
27.9k
  return _mm_add_epi32(res_0123, res_4567);
526
27.9k
}
convolve_2d_avx2.c:convolve16_8tap_sse2
Line
Count
Source
518
27.9k
                                           const __m128i coeffs[4]) {
519
27.9k
  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
520
27.9k
  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
521
27.9k
  const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
522
27.9k
  const __m128i res_67 = _mm_madd_epi16(ss[3], coeffs[3]);
523
27.9k
  const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
524
27.9k
  const __m128i res_4567 = _mm_add_epi32(res_45, res_67);
525
27.9k
  return _mm_add_epi32(res_0123, res_4567);
526
27.9k
}
Unexecuted instantiation: convolve_avx2.c:convolve16_8tap_sse2
527
528
static INLINE __m256i convolve16_2tap_avx2(const __m256i ss[1],
529
1.91M
                                           const __m256i coeffs[1]) {
530
1.91M
  return _mm256_madd_epi16(ss[0], coeffs[0]);
531
1.91M
}
convolve_2d_avx2.c:convolve16_2tap_avx2
Line
Count
Source
529
1.91M
                                           const __m256i coeffs[1]) {
530
1.91M
  return _mm256_madd_epi16(ss[0], coeffs[0]);
531
1.91M
}
Unexecuted instantiation: convolve_avx2.c:convolve16_2tap_avx2
532
533
static INLINE __m256i convolve16_4tap_avx2(const __m256i ss[2],
534
11.8M
                                           const __m256i coeffs[2]) {
535
11.8M
  const __m256i res_1 = _mm256_madd_epi16(ss[0], coeffs[0]);
536
11.8M
  const __m256i res_2 = _mm256_madd_epi16(ss[1], coeffs[1]);
537
11.8M
  return _mm256_add_epi32(res_1, res_2);
538
11.8M
}
convolve_2d_avx2.c:convolve16_4tap_avx2
Line
Count
Source
534
11.8M
                                           const __m256i coeffs[2]) {
535
11.8M
  const __m256i res_1 = _mm256_madd_epi16(ss[0], coeffs[0]);
536
11.8M
  const __m256i res_2 = _mm256_madd_epi16(ss[1], coeffs[1]);
537
11.8M
  return _mm256_add_epi32(res_1, res_2);
538
11.8M
}
Unexecuted instantiation: convolve_avx2.c:convolve16_4tap_avx2
539
540
static INLINE __m256i convolve16_6tap_avx2(const __m256i ss[3],
541
40.1M
                                           const __m256i coeffs[3]) {
542
40.1M
  const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
543
40.1M
  const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
544
40.1M
  const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
545
40.1M
  const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
546
40.1M
  return _mm256_add_epi32(res_0123, res_45);
547
40.1M
}
convolve_2d_avx2.c:convolve16_6tap_avx2
Line
Count
Source
541
40.1M
                                           const __m256i coeffs[3]) {
542
40.1M
  const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
543
40.1M
  const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
544
40.1M
  const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
545
40.1M
  const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
546
40.1M
  return _mm256_add_epi32(res_0123, res_45);
547
40.1M
}
Unexecuted instantiation: convolve_avx2.c:convolve16_6tap_avx2
548
549
static INLINE __m256i convolve16_8tap_avx2(const __m256i ss[4],
550
3.86M
                                           const __m256i coeffs[4]) {
551
3.86M
  const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
552
3.86M
  const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
553
3.86M
  const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
554
3.86M
  const __m256i res_67 = _mm256_madd_epi16(ss[3], coeffs[3]);
555
3.86M
  const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
556
3.86M
  const __m256i res_4567 = _mm256_add_epi32(res_45, res_67);
557
3.86M
  return _mm256_add_epi32(res_0123, res_4567);
558
3.86M
}
convolve_2d_avx2.c:convolve16_8tap_avx2
Line
Count
Source
550
3.86M
                                           const __m256i coeffs[4]) {
551
3.86M
  const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
552
3.86M
  const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
553
3.86M
  const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
554
3.86M
  const __m256i res_67 = _mm256_madd_epi16(ss[3], coeffs[3]);
555
3.86M
  const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
556
3.86M
  const __m256i res_4567 = _mm256_add_epi32(res_45, res_67);
557
3.86M
  return _mm256_add_epi32(res_0123, res_4567);
558
3.86M
}
Unexecuted instantiation: convolve_avx2.c:convolve16_8tap_avx2
559
560
static INLINE __m256i x_convolve_4tap_avx2(const __m256i data,
561
                                           const __m256i coeffs[2],
562
2.36M
                                           const __m256i filt[2]) {
563
2.36M
  __m256i ss[2];
564
565
2.36M
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
566
2.36M
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
567
568
2.36M
  return convolve_4tap_avx2(ss, coeffs);
569
2.36M
}
convolve_2d_avx2.c:x_convolve_4tap_avx2
Line
Count
Source
562
2.36M
                                           const __m256i filt[2]) {
563
2.36M
  __m256i ss[2];
564
565
2.36M
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
566
2.36M
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
567
568
2.36M
  return convolve_4tap_avx2(ss, coeffs);
569
2.36M
}
Unexecuted instantiation: convolve_avx2.c:x_convolve_4tap_avx2
570
571
static INLINE __m256i x_convolve_6tap_avx2(const __m256i data,
572
                                           const __m256i coeffs[3],
573
38.7M
                                           const __m256i filt[3]) {
574
38.7M
  __m256i ss[3];
575
576
38.7M
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
577
38.7M
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
578
38.7M
  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
579
580
38.7M
  return convolve_6tap_avx2(ss, coeffs);
581
38.7M
}
convolve_2d_avx2.c:x_convolve_6tap_avx2
Line
Count
Source
573
29.8M
                                           const __m256i filt[3]) {
574
29.8M
  __m256i ss[3];
575
576
29.8M
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
577
29.8M
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
578
29.8M
  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
579
580
29.8M
  return convolve_6tap_avx2(ss, coeffs);
581
29.8M
}
convolve_avx2.c:x_convolve_6tap_avx2
Line
Count
Source
573
8.88M
                                           const __m256i filt[3]) {
574
8.88M
  __m256i ss[3];
575
576
8.88M
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
577
8.88M
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
578
8.88M
  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
579
580
8.88M
  return convolve_6tap_avx2(ss, coeffs);
581
8.88M
}
582
583
static INLINE __m256i x_convolve_8tap_avx2(const __m256i data,
584
                                           const __m256i coeffs[4],
585
3.63M
                                           const __m256i filt[4]) {
586
3.63M
  __m256i ss[4];
587
588
3.63M
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
589
3.63M
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
590
3.63M
  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
591
3.63M
  ss[3] = _mm256_shuffle_epi8(data, filt[3]);
592
593
3.63M
  return convolve_8tap_avx2(ss, coeffs);
594
3.63M
}
convolve_2d_avx2.c:x_convolve_8tap_avx2
Line
Count
Source
585
2.79M
                                           const __m256i filt[4]) {
586
2.79M
  __m256i ss[4];
587
588
2.79M
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
589
2.79M
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
590
2.79M
  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
591
2.79M
  ss[3] = _mm256_shuffle_epi8(data, filt[3]);
592
593
2.79M
  return convolve_8tap_avx2(ss, coeffs);
594
2.79M
}
convolve_avx2.c:x_convolve_8tap_avx2
Line
Count
Source
585
837k
                                           const __m256i filt[4]) {
586
837k
  __m256i ss[4];
587
588
837k
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
589
837k
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
590
837k
  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
591
837k
  ss[3] = _mm256_shuffle_epi8(data, filt[3]);
592
593
837k
  return convolve_8tap_avx2(ss, coeffs);
594
837k
}
595
596
10.5M
static INLINE __m256i sr_y_round_avx2(const __m256i src) {
597
10.5M
  const __m256i round = _mm256_set1_epi16(32);
598
10.5M
  const __m256i dst = _mm256_add_epi16(src, round);
599
10.5M
  return _mm256_srai_epi16(dst, FILTER_BITS - 1);
600
10.5M
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_round_avx2
convolve_avx2.c:sr_y_round_avx2
Line
Count
Source
596
10.5M
static INLINE __m256i sr_y_round_avx2(const __m256i src) {
597
10.5M
  const __m256i round = _mm256_set1_epi16(32);
598
10.5M
  const __m256i dst = _mm256_add_epi16(src, round);
599
10.5M
  return _mm256_srai_epi16(dst, FILTER_BITS - 1);
600
10.5M
}
601
602
10.1M
static INLINE __m128i xy_x_round_sse2(const __m128i src) {
603
10.1M
  const __m128i round = _mm_set1_epi16(2);
604
10.1M
  const __m128i dst = _mm_add_epi16(src, round);
605
10.1M
  return _mm_srai_epi16(dst, 2);
606
10.1M
}
convolve_2d_avx2.c:xy_x_round_sse2
Line
Count
Source
602
10.1M
static INLINE __m128i xy_x_round_sse2(const __m128i src) {
603
10.1M
  const __m128i round = _mm_set1_epi16(2);
604
10.1M
  const __m128i dst = _mm_add_epi16(src, round);
605
10.1M
  return _mm_srai_epi16(dst, 2);
606
10.1M
}
Unexecuted instantiation: convolve_avx2.c:xy_x_round_sse2
607
608
35.8M
static INLINE __m256i xy_x_round_avx2(const __m256i src) {
609
35.8M
  const __m256i round = _mm256_set1_epi16(2);
610
35.8M
  const __m256i dst = _mm256_add_epi16(src, round);
611
35.8M
  return _mm256_srai_epi16(dst, 2);
612
35.8M
}
convolve_2d_avx2.c:xy_x_round_avx2
Line
Count
Source
608
35.8M
static INLINE __m256i xy_x_round_avx2(const __m256i src) {
609
35.8M
  const __m256i round = _mm256_set1_epi16(2);
610
35.8M
  const __m256i dst = _mm256_add_epi16(src, round);
611
35.8M
  return _mm256_srai_epi16(dst, 2);
612
35.8M
}
Unexecuted instantiation: convolve_avx2.c:xy_x_round_avx2
613
614
static INLINE void xy_x_round_store_2x2_sse2(const __m128i res,
615
1.91M
                                             int16_t *const dst) {
616
1.91M
  const __m128i d = xy_x_round_sse2(res);
617
1.91M
  _mm_storel_epi64((__m128i *)dst, d);
618
1.91M
}
convolve_2d_avx2.c:xy_x_round_store_2x2_sse2
Line
Count
Source
615
1.91M
                                             int16_t *const dst) {
616
1.91M
  const __m128i d = xy_x_round_sse2(res);
617
1.91M
  _mm_storel_epi64((__m128i *)dst, d);
618
1.91M
}
Unexecuted instantiation: convolve_avx2.c:xy_x_round_store_2x2_sse2
619
620
static INLINE void xy_x_round_store_4x2_sse2(const __m128i res,
621
7.98M
                                             int16_t *const dst) {
622
7.98M
  const __m128i d = xy_x_round_sse2(res);
623
7.98M
  _mm_storeu_si128((__m128i *)dst, d);
624
7.98M
}
convolve_2d_avx2.c:xy_x_round_store_4x2_sse2
Line
Count
Source
621
7.98M
                                             int16_t *const dst) {
622
7.98M
  const __m128i d = xy_x_round_sse2(res);
623
7.98M
  _mm_storeu_si128((__m128i *)dst, d);
624
7.98M
}
Unexecuted instantiation: convolve_avx2.c:xy_x_round_store_4x2_sse2
625
626
static INLINE void xy_x_round_store_8x2_sse2(const __m128i res[2],
627
141k
                                             int16_t *const dst) {
628
141k
  __m128i r[2];
629
630
141k
  r[0] = xy_x_round_sse2(res[0]);
631
141k
  r[1] = xy_x_round_sse2(res[1]);
632
141k
  _mm_storeu_si128((__m128i *)dst, r[0]);
633
141k
  _mm_storeu_si128((__m128i *)(dst + 8), r[1]);
634
141k
}
convolve_2d_avx2.c:xy_x_round_store_8x2_sse2
Line
Count
Source
627
141k
                                             int16_t *const dst) {
628
141k
  __m128i r[2];
629
630
141k
  r[0] = xy_x_round_sse2(res[0]);
631
141k
  r[1] = xy_x_round_sse2(res[1]);
632
141k
  _mm_storeu_si128((__m128i *)dst, r[0]);
633
141k
  _mm_storeu_si128((__m128i *)(dst + 8), r[1]);
634
141k
}
Unexecuted instantiation: convolve_avx2.c:xy_x_round_store_8x2_sse2
635
636
static INLINE void xy_x_round_store_8x2_avx2(const __m256i res,
637
8.11M
                                             int16_t *const dst) {
638
8.11M
  const __m256i d = xy_x_round_avx2(res);
639
8.11M
  _mm256_storeu_si256((__m256i *)dst, d);
640
8.11M
}
convolve_2d_avx2.c:xy_x_round_store_8x2_avx2
Line
Count
Source
637
8.11M
                                             int16_t *const dst) {
638
8.11M
  const __m256i d = xy_x_round_avx2(res);
639
8.11M
  _mm256_storeu_si256((__m256i *)dst, d);
640
8.11M
}
Unexecuted instantiation: convolve_avx2.c:xy_x_round_store_8x2_avx2
641
642
static INLINE void xy_x_round_store_32_avx2(const __m256i res[2],
643
4.87M
                                            int16_t *const dst) {
644
4.87M
  __m256i r[2];
645
646
4.87M
  r[0] = xy_x_round_avx2(res[0]);
647
4.87M
  r[1] = xy_x_round_avx2(res[1]);
648
4.87M
  const __m256i d0 =
649
4.87M
      _mm256_inserti128_si256(r[0], _mm256_castsi256_si128(r[1]), 1);
650
4.87M
  const __m256i d1 =
651
4.87M
      _mm256_inserti128_si256(r[1], _mm256_extracti128_si256(r[0], 1), 0);
652
4.87M
  _mm256_storeu_si256((__m256i *)dst, d0);
653
4.87M
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
654
4.87M
}
convolve_2d_avx2.c:xy_x_round_store_32_avx2
Line
Count
Source
643
4.87M
                                            int16_t *const dst) {
644
4.87M
  __m256i r[2];
645
646
4.87M
  r[0] = xy_x_round_avx2(res[0]);
647
4.87M
  r[1] = xy_x_round_avx2(res[1]);
648
4.87M
  const __m256i d0 =
649
4.87M
      _mm256_inserti128_si256(r[0], _mm256_castsi256_si128(r[1]), 1);
650
4.87M
  const __m256i d1 =
651
4.87M
      _mm256_inserti128_si256(r[1], _mm256_extracti128_si256(r[0], 1), 0);
652
4.87M
  _mm256_storeu_si256((__m256i *)dst, d0);
653
4.87M
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
654
4.87M
}
Unexecuted instantiation: convolve_avx2.c:xy_x_round_store_32_avx2
655
656
1.16M
static INLINE __m128i xy_y_round_sse2(const __m128i src) {
657
1.16M
  const __m128i round = _mm_set1_epi32(1024);
658
1.16M
  const __m128i dst = _mm_add_epi32(src, round);
659
1.16M
  return _mm_srai_epi32(dst, 11);
660
1.16M
}
convolve_2d_avx2.c:xy_y_round_sse2
Line
Count
Source
656
1.16M
static INLINE __m128i xy_y_round_sse2(const __m128i src) {
657
1.16M
  const __m128i round = _mm_set1_epi32(1024);
658
1.16M
  const __m128i dst = _mm_add_epi32(src, round);
659
1.16M
  return _mm_srai_epi32(dst, 11);
660
1.16M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_sse2
661
662
20.5k
static INLINE __m128i xy_y_round_half_pel_sse2(const __m128i src) {
663
20.5k
  const __m128i round = _mm_set1_epi16(16);
664
20.5k
  const __m128i dst = _mm_add_epi16(src, round);
665
20.5k
  return _mm_srai_epi16(dst, 5);
666
20.5k
}
convolve_2d_avx2.c:xy_y_round_half_pel_sse2
Line
Count
Source
662
20.5k
static INLINE __m128i xy_y_round_half_pel_sse2(const __m128i src) {
663
20.5k
  const __m128i round = _mm_set1_epi16(16);
664
20.5k
  const __m128i dst = _mm_add_epi16(src, round);
665
20.5k
  return _mm_srai_epi16(dst, 5);
666
20.5k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_half_pel_sse2
667
668
57.5M
static INLINE __m256i xy_y_round_avx2(const __m256i src) {
669
57.5M
  const __m256i round = _mm256_set1_epi32(1024);
670
57.5M
  const __m256i dst = _mm256_add_epi32(src, round);
671
57.5M
  return _mm256_srai_epi32(dst, 11);
672
57.5M
}
convolve_2d_avx2.c:xy_y_round_avx2
Line
Count
Source
668
57.5M
static INLINE __m256i xy_y_round_avx2(const __m256i src) {
669
57.5M
  const __m256i round = _mm256_set1_epi32(1024);
670
57.5M
  const __m256i dst = _mm256_add_epi32(src, round);
671
57.5M
  return _mm256_srai_epi32(dst, 11);
672
57.5M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_avx2
673
674
26.5M
static INLINE __m256i xy_y_round_16_avx2(const __m256i r[2]) {
675
26.5M
  const __m256i r0 = xy_y_round_avx2(r[0]);
676
26.5M
  const __m256i r1 = xy_y_round_avx2(r[1]);
677
26.5M
  return _mm256_packs_epi32(r0, r1);
678
26.5M
}
convolve_2d_avx2.c:xy_y_round_16_avx2
Line
Count
Source
674
26.5M
static INLINE __m256i xy_y_round_16_avx2(const __m256i r[2]) {
675
26.5M
  const __m256i r0 = xy_y_round_avx2(r[0]);
676
26.5M
  const __m256i r1 = xy_y_round_avx2(r[1]);
677
26.5M
  return _mm256_packs_epi32(r0, r1);
678
26.5M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_16_avx2
679
680
299k
static INLINE __m256i xy_y_round_half_pel_avx2(const __m256i src) {
681
299k
  const __m256i round = _mm256_set1_epi16(16);
682
299k
  const __m256i dst = _mm256_add_epi16(src, round);
683
299k
  return _mm256_srai_epi16(dst, 5);
684
299k
}
convolve_2d_avx2.c:xy_y_round_half_pel_avx2
Line
Count
Source
680
299k
static INLINE __m256i xy_y_round_half_pel_avx2(const __m256i src) {
681
299k
  const __m256i round = _mm256_set1_epi16(16);
682
299k
  const __m256i dst = _mm256_add_epi16(src, round);
683
299k
  return _mm256_srai_epi16(dst, 5);
684
299k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_half_pel_avx2
685
686
static INLINE void pack_store_2x2_sse2(const __m128i res, uint8_t *const dst,
687
1.58M
                                       const ptrdiff_t stride) {
688
1.58M
  const __m128i d = _mm_packus_epi16(res, res);
689
1.58M
  *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d);
690
1.58M
  *(int16_t *)(dst + stride) = (int16_t)_mm_extract_epi16(d, 1);
691
1.58M
}
convolve_2d_avx2.c:pack_store_2x2_sse2
Line
Count
Source
687
985k
                                       const ptrdiff_t stride) {
688
985k
  const __m128i d = _mm_packus_epi16(res, res);
689
985k
  *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d);
690
985k
  *(int16_t *)(dst + stride) = (int16_t)_mm_extract_epi16(d, 1);
691
985k
}
convolve_avx2.c:pack_store_2x2_sse2
Line
Count
Source
687
604k
                                       const ptrdiff_t stride) {
688
604k
  const __m128i d = _mm_packus_epi16(res, res);
689
604k
  *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d);
690
604k
  *(int16_t *)(dst + stride) = (int16_t)_mm_extract_epi16(d, 1);
691
604k
}
692
693
static INLINE void pack_store_4x2_sse2(const __m128i res, uint8_t *const dst,
694
3.49M
                                       const ptrdiff_t stride) {
695
3.49M
  const __m128i d = _mm_packus_epi16(res, res);
696
3.49M
  store_u8_4x2_sse2(d, dst, stride);
697
3.49M
}
convolve_2d_avx2.c:pack_store_4x2_sse2
Line
Count
Source
694
108k
                                       const ptrdiff_t stride) {
695
108k
  const __m128i d = _mm_packus_epi16(res, res);
696
108k
  store_u8_4x2_sse2(d, dst, stride);
697
108k
}
convolve_avx2.c:pack_store_4x2_sse2
Line
Count
Source
694
3.38M
                                       const ptrdiff_t stride) {
695
3.38M
  const __m128i d = _mm_packus_epi16(res, res);
696
3.38M
  store_u8_4x2_sse2(d, dst, stride);
697
3.38M
}
698
699
static INLINE void pack_store_4x2_avx2(const __m256i res, uint8_t *const dst,
700
4.52M
                                       const ptrdiff_t stride) {
701
4.52M
  const __m256i d = _mm256_packus_epi16(res, res);
702
4.52M
  const __m128i d0 = _mm256_castsi256_si128(d);
703
4.52M
  const __m128i d1 = _mm256_extracti128_si256(d, 1);
704
705
4.52M
  xx_storel_32(dst, d0);
706
4.52M
  xx_storel_32(dst + stride, d1);
707
4.52M
}
convolve_2d_avx2.c:pack_store_4x2_avx2
Line
Count
Source
700
4.52M
                                       const ptrdiff_t stride) {
701
4.52M
  const __m256i d = _mm256_packus_epi16(res, res);
702
4.52M
  const __m128i d0 = _mm256_castsi256_si128(d);
703
4.52M
  const __m128i d1 = _mm256_extracti128_si256(d, 1);
704
705
4.52M
  xx_storel_32(dst, d0);
706
4.52M
  xx_storel_32(dst + stride, d1);
707
4.52M
}
Unexecuted instantiation: convolve_avx2.c:pack_store_4x2_avx2
708
709
static INLINE void pack_store_8x2_avx2(const __m256i res, uint8_t *const dst,
710
8.60M
                                       const ptrdiff_t stride) {
711
8.60M
  const __m256i d = _mm256_packus_epi16(res, res);
712
8.60M
  const __m128i d0 = _mm256_castsi256_si128(d);
713
8.60M
  const __m128i d1 = _mm256_extracti128_si256(d, 1);
714
8.60M
  _mm_storel_epi64((__m128i *)dst, d0);
715
8.60M
  _mm_storel_epi64((__m128i *)(dst + stride), d1);
716
8.60M
}
convolve_2d_avx2.c:pack_store_8x2_avx2
Line
Count
Source
710
5.04M
                                       const ptrdiff_t stride) {
711
5.04M
  const __m256i d = _mm256_packus_epi16(res, res);
712
5.04M
  const __m128i d0 = _mm256_castsi256_si128(d);
713
5.04M
  const __m128i d1 = _mm256_extracti128_si256(d, 1);
714
5.04M
  _mm_storel_epi64((__m128i *)dst, d0);
715
5.04M
  _mm_storel_epi64((__m128i *)(dst + stride), d1);
716
5.04M
}
convolve_avx2.c:pack_store_8x2_avx2
Line
Count
Source
710
3.55M
                                       const ptrdiff_t stride) {
711
3.55M
  const __m256i d = _mm256_packus_epi16(res, res);
712
3.55M
  const __m128i d0 = _mm256_castsi256_si128(d);
713
3.55M
  const __m128i d1 = _mm256_extracti128_si256(d, 1);
714
3.55M
  _mm_storel_epi64((__m128i *)dst, d0);
715
3.55M
  _mm_storel_epi64((__m128i *)(dst + stride), d1);
716
3.55M
}
717
718
static INLINE void pack_store_16x2_avx2(const __m256i res0, const __m256i res1,
719
                                        uint8_t *const dst,
720
2.61M
                                        const ptrdiff_t stride) {
721
2.61M
  const __m256i d = _mm256_packus_epi16(res0, res1);
722
2.61M
  storeu_u8_16x2_avx2(d, dst, stride);
723
2.61M
}
Unexecuted instantiation: convolve_2d_avx2.c:pack_store_16x2_avx2
convolve_avx2.c:pack_store_16x2_avx2
Line
Count
Source
720
2.61M
                                        const ptrdiff_t stride) {
721
2.61M
  const __m256i d = _mm256_packus_epi16(res0, res1);
722
2.61M
  storeu_u8_16x2_avx2(d, dst, stride);
723
2.61M
}
724
725
static INLINE void xy_y_pack_store_16x2_avx2(const __m256i res0,
726
                                             const __m256i res1,
727
                                             uint8_t *const dst,
728
3.30M
                                             const ptrdiff_t stride) {
729
3.30M
  const __m256i t = _mm256_packus_epi16(res0, res1);
730
3.30M
  const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
731
3.30M
  storeu_u8_16x2_avx2(d, dst, stride);
732
3.30M
}
convolve_2d_avx2.c:xy_y_pack_store_16x2_avx2
Line
Count
Source
728
3.30M
                                             const ptrdiff_t stride) {
729
3.30M
  const __m256i t = _mm256_packus_epi16(res0, res1);
730
3.30M
  const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
731
3.30M
  storeu_u8_16x2_avx2(d, dst, stride);
732
3.30M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_pack_store_16x2_avx2
733
734
static INLINE void pack_store_32_avx2(const __m256i res0, const __m256i res1,
735
0
                                      uint8_t *const dst) {
736
0
  const __m256i t = _mm256_packus_epi16(res0, res1);
737
0
  const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
738
0
  _mm256_storeu_si256((__m256i *)dst, d);
739
0
}
Unexecuted instantiation: convolve_2d_avx2.c:pack_store_32_avx2
Unexecuted instantiation: convolve_avx2.c:pack_store_32_avx2
740
741
static INLINE void xy_y_round_store_2x2_sse2(const __m128i res,
742
                                             uint8_t *const dst,
743
980k
                                             const ptrdiff_t stride) {
744
980k
  const __m128i r = xy_y_round_sse2(res);
745
980k
  const __m128i rr = _mm_packs_epi32(r, r);
746
980k
  pack_store_2x2_sse2(rr, dst, stride);
747
980k
}
convolve_2d_avx2.c:xy_y_round_store_2x2_sse2
Line
Count
Source
743
980k
                                             const ptrdiff_t stride) {
744
980k
  const __m128i r = xy_y_round_sse2(res);
745
980k
  const __m128i rr = _mm_packs_epi32(r, r);
746
980k
  pack_store_2x2_sse2(rr, dst, stride);
747
980k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_store_2x2_sse2
748
749
static INLINE void xy_y_round_store_4x2_avx2(const __m256i res,
750
                                             uint8_t *const dst,
751
4.52M
                                             const ptrdiff_t stride) {
752
4.52M
  const __m256i r = xy_y_round_avx2(res);
753
4.52M
  const __m256i rr = _mm256_packs_epi32(r, r);
754
4.52M
  pack_store_4x2_avx2(rr, dst, stride);
755
4.52M
}
convolve_2d_avx2.c:xy_y_round_store_4x2_avx2
Line
Count
Source
751
4.52M
                                             const ptrdiff_t stride) {
752
4.52M
  const __m256i r = xy_y_round_avx2(res);
753
4.52M
  const __m256i rr = _mm256_packs_epi32(r, r);
754
4.52M
  pack_store_4x2_avx2(rr, dst, stride);
755
4.52M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_store_4x2_avx2
756
757
static INLINE void xy_y_pack_store_32_avx2(const __m256i res0,
758
                                           const __m256i res1,
759
7.65M
                                           uint8_t *const dst) {
760
7.65M
  const __m256i d = _mm256_packus_epi16(res0, res1);
761
  // d = _mm256_permute4x64_epi64(d, 0xD8);
762
7.65M
  _mm256_storeu_si256((__m256i *)dst, d);
763
7.65M
}
convolve_2d_avx2.c:xy_y_pack_store_32_avx2
Line
Count
Source
759
7.65M
                                           uint8_t *const dst) {
760
7.65M
  const __m256i d = _mm256_packus_epi16(res0, res1);
761
  // d = _mm256_permute4x64_epi64(d, 0xD8);
762
7.65M
  _mm256_storeu_si256((__m256i *)dst, d);
763
7.65M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_pack_store_32_avx2
764
765
static INLINE void xy_y_round_store_32_avx2(const __m256i r0[2],
766
                                            const __m256i r1[2],
767
7.53M
                                            uint8_t *const dst) {
768
7.53M
  const __m256i ra = xy_y_round_16_avx2(r0);
769
7.53M
  const __m256i rb = xy_y_round_16_avx2(r1);
770
7.53M
  xy_y_pack_store_32_avx2(ra, rb, dst);
771
7.53M
}
convolve_2d_avx2.c:xy_y_round_store_32_avx2
Line
Count
Source
767
7.53M
                                            uint8_t *const dst) {
768
7.53M
  const __m256i ra = xy_y_round_16_avx2(r0);
769
7.53M
  const __m256i rb = xy_y_round_16_avx2(r1);
770
7.53M
  xy_y_pack_store_32_avx2(ra, rb, dst);
771
7.53M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_store_32_avx2
772
773
static INLINE void convolve_store_32_avx2(const __m256i res0,
774
                                          const __m256i res1,
775
5.87M
                                          uint8_t *const dst) {
776
5.87M
  const __m256i d = _mm256_packus_epi16(res0, res1);
777
5.87M
  _mm256_storeu_si256((__m256i *)dst, d);
778
5.87M
}
Unexecuted instantiation: convolve_2d_avx2.c:convolve_store_32_avx2
convolve_avx2.c:convolve_store_32_avx2
Line
Count
Source
775
5.87M
                                          uint8_t *const dst) {
776
5.87M
  const __m256i d = _mm256_packus_epi16(res0, res1);
777
5.87M
  _mm256_storeu_si256((__m256i *)dst, d);
778
5.87M
}
779
780
2.13M
static INLINE __m128i sr_x_round_sse2(const __m128i src) {
781
2.13M
  const __m128i round = _mm_set1_epi16(34);
782
2.13M
  const __m128i dst = _mm_add_epi16(src, round);
783
2.13M
  return _mm_srai_epi16(dst, 6);
784
2.13M
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_round_sse2
convolve_avx2.c:sr_x_round_sse2
Line
Count
Source
780
2.13M
static INLINE __m128i sr_x_round_sse2(const __m128i src) {
781
2.13M
  const __m128i round = _mm_set1_epi16(34);
782
2.13M
  const __m128i dst = _mm_add_epi16(src, round);
783
2.13M
  return _mm_srai_epi16(dst, 6);
784
2.13M
}
785
786
10.5M
static INLINE __m256i sr_x_round_avx2(const __m256i src) {
787
10.5M
  const __m256i round = _mm256_set1_epi16(34);
788
10.5M
  const __m256i dst = _mm256_add_epi16(src, round);
789
10.5M
  return _mm256_srai_epi16(dst, 6);
790
10.5M
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_round_avx2
convolve_avx2.c:sr_x_round_avx2
Line
Count
Source
786
10.5M
static INLINE __m256i sr_x_round_avx2(const __m256i src) {
787
10.5M
  const __m256i round = _mm256_set1_epi16(34);
788
10.5M
  const __m256i dst = _mm256_add_epi16(src, round);
789
10.5M
  return _mm256_srai_epi16(dst, 6);
790
10.5M
}
791
792
1.96M
static INLINE __m128i sr_y_round_sse2(const __m128i src) {
793
1.96M
  const __m128i round = _mm_set1_epi16(32);
794
1.96M
  const __m128i dst = _mm_add_epi16(src, round);
795
1.96M
  return _mm_srai_epi16(dst, FILTER_BITS - 1);
796
1.96M
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_round_sse2
convolve_avx2.c:sr_y_round_sse2
Line
Count
Source
792
1.96M
static INLINE __m128i sr_y_round_sse2(const __m128i src) {
793
1.96M
  const __m128i round = _mm_set1_epi16(32);
794
1.96M
  const __m128i dst = _mm_add_epi16(src, round);
795
1.96M
  return _mm_srai_epi16(dst, FILTER_BITS - 1);
796
1.96M
}
797
798
static INLINE void sr_x_round_store_8x2_avx2(const __m256i res,
799
                                             uint8_t *const dst,
800
1.72M
                                             const ptrdiff_t dst_stride) {
801
1.72M
  const __m256i r = sr_x_round_avx2(res);
802
1.72M
  pack_store_8x2_avx2(r, dst, dst_stride);
803
1.72M
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_round_store_8x2_avx2
convolve_avx2.c:sr_x_round_store_8x2_avx2
Line
Count
Source
800
1.72M
                                             const ptrdiff_t dst_stride) {
801
1.72M
  const __m256i r = sr_x_round_avx2(res);
802
1.72M
  pack_store_8x2_avx2(r, dst, dst_stride);
803
1.72M
}
804
805
static INLINE void sr_x_round_store_16x2_avx2(const __m256i res[2],
806
                                              uint8_t *const dst,
807
1.25M
                                              const ptrdiff_t dst_stride) {
808
1.25M
  __m256i r[2];
809
810
1.25M
  r[0] = sr_x_round_avx2(res[0]);
811
1.25M
  r[1] = sr_x_round_avx2(res[1]);
812
1.25M
  pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
813
1.25M
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_round_store_16x2_avx2
convolve_avx2.c:sr_x_round_store_16x2_avx2
Line
Count
Source
807
1.25M
                                              const ptrdiff_t dst_stride) {
808
1.25M
  __m256i r[2];
809
810
1.25M
  r[0] = sr_x_round_avx2(res[0]);
811
1.25M
  r[1] = sr_x_round_avx2(res[1]);
812
1.25M
  pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
813
1.25M
}
814
815
static INLINE void sr_x_round_store_32_avx2(const __m256i res[2],
816
2.86M
                                            uint8_t *const dst) {
817
2.86M
  __m256i r[2];
818
819
2.86M
  r[0] = sr_x_round_avx2(res[0]);
820
2.86M
  r[1] = sr_x_round_avx2(res[1]);
821
2.86M
  convolve_store_32_avx2(r[0], r[1], dst);
822
2.86M
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_round_store_32_avx2
convolve_avx2.c:sr_x_round_store_32_avx2
Line
Count
Source
816
2.86M
                                            uint8_t *const dst) {
817
2.86M
  __m256i r[2];
818
819
2.86M
  r[0] = sr_x_round_avx2(res[0]);
820
2.86M
  r[1] = sr_x_round_avx2(res[1]);
821
2.86M
  convolve_store_32_avx2(r[0], r[1], dst);
822
2.86M
}
823
824
static INLINE void sr_y_round_store_8x2_avx2(const __m256i res,
825
                                             uint8_t *const dst,
826
1.83M
                                             const ptrdiff_t dst_stride) {
827
1.83M
  const __m256i r = sr_y_round_avx2(res);
828
1.83M
  pack_store_8x2_avx2(r, dst, dst_stride);
829
1.83M
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_round_store_8x2_avx2
convolve_avx2.c:sr_y_round_store_8x2_avx2
Line
Count
Source
826
1.83M
                                             const ptrdiff_t dst_stride) {
827
1.83M
  const __m256i r = sr_y_round_avx2(res);
828
1.83M
  pack_store_8x2_avx2(r, dst, dst_stride);
829
1.83M
}
830
831
static INLINE void sr_y_round_store_16x2_avx2(const __m256i res[2],
832
                                              uint8_t *const dst,
833
1.35M
                                              const ptrdiff_t dst_stride) {
834
1.35M
  __m256i r[2];
835
836
1.35M
  r[0] = sr_y_round_avx2(res[0]);
837
1.35M
  r[1] = sr_y_round_avx2(res[1]);
838
1.35M
  pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
839
1.35M
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_round_store_16x2_avx2
convolve_avx2.c:sr_y_round_store_16x2_avx2
Line
Count
Source
833
1.35M
                                              const ptrdiff_t dst_stride) {
834
1.35M
  __m256i r[2];
835
836
1.35M
  r[0] = sr_y_round_avx2(res[0]);
837
1.35M
  r[1] = sr_y_round_avx2(res[1]);
838
1.35M
  pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
839
1.35M
}
840
841
static INLINE void sr_y_2tap_32_avg_avx2(const uint8_t *const src,
842
                                         const __m256i s0, __m256i *const s1,
843
98.7k
                                         uint8_t *const dst) {
844
98.7k
  *s1 = _mm256_loadu_si256((__m256i *)src);
845
98.7k
  const __m256i d = _mm256_avg_epu8(s0, *s1);
846
98.7k
  _mm256_storeu_si256((__m256i *)dst, d);
847
98.7k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_2tap_32_avg_avx2
convolve_avx2.c:sr_y_2tap_32_avg_avx2
Line
Count
Source
843
98.7k
                                         uint8_t *const dst) {
844
98.7k
  *s1 = _mm256_loadu_si256((__m256i *)src);
845
98.7k
  const __m256i d = _mm256_avg_epu8(s0, *s1);
846
98.7k
  _mm256_storeu_si256((__m256i *)dst, d);
847
98.7k
}
848
849
static INLINE void sr_x_2tap_32_avg_avx2(const uint8_t *const src,
850
79.1k
                                         uint8_t *const dst) {
851
79.1k
  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
852
79.1k
  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
853
79.1k
  const __m256i d = _mm256_avg_epu8(s0, s1);
854
79.1k
  _mm256_storeu_si256((__m256i *)dst, d);
855
79.1k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_2tap_32_avg_avx2
convolve_avx2.c:sr_x_2tap_32_avg_avx2
Line
Count
Source
850
79.1k
                                         uint8_t *const dst) {
851
79.1k
  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
852
79.1k
  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
853
79.1k
  const __m256i d = _mm256_avg_epu8(s0, s1);
854
79.1k
  _mm256_storeu_si256((__m256i *)dst, d);
855
79.1k
}
856
857
static INLINE __m128i x_convolve_2tap_2x2_sse4_1(const uint8_t *const src,
858
                                                 const ptrdiff_t stride,
859
39.3k
                                                 const __m128i coeffs[1]) {
860
39.3k
  const __m128i sfl =
861
39.3k
      _mm_setr_epi8(0, 1, 1, 2, 4, 5, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0);
862
39.3k
  const __m128i s_128 = load_u8_4x2_sse4_1(src, stride);
863
39.3k
  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
864
39.3k
  return convolve_2tap_ssse3(&ss, coeffs);
865
39.3k
}
convolve_2d_avx2.c:x_convolve_2tap_2x2_sse4_1
Line
Count
Source
859
33.6k
                                                 const __m128i coeffs[1]) {
860
33.6k
  const __m128i sfl =
861
33.6k
      _mm_setr_epi8(0, 1, 1, 2, 4, 5, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0);
862
33.6k
  const __m128i s_128 = load_u8_4x2_sse4_1(src, stride);
863
33.6k
  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
864
33.6k
  return convolve_2tap_ssse3(&ss, coeffs);
865
33.6k
}
convolve_avx2.c:x_convolve_2tap_2x2_sse4_1
Line
Count
Source
859
5.69k
                                                 const __m128i coeffs[1]) {
860
5.69k
  const __m128i sfl =
861
5.69k
      _mm_setr_epi8(0, 1, 1, 2, 4, 5, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0);
862
5.69k
  const __m128i s_128 = load_u8_4x2_sse4_1(src, stride);
863
5.69k
  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
864
5.69k
  return convolve_2tap_ssse3(&ss, coeffs);
865
5.69k
}
866
867
static INLINE __m128i x_convolve_2tap_4x2_ssse3(const uint8_t *const src,
868
                                                const ptrdiff_t stride,
869
174k
                                                const __m128i coeffs[1]) {
870
174k
  const __m128i sfl =
871
174k
      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
872
174k
  const __m128i s_128 = load_u8_8x2_sse2(src, stride);
873
174k
  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
874
174k
  return convolve_2tap_ssse3(&ss, coeffs);
875
174k
}
convolve_2d_avx2.c:x_convolve_2tap_4x2_ssse3
Line
Count
Source
869
140k
                                                const __m128i coeffs[1]) {
870
140k
  const __m128i sfl =
871
140k
      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
872
140k
  const __m128i s_128 = load_u8_8x2_sse2(src, stride);
873
140k
  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
874
140k
  return convolve_2tap_ssse3(&ss, coeffs);
875
140k
}
convolve_avx2.c:x_convolve_2tap_4x2_ssse3
Line
Count
Source
869
33.3k
                                                const __m128i coeffs[1]) {
870
33.3k
  const __m128i sfl =
871
33.3k
      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
872
33.3k
  const __m128i s_128 = load_u8_8x2_sse2(src, stride);
873
33.3k
  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
874
33.3k
  return convolve_2tap_ssse3(&ss, coeffs);
875
33.3k
}
876
877
static INLINE void x_convolve_2tap_8x2_ssse3(const uint8_t *const src,
878
                                             const ptrdiff_t stride,
879
                                             const __m128i coeffs[1],
880
175k
                                             __m128i r[2]) {
881
175k
  __m128i ss[2];
882
175k
  const __m128i s00 = _mm_loadu_si128((__m128i *)src);
883
175k
  const __m128i s10 = _mm_loadu_si128((__m128i *)(src + stride));
884
175k
  const __m128i s01 = _mm_srli_si128(s00, 1);
885
175k
  const __m128i s11 = _mm_srli_si128(s10, 1);
886
175k
  ss[0] = _mm_unpacklo_epi8(s00, s01);
887
175k
  ss[1] = _mm_unpacklo_epi8(s10, s11);
888
889
175k
  r[0] = convolve_2tap_ssse3(&ss[0], coeffs);
890
175k
  r[1] = convolve_2tap_ssse3(&ss[1], coeffs);
891
175k
}
convolve_2d_avx2.c:x_convolve_2tap_8x2_ssse3
Line
Count
Source
880
141k
                                             __m128i r[2]) {
881
141k
  __m128i ss[2];
882
141k
  const __m128i s00 = _mm_loadu_si128((__m128i *)src);
883
141k
  const __m128i s10 = _mm_loadu_si128((__m128i *)(src + stride));
884
141k
  const __m128i s01 = _mm_srli_si128(s00, 1);
885
141k
  const __m128i s11 = _mm_srli_si128(s10, 1);
886
141k
  ss[0] = _mm_unpacklo_epi8(s00, s01);
887
141k
  ss[1] = _mm_unpacklo_epi8(s10, s11);
888
889
141k
  r[0] = convolve_2tap_ssse3(&ss[0], coeffs);
890
141k
  r[1] = convolve_2tap_ssse3(&ss[1], coeffs);
891
141k
}
convolve_avx2.c:x_convolve_2tap_8x2_ssse3
Line
Count
Source
880
34.5k
                                             __m128i r[2]) {
881
34.5k
  __m128i ss[2];
882
34.5k
  const __m128i s00 = _mm_loadu_si128((__m128i *)src);
883
34.5k
  const __m128i s10 = _mm_loadu_si128((__m128i *)(src + stride));
884
34.5k
  const __m128i s01 = _mm_srli_si128(s00, 1);
885
34.5k
  const __m128i s11 = _mm_srli_si128(s10, 1);
886
34.5k
  ss[0] = _mm_unpacklo_epi8(s00, s01);
887
34.5k
  ss[1] = _mm_unpacklo_epi8(s10, s11);
888
889
34.5k
  r[0] = convolve_2tap_ssse3(&ss[0], coeffs);
890
34.5k
  r[1] = convolve_2tap_ssse3(&ss[1], coeffs);
891
34.5k
}
892
893
static INLINE __m256i x_convolve_2tap_8x2_avx2(const uint8_t *const src,
894
                                               const ptrdiff_t stride,
895
0
                                               const __m256i coeffs[1]) {
896
0
  __m128i s_128[2][2];
897
0
  __m256i s_256[2];
898
0
899
0
  s_128[0][0] = _mm_loadu_si128((__m128i *)src);
900
0
  s_128[1][0] = _mm_loadu_si128((__m128i *)(src + stride));
901
0
  s_128[0][1] = _mm_srli_si128(s_128[0][0], 1);
902
0
  s_128[1][1] = _mm_srli_si128(s_128[1][0], 1);
903
0
  s_256[0] = _mm256_setr_m128i(s_128[0][0], s_128[1][0]);
904
0
  s_256[1] = _mm256_setr_m128i(s_128[0][1], s_128[1][1]);
905
0
  const __m256i ss = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
906
0
  return convolve_2tap_avx2(&ss, coeffs);
907
0
}
Unexecuted instantiation: convolve_2d_avx2.c:x_convolve_2tap_8x2_avx2
Unexecuted instantiation: convolve_avx2.c:x_convolve_2tap_8x2_avx2
908
909
static INLINE void x_convolve_2tap_16x2_avx2(const uint8_t *const src,
910
                                             const ptrdiff_t stride,
911
                                             const __m256i coeffs[1],
912
93.9k
                                             __m256i r[2]) {
913
93.9k
  const __m256i s0_256 = loadu_8bit_16x2_avx2(src, stride);
914
93.9k
  const __m256i s1_256 = loadu_8bit_16x2_avx2(src + 1, stride);
915
93.9k
  const __m256i s0 = _mm256_unpacklo_epi8(s0_256, s1_256);
916
93.9k
  const __m256i s1 = _mm256_unpackhi_epi8(s0_256, s1_256);
917
93.9k
  r[0] = convolve_2tap_avx2(&s0, coeffs);
918
93.9k
  r[1] = convolve_2tap_avx2(&s1, coeffs);
919
93.9k
}
convolve_2d_avx2.c:x_convolve_2tap_16x2_avx2
Line
Count
Source
912
79.8k
                                             __m256i r[2]) {
913
79.8k
  const __m256i s0_256 = loadu_8bit_16x2_avx2(src, stride);
914
79.8k
  const __m256i s1_256 = loadu_8bit_16x2_avx2(src + 1, stride);
915
79.8k
  const __m256i s0 = _mm256_unpacklo_epi8(s0_256, s1_256);
916
79.8k
  const __m256i s1 = _mm256_unpackhi_epi8(s0_256, s1_256);
917
79.8k
  r[0] = convolve_2tap_avx2(&s0, coeffs);
918
79.8k
  r[1] = convolve_2tap_avx2(&s1, coeffs);
919
79.8k
}
convolve_avx2.c:x_convolve_2tap_16x2_avx2
Line
Count
Source
912
14.1k
                                             __m256i r[2]) {
913
14.1k
  const __m256i s0_256 = loadu_8bit_16x2_avx2(src, stride);
914
14.1k
  const __m256i s1_256 = loadu_8bit_16x2_avx2(src + 1, stride);
915
14.1k
  const __m256i s0 = _mm256_unpacklo_epi8(s0_256, s1_256);
916
14.1k
  const __m256i s1 = _mm256_unpackhi_epi8(s0_256, s1_256);
917
14.1k
  r[0] = convolve_2tap_avx2(&s0, coeffs);
918
14.1k
  r[1] = convolve_2tap_avx2(&s1, coeffs);
919
14.1k
}
920
921
static INLINE void x_convolve_2tap_32_avx2(const uint8_t *const src,
922
                                           const __m256i coeffs[1],
923
104k
                                           __m256i r[2]) {
924
104k
  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
925
104k
  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
926
104k
  const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
927
104k
  const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
928
929
104k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
930
104k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
931
104k
}
Unexecuted instantiation: convolve_2d_avx2.c:x_convolve_2tap_32_avx2
convolve_avx2.c:x_convolve_2tap_32_avx2
Line
Count
Source
923
104k
                                           __m256i r[2]) {
924
104k
  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
925
104k
  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
926
104k
  const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
927
104k
  const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
928
929
104k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
930
104k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
931
104k
}
932
933
static INLINE __m128i x_convolve_4tap_2x2_ssse3(const uint8_t *const src,
934
                                                const ptrdiff_t stride,
935
2.21M
                                                const __m128i coeffs[2]) {
936
2.21M
  const __m128i sfl0 =
937
2.21M
      _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
938
2.21M
  const __m128i sfl1 =
939
2.21M
      _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
940
2.21M
  const __m128i s = load_u8_8x2_sse2(src, stride);
941
2.21M
  __m128i ss[2];
942
943
2.21M
  ss[0] = _mm_shuffle_epi8(s, sfl0);
944
2.21M
  ss[1] = _mm_shuffle_epi8(s, sfl1);
945
2.21M
  return convolve_4tap_ssse3(ss, coeffs);
946
2.21M
}
convolve_2d_avx2.c:x_convolve_4tap_2x2_ssse3
Line
Count
Source
935
1.87M
                                                const __m128i coeffs[2]) {
936
1.87M
  const __m128i sfl0 =
937
1.87M
      _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
938
1.87M
  const __m128i sfl1 =
939
1.87M
      _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
940
1.87M
  const __m128i s = load_u8_8x2_sse2(src, stride);
941
1.87M
  __m128i ss[2];
942
943
1.87M
  ss[0] = _mm_shuffle_epi8(s, sfl0);
944
1.87M
  ss[1] = _mm_shuffle_epi8(s, sfl1);
945
1.87M
  return convolve_4tap_ssse3(ss, coeffs);
946
1.87M
}
convolve_avx2.c:x_convolve_4tap_2x2_ssse3
Line
Count
Source
935
334k
                                                const __m128i coeffs[2]) {
936
334k
  const __m128i sfl0 =
937
334k
      _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
938
334k
  const __m128i sfl1 =
939
334k
      _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
940
334k
  const __m128i s = load_u8_8x2_sse2(src, stride);
941
334k
  __m128i ss[2];
942
943
334k
  ss[0] = _mm_shuffle_epi8(s, sfl0);
944
334k
  ss[1] = _mm_shuffle_epi8(s, sfl1);
945
334k
  return convolve_4tap_ssse3(ss, coeffs);
946
334k
}
947
948
static INLINE __m128i x_convolve_4tap_4x2_ssse3(const uint8_t *const src,
949
                                                const ptrdiff_t stride,
950
9.53M
                                                const __m128i coeffs[2]) {
951
9.53M
  const __m128i s = load_u8_8x2_sse2(src, stride);
952
9.53M
  const __m128i sfl0 =
953
9.53M
      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
954
9.53M
  const __m128i sfl1 =
955
9.53M
      _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14);
956
9.53M
  __m128i ss[2];
957
958
9.53M
  ss[0] = _mm_shuffle_epi8(s, sfl0);
959
9.53M
  ss[1] = _mm_shuffle_epi8(s, sfl1);
960
9.53M
  return convolve_4tap_ssse3(ss, coeffs);
961
9.53M
}
convolve_2d_avx2.c:x_convolve_4tap_4x2_ssse3
Line
Count
Source
950
7.84M
                                                const __m128i coeffs[2]) {
951
7.84M
  const __m128i s = load_u8_8x2_sse2(src, stride);
952
7.84M
  const __m128i sfl0 =
953
7.84M
      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
954
7.84M
  const __m128i sfl1 =
955
7.84M
      _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14);
956
7.84M
  __m128i ss[2];
957
958
7.84M
  ss[0] = _mm_shuffle_epi8(s, sfl0);
959
7.84M
  ss[1] = _mm_shuffle_epi8(s, sfl1);
960
7.84M
  return convolve_4tap_ssse3(ss, coeffs);
961
7.84M
}
convolve_avx2.c:x_convolve_4tap_4x2_ssse3
Line
Count
Source
950
1.68M
                                                const __m128i coeffs[2]) {
951
1.68M
  const __m128i s = load_u8_8x2_sse2(src, stride);
952
1.68M
  const __m128i sfl0 =
953
1.68M
      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
954
1.68M
  const __m128i sfl1 =
955
1.68M
      _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14);
956
1.68M
  __m128i ss[2];
957
958
1.68M
  ss[0] = _mm_shuffle_epi8(s, sfl0);
959
1.68M
  ss[1] = _mm_shuffle_epi8(s, sfl1);
960
1.68M
  return convolve_4tap_ssse3(ss, coeffs);
961
1.68M
}
962
963
static INLINE __m256i x_convolve_4tap_8x2_avx2(const uint8_t *const src,
964
                                               const ptrdiff_t stride,
965
                                               const __m256i coeffs[2],
966
1.01M
                                               const __m256i filt[2]) {
967
1.01M
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
968
1.01M
  return x_convolve_4tap_avx2(s_256, coeffs, filt);
969
1.01M
}
convolve_2d_avx2.c:x_convolve_4tap_8x2_avx2
Line
Count
Source
966
1.01M
                                               const __m256i filt[2]) {
967
1.01M
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
968
1.01M
  return x_convolve_4tap_avx2(s_256, coeffs, filt);
969
1.01M
}
Unexecuted instantiation: convolve_avx2.c:x_convolve_4tap_8x2_avx2
970
971
static INLINE void x_convolve_4tap_16x2_avx2(const uint8_t *const src,
972
                                             const int32_t src_stride,
973
                                             const __m256i coeffs[2],
974
                                             const __m256i filt[2],
975
286k
                                             __m256i r[2]) {
976
286k
  r[0] = x_convolve_4tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
977
286k
  r[1] = x_convolve_4tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
978
286k
}
convolve_2d_avx2.c:x_convolve_4tap_16x2_avx2
Line
Count
Source
975
286k
                                             __m256i r[2]) {
976
286k
  r[0] = x_convolve_4tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
977
286k
  r[1] = x_convolve_4tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
978
286k
}
Unexecuted instantiation: convolve_avx2.c:x_convolve_4tap_16x2_avx2
979
980
static INLINE void x_convolve_4tap_32_avx2(const uint8_t *const src,
981
                                           const __m256i coeffs[2],
982
                                           const __m256i filt[2],
983
675k
                                           __m256i r[2]) {
984
675k
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
985
675k
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
986
987
675k
  r[0] = x_convolve_4tap_avx2(s0_256, coeffs, filt);
988
675k
  r[1] = x_convolve_4tap_avx2(s1_256, coeffs, filt);
989
675k
}
convolve_2d_avx2.c:x_convolve_4tap_32_avx2
Line
Count
Source
983
675k
                                           __m256i r[2]) {
984
675k
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
985
675k
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
986
987
675k
  r[0] = x_convolve_4tap_avx2(s0_256, coeffs, filt);
988
675k
  r[1] = x_convolve_4tap_avx2(s1_256, coeffs, filt);
989
675k
}
Unexecuted instantiation: convolve_avx2.c:x_convolve_4tap_32_avx2
990
991
static INLINE __m128i x_convolve_6tap_2x2_ssse3(const uint8_t *const src,
992
                                                const ptrdiff_t stride,
993
0
                                                const __m128i coeffs[3]) {
994
0
  const __m128i sfl0 =
995
0
      _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
996
0
  const __m128i sfl1 =
997
0
      _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
998
0
  const __m128i sfl2 =
999
0
      _mm_setr_epi8(4, 5, 5, 6, 12, 13, 13, 14, 0, 0, 0, 0, 0, 0, 0, 0);
1000
1001
0
  const __m128i s = load_u8_8x2_sse2(src, stride);
1002
0
  __m128i ss[3];
1003
1004
0
  ss[0] = _mm_shuffle_epi8(s, sfl0);
1005
0
  ss[1] = _mm_shuffle_epi8(s, sfl1);
1006
0
  ss[2] = _mm_shuffle_epi8(s, sfl2);
1007
0
  return convolve_6tap_ssse3(ss, coeffs);
1008
0
}
Unexecuted instantiation: convolve_2d_avx2.c:x_convolve_6tap_2x2_ssse3
Unexecuted instantiation: convolve_avx2.c:x_convolve_6tap_2x2_ssse3
1009
1010
static INLINE __m128i x_convolve_6tap_4x2_ssse3(const uint8_t *const src,
1011
                                                const ptrdiff_t stride,
1012
0
                                                const __m128i coeffs[3]) {
1013
0
  const __m128i s = load_u8_8x2_sse2(src, stride);
1014
0
  const __m128i sfl0 =
1015
0
      _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
1016
0
  const __m128i sfl1 =
1017
0
      _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
1018
0
  const __m128i sfl2 =
1019
0
      _mm_setr_epi8(4, 5, 5, 6, 12, 13, 13, 14, 0, 0, 0, 0, 0, 0, 0, 0);
1020
0
  __m128i ss[3];
1021
1022
0
  ss[0] = _mm_shuffle_epi8(s, sfl0);
1023
0
  ss[1] = _mm_shuffle_epi8(s, sfl1);
1024
0
  ss[2] = _mm_shuffle_epi8(s, sfl2);
1025
0
  return convolve_6tap_ssse3(ss, coeffs);
1026
0
}
Unexecuted instantiation: convolve_2d_avx2.c:x_convolve_6tap_4x2_ssse3
Unexecuted instantiation: convolve_avx2.c:x_convolve_6tap_4x2_ssse3
1027
1028
static INLINE __m256i x_convolve_6tap_8x2_avx2(const uint8_t *const src,
1029
                                               const ptrdiff_t stride,
1030
                                               const __m256i coeffs[3],
1031
19.8M
                                               const __m256i filt[3]) {
1032
19.8M
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
1033
19.8M
  return x_convolve_6tap_avx2(s_256, coeffs, filt);
1034
19.8M
}
convolve_2d_avx2.c:x_convolve_6tap_8x2_avx2
Line
Count
Source
1031
15.8M
                                               const __m256i filt[3]) {
1032
15.8M
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
1033
15.8M
  return x_convolve_6tap_avx2(s_256, coeffs, filt);
1034
15.8M
}
convolve_avx2.c:x_convolve_6tap_8x2_avx2
Line
Count
Source
1031
3.93M
                                               const __m256i filt[3]) {
1032
3.93M
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
1033
3.93M
  return x_convolve_6tap_avx2(s_256, coeffs, filt);
1034
3.93M
}
1035
1036
static INLINE void x_convolve_6tap_16x2_avx2(const uint8_t *const src,
1037
                                             const int32_t src_stride,
1038
                                             const __m256i coeffs[3],
1039
                                             const __m256i filt[3],
1040
5.46M
                                             __m256i r[2]) {
1041
5.46M
  r[0] = x_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1042
5.46M
  r[1] = x_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1043
5.46M
}
convolve_2d_avx2.c:x_convolve_6tap_16x2_avx2
Line
Count
Source
1040
4.29M
                                             __m256i r[2]) {
1041
4.29M
  r[0] = x_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1042
4.29M
  r[1] = x_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1043
4.29M
}
convolve_avx2.c:x_convolve_6tap_16x2_avx2
Line
Count
Source
1040
1.16M
                                             __m256i r[2]) {
1041
1.16M
  r[0] = x_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1042
1.16M
  r[1] = x_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1043
1.16M
}
1044
1045
static INLINE void x_convolve_6tap_32_avx2(const uint8_t *const src,
1046
                                           const __m256i coeffs[3],
1047
                                           const __m256i filt[3],
1048
9.48M
                                           __m256i r[2]) {
1049
9.48M
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
1050
9.48M
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
1051
1052
9.48M
  r[0] = x_convolve_6tap_avx2(s0_256, coeffs, filt);
1053
9.48M
  r[1] = x_convolve_6tap_avx2(s1_256, coeffs, filt);
1054
9.48M
}
convolve_2d_avx2.c:x_convolve_6tap_32_avx2
Line
Count
Source
1048
7.00M
                                           __m256i r[2]) {
1049
7.00M
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
1050
7.00M
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
1051
1052
7.00M
  r[0] = x_convolve_6tap_avx2(s0_256, coeffs, filt);
1053
7.00M
  r[1] = x_convolve_6tap_avx2(s1_256, coeffs, filt);
1054
7.00M
}
convolve_avx2.c:x_convolve_6tap_32_avx2
Line
Count
Source
1048
2.47M
                                           __m256i r[2]) {
1049
2.47M
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
1050
2.47M
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
1051
1052
2.47M
  r[0] = x_convolve_6tap_avx2(s0_256, coeffs, filt);
1053
2.47M
  r[1] = x_convolve_6tap_avx2(s1_256, coeffs, filt);
1054
2.47M
}
1055
1056
static INLINE __m256i x_convolve_8tap_8x2_avx2(const uint8_t *const src,
1057
                                               const ptrdiff_t stride,
1058
                                               const __m256i coeffs[4],
1059
1.05M
                                               const __m256i filt[4]) {
1060
1.05M
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
1061
1.05M
  return x_convolve_8tap_avx2(s_256, coeffs, filt);
1062
1.05M
}
convolve_2d_avx2.c:x_convolve_8tap_8x2_avx2
Line
Count
Source
1059
788k
                                               const __m256i filt[4]) {
1060
788k
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
1061
788k
  return x_convolve_8tap_avx2(s_256, coeffs, filt);
1062
788k
}
convolve_avx2.c:x_convolve_8tap_8x2_avx2
Line
Count
Source
1059
269k
                                               const __m256i filt[4]) {
1060
269k
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
1061
269k
  return x_convolve_8tap_avx2(s_256, coeffs, filt);
1062
269k
}
1063
1064
static AOM_FORCE_INLINE void x_convolve_8tap_16x2_avx2(const uint8_t *const src,
1065
                                                       const int32_t src_stride,
1066
                                                       const __m256i coeffs[4],
1067
                                                       const __m256i filt[4],
1068
289k
                                                       __m256i r[2]) {
1069
289k
  r[0] = x_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1070
289k
  r[1] = x_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1071
289k
}
convolve_2d_avx2.c:x_convolve_8tap_16x2_avx2
Line
Count
Source
1068
213k
                                                       __m256i r[2]) {
1069
213k
  r[0] = x_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1070
213k
  r[1] = x_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1071
213k
}
convolve_avx2.c:x_convolve_8tap_16x2_avx2
Line
Count
Source
1068
76.1k
                                                       __m256i r[2]) {
1069
76.1k
  r[0] = x_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1070
76.1k
  r[1] = x_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1071
76.1k
}
1072
1073
static AOM_FORCE_INLINE void x_convolve_8tap_32_avx2(const uint8_t *const src,
1074
                                                     const __m256i coeffs[4],
1075
                                                     const __m256i filt[4],
1076
1.28M
                                                     __m256i r[2]) {
1077
1.28M
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
1078
1.28M
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
1079
1080
1.28M
  r[0] = x_convolve_8tap_avx2(s0_256, coeffs, filt);
1081
1.28M
  r[1] = x_convolve_8tap_avx2(s1_256, coeffs, filt);
1082
1.28M
}
convolve_2d_avx2.c:x_convolve_8tap_32_avx2
Line
Count
Source
1076
1.00M
                                                     __m256i r[2]) {
1077
1.00M
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
1078
1.00M
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
1079
1080
1.00M
  r[0] = x_convolve_8tap_avx2(s0_256, coeffs, filt);
1081
1.00M
  r[1] = x_convolve_8tap_avx2(s1_256, coeffs, filt);
1082
1.00M
}
convolve_avx2.c:x_convolve_8tap_32_avx2
Line
Count
Source
1076
284k
                                                     __m256i r[2]) {
1077
284k
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
1078
284k
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
1079
1080
284k
  r[0] = x_convolve_8tap_avx2(s0_256, coeffs, filt);
1081
284k
  r[1] = x_convolve_8tap_avx2(s1_256, coeffs, filt);
1082
284k
}
1083
1084
static INLINE __m128i y_convolve_2tap_2x2_ssse3(const uint8_t *const src,
1085
                                                const ptrdiff_t stride,
1086
                                                const __m128i coeffs[1],
1087
6.89k
                                                __m128i s_16[2]) {
1088
6.89k
  __m128i s_128[2];
1089
1090
6.89k
  s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src + stride));
1091
6.89k
  s_128[0] = _mm_unpacklo_epi16(s_16[0], s_16[1]);
1092
6.89k
  s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src + 2 * stride));
1093
6.89k
  s_128[1] = _mm_unpacklo_epi16(s_16[1], s_16[0]);
1094
6.89k
  const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
1095
6.89k
  return convolve_2tap_ssse3(&ss, coeffs);
1096
6.89k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_2tap_2x2_ssse3
convolve_avx2.c:y_convolve_2tap_2x2_ssse3
Line
Count
Source
1087
6.89k
                                                __m128i s_16[2]) {
1088
6.89k
  __m128i s_128[2];
1089
1090
6.89k
  s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src + stride));
1091
6.89k
  s_128[0] = _mm_unpacklo_epi16(s_16[0], s_16[1]);
1092
6.89k
  s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src + 2 * stride));
1093
6.89k
  s_128[1] = _mm_unpacklo_epi16(s_16[1], s_16[0]);
1094
6.89k
  const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
1095
6.89k
  return convolve_2tap_ssse3(&ss, coeffs);
1096
6.89k
}
1097
1098
static INLINE __m128i y_convolve_2tap_4x2_ssse3(const uint8_t *const src,
1099
                                                const ptrdiff_t stride,
1100
                                                const __m128i coeffs[1],
1101
26.1k
                                                __m128i s_32[2]) {
1102
26.1k
  __m128i s_128[2];
1103
1104
26.1k
  s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src + stride));
1105
26.1k
  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1106
26.1k
  s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src + 2 * stride));
1107
26.1k
  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1108
26.1k
  const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
1109
26.1k
  return convolve_2tap_ssse3(&ss, coeffs);
1110
26.1k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_2tap_4x2_ssse3
convolve_avx2.c:y_convolve_2tap_4x2_ssse3
Line
Count
Source
1101
26.1k
                                                __m128i s_32[2]) {
1102
26.1k
  __m128i s_128[2];
1103
1104
26.1k
  s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src + stride));
1105
26.1k
  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1106
26.1k
  s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src + 2 * stride));
1107
26.1k
  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1108
26.1k
  const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
1109
26.1k
  return convolve_2tap_ssse3(&ss, coeffs);
1110
26.1k
}
1111
1112
static INLINE __m256i y_convolve_2tap_8x2_avx2(const uint8_t *const src,
1113
                                               const ptrdiff_t stride,
1114
                                               const __m256i coeffs[1],
1115
0
                                               __m128i s_64[2]) {
1116
0
  __m256i s_256[2];
1117
0
1118
0
  s_64[1] = _mm_loadl_epi64((__m128i *)(src + stride));
1119
0
  s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
1120
0
  s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * stride));
1121
0
  s_256[1] = _mm256_setr_m128i(s_64[1], s_64[0]);
1122
0
  const __m256i ss = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
1123
0
  return convolve_2tap_avx2(&ss, coeffs);
1124
0
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_2tap_8x2_avx2
Unexecuted instantiation: convolve_avx2.c:y_convolve_2tap_8x2_avx2
1125
1126
static INLINE void y_convolve_2tap_16x2_avx2(const uint8_t *const src,
1127
                                             const ptrdiff_t stride,
1128
                                             const __m256i coeffs[1],
1129
15.6k
                                             __m128i s_128[2], __m256i r[2]) {
1130
15.6k
  __m256i s_256[2];
1131
1132
15.6k
  s_128[1] = _mm_loadu_si128((__m128i *)(src + stride));
1133
15.6k
  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1134
15.6k
  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
1135
15.6k
  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1136
15.6k
  const __m256i ss0 = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
1137
15.6k
  const __m256i ss1 = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
1138
15.6k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
1139
15.6k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
1140
15.6k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_2tap_16x2_avx2
convolve_avx2.c:y_convolve_2tap_16x2_avx2
Line
Count
Source
1129
15.6k
                                             __m128i s_128[2], __m256i r[2]) {
1130
15.6k
  __m256i s_256[2];
1131
1132
15.6k
  s_128[1] = _mm_loadu_si128((__m128i *)(src + stride));
1133
15.6k
  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1134
15.6k
  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
1135
15.6k
  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1136
15.6k
  const __m256i ss0 = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
1137
15.6k
  const __m256i ss1 = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
1138
15.6k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
1139
15.6k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
1140
15.6k
}
1141
1142
static INLINE void y_convolve_2tap_32_avx2(const uint8_t *const src,
1143
                                           const __m256i coeffs[1],
1144
                                           const __m256i s0, __m256i *const s1,
1145
129k
                                           __m256i r[2]) {
1146
129k
  *s1 = _mm256_loadu_si256((__m256i *)src);
1147
129k
  const __m256i ss0 = _mm256_unpacklo_epi8(s0, *s1);
1148
129k
  const __m256i ss1 = _mm256_unpackhi_epi8(s0, *s1);
1149
129k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
1150
129k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
1151
129k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_2tap_32_avx2
convolve_avx2.c:y_convolve_2tap_32_avx2
Line
Count
Source
1145
129k
                                           __m256i r[2]) {
1146
129k
  *s1 = _mm256_loadu_si256((__m256i *)src);
1147
129k
  const __m256i ss0 = _mm256_unpacklo_epi8(s0, *s1);
1148
129k
  const __m256i ss1 = _mm256_unpackhi_epi8(s0, *s1);
1149
129k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
1150
129k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
1151
129k
}
1152
1153
static INLINE __m128i y_convolve_4tap_2x2_ssse3(const uint8_t *const src,
1154
                                                const ptrdiff_t stride,
1155
                                                const __m128i coeffs[2],
1156
                                                __m128i s_16[4],
1157
112k
                                                __m128i ss_128[2]) {
1158
112k
  s_16[3] = _mm_cvtsi32_si128(loadu_int16(src + stride));
1159
112k
  const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
1160
112k
  s_16[2] = _mm_cvtsi32_si128(loadu_int16(src + 2 * stride));
1161
112k
  const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[2]);
1162
112k
  ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1163
112k
  return convolve_4tap_ssse3(ss_128, coeffs);
1164
112k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_4tap_2x2_ssse3
convolve_avx2.c:y_convolve_4tap_2x2_ssse3
Line
Count
Source
1157
112k
                                                __m128i ss_128[2]) {
1158
112k
  s_16[3] = _mm_cvtsi32_si128(loadu_int16(src + stride));
1159
112k
  const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
1160
112k
  s_16[2] = _mm_cvtsi32_si128(loadu_int16(src + 2 * stride));
1161
112k
  const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[2]);
1162
112k
  ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1163
112k
  return convolve_4tap_ssse3(ss_128, coeffs);
1164
112k
}
1165
1166
static INLINE __m128i y_convolve_4tap_4x2_ssse3(const uint8_t *const src,
1167
                                                const ptrdiff_t stride,
1168
                                                const __m128i coeffs[2],
1169
                                                __m128i s_32[4],
1170
550k
                                                __m128i ss_128[2]) {
1171
550k
  s_32[3] = _mm_cvtsi32_si128(loadu_int32(src + stride));
1172
550k
  const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1173
550k
  s_32[2] = _mm_cvtsi32_si128(loadu_int32(src + 2 * stride));
1174
550k
  const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
1175
550k
  ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1176
550k
  return convolve_4tap_ssse3(ss_128, coeffs);
1177
550k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_4tap_4x2_ssse3
convolve_avx2.c:y_convolve_4tap_4x2_ssse3
Line
Count
Source
1170
550k
                                                __m128i ss_128[2]) {
1171
550k
  s_32[3] = _mm_cvtsi32_si128(loadu_int32(src + stride));
1172
550k
  const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1173
550k
  s_32[2] = _mm_cvtsi32_si128(loadu_int32(src + 2 * stride));
1174
550k
  const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
1175
550k
  ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1176
550k
  return convolve_4tap_ssse3(ss_128, coeffs);
1177
550k
}
1178
1179
static INLINE __m256i y_convolve_4tap_8x2_avx2(const uint8_t *const src,
1180
                                               const ptrdiff_t stride,
1181
                                               const __m256i coeffs[2],
1182
                                               __m128i s_64[4],
1183
438k
                                               __m256i ss_256[2]) {
1184
438k
  s_64[3] = _mm_loadl_epi64((__m128i *)(src + stride));
1185
438k
  const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
1186
438k
  s_64[2] = _mm_loadl_epi64((__m128i *)(src + 2 * stride));
1187
438k
  const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[2]);
1188
438k
  ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1189
438k
  return convolve_4tap_avx2(ss_256, coeffs);
1190
438k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_4tap_8x2_avx2
convolve_avx2.c:y_convolve_4tap_8x2_avx2
Line
Count
Source
1183
438k
                                               __m256i ss_256[2]) {
1184
438k
  s_64[3] = _mm_loadl_epi64((__m128i *)(src + stride));
1185
438k
  const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
1186
438k
  s_64[2] = _mm_loadl_epi64((__m128i *)(src + 2 * stride));
1187
438k
  const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[2]);
1188
438k
  ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1189
438k
  return convolve_4tap_avx2(ss_256, coeffs);
1190
438k
}
1191
1192
static INLINE void y_convolve_4tap_16x2_avx2(const uint8_t *const src,
1193
                                             const ptrdiff_t stride,
1194
                                             const __m256i coeffs[2],
1195
                                             __m128i s_128[4],
1196
212k
                                             __m256i ss_256[4], __m256i r[2]) {
1197
212k
  s_128[3] = _mm_loadu_si128((__m128i *)(src + stride));
1198
212k
  const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
1199
212k
  s_128[2] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
1200
212k
  const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[2]);
1201
212k
  ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1202
212k
  ss_256[3] = _mm256_unpackhi_epi8(src23, src34);
1203
212k
  r[0] = convolve_4tap_avx2(ss_256, coeffs);
1204
212k
  r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
1205
212k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_4tap_16x2_avx2
convolve_avx2.c:y_convolve_4tap_16x2_avx2
Line
Count
Source
1196
212k
                                             __m256i ss_256[4], __m256i r[2]) {
1197
212k
  s_128[3] = _mm_loadu_si128((__m128i *)(src + stride));
1198
212k
  const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
1199
212k
  s_128[2] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
1200
212k
  const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[2]);
1201
212k
  ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1202
212k
  ss_256[3] = _mm256_unpackhi_epi8(src23, src34);
1203
212k
  r[0] = convolve_4tap_avx2(ss_256, coeffs);
1204
212k
  r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
1205
212k
}
1206
1207
static INLINE __m128i y_convolve_6tap_2x2_ssse3(const uint8_t *const src,
1208
                                                const ptrdiff_t stride,
1209
                                                const __m128i coeffs[3],
1210
                                                __m128i s_16[6],
1211
136k
                                                __m128i ss_128[3]) {
1212
136k
  s_16[5] = _mm_cvtsi32_si128(loadu_int16(src + 3 * stride));
1213
136k
  const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
1214
136k
  s_16[4] = _mm_cvtsi32_si128(loadu_int16(src + 4 * stride));
1215
136k
  const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[4]);
1216
136k
  ss_128[2] = _mm_unpacklo_epi8(src45, src56);
1217
136k
  return convolve_6tap_ssse3(ss_128, coeffs);
1218
136k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_6tap_2x2_ssse3
convolve_avx2.c:y_convolve_6tap_2x2_ssse3
Line
Count
Source
1211
136k
                                                __m128i ss_128[3]) {
1212
136k
  s_16[5] = _mm_cvtsi32_si128(loadu_int16(src + 3 * stride));
1213
136k
  const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
1214
136k
  s_16[4] = _mm_cvtsi32_si128(loadu_int16(src + 4 * stride));
1215
136k
  const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[4]);
1216
136k
  ss_128[2] = _mm_unpacklo_epi8(src45, src56);
1217
136k
  return convolve_6tap_ssse3(ss_128, coeffs);
1218
136k
}
1219
1220
static INLINE void y_convolve_4tap_32x2_avx2(
1221
    const uint8_t *const src, const ptrdiff_t stride, const __m256i coeffs[2],
1222
141k
    __m256i s_256[4], __m256i ss_256[4], __m256i tt_256[4], __m256i r[4]) {
1223
141k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
1224
141k
  ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
1225
141k
  ss_256[3] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
1226
141k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
1227
141k
  tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[2]);
1228
141k
  tt_256[3] = _mm256_unpackhi_epi8(s_256[3], s_256[2]);
1229
141k
  r[0] = convolve_4tap_avx2(ss_256 + 0, coeffs);
1230
141k
  r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
1231
141k
  r[2] = convolve_4tap_avx2(tt_256 + 0, coeffs);
1232
141k
  r[3] = convolve_4tap_avx2(tt_256 + 2, coeffs);
1233
141k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_4tap_32x2_avx2
convolve_avx2.c:y_convolve_4tap_32x2_avx2
Line
Count
Source
1222
141k
    __m256i s_256[4], __m256i ss_256[4], __m256i tt_256[4], __m256i r[4]) {
1223
141k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
1224
141k
  ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
1225
141k
  ss_256[3] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
1226
141k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
1227
141k
  tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[2]);
1228
141k
  tt_256[3] = _mm256_unpackhi_epi8(s_256[3], s_256[2]);
1229
141k
  r[0] = convolve_4tap_avx2(ss_256 + 0, coeffs);
1230
141k
  r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
1231
141k
  r[2] = convolve_4tap_avx2(tt_256 + 0, coeffs);
1232
141k
  r[3] = convolve_4tap_avx2(tt_256 + 2, coeffs);
1233
141k
}
1234
1235
static INLINE __m128i y_convolve_6tap_4x2_ssse3(const uint8_t *const src,
1236
                                                const ptrdiff_t stride,
1237
                                                const __m128i coeffs[3],
1238
                                                __m128i s_32[6],
1239
1.03M
                                                __m128i ss_128[3]) {
1240
1.03M
  s_32[5] = _mm_cvtsi32_si128(loadu_int32(src + 3 * stride));
1241
1.03M
  const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
1242
1.03M
  s_32[4] = _mm_cvtsi32_si128(loadu_int32(src + 4 * stride));
1243
1.03M
  const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
1244
1.03M
  ss_128[2] = _mm_unpacklo_epi8(src45, src56);
1245
1.03M
  return convolve_6tap_ssse3(ss_128, coeffs);
1246
1.03M
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_6tap_4x2_ssse3
convolve_avx2.c:y_convolve_6tap_4x2_ssse3
Line
Count
Source
1239
1.03M
                                                __m128i ss_128[3]) {
1240
1.03M
  s_32[5] = _mm_cvtsi32_si128(loadu_int32(src + 3 * stride));
1241
1.03M
  const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
1242
1.03M
  s_32[4] = _mm_cvtsi32_si128(loadu_int32(src + 4 * stride));
1243
1.03M
  const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
1244
1.03M
  ss_128[2] = _mm_unpacklo_epi8(src45, src56);
1245
1.03M
  return convolve_6tap_ssse3(ss_128, coeffs);
1246
1.03M
}
1247
1248
static INLINE __m256i y_convolve_6tap_8x2_avx2(const uint8_t *const src,
1249
                                               const ptrdiff_t stride,
1250
                                               const __m256i coeffs[3],
1251
                                               __m128i s_64[6],
1252
1.33M
                                               __m256i ss_256[3]) {
1253
1.33M
  s_64[5] = _mm_loadl_epi64((__m128i *)(src + 3 * stride));
1254
1.33M
  const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
1255
1.33M
  s_64[4] = _mm_loadl_epi64((__m128i *)(src + 4 * stride));
1256
1.33M
  const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[4]);
1257
1.33M
  ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
1258
1.33M
  return convolve_6tap_avx2(ss_256, coeffs);
1259
1.33M
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_6tap_8x2_avx2
convolve_avx2.c:y_convolve_6tap_8x2_avx2
Line
Count
Source
1252
1.33M
                                               __m256i ss_256[3]) {
1253
1.33M
  s_64[5] = _mm_loadl_epi64((__m128i *)(src + 3 * stride));
1254
1.33M
  const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
1255
1.33M
  s_64[4] = _mm_loadl_epi64((__m128i *)(src + 4 * stride));
1256
1.33M
  const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[4]);
1257
1.33M
  ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
1258
1.33M
  return convolve_6tap_avx2(ss_256, coeffs);
1259
1.33M
}
1260
1261
static INLINE void y_convolve_6tap_16x2_avx2(const uint8_t *const src,
1262
                                             const ptrdiff_t stride,
1263
                                             const __m256i coeffs[3],
1264
                                             __m128i s_128[6],
1265
1.08M
                                             __m256i ss_256[6], __m256i r[2]) {
1266
1.08M
  s_128[5] = _mm_loadu_si128((__m128i *)(src + 3 * stride));
1267
1.08M
  const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
1268
1.08M
  s_128[4] = _mm_loadu_si128((__m128i *)(src + 4 * stride));
1269
1.08M
  const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[4]);
1270
1.08M
  ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
1271
1.08M
  ss_256[5] = _mm256_unpackhi_epi8(src45, src56);
1272
1.08M
  r[0] = convolve_6tap_avx2(ss_256, coeffs);
1273
1.08M
  r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
1274
1.08M
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_6tap_16x2_avx2
convolve_avx2.c:y_convolve_6tap_16x2_avx2
Line
Count
Source
1265
1.08M
                                             __m256i ss_256[6], __m256i r[2]) {
1266
1.08M
  s_128[5] = _mm_loadu_si128((__m128i *)(src + 3 * stride));
1267
1.08M
  const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
1268
1.08M
  s_128[4] = _mm_loadu_si128((__m128i *)(src + 4 * stride));
1269
1.08M
  const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[4]);
1270
1.08M
  ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
1271
1.08M
  ss_256[5] = _mm256_unpackhi_epi8(src45, src56);
1272
1.08M
  r[0] = convolve_6tap_avx2(ss_256, coeffs);
1273
1.08M
  r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
1274
1.08M
}
1275
1276
static INLINE void y_convolve_6tap_32x2_avx2(
1277
    const uint8_t *const src, const ptrdiff_t stride, const __m256i coeffs[3],
1278
1.20M
    __m256i s_256[6], __m256i ss_256[6], __m256i tt_256[6], __m256i r[4]) {
1279
1.20M
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
1280
1.20M
  ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
1281
1.20M
  ss_256[5] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
1282
1.20M
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
1283
1.20M
  tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[4]);
1284
1.20M
  tt_256[5] = _mm256_unpackhi_epi8(s_256[5], s_256[4]);
1285
1.20M
  r[0] = convolve_6tap_avx2(ss_256 + 0, coeffs);
1286
1.20M
  r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
1287
1.20M
  r[2] = convolve_6tap_avx2(tt_256 + 0, coeffs);
1288
1.20M
  r[3] = convolve_6tap_avx2(tt_256 + 3, coeffs);
1289
1.20M
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_6tap_32x2_avx2
convolve_avx2.c:y_convolve_6tap_32x2_avx2
Line
Count
Source
1278
1.20M
    __m256i s_256[6], __m256i ss_256[6], __m256i tt_256[6], __m256i r[4]) {
1279
1.20M
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
1280
1.20M
  ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
1281
1.20M
  ss_256[5] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
1282
1.20M
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
1283
1.20M
  tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[4]);
1284
1.20M
  tt_256[5] = _mm256_unpackhi_epi8(s_256[5], s_256[4]);
1285
1.20M
  r[0] = convolve_6tap_avx2(ss_256 + 0, coeffs);
1286
1.20M
  r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
1287
1.20M
  r[2] = convolve_6tap_avx2(tt_256 + 0, coeffs);
1288
1.20M
  r[3] = convolve_6tap_avx2(tt_256 + 3, coeffs);
1289
1.20M
}
1290
1291
static INLINE __m128i y_convolve_8tap_2x2_ssse3(const uint8_t *const src,
1292
                                                const ptrdiff_t stride,
1293
                                                const __m128i coeffs[4],
1294
                                                __m128i s_16[8],
1295
8.29k
                                                __m128i ss_128[4]) {
1296
8.29k
  s_16[7] = _mm_cvtsi32_si128(loadu_int16(src + 7 * stride));
1297
8.29k
  const __m128i src67 = _mm_unpacklo_epi16(s_16[6], s_16[7]);
1298
8.29k
  s_16[6] = _mm_cvtsi32_si128(loadu_int16(src + 8 * stride));
1299
8.29k
  const __m128i src78 = _mm_unpacklo_epi16(s_16[7], s_16[6]);
1300
8.29k
  ss_128[3] = _mm_unpacklo_epi8(src67, src78);
1301
8.29k
  return convolve_8tap_ssse3(ss_128, coeffs);
1302
8.29k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_8tap_2x2_ssse3
convolve_avx2.c:y_convolve_8tap_2x2_ssse3
Line
Count
Source
1295
8.29k
                                                __m128i ss_128[4]) {
1296
8.29k
  s_16[7] = _mm_cvtsi32_si128(loadu_int16(src + 7 * stride));
1297
8.29k
  const __m128i src67 = _mm_unpacklo_epi16(s_16[6], s_16[7]);
1298
8.29k
  s_16[6] = _mm_cvtsi32_si128(loadu_int16(src + 8 * stride));
1299
8.29k
  const __m128i src78 = _mm_unpacklo_epi16(s_16[7], s_16[6]);
1300
8.29k
  ss_128[3] = _mm_unpacklo_epi8(src67, src78);
1301
8.29k
  return convolve_8tap_ssse3(ss_128, coeffs);
1302
8.29k
}
1303
1304
static INLINE __m128i y_convolve_8tap_4x2_ssse3(const uint8_t *const src,
1305
                                                const ptrdiff_t stride,
1306
                                                const __m128i coeffs[4],
1307
                                                __m128i s_32[8],
1308
52.0k
                                                __m128i ss_128[4]) {
1309
52.0k
  s_32[7] = _mm_cvtsi32_si128(loadu_int32(src + 7 * stride));
1310
52.0k
  const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
1311
52.0k
  s_32[6] = _mm_cvtsi32_si128(loadu_int32(src + 8 * stride));
1312
52.0k
  const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
1313
52.0k
  ss_128[3] = _mm_unpacklo_epi8(src67, src78);
1314
52.0k
  return convolve_8tap_ssse3(ss_128, coeffs);
1315
52.0k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_8tap_4x2_ssse3
convolve_avx2.c:y_convolve_8tap_4x2_ssse3
Line
Count
Source
1308
52.0k
                                                __m128i ss_128[4]) {
1309
52.0k
  s_32[7] = _mm_cvtsi32_si128(loadu_int32(src + 7 * stride));
1310
52.0k
  const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
1311
52.0k
  s_32[6] = _mm_cvtsi32_si128(loadu_int32(src + 8 * stride));
1312
52.0k
  const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
1313
52.0k
  ss_128[3] = _mm_unpacklo_epi8(src67, src78);
1314
52.0k
  return convolve_8tap_ssse3(ss_128, coeffs);
1315
52.0k
}
1316
1317
static INLINE __m256i y_convolve_8tap_8x2_avx2(const uint8_t *const src,
1318
                                               const ptrdiff_t stride,
1319
                                               const __m256i coeffs[4],
1320
                                               __m128i s_64[8],
1321
61.3k
                                               __m256i ss_256[4]) {
1322
61.3k
  s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * stride));
1323
61.3k
  const __m256i src67 = _mm256_setr_m128i(s_64[6], s_64[7]);
1324
61.3k
  s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * stride));
1325
61.3k
  const __m256i src78 = _mm256_setr_m128i(s_64[7], s_64[6]);
1326
61.3k
  ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
1327
61.3k
  return convolve_8tap_avx2(ss_256, coeffs);
1328
61.3k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_8tap_8x2_avx2
convolve_avx2.c:y_convolve_8tap_8x2_avx2
Line
Count
Source
1321
61.3k
                                               __m256i ss_256[4]) {
1322
61.3k
  s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * stride));
1323
61.3k
  const __m256i src67 = _mm256_setr_m128i(s_64[6], s_64[7]);
1324
61.3k
  s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * stride));
1325
61.3k
  const __m256i src78 = _mm256_setr_m128i(s_64[7], s_64[6]);
1326
61.3k
  ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
1327
61.3k
  return convolve_8tap_avx2(ss_256, coeffs);
1328
61.3k
}
1329
1330
static INLINE void y_convolve_8tap_16x2_avx2(const uint8_t *const src,
1331
                                             const ptrdiff_t stride,
1332
                                             const __m256i coeffs[4],
1333
                                             __m128i s_128[8],
1334
46.9k
                                             __m256i ss_256[8], __m256i r[2]) {
1335
46.9k
  s_128[7] = _mm_loadu_si128((__m128i *)(src + 7 * stride));
1336
46.9k
  const __m256i src67 = _mm256_setr_m128i(s_128[6], s_128[7]);
1337
46.9k
  s_128[6] = _mm_loadu_si128((__m128i *)(src + 8 * stride));
1338
46.9k
  const __m256i src78 = _mm256_setr_m128i(s_128[7], s_128[6]);
1339
46.9k
  ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
1340
46.9k
  ss_256[7] = _mm256_unpackhi_epi8(src67, src78);
1341
46.9k
  r[0] = convolve_8tap_avx2(ss_256, coeffs);
1342
46.9k
  r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
1343
46.9k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_8tap_16x2_avx2
convolve_avx2.c:y_convolve_8tap_16x2_avx2
Line
Count
Source
1334
46.9k
                                             __m256i ss_256[8], __m256i r[2]) {
1335
46.9k
  s_128[7] = _mm_loadu_si128((__m128i *)(src + 7 * stride));
1336
46.9k
  const __m256i src67 = _mm256_setr_m128i(s_128[6], s_128[7]);
1337
46.9k
  s_128[6] = _mm_loadu_si128((__m128i *)(src + 8 * stride));
1338
46.9k
  const __m256i src78 = _mm256_setr_m128i(s_128[7], s_128[6]);
1339
46.9k
  ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
1340
46.9k
  ss_256[7] = _mm256_unpackhi_epi8(src67, src78);
1341
46.9k
  r[0] = convolve_8tap_avx2(ss_256, coeffs);
1342
46.9k
  r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
1343
46.9k
}
1344
1345
static INLINE void y_convolve_8tap_32x2_avx2(
1346
    const uint8_t *const src, const ptrdiff_t stride, const __m256i coeffs[4],
1347
92.8k
    __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
1348
92.8k
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1349
92.8k
  ss_256[3] = _mm256_unpacklo_epi8(s_256[6], s_256[7]);
1350
92.8k
  ss_256[7] = _mm256_unpackhi_epi8(s_256[6], s_256[7]);
1351
92.8k
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1352
92.8k
  tt_256[3] = _mm256_unpacklo_epi8(s_256[7], s_256[6]);
1353
92.8k
  tt_256[7] = _mm256_unpackhi_epi8(s_256[7], s_256[6]);
1354
92.8k
  r[0] = convolve_8tap_avx2(ss_256 + 0, coeffs);
1355
92.8k
  r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
1356
92.8k
  r[2] = convolve_8tap_avx2(tt_256 + 0, coeffs);
1357
92.8k
  r[3] = convolve_8tap_avx2(tt_256 + 4, coeffs);
1358
92.8k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_8tap_32x2_avx2
convolve_avx2.c:y_convolve_8tap_32x2_avx2
Line
Count
Source
1347
92.8k
    __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
1348
92.8k
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1349
92.8k
  ss_256[3] = _mm256_unpacklo_epi8(s_256[6], s_256[7]);
1350
92.8k
  ss_256[7] = _mm256_unpackhi_epi8(s_256[6], s_256[7]);
1351
92.8k
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1352
92.8k
  tt_256[3] = _mm256_unpacklo_epi8(s_256[7], s_256[6]);
1353
92.8k
  tt_256[7] = _mm256_unpackhi_epi8(s_256[7], s_256[6]);
1354
92.8k
  r[0] = convolve_8tap_avx2(ss_256 + 0, coeffs);
1355
92.8k
  r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
1356
92.8k
  r[2] = convolve_8tap_avx2(tt_256 + 0, coeffs);
1357
92.8k
  r[3] = convolve_8tap_avx2(tt_256 + 4, coeffs);
1358
92.8k
}
1359
1360
static INLINE void xy_x_convolve_2tap_32_avx2(const uint8_t *const src,
1361
                                              const __m256i coeffs[1],
1362
339k
                                              __m256i r[2]) {
1363
339k
  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
1364
339k
  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
1365
339k
  const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
1366
339k
  const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
1367
1368
339k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
1369
339k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
1370
339k
}
convolve_2d_avx2.c:xy_x_convolve_2tap_32_avx2
Line
Count
Source
1362
339k
                                              __m256i r[2]) {
1363
339k
  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
1364
339k
  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
1365
339k
  const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
1366
339k
  const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
1367
1368
339k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
1369
339k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
1370
339k
}
Unexecuted instantiation: convolve_avx2.c:xy_x_convolve_2tap_32_avx2
1371
1372
static INLINE void xy_x_2tap_32_avx2(const uint8_t *const src,
1373
                                     const __m256i coeffs[1],
1374
339k
                                     int16_t *const dst) {
1375
339k
  __m256i r[2];
1376
1377
339k
  xy_x_convolve_2tap_32_avx2(src, coeffs, r);
1378
339k
  const __m256i d0 = xy_x_round_avx2(r[0]);
1379
339k
  const __m256i d1 = xy_x_round_avx2(r[1]);
1380
339k
  _mm256_storeu_si256((__m256i *)dst, d0);
1381
339k
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1382
339k
}
convolve_2d_avx2.c:xy_x_2tap_32_avx2
Line
Count
Source
1374
339k
                                     int16_t *const dst) {
1375
339k
  __m256i r[2];
1376
1377
339k
  xy_x_convolve_2tap_32_avx2(src, coeffs, r);
1378
339k
  const __m256i d0 = xy_x_round_avx2(r[0]);
1379
339k
  const __m256i d1 = xy_x_round_avx2(r[1]);
1380
339k
  _mm256_storeu_si256((__m256i *)dst, d0);
1381
339k
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1382
339k
}
Unexecuted instantiation: convolve_avx2.c:xy_x_2tap_32_avx2
1383
1384
static INLINE void xy_x_4tap_32_avx2(const uint8_t *const src,
1385
                                     const __m256i coeffs[2],
1386
                                     const __m256i filt[2],
1387
675k
                                     int16_t *const dst) {
1388
675k
  __m256i r[2];
1389
1390
675k
  x_convolve_4tap_32_avx2(src, coeffs, filt, r);
1391
675k
  const __m256i d0 = xy_x_round_avx2(r[0]);
1392
675k
  const __m256i d1 = xy_x_round_avx2(r[1]);
1393
675k
  _mm256_storeu_si256((__m256i *)dst, d0);
1394
675k
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1395
675k
}
convolve_2d_avx2.c:xy_x_4tap_32_avx2
Line
Count
Source
1387
675k
                                     int16_t *const dst) {
1388
675k
  __m256i r[2];
1389
1390
675k
  x_convolve_4tap_32_avx2(src, coeffs, filt, r);
1391
675k
  const __m256i d0 = xy_x_round_avx2(r[0]);
1392
675k
  const __m256i d1 = xy_x_round_avx2(r[1]);
1393
675k
  _mm256_storeu_si256((__m256i *)dst, d0);
1394
675k
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1395
675k
}
Unexecuted instantiation: convolve_avx2.c:xy_x_4tap_32_avx2
1396
1397
static INLINE void xy_x_6tap_32_avx2(const uint8_t *const src,
1398
                                     const __m256i coeffs[3],
1399
                                     const __m256i filt[3],
1400
7.00M
                                     int16_t *const dst) {
1401
7.00M
  __m256i r[2];
1402
1403
7.00M
  x_convolve_6tap_32_avx2(src, coeffs, filt, r);
1404
7.00M
  const __m256i d0 = xy_x_round_avx2(r[0]);
1405
7.00M
  const __m256i d1 = xy_x_round_avx2(r[1]);
1406
7.00M
  _mm256_storeu_si256((__m256i *)dst, d0);
1407
7.00M
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1408
7.00M
}
convolve_2d_avx2.c:xy_x_6tap_32_avx2
Line
Count
Source
1400
7.00M
                                     int16_t *const dst) {
1401
7.00M
  __m256i r[2];
1402
1403
7.00M
  x_convolve_6tap_32_avx2(src, coeffs, filt, r);
1404
7.00M
  const __m256i d0 = xy_x_round_avx2(r[0]);
1405
7.00M
  const __m256i d1 = xy_x_round_avx2(r[1]);
1406
7.00M
  _mm256_storeu_si256((__m256i *)dst, d0);
1407
7.00M
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1408
7.00M
}
Unexecuted instantiation: convolve_avx2.c:xy_x_6tap_32_avx2
1409
1410
static INLINE void xy_x_8tap_32_avx2(const uint8_t *const src,
1411
                                     const __m256i coeffs[4],
1412
                                     const __m256i filt[4],
1413
1.00M
                                     int16_t *const dst) {
1414
1.00M
  __m256i r[2];
1415
1416
1.00M
  x_convolve_8tap_32_avx2(src, coeffs, filt, r);
1417
1.00M
  const __m256i d0 = xy_x_round_avx2(r[0]);
1418
1.00M
  const __m256i d1 = xy_x_round_avx2(r[1]);
1419
1.00M
  _mm256_storeu_si256((__m256i *)dst, d0);
1420
1.00M
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1421
1.00M
}
convolve_2d_avx2.c:xy_x_8tap_32_avx2
Line
Count
Source
1413
1.00M
                                     int16_t *const dst) {
1414
1.00M
  __m256i r[2];
1415
1416
1.00M
  x_convolve_8tap_32_avx2(src, coeffs, filt, r);
1417
1.00M
  const __m256i d0 = xy_x_round_avx2(r[0]);
1418
1.00M
  const __m256i d1 = xy_x_round_avx2(r[1]);
1419
1.00M
  _mm256_storeu_si256((__m256i *)dst, d0);
1420
1.00M
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1421
1.00M
}
Unexecuted instantiation: convolve_avx2.c:xy_x_8tap_32_avx2
1422
1423
static INLINE __m128i xy_y_convolve_2tap_2x2_sse2(const int16_t *const src,
1424
                                                  __m128i s_32[2],
1425
18.1k
                                                  const __m128i coeffs[1]) {
1426
18.1k
  __m128i s_128[2];
1427
1428
18.1k
  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + 2));
1429
18.1k
  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1430
18.1k
  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * 2));
1431
18.1k
  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1432
18.1k
  const __m128i ss = _mm_unpacklo_epi16(s_128[0], s_128[1]);
1433
18.1k
  return convolve16_2tap_sse2(&ss, coeffs);
1434
18.1k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_2x2_sse2
Line
Count
Source
1425
18.1k
                                                  const __m128i coeffs[1]) {
1426
18.1k
  __m128i s_128[2];
1427
1428
18.1k
  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + 2));
1429
18.1k
  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1430
18.1k
  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * 2));
1431
18.1k
  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1432
18.1k
  const __m128i ss = _mm_unpacklo_epi16(s_128[0], s_128[1]);
1433
18.1k
  return convolve16_2tap_sse2(&ss, coeffs);
1434
18.1k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_2x2_sse2
1435
1436
static INLINE __m128i xy_y_convolve_2tap_2x2_half_pel_sse2(
1437
4.84k
    const int16_t *const src, __m128i s_32[2]) {
1438
4.84k
  __m128i s_128[2];
1439
1440
4.84k
  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + 2));
1441
4.84k
  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1442
4.84k
  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * 2));
1443
4.84k
  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1444
4.84k
  return _mm_add_epi16(s_128[0], s_128[1]);
1445
4.84k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_2x2_half_pel_sse2
Line
Count
Source
1437
4.84k
    const int16_t *const src, __m128i s_32[2]) {
1438
4.84k
  __m128i s_128[2];
1439
1440
4.84k
  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + 2));
1441
4.84k
  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1442
4.84k
  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * 2));
1443
4.84k
  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1444
4.84k
  return _mm_add_epi16(s_128[0], s_128[1]);
1445
4.84k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_2x2_half_pel_sse2
1446
1447
static INLINE void xy_y_convolve_2tap_4x2_sse2(const int16_t *const src,
1448
                                               __m128i s_64[2],
1449
                                               const __m128i coeffs[1],
1450
92.4k
                                               __m128i r[2]) {
1451
92.4k
  __m128i s_128[2];
1452
1453
92.4k
  s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
1454
92.4k
  s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
1455
92.4k
  s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
1456
92.4k
  s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
1457
92.4k
  const __m128i ss0 = _mm_unpacklo_epi16(s_128[0], s_128[1]);
1458
92.4k
  const __m128i ss1 = _mm_unpackhi_epi16(s_128[0], s_128[1]);
1459
92.4k
  r[0] = convolve16_2tap_sse2(&ss0, coeffs);
1460
92.4k
  r[1] = convolve16_2tap_sse2(&ss1, coeffs);
1461
92.4k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_4x2_sse2
Line
Count
Source
1450
92.4k
                                               __m128i r[2]) {
1451
92.4k
  __m128i s_128[2];
1452
1453
92.4k
  s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
1454
92.4k
  s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
1455
92.4k
  s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
1456
92.4k
  s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
1457
92.4k
  const __m128i ss0 = _mm_unpacklo_epi16(s_128[0], s_128[1]);
1458
92.4k
  const __m128i ss1 = _mm_unpackhi_epi16(s_128[0], s_128[1]);
1459
92.4k
  r[0] = convolve16_2tap_sse2(&ss0, coeffs);
1460
92.4k
  r[1] = convolve16_2tap_sse2(&ss1, coeffs);
1461
92.4k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_4x2_sse2
1462
1463
static INLINE __m128i xy_y_convolve_2tap_4x2_half_pel_sse2(
1464
15.6k
    const int16_t *const src, __m128i s_64[2]) {
1465
15.6k
  __m128i s_128[2];
1466
1467
15.6k
  s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
1468
15.6k
  s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
1469
15.6k
  s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
1470
15.6k
  s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
1471
15.6k
  return _mm_add_epi16(s_128[0], s_128[1]);
1472
15.6k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_4x2_half_pel_sse2
Line
Count
Source
1464
15.6k
    const int16_t *const src, __m128i s_64[2]) {
1465
15.6k
  __m128i s_128[2];
1466
1467
15.6k
  s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
1468
15.6k
  s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
1469
15.6k
  s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
1470
15.6k
  s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
1471
15.6k
  return _mm_add_epi16(s_128[0], s_128[1]);
1472
15.6k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_4x2_half_pel_sse2
1473
1474
static INLINE void xy_y_convolve_2tap_16_avx2(const __m256i s0,
1475
                                              const __m256i s1,
1476
                                              const __m256i coeffs[1],
1477
955k
                                              __m256i r[2]) {
1478
955k
  const __m256i ss0 = _mm256_unpacklo_epi16(s0, s1);
1479
955k
  const __m256i ss1 = _mm256_unpackhi_epi16(s0, s1);
1480
955k
  r[0] = convolve16_2tap_avx2(&ss0, coeffs);
1481
955k
  r[1] = convolve16_2tap_avx2(&ss1, coeffs);
1482
955k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_16_avx2
Line
Count
Source
1477
955k
                                              __m256i r[2]) {
1478
955k
  const __m256i ss0 = _mm256_unpacklo_epi16(s0, s1);
1479
955k
  const __m256i ss1 = _mm256_unpackhi_epi16(s0, s1);
1480
955k
  r[0] = convolve16_2tap_avx2(&ss0, coeffs);
1481
955k
  r[1] = convolve16_2tap_avx2(&ss1, coeffs);
1482
955k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_16_avx2
1483
1484
static INLINE void xy_y_convolve_2tap_8x2_avx2(const int16_t *const src,
1485
                                               __m128i s_128[2],
1486
                                               const __m256i coeffs[1],
1487
91.1k
                                               __m256i r[2]) {
1488
91.1k
  __m256i s_256[2];
1489
91.1k
  s_128[1] = _mm_loadu_si128((__m128i *)(src + 8));
1490
91.1k
  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1491
91.1k
  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
1492
91.1k
  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1493
91.1k
  xy_y_convolve_2tap_16_avx2(s_256[0], s_256[1], coeffs, r);
1494
91.1k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_8x2_avx2
Line
Count
Source
1487
91.1k
                                               __m256i r[2]) {
1488
91.1k
  __m256i s_256[2];
1489
91.1k
  s_128[1] = _mm_loadu_si128((__m128i *)(src + 8));
1490
91.1k
  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1491
91.1k
  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
1492
91.1k
  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1493
91.1k
  xy_y_convolve_2tap_16_avx2(s_256[0], s_256[1], coeffs, r);
1494
91.1k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_8x2_avx2
1495
1496
static INLINE __m256i xy_y_convolve_2tap_8x2_half_pel_avx2(
1497
23.3k
    const int16_t *const src, __m128i s_128[2]) {
1498
23.3k
  __m256i s_256[2];
1499
23.3k
  s_128[1] = _mm_loadu_si128((__m128i *)(src + 8));
1500
23.3k
  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1501
23.3k
  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
1502
23.3k
  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1503
23.3k
  return _mm256_add_epi16(s_256[0], s_256[1]);
1504
23.3k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_8x2_half_pel_avx2
Line
Count
Source
1497
23.3k
    const int16_t *const src, __m128i s_128[2]) {
1498
23.3k
  __m256i s_256[2];
1499
23.3k
  s_128[1] = _mm_loadu_si128((__m128i *)(src + 8));
1500
23.3k
  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1501
23.3k
  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
1502
23.3k
  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1503
23.3k
  return _mm256_add_epi16(s_256[0], s_256[1]);
1504
23.3k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_8x2_half_pel_avx2
1505
1506
static INLINE void xy_y_convolve_2tap_16x2_half_pel_avx2(
1507
20.8k
    const int16_t *const src, __m256i s_256[2], __m256i r[2]) {
1508
20.8k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1509
20.8k
  r[0] = _mm256_add_epi16(s_256[0], s_256[1]);
1510
20.8k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
1511
20.8k
  r[1] = _mm256_add_epi16(s_256[1], s_256[0]);
1512
20.8k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_16x2_half_pel_avx2
Line
Count
Source
1507
20.8k
    const int16_t *const src, __m256i s_256[2], __m256i r[2]) {
1508
20.8k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1509
20.8k
  r[0] = _mm256_add_epi16(s_256[0], s_256[1]);
1510
20.8k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
1511
20.8k
  r[1] = _mm256_add_epi16(s_256[1], s_256[0]);
1512
20.8k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_16x2_half_pel_avx2
1513
1514
static INLINE void xy_y_store_16x2_avx2(const __m256i r[2], uint8_t *const dst,
1515
0
                                        const ptrdiff_t stride) {
1516
0
  const __m256i t = _mm256_packus_epi16(r[0], r[1]);
1517
0
  const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
1518
0
  storeu_u8_16x2_avx2(d, dst, stride);
1519
0
}
Unexecuted instantiation: convolve_2d_avx2.c:xy_y_store_16x2_avx2
Unexecuted instantiation: convolve_avx2.c:xy_y_store_16x2_avx2
1520
1521
static INLINE void xy_y_convolve_2tap_16x2_avx2(const int16_t *const src,
1522
                                                __m256i s[2],
1523
                                                const __m256i coeffs[1],
1524
48.2k
                                                __m256i r[4]) {
1525
48.2k
  s[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1526
48.2k
  xy_y_convolve_2tap_16_avx2(s[0], s[1], coeffs, r + 0);
1527
48.2k
  s[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
1528
48.2k
  xy_y_convolve_2tap_16_avx2(s[1], s[0], coeffs, r + 2);
1529
48.2k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_16x2_avx2
Line
Count
Source
1524
48.2k
                                                __m256i r[4]) {
1525
48.2k
  s[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1526
48.2k
  xy_y_convolve_2tap_16_avx2(s[0], s[1], coeffs, r + 0);
1527
48.2k
  s[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
1528
48.2k
  xy_y_convolve_2tap_16_avx2(s[1], s[0], coeffs, r + 2);
1529
48.2k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_16x2_avx2
1530
1531
static INLINE void xy_y_convolve_2tap_32_avx2(const int16_t *const src,
1532
                                              const __m256i s0[2],
1533
                                              __m256i s1[2],
1534
                                              const __m256i coeffs[1],
1535
207k
                                              __m256i r[4]) {
1536
207k
  s1[0] = _mm256_loadu_si256((__m256i *)src);
1537
207k
  s1[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1538
207k
  xy_y_convolve_2tap_16_avx2(s0[0], s1[0], coeffs, r + 0);
1539
207k
  xy_y_convolve_2tap_16_avx2(s0[1], s1[1], coeffs, r + 2);
1540
207k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_32_avx2
Line
Count
Source
1535
207k
                                              __m256i r[4]) {
1536
207k
  s1[0] = _mm256_loadu_si256((__m256i *)src);
1537
207k
  s1[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1538
207k
  xy_y_convolve_2tap_16_avx2(s0[0], s1[0], coeffs, r + 0);
1539
207k
  xy_y_convolve_2tap_16_avx2(s0[1], s1[1], coeffs, r + 2);
1540
207k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_32_avx2
1541
1542
static INLINE void xy_y_convolve_2tap_32_all_avx2(const int16_t *const src,
1543
                                                  const __m256i s0[2],
1544
                                                  __m256i s1[2],
1545
                                                  const __m256i coeffs[1],
1546
207k
                                                  uint8_t *const dst) {
1547
207k
  __m256i r[4];
1548
1549
207k
  xy_y_convolve_2tap_32_avx2(src, s0, s1, coeffs, r);
1550
207k
  xy_y_round_store_32_avx2(r + 0, r + 2, dst);
1551
207k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_32_all_avx2
Line
Count
Source
1546
207k
                                                  uint8_t *const dst) {
1547
207k
  __m256i r[4];
1548
1549
207k
  xy_y_convolve_2tap_32_avx2(src, s0, s1, coeffs, r);
1550
207k
  xy_y_round_store_32_avx2(r + 0, r + 2, dst);
1551
207k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_32_all_avx2
1552
1553
static INLINE void xy_y_convolve_2tap_half_pel_32_avx2(const int16_t *const src,
1554
                                                       const __m256i s0[2],
1555
                                                       __m256i s1[2],
1556
117k
                                                       __m256i r[2]) {
1557
117k
  s1[0] = _mm256_loadu_si256((__m256i *)src);
1558
117k
  s1[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1559
117k
  r[0] = _mm256_add_epi16(s0[0], s1[0]);
1560
117k
  r[1] = _mm256_add_epi16(s0[1], s1[1]);
1561
117k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_half_pel_32_avx2
Line
Count
Source
1556
117k
                                                       __m256i r[2]) {
1557
117k
  s1[0] = _mm256_loadu_si256((__m256i *)src);
1558
117k
  s1[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1559
117k
  r[0] = _mm256_add_epi16(s0[0], s1[0]);
1560
117k
  r[1] = _mm256_add_epi16(s0[1], s1[1]);
1561
117k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_half_pel_32_avx2
1562
1563
static INLINE void xy_y_convolve_2tap_half_pel_32_all_avx2(
1564
    const int16_t *const src, const __m256i s0[2], __m256i s1[2],
1565
117k
    uint8_t *const dst) {
1566
117k
  __m256i r[2];
1567
1568
117k
  xy_y_convolve_2tap_half_pel_32_avx2(src, s0, s1, r);
1569
117k
  r[0] = xy_y_round_half_pel_avx2(r[0]);
1570
117k
  r[1] = xy_y_round_half_pel_avx2(r[1]);
1571
117k
  xy_y_pack_store_32_avx2(r[0], r[1], dst);
1572
117k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_half_pel_32_all_avx2
Line
Count
Source
1565
117k
    uint8_t *const dst) {
1566
117k
  __m256i r[2];
1567
1568
117k
  xy_y_convolve_2tap_half_pel_32_avx2(src, s0, s1, r);
1569
117k
  r[0] = xy_y_round_half_pel_avx2(r[0]);
1570
117k
  r[1] = xy_y_round_half_pel_avx2(r[1]);
1571
117k
  xy_y_pack_store_32_avx2(r[0], r[1], dst);
1572
117k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_half_pel_32_all_avx2
1573
1574
static INLINE __m128i xy_y_convolve_4tap_2x2_sse2(const int16_t *const src,
1575
                                                  __m128i s_32[4],
1576
                                                  __m128i ss_128[2],
1577
419k
                                                  const __m128i coeffs[2]) {
1578
419k
  s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(src + 3 * 2));
1579
419k
  const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1580
419k
  s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(src + 4 * 2));
1581
419k
  const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
1582
419k
  ss_128[1] = _mm_unpacklo_epi16(src23, src34);
1583
419k
  const __m128i r = convolve16_4tap_sse2(ss_128, coeffs);
1584
419k
  ss_128[0] = ss_128[1];
1585
419k
  return r;
1586
419k
}
convolve_2d_avx2.c:xy_y_convolve_4tap_2x2_sse2
Line
Count
Source
1577
419k
                                                  const __m128i coeffs[2]) {
1578
419k
  s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(src + 3 * 2));
1579
419k
  const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1580
419k
  s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(src + 4 * 2));
1581
419k
  const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
1582
419k
  ss_128[1] = _mm_unpacklo_epi16(src23, src34);
1583
419k
  const __m128i r = convolve16_4tap_sse2(ss_128, coeffs);
1584
419k
  ss_128[0] = ss_128[1];
1585
419k
  return r;
1586
419k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_2x2_sse2
1587
1588
static INLINE __m256i xy_y_convolve_4tap_4x2_avx2(const int16_t *const src,
1589
                                                  __m128i s_64[4],
1590
                                                  __m256i ss_256[2],
1591
1.60M
                                                  const __m256i coeffs[2]) {
1592
1.60M
  __m256i s_256[2];
1593
1.60M
  s_64[3] = _mm_loadl_epi64((__m128i *)(src + 3 * 4));
1594
1.60M
  s_256[0] = _mm256_setr_m128i(s_64[2], s_64[3]);
1595
1.60M
  s_64[2] = _mm_loadl_epi64((__m128i *)(src + 4 * 4));
1596
1.60M
  s_256[1] = _mm256_setr_m128i(s_64[3], s_64[2]);
1597
1.60M
  ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1598
1.60M
  const __m256i r = convolve16_4tap_avx2(ss_256, coeffs);
1599
1.60M
  ss_256[0] = ss_256[1];
1600
1.60M
  return r;
1601
1.60M
}
convolve_2d_avx2.c:xy_y_convolve_4tap_4x2_avx2
Line
Count
Source
1591
1.60M
                                                  const __m256i coeffs[2]) {
1592
1.60M
  __m256i s_256[2];
1593
1.60M
  s_64[3] = _mm_loadl_epi64((__m128i *)(src + 3 * 4));
1594
1.60M
  s_256[0] = _mm256_setr_m128i(s_64[2], s_64[3]);
1595
1.60M
  s_64[2] = _mm_loadl_epi64((__m128i *)(src + 4 * 4));
1596
1.60M
  s_256[1] = _mm256_setr_m128i(s_64[3], s_64[2]);
1597
1.60M
  ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1598
1.60M
  const __m256i r = convolve16_4tap_avx2(ss_256, coeffs);
1599
1.60M
  ss_256[0] = ss_256[1];
1600
1.60M
  return r;
1601
1.60M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_4x2_avx2
1602
1603
static INLINE void xy_y_convolve_4tap_16_avx2(const __m256i *const ss,
1604
                                              const __m256i coeffs[2],
1605
5.11M
                                              __m256i r[2]) {
1606
5.11M
  r[0] = convolve16_4tap_avx2(ss, coeffs);
1607
5.11M
  r[1] = convolve16_4tap_avx2(ss + 2, coeffs);
1608
5.11M
}
convolve_2d_avx2.c:xy_y_convolve_4tap_16_avx2
Line
Count
Source
1605
5.11M
                                              __m256i r[2]) {
1606
5.11M
  r[0] = convolve16_4tap_avx2(ss, coeffs);
1607
5.11M
  r[1] = convolve16_4tap_avx2(ss + 2, coeffs);
1608
5.11M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_16_avx2
1609
1610
static INLINE void xy_y_convolve_4tap_8x2_avx2(const int16_t *const src,
1611
                                               __m256i ss_256[4],
1612
                                               const __m256i coeffs[2],
1613
1.07M
                                               __m256i r[2]) {
1614
1.07M
  __m256i s_256[2];
1615
1.07M
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
1616
1.07M
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
1617
1.07M
  ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1618
1.07M
  ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1619
1.07M
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1620
1.07M
  ss_256[0] = ss_256[1];
1621
1.07M
  ss_256[2] = ss_256[3];
1622
1.07M
}
convolve_2d_avx2.c:xy_y_convolve_4tap_8x2_avx2
Line
Count
Source
1613
1.07M
                                               __m256i r[2]) {
1614
1.07M
  __m256i s_256[2];
1615
1.07M
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
1616
1.07M
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
1617
1.07M
  ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1618
1.07M
  ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1619
1.07M
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1620
1.07M
  ss_256[0] = ss_256[1];
1621
1.07M
  ss_256[2] = ss_256[3];
1622
1.07M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_8x2_avx2
1623
1624
static INLINE void xy_y_convolve_4tap_8x2_half_pel_avx2(
1625
    const int16_t *const src, const __m256i coeffs[1], __m256i s_256[4],
1626
190k
    __m256i r[2]) {
1627
190k
  __m256i a_256[2];
1628
190k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
1629
190k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
1630
190k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
1631
190k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
1632
190k
  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r);
1633
190k
  s_256[0] = s_256[2];
1634
190k
  s_256[1] = s_256[3];
1635
190k
}
convolve_2d_avx2.c:xy_y_convolve_4tap_8x2_half_pel_avx2
Line
Count
Source
1626
190k
    __m256i r[2]) {
1627
190k
  __m256i a_256[2];
1628
190k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
1629
190k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
1630
190k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
1631
190k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
1632
190k
  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r);
1633
190k
  s_256[0] = s_256[2];
1634
190k
  s_256[1] = s_256[3];
1635
190k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_8x2_half_pel_avx2
1636
1637
static INLINE void xy_y_convolve_4tap_16x2_avx2(
1638
    const int16_t *const src, __m256i s_256[4], __m256i ss_256[4],
1639
478k
    __m256i tt_256[4], const __m256i coeffs[2], __m256i r[4]) {
1640
478k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
1641
478k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1642
478k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1643
478k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
1644
478k
  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
1645
478k
  tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
1646
478k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1647
478k
  xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
1648
478k
  ss_256[0] = ss_256[1];
1649
478k
  ss_256[2] = ss_256[3];
1650
478k
  tt_256[0] = tt_256[1];
1651
478k
  tt_256[2] = tt_256[3];
1652
478k
}
convolve_2d_avx2.c:xy_y_convolve_4tap_16x2_avx2
Line
Count
Source
1639
478k
    __m256i tt_256[4], const __m256i coeffs[2], __m256i r[4]) {
1640
478k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
1641
478k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1642
478k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1643
478k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
1644
478k
  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
1645
478k
  tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
1646
478k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1647
478k
  xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
1648
478k
  ss_256[0] = ss_256[1];
1649
478k
  ss_256[2] = ss_256[3];
1650
478k
  tt_256[0] = tt_256[1];
1651
478k
  tt_256[2] = tt_256[3];
1652
478k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_16x2_avx2
1653
1654
static INLINE void xy_y_convolve_4tap_32x2_avx2(
1655
    const int16_t *const src, const ptrdiff_t stride, __m256i s_256[4],
1656
    __m256i ss_256[4], __m256i tt_256[4], const __m256i coeffs[2],
1657
567k
    __m256i r[4]) {
1658
567k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
1659
567k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1660
567k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1661
567k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
1662
567k
  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
1663
567k
  tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
1664
567k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1665
567k
  xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
1666
567k
  ss_256[0] = ss_256[1];
1667
567k
  ss_256[2] = ss_256[3];
1668
567k
  tt_256[0] = tt_256[1];
1669
567k
  tt_256[2] = tt_256[3];
1670
567k
}
convolve_2d_avx2.c:xy_y_convolve_4tap_32x2_avx2
Line
Count
Source
1657
567k
    __m256i r[4]) {
1658
567k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
1659
567k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1660
567k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1661
567k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
1662
567k
  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
1663
567k
  tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
1664
567k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1665
567k
  xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
1666
567k
  ss_256[0] = ss_256[1];
1667
567k
  ss_256[2] = ss_256[3];
1668
567k
  tt_256[0] = tt_256[1];
1669
567k
  tt_256[2] = tt_256[3];
1670
567k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_32x2_avx2
1671
1672
static INLINE void xy_y_convolve_4tap_16x2_half_pelavx2(
1673
    const int16_t *const src, __m256i s_256[5], const __m256i coeffs[1],
1674
81.4k
    __m256i r[4]) {
1675
81.4k
  __m256i a_256[2];
1676
1677
81.4k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
1678
81.4k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
1679
1680
81.4k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
1681
81.4k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
1682
81.4k
  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 0);
1683
1684
81.4k
  a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
1685
81.4k
  a_256[1] = _mm256_add_epi16(s_256[2], s_256[3]);
1686
81.4k
  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 2);
1687
1688
81.4k
  s_256[0] = s_256[2];
1689
81.4k
  s_256[1] = s_256[3];
1690
81.4k
  s_256[2] = s_256[4];
1691
81.4k
}
convolve_2d_avx2.c:xy_y_convolve_4tap_16x2_half_pelavx2
Line
Count
Source
1674
81.4k
    __m256i r[4]) {
1675
81.4k
  __m256i a_256[2];
1676
1677
81.4k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
1678
81.4k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
1679
1680
81.4k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
1681
81.4k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
1682
81.4k
  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 0);
1683
1684
81.4k
  a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
1685
81.4k
  a_256[1] = _mm256_add_epi16(s_256[2], s_256[3]);
1686
81.4k
  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 2);
1687
1688
81.4k
  s_256[0] = s_256[2];
1689
81.4k
  s_256[1] = s_256[3];
1690
81.4k
  s_256[2] = s_256[4];
1691
81.4k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_16x2_half_pelavx2
1692
1693
static INLINE __m128i xy_y_convolve_6tap_2x2_sse2(const int16_t *const src,
1694
                                                  __m128i s_32[6],
1695
                                                  __m128i ss_128[3],
1696
514k
                                                  const __m128i coeffs[3]) {
1697
514k
  s_32[5] = _mm_cvtsi32_si128(*(int32_t *)(src + 5 * 2));
1698
514k
  const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
1699
514k
  s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(src + 6 * 2));
1700
514k
  const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
1701
514k
  ss_128[2] = _mm_unpacklo_epi16(src45, src56);
1702
514k
  const __m128i r = convolve16_6tap_sse2(ss_128, coeffs);
1703
514k
  ss_128[0] = ss_128[1];
1704
514k
  ss_128[1] = ss_128[2];
1705
514k
  return r;
1706
514k
}
convolve_2d_avx2.c:xy_y_convolve_6tap_2x2_sse2
Line
Count
Source
1696
514k
                                                  const __m128i coeffs[3]) {
1697
514k
  s_32[5] = _mm_cvtsi32_si128(*(int32_t *)(src + 5 * 2));
1698
514k
  const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
1699
514k
  s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(src + 6 * 2));
1700
514k
  const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
1701
514k
  ss_128[2] = _mm_unpacklo_epi16(src45, src56);
1702
514k
  const __m128i r = convolve16_6tap_sse2(ss_128, coeffs);
1703
514k
  ss_128[0] = ss_128[1];
1704
514k
  ss_128[1] = ss_128[2];
1705
514k
  return r;
1706
514k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_6tap_2x2_sse2
1707
1708
static INLINE __m256i xy_y_convolve_6tap_4x2_avx2(const int16_t *const src,
1709
                                                  __m128i s_64[6],
1710
                                                  __m256i ss_256[3],
1711
2.80M
                                                  const __m256i coeffs[3]) {
1712
2.80M
  __m256i s_256[2];
1713
2.80M
  s_64[5] = _mm_loadl_epi64((__m128i *)(src + 5 * 4));
1714
2.80M
  s_256[0] = _mm256_setr_m128i(s_64[4], s_64[5]);
1715
2.80M
  s_64[4] = _mm_loadl_epi64((__m128i *)(src + 6 * 4));
1716
2.80M
  s_256[1] = _mm256_setr_m128i(s_64[5], s_64[4]);
1717
2.80M
  ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1718
2.80M
  const __m256i r = convolve16_6tap_avx2(ss_256, coeffs);
1719
2.80M
  ss_256[0] = ss_256[1];
1720
2.80M
  ss_256[1] = ss_256[2];
1721
2.80M
  return r;
1722
2.80M
}
convolve_2d_avx2.c:xy_y_convolve_6tap_4x2_avx2
Line
Count
Source
1711
2.80M
                                                  const __m256i coeffs[3]) {
1712
2.80M
  __m256i s_256[2];
1713
2.80M
  s_64[5] = _mm_loadl_epi64((__m128i *)(src + 5 * 4));
1714
2.80M
  s_256[0] = _mm256_setr_m128i(s_64[4], s_64[5]);
1715
2.80M
  s_64[4] = _mm_loadl_epi64((__m128i *)(src + 6 * 4));
1716
2.80M
  s_256[1] = _mm256_setr_m128i(s_64[5], s_64[4]);
1717
2.80M
  ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1718
2.80M
  const __m256i r = convolve16_6tap_avx2(ss_256, coeffs);
1719
2.80M
  ss_256[0] = ss_256[1];
1720
2.80M
  ss_256[1] = ss_256[2];
1721
2.80M
  return r;
1722
2.80M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_6tap_4x2_avx2
1723
1724
static INLINE void xy_y_convolve_6tap_16_avx2(const __m256i ss[6],
1725
                                              const __m256i coeffs[3],
1726
18.6M
                                              __m256i r[2]) {
1727
18.6M
  r[0] = convolve16_6tap_avx2(ss, coeffs);
1728
18.6M
  r[1] = convolve16_6tap_avx2(ss + 3, coeffs);
1729
18.6M
}
convolve_2d_avx2.c:xy_y_convolve_6tap_16_avx2
Line
Count
Source
1726
18.6M
                                              __m256i r[2]) {
1727
18.6M
  r[0] = convolve16_6tap_avx2(ss, coeffs);
1728
18.6M
  r[1] = convolve16_6tap_avx2(ss + 3, coeffs);
1729
18.6M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_6tap_16_avx2
1730
1731
static INLINE void xy_y_convolve_6tap_8x2_avx2(const int16_t *const src,
1732
                                               __m256i ss_256[6],
1733
                                               const __m256i coeffs[3],
1734
2.81M
                                               __m256i r[2]) {
1735
2.81M
  __m256i s_256[2];
1736
2.81M
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
1737
2.81M
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
1738
2.81M
  ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1739
2.81M
  ss_256[5] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1740
2.81M
  xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r);
1741
2.81M
  ss_256[0] = ss_256[1];
1742
2.81M
  ss_256[1] = ss_256[2];
1743
2.81M
  ss_256[3] = ss_256[4];
1744
2.81M
  ss_256[4] = ss_256[5];
1745
2.81M
}
convolve_2d_avx2.c:xy_y_convolve_6tap_8x2_avx2
Line
Count
Source
1734
2.81M
                                               __m256i r[2]) {
1735
2.81M
  __m256i s_256[2];
1736
2.81M
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
1737
2.81M
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
1738
2.81M
  ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1739
2.81M
  ss_256[5] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1740
2.81M
  xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r);
1741
2.81M
  ss_256[0] = ss_256[1];
1742
2.81M
  ss_256[1] = ss_256[2];
1743
2.81M
  ss_256[3] = ss_256[4];
1744
2.81M
  ss_256[4] = ss_256[5];
1745
2.81M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_6tap_8x2_avx2
1746
1747
static INLINE void xy_y_convolve_6tap_8x2_half_pel_avx2(
1748
    const int16_t *const src, const __m256i coeffs[2], __m256i s_256[6],
1749
701k
    __m256i r[2]) {
1750
701k
  __m256i a_256[2], ss_256[4];
1751
701k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
1752
701k
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
1753
701k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
1754
701k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
1755
701k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1756
701k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1757
701k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1758
701k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1759
701k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1760
701k
  s_256[0] = s_256[2];
1761
701k
  s_256[1] = s_256[3];
1762
701k
  s_256[2] = s_256[4];
1763
701k
  s_256[3] = s_256[5];
1764
701k
}
convolve_2d_avx2.c:xy_y_convolve_6tap_8x2_half_pel_avx2
Line
Count
Source
1749
701k
    __m256i r[2]) {
1750
701k
  __m256i a_256[2], ss_256[4];
1751
701k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
1752
701k
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
1753
701k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
1754
701k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
1755
701k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1756
701k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1757
701k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1758
701k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1759
701k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1760
701k
  s_256[0] = s_256[2];
1761
701k
  s_256[1] = s_256[3];
1762
701k
  s_256[2] = s_256[4];
1763
701k
  s_256[3] = s_256[5];
1764
701k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_6tap_8x2_half_pel_avx2
1765
1766
static INLINE void xy_y_convolve_6tap_16x2_avx2(
1767
    const int16_t *const src, const ptrdiff_t stride, __m256i s_256[6],
1768
    __m256i ss_256[6], __m256i tt_256[6], const __m256i coeffs[3],
1769
7.95M
    __m256i r[4]) {
1770
7.95M
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
1771
7.95M
  ss_256[2] = _mm256_unpacklo_epi16(s_256[4], s_256[5]);
1772
7.95M
  ss_256[5] = _mm256_unpackhi_epi16(s_256[4], s_256[5]);
1773
7.95M
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
1774
7.95M
  tt_256[2] = _mm256_unpacklo_epi16(s_256[5], s_256[4]);
1775
7.95M
  tt_256[5] = _mm256_unpackhi_epi16(s_256[5], s_256[4]);
1776
1777
7.95M
  xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r + 0);
1778
7.95M
  xy_y_convolve_6tap_16_avx2(tt_256, coeffs, r + 2);
1779
1780
7.95M
  ss_256[0] = ss_256[1];
1781
7.95M
  ss_256[1] = ss_256[2];
1782
7.95M
  ss_256[3] = ss_256[4];
1783
7.95M
  ss_256[4] = ss_256[5];
1784
1785
7.95M
  tt_256[0] = tt_256[1];
1786
7.95M
  tt_256[1] = tt_256[2];
1787
7.95M
  tt_256[3] = tt_256[4];
1788
7.95M
  tt_256[4] = tt_256[5];
1789
7.95M
}
convolve_2d_avx2.c:xy_y_convolve_6tap_16x2_avx2
Line
Count
Source
1769
7.95M
    __m256i r[4]) {
1770
7.95M
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
1771
7.95M
  ss_256[2] = _mm256_unpacklo_epi16(s_256[4], s_256[5]);
1772
7.95M
  ss_256[5] = _mm256_unpackhi_epi16(s_256[4], s_256[5]);
1773
7.95M
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
1774
7.95M
  tt_256[2] = _mm256_unpacklo_epi16(s_256[5], s_256[4]);
1775
7.95M
  tt_256[5] = _mm256_unpackhi_epi16(s_256[5], s_256[4]);
1776
1777
7.95M
  xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r + 0);
1778
7.95M
  xy_y_convolve_6tap_16_avx2(tt_256, coeffs, r + 2);
1779
1780
7.95M
  ss_256[0] = ss_256[1];
1781
7.95M
  ss_256[1] = ss_256[2];
1782
7.95M
  ss_256[3] = ss_256[4];
1783
7.95M
  ss_256[4] = ss_256[5];
1784
1785
7.95M
  tt_256[0] = tt_256[1];
1786
7.95M
  tt_256[1] = tt_256[2];
1787
7.95M
  tt_256[3] = tt_256[4];
1788
7.95M
  tt_256[4] = tt_256[5];
1789
7.95M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_6tap_16x2_avx2
1790
1791
static INLINE void xy_y_convolve_6tap_16x2_half_pel_avx2(
1792
    const int16_t *const src, const ptrdiff_t stride, __m256i s_256[6],
1793
567k
    __m256i ss_256[4], const __m256i coeffs[2], __m256i r[4]) {
1794
567k
  __m256i a_256[2];
1795
1796
567k
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
1797
567k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
1798
567k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
1799
567k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1800
567k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1801
567k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1802
567k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1803
567k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1804
1805
567k
  a_256[1] = _mm256_add_epi16(s_256[2], s_256[5]);
1806
567k
  s_256[0] = s_256[2];
1807
567k
  s_256[2] = s_256[4];
1808
567k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
1809
567k
  a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
1810
567k
  s_256[1] = s_256[3];
1811
567k
  s_256[3] = s_256[5];
1812
567k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1813
567k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
1814
567k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1815
567k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
1816
567k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
1817
567k
}
convolve_2d_avx2.c:xy_y_convolve_6tap_16x2_half_pel_avx2
Line
Count
Source
1793
567k
    __m256i ss_256[4], const __m256i coeffs[2], __m256i r[4]) {
1794
567k
  __m256i a_256[2];
1795
1796
567k
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
1797
567k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
1798
567k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
1799
567k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1800
567k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1801
567k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1802
567k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1803
567k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1804
1805
567k
  a_256[1] = _mm256_add_epi16(s_256[2], s_256[5]);
1806
567k
  s_256[0] = s_256[2];
1807
567k
  s_256[2] = s_256[4];
1808
567k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
1809
567k
  a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
1810
567k
  s_256[1] = s_256[3];
1811
567k
  s_256[3] = s_256[5];
1812
567k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1813
567k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
1814
567k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1815
567k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
1816
567k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
1817
567k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_6tap_16x2_half_pel_avx2
1818
1819
static INLINE __m128i xy_y_convolve_8tap_2x2_sse2(const int16_t *const src,
1820
                                                  __m128i s_32[8],
1821
                                                  __m128i ss_128[4],
1822
27.9k
                                                  const __m128i coeffs[4]) {
1823
27.9k
  s_32[7] = _mm_cvtsi32_si128(*(int32_t *)(src + 7 * 2));
1824
27.9k
  const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
1825
27.9k
  s_32[6] = _mm_cvtsi32_si128(*(int32_t *)(src + 8 * 2));
1826
27.9k
  const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
1827
27.9k
  ss_128[3] = _mm_unpacklo_epi16(src67, src78);
1828
27.9k
  const __m128i r = convolve16_8tap_sse2(ss_128, coeffs);
1829
27.9k
  ss_128[0] = ss_128[1];
1830
27.9k
  ss_128[1] = ss_128[2];
1831
27.9k
  ss_128[2] = ss_128[3];
1832
27.9k
  return r;
1833
27.9k
}
convolve_2d_avx2.c:xy_y_convolve_8tap_2x2_sse2
Line
Count
Source
1822
27.9k
                                                  const __m128i coeffs[4]) {
1823
27.9k
  s_32[7] = _mm_cvtsi32_si128(*(int32_t *)(src + 7 * 2));
1824
27.9k
  const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
1825
27.9k
  s_32[6] = _mm_cvtsi32_si128(*(int32_t *)(src + 8 * 2));
1826
27.9k
  const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
1827
27.9k
  ss_128[3] = _mm_unpacklo_epi16(src67, src78);
1828
27.9k
  const __m128i r = convolve16_8tap_sse2(ss_128, coeffs);
1829
27.9k
  ss_128[0] = ss_128[1];
1830
27.9k
  ss_128[1] = ss_128[2];
1831
27.9k
  ss_128[2] = ss_128[3];
1832
27.9k
  return r;
1833
27.9k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_8tap_2x2_sse2
1834
1835
static INLINE __m256i xy_y_convolve_8tap_4x2_avx2(const int16_t *const src,
1836
                                                  __m128i s_64[8],
1837
                                                  __m256i ss_256[4],
1838
126k
                                                  const __m256i coeffs[4]) {
1839
126k
  __m256i s_256[2];
1840
126k
  s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * 4));
1841
126k
  s_256[0] = _mm256_setr_m128i(s_64[6], s_64[7]);
1842
126k
  s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * 4));
1843
126k
  s_256[1] = _mm256_setr_m128i(s_64[7], s_64[6]);
1844
126k
  ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1845
126k
  const __m256i r = convolve16_8tap_avx2(ss_256, coeffs);
1846
126k
  ss_256[0] = ss_256[1];
1847
126k
  ss_256[1] = ss_256[2];
1848
126k
  ss_256[2] = ss_256[3];
1849
126k
  return r;
1850
126k
}
convolve_2d_avx2.c:xy_y_convolve_8tap_4x2_avx2
Line
Count
Source
1838
126k
                                                  const __m256i coeffs[4]) {
1839
126k
  __m256i s_256[2];
1840
126k
  s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * 4));
1841
126k
  s_256[0] = _mm256_setr_m128i(s_64[6], s_64[7]);
1842
126k
  s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * 4));
1843
126k
  s_256[1] = _mm256_setr_m128i(s_64[7], s_64[6]);
1844
126k
  ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1845
126k
  const __m256i r = convolve16_8tap_avx2(ss_256, coeffs);
1846
126k
  ss_256[0] = ss_256[1];
1847
126k
  ss_256[1] = ss_256[2];
1848
126k
  ss_256[2] = ss_256[3];
1849
126k
  return r;
1850
126k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_8tap_4x2_avx2
1851
1852
static INLINE void xy_y_convolve_8tap_16_avx2(const __m256i *const ss,
1853
                                              const __m256i coeffs[4],
1854
1.86M
                                              __m256i r[2]) {
1855
1.86M
  r[0] = convolve16_8tap_avx2(ss, coeffs);
1856
1.86M
  r[1] = convolve16_8tap_avx2(ss + 4, coeffs);
1857
1.86M
}
convolve_2d_avx2.c:xy_y_convolve_8tap_16_avx2
Line
Count
Source
1854
1.86M
                                              __m256i r[2]) {
1855
1.86M
  r[0] = convolve16_8tap_avx2(ss, coeffs);
1856
1.86M
  r[1] = convolve16_8tap_avx2(ss + 4, coeffs);
1857
1.86M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_8tap_16_avx2
1858
1859
static INLINE void xy_y_convolve_8tap_8x2_avx2(const int16_t *const src,
1860
                                               __m256i ss_256[8],
1861
                                               const __m256i coeffs[4],
1862
112k
                                               __m256i r[2]) {
1863
112k
  __m256i s_256[2];
1864
112k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
1865
112k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
1866
112k
  ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1867
112k
  ss_256[7] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1868
112k
  xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r);
1869
112k
  ss_256[0] = ss_256[1];
1870
112k
  ss_256[1] = ss_256[2];
1871
112k
  ss_256[2] = ss_256[3];
1872
112k
  ss_256[4] = ss_256[5];
1873
112k
  ss_256[5] = ss_256[6];
1874
112k
  ss_256[6] = ss_256[7];
1875
112k
}
convolve_2d_avx2.c:xy_y_convolve_8tap_8x2_avx2
Line
Count
Source
1862
112k
                                               __m256i r[2]) {
1863
112k
  __m256i s_256[2];
1864
112k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
1865
112k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
1866
112k
  ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1867
112k
  ss_256[7] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1868
112k
  xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r);
1869
112k
  ss_256[0] = ss_256[1];
1870
112k
  ss_256[1] = ss_256[2];
1871
112k
  ss_256[2] = ss_256[3];
1872
112k
  ss_256[4] = ss_256[5];
1873
112k
  ss_256[5] = ss_256[6];
1874
112k
  ss_256[6] = ss_256[7];
1875
112k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_8tap_8x2_avx2
1876
1877
static INLINE void xy_y_convolve_8tap_8x2_half_pel_avx2(
1878
    const int16_t *const src, const __m256i coeffs[2], __m256i s_256[8],
1879
44.0k
    __m256i r[2]) {
1880
44.0k
  __m256i a_256[4], ss_256[4];
1881
1882
44.0k
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
1883
44.0k
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
1884
44.0k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
1885
44.0k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
1886
44.0k
  a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
1887
44.0k
  a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
1888
44.0k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1889
44.0k
  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1890
44.0k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1891
44.0k
  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1892
44.0k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1893
44.0k
  s_256[0] = s_256[2];
1894
44.0k
  s_256[1] = s_256[3];
1895
44.0k
  s_256[2] = s_256[4];
1896
44.0k
  s_256[3] = s_256[5];
1897
44.0k
  s_256[4] = s_256[6];
1898
44.0k
  s_256[5] = s_256[7];
1899
44.0k
}
convolve_2d_avx2.c:xy_y_convolve_8tap_8x2_half_pel_avx2
Line
Count
Source
1879
44.0k
    __m256i r[2]) {
1880
44.0k
  __m256i a_256[4], ss_256[4];
1881
1882
44.0k
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
1883
44.0k
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
1884
44.0k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
1885
44.0k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
1886
44.0k
  a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
1887
44.0k
  a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
1888
44.0k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1889
44.0k
  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1890
44.0k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1891
44.0k
  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1892
44.0k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1893
44.0k
  s_256[0] = s_256[2];
1894
44.0k
  s_256[1] = s_256[3];
1895
44.0k
  s_256[2] = s_256[4];
1896
44.0k
  s_256[3] = s_256[5];
1897
44.0k
  s_256[4] = s_256[6];
1898
44.0k
  s_256[5] = s_256[7];
1899
44.0k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_8tap_8x2_half_pel_avx2
1900
1901
static AOM_FORCE_INLINE void xy_y_convolve_8tap_16x2_avx2(
1902
    const int16_t *const src, const ptrdiff_t stride, const __m256i coeffs[4],
1903
878k
    __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
1904
878k
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1905
878k
  ss_256[3] = _mm256_unpacklo_epi16(s_256[6], s_256[7]);
1906
878k
  ss_256[7] = _mm256_unpackhi_epi16(s_256[6], s_256[7]);
1907
878k
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1908
878k
  tt_256[3] = _mm256_unpacklo_epi16(s_256[7], s_256[6]);
1909
878k
  tt_256[7] = _mm256_unpackhi_epi16(s_256[7], s_256[6]);
1910
1911
878k
  xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r + 0);
1912
878k
  xy_y_convolve_8tap_16_avx2(tt_256, coeffs, r + 2);
1913
1914
878k
  ss_256[0] = ss_256[1];
1915
878k
  ss_256[1] = ss_256[2];
1916
878k
  ss_256[2] = ss_256[3];
1917
878k
  ss_256[4] = ss_256[5];
1918
878k
  ss_256[5] = ss_256[6];
1919
878k
  ss_256[6] = ss_256[7];
1920
1921
878k
  tt_256[0] = tt_256[1];
1922
878k
  tt_256[1] = tt_256[2];
1923
878k
  tt_256[2] = tt_256[3];
1924
878k
  tt_256[4] = tt_256[5];
1925
878k
  tt_256[5] = tt_256[6];
1926
878k
  tt_256[6] = tt_256[7];
1927
878k
}
convolve_2d_avx2.c:xy_y_convolve_8tap_16x2_avx2
Line
Count
Source
1903
878k
    __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
1904
878k
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1905
878k
  ss_256[3] = _mm256_unpacklo_epi16(s_256[6], s_256[7]);
1906
878k
  ss_256[7] = _mm256_unpackhi_epi16(s_256[6], s_256[7]);
1907
878k
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1908
878k
  tt_256[3] = _mm256_unpacklo_epi16(s_256[7], s_256[6]);
1909
878k
  tt_256[7] = _mm256_unpackhi_epi16(s_256[7], s_256[6]);
1910
1911
878k
  xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r + 0);
1912
878k
  xy_y_convolve_8tap_16_avx2(tt_256, coeffs, r + 2);
1913
1914
878k
  ss_256[0] = ss_256[1];
1915
878k
  ss_256[1] = ss_256[2];
1916
878k
  ss_256[2] = ss_256[3];
1917
878k
  ss_256[4] = ss_256[5];
1918
878k
  ss_256[5] = ss_256[6];
1919
878k
  ss_256[6] = ss_256[7];
1920
1921
878k
  tt_256[0] = tt_256[1];
1922
878k
  tt_256[1] = tt_256[2];
1923
878k
  tt_256[2] = tt_256[3];
1924
878k
  tt_256[4] = tt_256[5];
1925
878k
  tt_256[5] = tt_256[6];
1926
878k
  tt_256[6] = tt_256[7];
1927
878k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_8tap_16x2_avx2
1928
1929
static INLINE void xy_y_convolve_8tap_16x2_half_pel_avx2(
1930
    const int16_t *const src, const ptrdiff_t stride, const __m256i coeffs[4],
1931
35.5k
    __m256i s_256[8], __m256i r[4]) {
1932
35.5k
  __m256i a_256[4], ss_256[4];
1933
35.5k
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1934
1935
35.5k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
1936
35.5k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
1937
35.5k
  a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
1938
35.5k
  a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
1939
35.5k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1940
35.5k
  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1941
35.5k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1942
35.5k
  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1943
1944
35.5k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1945
1946
35.5k
  a_256[1] = _mm256_add_epi16(s_256[2], s_256[7]);
1947
35.5k
  a_256[2] = _mm256_add_epi16(s_256[3], s_256[6]);
1948
35.5k
  a_256[3] = _mm256_add_epi16(s_256[4], s_256[5]);
1949
35.5k
  s_256[0] = s_256[2];
1950
35.5k
  s_256[2] = s_256[4];
1951
35.5k
  s_256[4] = s_256[6];
1952
35.5k
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1953
1954
35.5k
  a_256[0] = _mm256_add_epi16(s_256[1], s_256[6]);
1955
35.5k
  s_256[1] = s_256[3];
1956
35.5k
  s_256[3] = s_256[5];
1957
35.5k
  s_256[5] = s_256[7];
1958
35.5k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1959
35.5k
  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1960
35.5k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1961
35.5k
  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1962
1963
35.5k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
1964
35.5k
}
convolve_2d_avx2.c:xy_y_convolve_8tap_16x2_half_pel_avx2
Line
Count
Source
1931
35.5k
    __m256i s_256[8], __m256i r[4]) {
1932
35.5k
  __m256i a_256[4], ss_256[4];
1933
35.5k
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1934
1935
35.5k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
1936
35.5k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
1937
35.5k
  a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
1938
35.5k
  a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
1939
35.5k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1940
35.5k
  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1941
35.5k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1942
35.5k
  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1943
1944
35.5k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1945
1946
35.5k
  a_256[1] = _mm256_add_epi16(s_256[2], s_256[7]);
1947
35.5k
  a_256[2] = _mm256_add_epi16(s_256[3], s_256[6]);
1948
35.5k
  a_256[3] = _mm256_add_epi16(s_256[4], s_256[5]);
1949
35.5k
  s_256[0] = s_256[2];
1950
35.5k
  s_256[2] = s_256[4];
1951
35.5k
  s_256[4] = s_256[6];
1952
35.5k
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1953
1954
35.5k
  a_256[0] = _mm256_add_epi16(s_256[1], s_256[6]);
1955
35.5k
  s_256[1] = s_256[3];
1956
35.5k
  s_256[3] = s_256[5];
1957
35.5k
  s_256[5] = s_256[7];
1958
35.5k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1959
35.5k
  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1960
35.5k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1961
35.5k
  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1962
1963
35.5k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
1964
35.5k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_8tap_16x2_half_pel_avx2
1965
1966
static INLINE void xy_y_round_store_8x2_avx2(const __m256i res[2],
1967
                                             uint8_t *const dst,
1968
5.02M
                                             const ptrdiff_t stride) {
1969
5.02M
  const __m256i r = xy_y_round_16_avx2(res);
1970
5.02M
  pack_store_8x2_avx2(r, dst, stride);
1971
5.02M
}
convolve_2d_avx2.c:xy_y_round_store_8x2_avx2
Line
Count
Source
1968
5.02M
                                             const ptrdiff_t stride) {
1969
5.02M
  const __m256i r = xy_y_round_16_avx2(res);
1970
5.02M
  pack_store_8x2_avx2(r, dst, stride);
1971
5.02M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_store_8x2_avx2
1972
1973
static INLINE void xy_y_round_store_16x2_avx2(const __m256i res[4],
1974
                                              uint8_t *const dst,
1975
3.28M
                                              const ptrdiff_t stride) {
1976
3.28M
  const __m256i r0 = xy_y_round_16_avx2(res + 0);
1977
3.28M
  const __m256i r1 = xy_y_round_16_avx2(res + 2);
1978
3.28M
  xy_y_pack_store_16x2_avx2(r0, r1, dst, stride);
1979
3.28M
}
convolve_2d_avx2.c:xy_y_round_store_16x2_avx2
Line
Count
Source
1975
3.28M
                                              const ptrdiff_t stride) {
1976
3.28M
  const __m256i r0 = xy_y_round_16_avx2(res + 0);
1977
3.28M
  const __m256i r1 = xy_y_round_16_avx2(res + 2);
1978
3.28M
  xy_y_pack_store_16x2_avx2(r0, r1, dst, stride);
1979
3.28M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_store_16x2_avx2
1980
1981
static INLINE void sr_y_round_store_32_avx2(const __m256i res[2],
1982
3.01M
                                            uint8_t *const dst) {
1983
3.01M
  __m256i r[2];
1984
1985
3.01M
  r[0] = sr_y_round_avx2(res[0]);
1986
3.01M
  r[1] = sr_y_round_avx2(res[1]);
1987
3.01M
  convolve_store_32_avx2(r[0], r[1], dst);
1988
3.01M
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_round_store_32_avx2
convolve_avx2.c:sr_y_round_store_32_avx2
Line
Count
Source
1982
3.01M
                                            uint8_t *const dst) {
1983
3.01M
  __m256i r[2];
1984
1985
3.01M
  r[0] = sr_y_round_avx2(res[0]);
1986
3.01M
  r[1] = sr_y_round_avx2(res[1]);
1987
3.01M
  convolve_store_32_avx2(r[0], r[1], dst);
1988
3.01M
}
1989
1990
static INLINE void sr_y_round_store_32x2_avx2(const __m256i res[4],
1991
                                              uint8_t *const dst,
1992
1.44M
                                              const int32_t dst_stride) {
1993
1.44M
  sr_y_round_store_32_avx2(res, dst);
1994
1.44M
  sr_y_round_store_32_avx2(res + 2, dst + dst_stride);
1995
1.44M
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_round_store_32x2_avx2
convolve_avx2.c:sr_y_round_store_32x2_avx2
Line
Count
Source
1992
1.44M
                                              const int32_t dst_stride) {
1993
1.44M
  sr_y_round_store_32_avx2(res, dst);
1994
1.44M
  sr_y_round_store_32_avx2(res + 2, dst + dst_stride);
1995
1.44M
}
1996
1997
static INLINE void sr_y_2tap_32_avx2(const uint8_t *const src,
1998
                                     const __m256i coeffs[1], const __m256i s0,
1999
129k
                                     __m256i *const s1, uint8_t *const dst) {
2000
129k
  __m256i r[2];
2001
129k
  y_convolve_2tap_32_avx2(src, coeffs, s0, s1, r);
2002
129k
  sr_y_round_store_32_avx2(r, dst);
2003
129k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_2tap_32_avx2
convolve_avx2.c:sr_y_2tap_32_avx2
Line
Count
Source
1999
129k
                                     __m256i *const s1, uint8_t *const dst) {
2000
129k
  __m256i r[2];
2001
129k
  y_convolve_2tap_32_avx2(src, coeffs, s0, s1, r);
2002
129k
  sr_y_round_store_32_avx2(r, dst);
2003
129k
}
2004
2005
static AOM_FORCE_INLINE void av1_convolve_y_sr_specialized_avx2(
2006
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
2007
    int32_t w, int32_t h, const InterpFilterParams *filter_params_y,
2008
1.39M
    const int32_t subpel_y_q4) {
2009
1.39M
  int32_t x, y;
2010
1.39M
  __m128i coeffs_128[4];
2011
1.39M
  __m256i coeffs_256[4];
2012
2013
1.39M
  int vert_tap = get_filter_tap(filter_params_y, subpel_y_q4);
2014
2015
1.39M
  if (vert_tap == 2) {
2016
    // vert_filt as 2 tap
2017
45.6k
    const uint8_t *src_ptr = src;
2018
2019
45.6k
    y = h;
2020
2021
45.6k
    if (subpel_y_q4 != 8) {
2022
22.8k
      if (w <= 8) {
2023
18.1k
        prepare_half_coeffs_2tap_ssse3(filter_params_y, subpel_y_q4,
2024
18.1k
                                       coeffs_128);
2025
2026
18.1k
        if (w == 2) {
2027
3.00k
          __m128i s_16[2];
2028
2029
3.00k
          s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
2030
2031
6.89k
          do {
2032
6.89k
            const __m128i res = y_convolve_2tap_2x2_ssse3(src_ptr, src_stride,
2033
6.89k
                                                          coeffs_128, s_16);
2034
6.89k
            const __m128i r = sr_y_round_sse2(res);
2035
6.89k
            pack_store_2x2_sse2(r, dst, dst_stride);
2036
6.89k
            src_ptr += 2 * src_stride;
2037
6.89k
            dst += 2 * dst_stride;
2038
6.89k
            y -= 2;
2039
6.89k
          } while (y);
2040
15.0k
        } else if (w == 4) {
2041
8.50k
          __m128i s_32[2];
2042
2043
8.50k
          s_32[0] = _mm_cvtsi32_si128(*(int32_t *)src_ptr);
2044
2045
26.1k
          do {
2046
26.1k
            const __m128i res = y_convolve_2tap_4x2_ssse3(src_ptr, src_stride,
2047
26.1k
                                                          coeffs_128, s_32);
2048
26.1k
            const __m128i r = sr_y_round_sse2(res);
2049
26.1k
            pack_store_4x2_sse2(r, dst, dst_stride);
2050
26.1k
            src_ptr += 2 * src_stride;
2051
26.1k
            dst += 2 * dst_stride;
2052
26.1k
            y -= 2;
2053
26.1k
          } while (y);
2054
8.50k
        } else {
2055
6.58k
          __m128i s_64[2], s_128[2];
2056
2057
6.58k
          assert(w == 8);
2058
2059
0
          s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
2060
2061
21.5k
          do {
2062
            // Note: Faster than binding to AVX2 registers.
2063
21.5k
            s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
2064
21.5k
            s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
2065
21.5k
            s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2066
21.5k
            s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
2067
21.5k
            const __m128i ss0 = _mm_unpacklo_epi8(s_128[0], s_128[1]);
2068
21.5k
            const __m128i ss1 = _mm_unpackhi_epi8(s_128[0], s_128[1]);
2069
21.5k
            const __m128i res0 = convolve_2tap_ssse3(&ss0, coeffs_128);
2070
21.5k
            const __m128i res1 = convolve_2tap_ssse3(&ss1, coeffs_128);
2071
21.5k
            const __m128i r0 = sr_y_round_sse2(res0);
2072
21.5k
            const __m128i r1 = sr_y_round_sse2(res1);
2073
21.5k
            const __m128i d = _mm_packus_epi16(r0, r1);
2074
21.5k
            _mm_storel_epi64((__m128i *)dst, d);
2075
21.5k
            _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
2076
21.5k
            src_ptr += 2 * src_stride;
2077
21.5k
            dst += 2 * dst_stride;
2078
21.5k
            y -= 2;
2079
21.5k
          } while (y);
2080
6.58k
        }
2081
18.1k
      } else {
2082
4.77k
        prepare_half_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2083
2084
4.77k
        if (w == 16) {
2085
3.03k
          __m128i s_128[2];
2086
2087
3.03k
          s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
2088
2089
15.6k
          do {
2090
15.6k
            __m256i r[2];
2091
2092
15.6k
            y_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2093
15.6k
                                      r);
2094
15.6k
            sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2095
15.6k
            src_ptr += 2 * src_stride;
2096
15.6k
            dst += 2 * dst_stride;
2097
15.6k
            y -= 2;
2098
15.6k
          } while (y);
2099
3.03k
        } else if (w == 32) {
2100
954
          __m256i s_256[2];
2101
2102
954
          s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
2103
2104
9.45k
          do {
2105
9.45k
            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0],
2106
9.45k
                              &s_256[1], dst);
2107
9.45k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1],
2108
9.45k
                              &s_256[0], dst + dst_stride);
2109
9.45k
            src_ptr += 2 * src_stride;
2110
9.45k
            dst += 2 * dst_stride;
2111
9.45k
            y -= 2;
2112
9.45k
          } while (y);
2113
954
        } else if (w == 64) {
2114
645
          __m256i s_256[2][2];
2115
2116
645
          s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2117
645
          s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2118
2119
15.0k
          do {
2120
15.0k
            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0][0],
2121
15.0k
                              &s_256[1][0], dst);
2122
15.0k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 32, coeffs_256,
2123
15.0k
                              s_256[0][1], &s_256[1][1], dst + 32);
2124
15.0k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1][0],
2125
15.0k
                              &s_256[0][0], dst + dst_stride);
2126
15.0k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 32, coeffs_256,
2127
15.0k
                              s_256[1][1], &s_256[0][1], dst + dst_stride + 32);
2128
2129
15.0k
            src_ptr += 2 * src_stride;
2130
15.0k
            dst += 2 * dst_stride;
2131
15.0k
            y -= 2;
2132
15.0k
          } while (y);
2133
645
        } else {
2134
138
          __m256i s_256[2][4];
2135
2136
138
          assert(w == 128);
2137
2138
0
          s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2139
138
          s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2140
138
          s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
2141
138
          s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
2142
2143
6.27k
          do {
2144
6.27k
            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0][0],
2145
6.27k
                              &s_256[1][0], dst);
2146
6.27k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 1 * 32, coeffs_256,
2147
6.27k
                              s_256[0][1], &s_256[1][1], dst + 1 * 32);
2148
6.27k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 2 * 32, coeffs_256,
2149
6.27k
                              s_256[0][2], &s_256[1][2], dst + 2 * 32);
2150
6.27k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 3 * 32, coeffs_256,
2151
6.27k
                              s_256[0][3], &s_256[1][3], dst + 3 * 32);
2152
2153
6.27k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1][0],
2154
6.27k
                              &s_256[0][0], dst + dst_stride);
2155
6.27k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 1 * 32, coeffs_256,
2156
6.27k
                              s_256[1][1], &s_256[0][1],
2157
6.27k
                              dst + dst_stride + 1 * 32);
2158
6.27k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 2 * 32, coeffs_256,
2159
6.27k
                              s_256[1][2], &s_256[0][2],
2160
6.27k
                              dst + dst_stride + 2 * 32);
2161
6.27k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 3 * 32, coeffs_256,
2162
6.27k
                              s_256[1][3], &s_256[0][3],
2163
6.27k
                              dst + dst_stride + 3 * 32);
2164
2165
6.27k
            src_ptr += 2 * src_stride;
2166
6.27k
            dst += 2 * dst_stride;
2167
6.27k
            y -= 2;
2168
6.27k
          } while (y);
2169
138
        }
2170
4.77k
      }
2171
22.8k
    } else {
2172
      // average to get half pel
2173
22.7k
      if (w <= 8) {
2174
19.3k
        if (w == 2) {
2175
3.94k
          __m128i s_16[2];
2176
2177
3.94k
          s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
2178
2179
10.4k
          do {
2180
10.4k
            s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + src_stride));
2181
10.4k
            const __m128i d0 = _mm_avg_epu8(s_16[0], s_16[1]);
2182
10.4k
            *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d0);
2183
10.4k
            s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
2184
10.4k
            const __m128i d1 = _mm_avg_epu8(s_16[1], s_16[0]);
2185
10.4k
            *(int16_t *)(dst + dst_stride) = (int16_t)_mm_cvtsi128_si32(d1);
2186
10.4k
            src_ptr += 2 * src_stride;
2187
10.4k
            dst += 2 * dst_stride;
2188
10.4k
            y -= 2;
2189
10.4k
          } while (y);
2190
15.3k
        } else if (w == 4) {
2191
9.12k
          __m128i s_32[2];
2192
2193
9.12k
          s_32[0] = _mm_cvtsi32_si128(*(int32_t *)src_ptr);
2194
2195
25.8k
          do {
2196
25.8k
            s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + src_stride));
2197
25.8k
            const __m128i d0 = _mm_avg_epu8(s_32[0], s_32[1]);
2198
25.8k
            xx_storel_32(dst, d0);
2199
25.8k
            s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 2 * src_stride));
2200
25.8k
            const __m128i d1 = _mm_avg_epu8(s_32[1], s_32[0]);
2201
25.8k
            xx_storel_32(dst + dst_stride, d1);
2202
25.8k
            src_ptr += 2 * src_stride;
2203
25.8k
            dst += 2 * dst_stride;
2204
25.8k
            y -= 2;
2205
25.8k
          } while (y);
2206
9.12k
        } else {
2207
6.23k
          __m128i s_64[2];
2208
2209
6.23k
          assert(w == 8);
2210
2211
0
          s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
2212
2213
21.5k
          do {
2214
            // Note: Faster than binding to AVX2 registers.
2215
21.5k
            s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
2216
21.5k
            const __m128i d0 = _mm_avg_epu8(s_64[0], s_64[1]);
2217
21.5k
            _mm_storel_epi64((__m128i *)dst, d0);
2218
21.5k
            s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2219
21.5k
            const __m128i d1 = _mm_avg_epu8(s_64[1], s_64[0]);
2220
21.5k
            _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
2221
21.5k
            src_ptr += 2 * src_stride;
2222
21.5k
            dst += 2 * dst_stride;
2223
21.5k
            y -= 2;
2224
21.5k
          } while (y);
2225
6.23k
        }
2226
19.3k
      } else if (w == 16) {
2227
2.42k
        __m128i s_128[2];
2228
2229
2.42k
        s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
2230
2231
12.0k
        do {
2232
12.0k
          s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
2233
12.0k
          const __m128i d0 = _mm_avg_epu8(s_128[0], s_128[1]);
2234
12.0k
          _mm_storeu_si128((__m128i *)dst, d0);
2235
12.0k
          s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2236
12.0k
          const __m128i d1 = _mm_avg_epu8(s_128[1], s_128[0]);
2237
12.0k
          _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
2238
12.0k
          src_ptr += 2 * src_stride;
2239
12.0k
          dst += 2 * dst_stride;
2240
12.0k
          y -= 2;
2241
12.0k
        } while (y);
2242
2.42k
      } else if (w == 32) {
2243
623
        __m256i s_256[2];
2244
2245
623
        s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
2246
2247
6.20k
        do {
2248
6.20k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0], &s_256[1], dst);
2249
6.20k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1], &s_256[0],
2250
6.20k
                                dst + dst_stride);
2251
6.20k
          src_ptr += 2 * src_stride;
2252
6.20k
          dst += 2 * dst_stride;
2253
6.20k
          y -= 2;
2254
6.20k
        } while (y);
2255
623
      } else if (w == 64) {
2256
295
        __m256i s_256[2][2];
2257
2258
295
        s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2259
295
        s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2260
2261
6.85k
        do {
2262
6.85k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0][0], &s_256[1][0],
2263
6.85k
                                dst);
2264
6.85k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 32, s_256[0][1],
2265
6.85k
                                &s_256[1][1], dst + 32);
2266
2267
6.85k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1][0],
2268
6.85k
                                &s_256[0][0], dst + dst_stride);
2269
6.85k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 32, s_256[1][1],
2270
6.85k
                                &s_256[0][1], dst + dst_stride + 32);
2271
2272
6.85k
          src_ptr += 2 * src_stride;
2273
6.85k
          dst += 2 * dst_stride;
2274
6.85k
          y -= 2;
2275
6.85k
        } while (y);
2276
295
      } else {
2277
136
        __m256i s_256[2][4];
2278
2279
136
        assert(w == 128);
2280
2281
0
        s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2282
136
        s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2283
136
        s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
2284
136
        s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
2285
2286
7.36k
        do {
2287
7.36k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0][0], &s_256[1][0],
2288
7.36k
                                dst);
2289
7.36k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 1 * 32, s_256[0][1],
2290
7.36k
                                &s_256[1][1], dst + 1 * 32);
2291
7.36k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 2 * 32, s_256[0][2],
2292
7.36k
                                &s_256[1][2], dst + 2 * 32);
2293
7.36k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 3 * 32, s_256[0][3],
2294
7.36k
                                &s_256[1][3], dst + 3 * 32);
2295
2296
7.36k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1][0],
2297
7.36k
                                &s_256[0][0], dst + dst_stride);
2298
7.36k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 1 * 32, s_256[1][1],
2299
7.36k
                                &s_256[0][1], dst + dst_stride + 1 * 32);
2300
7.36k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 2 * 32, s_256[1][2],
2301
7.36k
                                &s_256[0][2], dst + dst_stride + 2 * 32);
2302
7.36k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 3 * 32, s_256[1][3],
2303
7.36k
                                &s_256[0][3], dst + dst_stride + 3 * 32);
2304
2305
7.36k
          src_ptr += 2 * src_stride;
2306
7.36k
          dst += 2 * dst_stride;
2307
7.36k
          y -= 2;
2308
7.36k
        } while (y);
2309
136
      }
2310
22.7k
    }
2311
1.35M
  } else if (vert_tap == 4) {
2312
    // vert_filt as 4 tap
2313
654k
    const uint8_t *src_ptr = src - src_stride;
2314
2315
654k
    y = h;
2316
2317
654k
    if (w <= 4) {
2318
344k
      prepare_half_coeffs_4tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2319
2320
344k
      if (w == 2) {
2321
67.6k
        __m128i s_16[4], ss_128[2];
2322
2323
67.6k
        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2324
67.6k
        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2325
67.6k
        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2326
2327
67.6k
        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2328
67.6k
        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2329
2330
67.6k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2331
2332
112k
        do {
2333
112k
          src_ptr += 2 * src_stride;
2334
112k
          const __m128i res = y_convolve_4tap_2x2_ssse3(
2335
112k
              src_ptr, src_stride, coeffs_128, s_16, ss_128);
2336
112k
          const __m128i r = sr_y_round_sse2(res);
2337
112k
          pack_store_2x2_sse2(r, dst, dst_stride);
2338
2339
112k
          ss_128[0] = ss_128[1];
2340
112k
          dst += 2 * dst_stride;
2341
112k
          y -= 2;
2342
112k
        } while (y);
2343
276k
      } else {
2344
276k
        __m128i s_32[4], ss_128[2];
2345
2346
276k
        assert(w == 4);
2347
2348
0
        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2349
276k
        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2350
276k
        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2351
2352
276k
        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2353
276k
        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2354
2355
276k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2356
2357
550k
        do {
2358
550k
          src_ptr += 2 * src_stride;
2359
550k
          const __m128i res = y_convolve_4tap_4x2_ssse3(
2360
550k
              src_ptr, src_stride, coeffs_128, s_32, ss_128);
2361
550k
          const __m128i r = sr_y_round_sse2(res);
2362
550k
          pack_store_4x2_sse2(r, dst, dst_stride);
2363
2364
550k
          ss_128[0] = ss_128[1];
2365
550k
          dst += 2 * dst_stride;
2366
550k
          y -= 2;
2367
550k
        } while (y);
2368
276k
      }
2369
344k
    } else {
2370
309k
      prepare_half_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2371
2372
309k
      if (w == 8) {
2373
218k
        __m128i s_64[4];
2374
218k
        __m256i ss_256[2];
2375
2376
218k
        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2377
218k
        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2378
218k
        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2379
2380
        // Load lines a and b. Line a to lower 128, line b to upper 128
2381
218k
        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2382
218k
        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2383
2384
218k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2385
2386
438k
        do {
2387
438k
          src_ptr += 2 * src_stride;
2388
438k
          const __m256i res = y_convolve_4tap_8x2_avx2(
2389
438k
              src_ptr, src_stride, coeffs_256, s_64, ss_256);
2390
438k
          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2391
2392
438k
          ss_256[0] = ss_256[1];
2393
438k
          dst += 2 * dst_stride;
2394
438k
          y -= 2;
2395
438k
        } while (y);
2396
218k
      } else if (w == 16) {
2397
84.3k
        __m128i s_128[4];
2398
84.3k
        __m256i ss_256[4], r[2];
2399
2400
84.3k
        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2401
84.3k
        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2402
84.3k
        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2403
2404
        // Load lines a and b. Line a to lower 128, line b to upper 128
2405
84.3k
        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2406
84.3k
        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2407
2408
84.3k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2409
84.3k
        ss_256[2] = _mm256_unpackhi_epi8(src01, src12);
2410
2411
212k
        do {
2412
212k
          src_ptr += 2 * src_stride;
2413
212k
          y_convolve_4tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2414
212k
                                    ss_256, r);
2415
212k
          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2416
2417
212k
          ss_256[0] = ss_256[1];
2418
212k
          ss_256[2] = ss_256[3];
2419
212k
          dst += 2 * dst_stride;
2420
212k
          y -= 2;
2421
212k
        } while (y);
2422
84.3k
      } else if (w == 32) {
2423
        // AV1 standard won't have 32x4 case.
2424
        // This only favors some optimization feature which
2425
        // subsamples 32x8 to 32x4 and triggers 4-tap filter.
2426
2427
5.13k
        __m256i s_256[4], ss_256[4], tt_256[4], r[4];
2428
2429
5.13k
        s_256[0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * src_stride));
2430
5.13k
        s_256[1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * src_stride));
2431
5.13k
        s_256[2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * src_stride));
2432
2433
5.13k
        ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2434
5.13k
        ss_256[2] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2435
2436
5.13k
        tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2437
5.13k
        tt_256[2] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2438
2439
47.2k
        do {
2440
47.2k
          src_ptr += 2 * src_stride;
2441
47.2k
          y_convolve_4tap_32x2_avx2(src_ptr, src_stride, coeffs_256, s_256,
2442
47.2k
                                    ss_256, tt_256, r);
2443
47.2k
          sr_y_round_store_32x2_avx2(r, dst, dst_stride);
2444
2445
47.2k
          ss_256[0] = ss_256[1];
2446
47.2k
          ss_256[2] = ss_256[3];
2447
2448
47.2k
          tt_256[0] = tt_256[1];
2449
47.2k
          tt_256[2] = tt_256[3];
2450
47.2k
          dst += 2 * dst_stride;
2451
47.2k
          y -= 2;
2452
47.2k
        } while (y);
2453
5.13k
      } else {
2454
1.32k
        assert(!(w % 32));
2455
2456
0
        __m256i s_256[4], ss_256[4], tt_256[4], r[4];
2457
1.32k
        x = 0;
2458
2.96k
        do {
2459
2.96k
          const uint8_t *s = src_ptr + x;
2460
2.96k
          uint8_t *d = dst + x;
2461
2.96k
          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2462
2.96k
          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2463
2.96k
          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2464
2465
2.96k
          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2466
2.96k
          ss_256[2] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2467
2468
2.96k
          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2469
2.96k
          tt_256[2] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2470
2471
2.96k
          y = h;
2472
94.0k
          do {
2473
94.0k
            s += 2 * src_stride;
2474
94.0k
            y_convolve_4tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2475
94.0k
                                      tt_256, r);
2476
94.0k
            sr_y_round_store_32x2_avx2(r, d, dst_stride);
2477
2478
94.0k
            ss_256[0] = ss_256[1];
2479
94.0k
            ss_256[2] = ss_256[3];
2480
2481
94.0k
            tt_256[0] = tt_256[1];
2482
94.0k
            tt_256[2] = tt_256[3];
2483
94.0k
            d += 2 * dst_stride;
2484
94.0k
            y -= 2;
2485
94.0k
          } while (y);
2486
2.96k
          x += 32;
2487
2.96k
        } while (x < w);
2488
1.32k
      }
2489
309k
    }
2490
698k
  } else if (vert_tap == 6) {
2491
    // vert_filt as 6 tap
2492
667k
    const uint8_t *src_ptr = src - 2 * src_stride;
2493
2494
667k
    if (w <= 4) {
2495
227k
      prepare_half_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2496
2497
227k
      y = h;
2498
2499
227k
      if (w == 2) {
2500
34.0k
        __m128i s_16[6], ss_128[3];
2501
2502
34.0k
        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2503
34.0k
        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2504
34.0k
        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2505
34.0k
        s_16[3] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 3 * src_stride));
2506
34.0k
        s_16[4] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 4 * src_stride));
2507
2508
34.0k
        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2509
34.0k
        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2510
34.0k
        const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
2511
34.0k
        const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
2512
2513
34.0k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2514
34.0k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2515
2516
136k
        do {
2517
136k
          src_ptr += 2 * src_stride;
2518
136k
          const __m128i res = y_convolve_6tap_2x2_ssse3(
2519
136k
              src_ptr, src_stride, coeffs_128, s_16, ss_128);
2520
136k
          const __m128i r = sr_y_round_sse2(res);
2521
136k
          pack_store_2x2_sse2(r, dst, dst_stride);
2522
2523
136k
          ss_128[0] = ss_128[1];
2524
136k
          ss_128[1] = ss_128[2];
2525
136k
          dst += 2 * dst_stride;
2526
136k
          y -= 2;
2527
136k
        } while (y);
2528
193k
      } else {
2529
193k
        __m128i s_32[6], ss_128[3];
2530
2531
193k
        assert(w == 4);
2532
2533
0
        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2534
193k
        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2535
193k
        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2536
193k
        s_32[3] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 3 * src_stride));
2537
193k
        s_32[4] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 4 * src_stride));
2538
2539
193k
        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2540
193k
        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2541
193k
        const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
2542
193k
        const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
2543
2544
193k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2545
193k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2546
2547
1.03M
        do {
2548
1.03M
          src_ptr += 2 * src_stride;
2549
1.03M
          const __m128i res = y_convolve_6tap_4x2_ssse3(
2550
1.03M
              src_ptr, src_stride, coeffs_128, s_32, ss_128);
2551
1.03M
          const __m128i r = sr_y_round_sse2(res);
2552
1.03M
          pack_store_4x2_sse2(r, dst, dst_stride);
2553
2554
1.03M
          ss_128[0] = ss_128[1];
2555
1.03M
          ss_128[1] = ss_128[2];
2556
1.03M
          dst += 2 * dst_stride;
2557
1.03M
          y -= 2;
2558
1.03M
        } while (y);
2559
193k
      }
2560
440k
    } else {
2561
440k
      prepare_half_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2562
2563
440k
      if (w == 8) {
2564
231k
        __m128i s_64[6];
2565
231k
        __m256i ss_256[3];
2566
2567
231k
        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2568
231k
        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2569
231k
        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2570
231k
        s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
2571
231k
        s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
2572
2573
        // Load lines a and b. Line a to lower 128, line b to upper 128
2574
231k
        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2575
231k
        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2576
231k
        const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
2577
231k
        const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
2578
2579
231k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2580
231k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2581
2582
231k
        y = h;
2583
1.33M
        do {
2584
1.33M
          src_ptr += 2 * src_stride;
2585
1.33M
          const __m256i res = y_convolve_6tap_8x2_avx2(
2586
1.33M
              src_ptr, src_stride, coeffs_256, s_64, ss_256);
2587
1.33M
          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2588
2589
1.33M
          ss_256[0] = ss_256[1];
2590
1.33M
          ss_256[1] = ss_256[2];
2591
1.33M
          dst += 2 * dst_stride;
2592
1.33M
          y -= 2;
2593
1.33M
        } while (y);
2594
231k
      } else if (w == 16) {
2595
150k
        __m128i s_128[6];
2596
150k
        __m256i ss_256[6], r[2];
2597
2598
150k
        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2599
150k
        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2600
150k
        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2601
150k
        s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
2602
150k
        s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
2603
2604
        // Load lines a and b. Line a to lower 128, line b to upper 128
2605
150k
        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2606
150k
        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2607
150k
        const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
2608
150k
        const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
2609
2610
150k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2611
150k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2612
2613
150k
        ss_256[3] = _mm256_unpackhi_epi8(src01, src12);
2614
150k
        ss_256[4] = _mm256_unpackhi_epi8(src23, src34);
2615
2616
150k
        y = h;
2617
1.08M
        do {
2618
1.08M
          src_ptr += 2 * src_stride;
2619
1.08M
          y_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2620
1.08M
                                    ss_256, r);
2621
1.08M
          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2622
2623
1.08M
          ss_256[0] = ss_256[1];
2624
1.08M
          ss_256[1] = ss_256[2];
2625
2626
1.08M
          ss_256[3] = ss_256[4];
2627
1.08M
          ss_256[4] = ss_256[5];
2628
1.08M
          dst += 2 * dst_stride;
2629
1.08M
          y -= 2;
2630
1.08M
        } while (y);
2631
150k
      } else {
2632
59.5k
        __m256i s_256[6], ss_256[6], tt_256[6], r[4];
2633
2634
59.5k
        assert(!(w % 32));
2635
2636
0
        x = 0;
2637
71.6k
        do {
2638
71.6k
          const uint8_t *s = src_ptr + x;
2639
71.6k
          uint8_t *d = dst + x;
2640
2641
71.6k
          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2642
71.6k
          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2643
71.6k
          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2644
71.6k
          s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
2645
71.6k
          s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
2646
2647
71.6k
          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2648
71.6k
          ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
2649
71.6k
          ss_256[3] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2650
71.6k
          ss_256[4] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
2651
2652
71.6k
          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2653
71.6k
          tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
2654
71.6k
          tt_256[3] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2655
71.6k
          tt_256[4] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
2656
2657
71.6k
          y = h;
2658
1.20M
          do {
2659
1.20M
            s += 2 * src_stride;
2660
1.20M
            y_convolve_6tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2661
1.20M
                                      tt_256, r);
2662
1.20M
            sr_y_round_store_32x2_avx2(r, d, dst_stride);
2663
2664
1.20M
            ss_256[0] = ss_256[1];
2665
1.20M
            ss_256[1] = ss_256[2];
2666
1.20M
            ss_256[3] = ss_256[4];
2667
1.20M
            ss_256[4] = ss_256[5];
2668
2669
1.20M
            tt_256[0] = tt_256[1];
2670
1.20M
            tt_256[1] = tt_256[2];
2671
1.20M
            tt_256[3] = tt_256[4];
2672
1.20M
            tt_256[4] = tt_256[5];
2673
1.20M
            d += 2 * dst_stride;
2674
1.20M
            y -= 2;
2675
1.20M
          } while (y);
2676
2677
71.6k
          x += 32;
2678
71.6k
        } while (x < w);
2679
59.5k
      }
2680
440k
    }
2681
667k
  } else if (vert_tap == 8) {
2682
    // vert_filt as 8 tap
2683
31.1k
    const uint8_t *src_ptr = src - 3 * src_stride;
2684
2685
31.1k
    if (w <= 4) {
2686
11.8k
      prepare_half_coeffs_8tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2687
2688
11.8k
      y = h;
2689
2690
11.8k
      if (w == 2) {
2691
2.07k
        __m128i s_16[8], ss_128[4];
2692
2693
2.07k
        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2694
2.07k
        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2695
2.07k
        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2696
2.07k
        s_16[3] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 3 * src_stride));
2697
2.07k
        s_16[4] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 4 * src_stride));
2698
2.07k
        s_16[5] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 5 * src_stride));
2699
2.07k
        s_16[6] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 6 * src_stride));
2700
2701
2.07k
        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2702
2.07k
        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2703
2.07k
        const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
2704
2.07k
        const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
2705
2.07k
        const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
2706
2.07k
        const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[6]);
2707
2708
2.07k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2709
2.07k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2710
2.07k
        ss_128[2] = _mm_unpacklo_epi8(src45, src56);
2711
2712
8.29k
        do {
2713
8.29k
          const __m128i res = y_convolve_8tap_2x2_ssse3(
2714
8.29k
              src_ptr, src_stride, coeffs_128, s_16, ss_128);
2715
8.29k
          const __m128i r = sr_y_round_sse2(res);
2716
8.29k
          pack_store_2x2_sse2(r, dst, dst_stride);
2717
8.29k
          ss_128[0] = ss_128[1];
2718
8.29k
          ss_128[1] = ss_128[2];
2719
8.29k
          ss_128[2] = ss_128[3];
2720
8.29k
          src_ptr += 2 * src_stride;
2721
8.29k
          dst += 2 * dst_stride;
2722
8.29k
          y -= 2;
2723
8.29k
        } while (y);
2724
9.76k
      } else {
2725
9.76k
        __m128i s_32[8], ss_128[4];
2726
2727
9.76k
        assert(w == 4);
2728
2729
0
        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2730
9.76k
        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2731
9.76k
        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2732
9.76k
        s_32[3] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 3 * src_stride));
2733
9.76k
        s_32[4] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 4 * src_stride));
2734
9.76k
        s_32[5] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 5 * src_stride));
2735
9.76k
        s_32[6] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 6 * src_stride));
2736
2737
9.76k
        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2738
9.76k
        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2739
9.76k
        const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
2740
9.76k
        const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
2741
9.76k
        const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
2742
9.76k
        const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
2743
2744
9.76k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2745
9.76k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2746
9.76k
        ss_128[2] = _mm_unpacklo_epi8(src45, src56);
2747
2748
52.0k
        do {
2749
52.0k
          const __m128i res = y_convolve_8tap_4x2_ssse3(
2750
52.0k
              src_ptr, src_stride, coeffs_128, s_32, ss_128);
2751
52.0k
          const __m128i r = sr_y_round_sse2(res);
2752
52.0k
          pack_store_4x2_sse2(r, dst, dst_stride);
2753
52.0k
          ss_128[0] = ss_128[1];
2754
52.0k
          ss_128[1] = ss_128[2];
2755
52.0k
          ss_128[2] = ss_128[3];
2756
52.0k
          src_ptr += 2 * src_stride;
2757
52.0k
          dst += 2 * dst_stride;
2758
52.0k
          y -= 2;
2759
52.0k
        } while (y);
2760
9.76k
      }
2761
19.2k
    } else {
2762
19.2k
      prepare_half_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2763
2764
19.2k
      if (w == 8) {
2765
10.1k
        __m128i s_64[8];
2766
10.1k
        __m256i ss_256[4];
2767
2768
10.1k
        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2769
10.1k
        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2770
10.1k
        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2771
10.1k
        s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
2772
10.1k
        s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
2773
10.1k
        s_64[5] = _mm_loadl_epi64((__m128i *)(src_ptr + 5 * src_stride));
2774
10.1k
        s_64[6] = _mm_loadl_epi64((__m128i *)(src_ptr + 6 * src_stride));
2775
2776
        // Load lines a and b. Line a to lower 128, line b to upper 128
2777
10.1k
        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2778
10.1k
        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2779
10.1k
        const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
2780
10.1k
        const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
2781
10.1k
        const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
2782
10.1k
        const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[6]);
2783
2784
10.1k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2785
10.1k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2786
10.1k
        ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
2787
2788
10.1k
        y = h;
2789
61.3k
        do {
2790
61.3k
          const __m256i res = y_convolve_8tap_8x2_avx2(
2791
61.3k
              src_ptr, src_stride, coeffs_256, s_64, ss_256);
2792
61.3k
          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2793
61.3k
          ss_256[0] = ss_256[1];
2794
61.3k
          ss_256[1] = ss_256[2];
2795
61.3k
          ss_256[2] = ss_256[3];
2796
61.3k
          src_ptr += 2 * src_stride;
2797
61.3k
          dst += 2 * dst_stride;
2798
61.3k
          y -= 2;
2799
61.3k
        } while (y);
2800
10.1k
      } else if (w == 16) {
2801
6.01k
        __m128i s_128[8];
2802
6.01k
        __m256i ss_256[8], r[2];
2803
2804
6.01k
        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2805
6.01k
        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2806
6.01k
        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2807
6.01k
        s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
2808
6.01k
        s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
2809
6.01k
        s_128[5] = _mm_loadu_si128((__m128i *)(src_ptr + 5 * src_stride));
2810
6.01k
        s_128[6] = _mm_loadu_si128((__m128i *)(src_ptr + 6 * src_stride));
2811
2812
        // Load lines a and b. Line a to lower 128, line b to upper 128
2813
6.01k
        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2814
6.01k
        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2815
6.01k
        const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
2816
6.01k
        const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
2817
6.01k
        const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
2818
6.01k
        const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[6]);
2819
2820
6.01k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2821
6.01k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2822
6.01k
        ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
2823
2824
6.01k
        ss_256[4] = _mm256_unpackhi_epi8(src01, src12);
2825
6.01k
        ss_256[5] = _mm256_unpackhi_epi8(src23, src34);
2826
6.01k
        ss_256[6] = _mm256_unpackhi_epi8(src45, src56);
2827
2828
6.01k
        y = h;
2829
46.9k
        do {
2830
46.9k
          y_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2831
46.9k
                                    ss_256, r);
2832
46.9k
          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2833
2834
46.9k
          ss_256[0] = ss_256[1];
2835
46.9k
          ss_256[1] = ss_256[2];
2836
46.9k
          ss_256[2] = ss_256[3];
2837
2838
46.9k
          ss_256[4] = ss_256[5];
2839
46.9k
          ss_256[5] = ss_256[6];
2840
46.9k
          ss_256[6] = ss_256[7];
2841
46.9k
          src_ptr += 2 * src_stride;
2842
46.9k
          dst += 2 * dst_stride;
2843
46.9k
          y -= 2;
2844
46.9k
        } while (y);
2845
6.01k
      } else {
2846
3.14k
        __m256i s_256[8], ss_256[8], tt_256[8], r[4];
2847
2848
3.14k
        assert(!(w % 32));
2849
2850
0
        x = 0;
2851
4.30k
        do {
2852
4.30k
          const uint8_t *s = src_ptr + x;
2853
4.30k
          uint8_t *d = dst + x;
2854
2855
4.30k
          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2856
4.30k
          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2857
4.30k
          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2858
4.30k
          s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
2859
4.30k
          s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
2860
4.30k
          s_256[5] = _mm256_loadu_si256((__m256i *)(s + 5 * src_stride));
2861
4.30k
          s_256[6] = _mm256_loadu_si256((__m256i *)(s + 6 * src_stride));
2862
2863
4.30k
          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2864
4.30k
          ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
2865
4.30k
          ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
2866
4.30k
          ss_256[4] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2867
4.30k
          ss_256[5] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
2868
4.30k
          ss_256[6] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
2869
2870
4.30k
          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2871
4.30k
          tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
2872
4.30k
          tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[6]);
2873
4.30k
          tt_256[4] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2874
4.30k
          tt_256[5] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
2875
4.30k
          tt_256[6] = _mm256_unpackhi_epi8(s_256[5], s_256[6]);
2876
2877
4.30k
          y = h;
2878
92.8k
          do {
2879
92.8k
            y_convolve_8tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2880
92.8k
                                      tt_256, r);
2881
92.8k
            sr_y_round_store_32x2_avx2(r, d, dst_stride);
2882
2883
92.8k
            ss_256[0] = ss_256[1];
2884
92.8k
            ss_256[1] = ss_256[2];
2885
92.8k
            ss_256[2] = ss_256[3];
2886
92.8k
            ss_256[4] = ss_256[5];
2887
92.8k
            ss_256[5] = ss_256[6];
2888
92.8k
            ss_256[6] = ss_256[7];
2889
2890
92.8k
            tt_256[0] = tt_256[1];
2891
92.8k
            tt_256[1] = tt_256[2];
2892
92.8k
            tt_256[2] = tt_256[3];
2893
92.8k
            tt_256[4] = tt_256[5];
2894
92.8k
            tt_256[5] = tt_256[6];
2895
92.8k
            tt_256[6] = tt_256[7];
2896
92.8k
            s += 2 * src_stride;
2897
92.8k
            d += 2 * dst_stride;
2898
92.8k
            y -= 2;
2899
92.8k
          } while (y);
2900
2901
4.30k
          x += 32;
2902
4.30k
        } while (x < w);
2903
3.14k
      }
2904
19.2k
    }
2905
31.1k
  }
2906
1.39M
}
Unexecuted instantiation: convolve_2d_avx2.c:av1_convolve_y_sr_specialized_avx2
convolve_avx2.c:av1_convolve_y_sr_specialized_avx2
Line
Count
Source
2008
1.39M
    const int32_t subpel_y_q4) {
2009
1.39M
  int32_t x, y;
2010
1.39M
  __m128i coeffs_128[4];
2011
1.39M
  __m256i coeffs_256[4];
2012
2013
1.39M
  int vert_tap = get_filter_tap(filter_params_y, subpel_y_q4);
2014
2015
1.39M
  if (vert_tap == 2) {
2016
    // vert_filt as 2 tap
2017
45.6k
    const uint8_t *src_ptr = src;
2018
2019
45.6k
    y = h;
2020
2021
45.6k
    if (subpel_y_q4 != 8) {
2022
22.8k
      if (w <= 8) {
2023
18.1k
        prepare_half_coeffs_2tap_ssse3(filter_params_y, subpel_y_q4,
2024
18.1k
                                       coeffs_128);
2025
2026
18.1k
        if (w == 2) {
2027
3.00k
          __m128i s_16[2];
2028
2029
3.00k
          s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
2030
2031
6.89k
          do {
2032
6.89k
            const __m128i res = y_convolve_2tap_2x2_ssse3(src_ptr, src_stride,
2033
6.89k
                                                          coeffs_128, s_16);
2034
6.89k
            const __m128i r = sr_y_round_sse2(res);
2035
6.89k
            pack_store_2x2_sse2(r, dst, dst_stride);
2036
6.89k
            src_ptr += 2 * src_stride;
2037
6.89k
            dst += 2 * dst_stride;
2038
6.89k
            y -= 2;
2039
6.89k
          } while (y);
2040
15.0k
        } else if (w == 4) {
2041
8.50k
          __m128i s_32[2];
2042
2043
8.50k
          s_32[0] = _mm_cvtsi32_si128(*(int32_t *)src_ptr);
2044
2045
26.1k
          do {
2046
26.1k
            const __m128i res = y_convolve_2tap_4x2_ssse3(src_ptr, src_stride,
2047
26.1k
                                                          coeffs_128, s_32);
2048
26.1k
            const __m128i r = sr_y_round_sse2(res);
2049
26.1k
            pack_store_4x2_sse2(r, dst, dst_stride);
2050
26.1k
            src_ptr += 2 * src_stride;
2051
26.1k
            dst += 2 * dst_stride;
2052
26.1k
            y -= 2;
2053
26.1k
          } while (y);
2054
8.50k
        } else {
2055
6.58k
          __m128i s_64[2], s_128[2];
2056
2057
6.58k
          assert(w == 8);
2058
2059
0
          s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
2060
2061
21.5k
          do {
2062
            // Note: Faster than binding to AVX2 registers.
2063
21.5k
            s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
2064
21.5k
            s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
2065
21.5k
            s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2066
21.5k
            s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
2067
21.5k
            const __m128i ss0 = _mm_unpacklo_epi8(s_128[0], s_128[1]);
2068
21.5k
            const __m128i ss1 = _mm_unpackhi_epi8(s_128[0], s_128[1]);
2069
21.5k
            const __m128i res0 = convolve_2tap_ssse3(&ss0, coeffs_128);
2070
21.5k
            const __m128i res1 = convolve_2tap_ssse3(&ss1, coeffs_128);
2071
21.5k
            const __m128i r0 = sr_y_round_sse2(res0);
2072
21.5k
            const __m128i r1 = sr_y_round_sse2(res1);
2073
21.5k
            const __m128i d = _mm_packus_epi16(r0, r1);
2074
21.5k
            _mm_storel_epi64((__m128i *)dst, d);
2075
21.5k
            _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
2076
21.5k
            src_ptr += 2 * src_stride;
2077
21.5k
            dst += 2 * dst_stride;
2078
21.5k
            y -= 2;
2079
21.5k
          } while (y);
2080
6.58k
        }
2081
18.1k
      } else {
2082
4.77k
        prepare_half_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2083
2084
4.77k
        if (w == 16) {
2085
3.03k
          __m128i s_128[2];
2086
2087
3.03k
          s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
2088
2089
15.6k
          do {
2090
15.6k
            __m256i r[2];
2091
2092
15.6k
            y_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2093
15.6k
                                      r);
2094
15.6k
            sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2095
15.6k
            src_ptr += 2 * src_stride;
2096
15.6k
            dst += 2 * dst_stride;
2097
15.6k
            y -= 2;
2098
15.6k
          } while (y);
2099
3.03k
        } else if (w == 32) {
2100
954
          __m256i s_256[2];
2101
2102
954
          s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
2103
2104
9.45k
          do {
2105
9.45k
            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0],
2106
9.45k
                              &s_256[1], dst);
2107
9.45k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1],
2108
9.45k
                              &s_256[0], dst + dst_stride);
2109
9.45k
            src_ptr += 2 * src_stride;
2110
9.45k
            dst += 2 * dst_stride;
2111
9.45k
            y -= 2;
2112
9.45k
          } while (y);
2113
954
        } else if (w == 64) {
2114
645
          __m256i s_256[2][2];
2115
2116
645
          s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2117
645
          s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2118
2119
15.0k
          do {
2120
15.0k
            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0][0],
2121
15.0k
                              &s_256[1][0], dst);
2122
15.0k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 32, coeffs_256,
2123
15.0k
                              s_256[0][1], &s_256[1][1], dst + 32);
2124
15.0k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1][0],
2125
15.0k
                              &s_256[0][0], dst + dst_stride);
2126
15.0k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 32, coeffs_256,
2127
15.0k
                              s_256[1][1], &s_256[0][1], dst + dst_stride + 32);
2128
2129
15.0k
            src_ptr += 2 * src_stride;
2130
15.0k
            dst += 2 * dst_stride;
2131
15.0k
            y -= 2;
2132
15.0k
          } while (y);
2133
645
        } else {
2134
138
          __m256i s_256[2][4];
2135
2136
138
          assert(w == 128);
2137
2138
0
          s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2139
138
          s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2140
138
          s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
2141
138
          s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
2142
2143
6.27k
          do {
2144
6.27k
            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0][0],
2145
6.27k
                              &s_256[1][0], dst);
2146
6.27k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 1 * 32, coeffs_256,
2147
6.27k
                              s_256[0][1], &s_256[1][1], dst + 1 * 32);
2148
6.27k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 2 * 32, coeffs_256,
2149
6.27k
                              s_256[0][2], &s_256[1][2], dst + 2 * 32);
2150
6.27k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 3 * 32, coeffs_256,
2151
6.27k
                              s_256[0][3], &s_256[1][3], dst + 3 * 32);
2152
2153
6.27k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1][0],
2154
6.27k
                              &s_256[0][0], dst + dst_stride);
2155
6.27k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 1 * 32, coeffs_256,
2156
6.27k
                              s_256[1][1], &s_256[0][1],
2157
6.27k
                              dst + dst_stride + 1 * 32);
2158
6.27k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 2 * 32, coeffs_256,
2159
6.27k
                              s_256[1][2], &s_256[0][2],
2160
6.27k
                              dst + dst_stride + 2 * 32);
2161
6.27k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 3 * 32, coeffs_256,
2162
6.27k
                              s_256[1][3], &s_256[0][3],
2163
6.27k
                              dst + dst_stride + 3 * 32);
2164
2165
6.27k
            src_ptr += 2 * src_stride;
2166
6.27k
            dst += 2 * dst_stride;
2167
6.27k
            y -= 2;
2168
6.27k
          } while (y);
2169
138
        }
2170
4.77k
      }
2171
22.8k
    } else {
2172
      // average to get half pel
2173
22.7k
      if (w <= 8) {
2174
19.3k
        if (w == 2) {
2175
3.94k
          __m128i s_16[2];
2176
2177
3.94k
          s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
2178
2179
10.4k
          do {
2180
10.4k
            s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + src_stride));
2181
10.4k
            const __m128i d0 = _mm_avg_epu8(s_16[0], s_16[1]);
2182
10.4k
            *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d0);
2183
10.4k
            s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
2184
10.4k
            const __m128i d1 = _mm_avg_epu8(s_16[1], s_16[0]);
2185
10.4k
            *(int16_t *)(dst + dst_stride) = (int16_t)_mm_cvtsi128_si32(d1);
2186
10.4k
            src_ptr += 2 * src_stride;
2187
10.4k
            dst += 2 * dst_stride;
2188
10.4k
            y -= 2;
2189
10.4k
          } while (y);
2190
15.3k
        } else if (w == 4) {
2191
9.12k
          __m128i s_32[2];
2192
2193
9.12k
          s_32[0] = _mm_cvtsi32_si128(*(int32_t *)src_ptr);
2194
2195
25.8k
          do {
2196
25.8k
            s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + src_stride));
2197
25.8k
            const __m128i d0 = _mm_avg_epu8(s_32[0], s_32[1]);
2198
25.8k
            xx_storel_32(dst, d0);
2199
25.8k
            s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 2 * src_stride));
2200
25.8k
            const __m128i d1 = _mm_avg_epu8(s_32[1], s_32[0]);
2201
25.8k
            xx_storel_32(dst + dst_stride, d1);
2202
25.8k
            src_ptr += 2 * src_stride;
2203
25.8k
            dst += 2 * dst_stride;
2204
25.8k
            y -= 2;
2205
25.8k
          } while (y);
2206
9.12k
        } else {
2207
6.23k
          __m128i s_64[2];
2208
2209
6.23k
          assert(w == 8);
2210
2211
0
          s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
2212
2213
21.5k
          do {
2214
            // Note: Faster than binding to AVX2 registers.
2215
21.5k
            s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
2216
21.5k
            const __m128i d0 = _mm_avg_epu8(s_64[0], s_64[1]);
2217
21.5k
            _mm_storel_epi64((__m128i *)dst, d0);
2218
21.5k
            s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2219
21.5k
            const __m128i d1 = _mm_avg_epu8(s_64[1], s_64[0]);
2220
21.5k
            _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
2221
21.5k
            src_ptr += 2 * src_stride;
2222
21.5k
            dst += 2 * dst_stride;
2223
21.5k
            y -= 2;
2224
21.5k
          } while (y);
2225
6.23k
        }
2226
19.3k
      } else if (w == 16) {
2227
2.42k
        __m128i s_128[2];
2228
2229
2.42k
        s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
2230
2231
12.0k
        do {
2232
12.0k
          s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
2233
12.0k
          const __m128i d0 = _mm_avg_epu8(s_128[0], s_128[1]);
2234
12.0k
          _mm_storeu_si128((__m128i *)dst, d0);
2235
12.0k
          s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2236
12.0k
          const __m128i d1 = _mm_avg_epu8(s_128[1], s_128[0]);
2237
12.0k
          _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
2238
12.0k
          src_ptr += 2 * src_stride;
2239
12.0k
          dst += 2 * dst_stride;
2240
12.0k
          y -= 2;
2241
12.0k
        } while (y);
2242
2.42k
      } else if (w == 32) {
2243
623
        __m256i s_256[2];
2244
2245
623
        s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
2246
2247
6.20k
        do {
2248
6.20k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0], &s_256[1], dst);
2249
6.20k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1], &s_256[0],
2250
6.20k
                                dst + dst_stride);
2251
6.20k
          src_ptr += 2 * src_stride;
2252
6.20k
          dst += 2 * dst_stride;
2253
6.20k
          y -= 2;
2254
6.20k
        } while (y);
2255
623
      } else if (w == 64) {
2256
295
        __m256i s_256[2][2];
2257
2258
295
        s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2259
295
        s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2260
2261
6.85k
        do {
2262
6.85k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0][0], &s_256[1][0],
2263
6.85k
                                dst);
2264
6.85k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 32, s_256[0][1],
2265
6.85k
                                &s_256[1][1], dst + 32);
2266
2267
6.85k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1][0],
2268
6.85k
                                &s_256[0][0], dst + dst_stride);
2269
6.85k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 32, s_256[1][1],
2270
6.85k
                                &s_256[0][1], dst + dst_stride + 32);
2271
2272
6.85k
          src_ptr += 2 * src_stride;
2273
6.85k
          dst += 2 * dst_stride;
2274
6.85k
          y -= 2;
2275
6.85k
        } while (y);
2276
295
      } else {
2277
136
        __m256i s_256[2][4];
2278
2279
136
        assert(w == 128);
2280
2281
0
        s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2282
136
        s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2283
136
        s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
2284
136
        s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
2285
2286
7.36k
        do {
2287
7.36k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0][0], &s_256[1][0],
2288
7.36k
                                dst);
2289
7.36k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 1 * 32, s_256[0][1],
2290
7.36k
                                &s_256[1][1], dst + 1 * 32);
2291
7.36k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 2 * 32, s_256[0][2],
2292
7.36k
                                &s_256[1][2], dst + 2 * 32);
2293
7.36k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 3 * 32, s_256[0][3],
2294
7.36k
                                &s_256[1][3], dst + 3 * 32);
2295
2296
7.36k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1][0],
2297
7.36k
                                &s_256[0][0], dst + dst_stride);
2298
7.36k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 1 * 32, s_256[1][1],
2299
7.36k
                                &s_256[0][1], dst + dst_stride + 1 * 32);
2300
7.36k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 2 * 32, s_256[1][2],
2301
7.36k
                                &s_256[0][2], dst + dst_stride + 2 * 32);
2302
7.36k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 3 * 32, s_256[1][3],
2303
7.36k
                                &s_256[0][3], dst + dst_stride + 3 * 32);
2304
2305
7.36k
          src_ptr += 2 * src_stride;
2306
7.36k
          dst += 2 * dst_stride;
2307
7.36k
          y -= 2;
2308
7.36k
        } while (y);
2309
136
      }
2310
22.7k
    }
2311
1.35M
  } else if (vert_tap == 4) {
2312
    // vert_filt as 4 tap
2313
654k
    const uint8_t *src_ptr = src - src_stride;
2314
2315
654k
    y = h;
2316
2317
654k
    if (w <= 4) {
2318
344k
      prepare_half_coeffs_4tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2319
2320
344k
      if (w == 2) {
2321
67.6k
        __m128i s_16[4], ss_128[2];
2322
2323
67.6k
        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2324
67.6k
        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2325
67.6k
        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2326
2327
67.6k
        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2328
67.6k
        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2329
2330
67.6k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2331
2332
112k
        do {
2333
112k
          src_ptr += 2 * src_stride;
2334
112k
          const __m128i res = y_convolve_4tap_2x2_ssse3(
2335
112k
              src_ptr, src_stride, coeffs_128, s_16, ss_128);
2336
112k
          const __m128i r = sr_y_round_sse2(res);
2337
112k
          pack_store_2x2_sse2(r, dst, dst_stride);
2338
2339
112k
          ss_128[0] = ss_128[1];
2340
112k
          dst += 2 * dst_stride;
2341
112k
          y -= 2;
2342
112k
        } while (y);
2343
276k
      } else {
2344
276k
        __m128i s_32[4], ss_128[2];
2345
2346
276k
        assert(w == 4);
2347
2348
0
        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2349
276k
        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2350
276k
        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2351
2352
276k
        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2353
276k
        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2354
2355
276k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2356
2357
550k
        do {
2358
550k
          src_ptr += 2 * src_stride;
2359
550k
          const __m128i res = y_convolve_4tap_4x2_ssse3(
2360
550k
              src_ptr, src_stride, coeffs_128, s_32, ss_128);
2361
550k
          const __m128i r = sr_y_round_sse2(res);
2362
550k
          pack_store_4x2_sse2(r, dst, dst_stride);
2363
2364
550k
          ss_128[0] = ss_128[1];
2365
550k
          dst += 2 * dst_stride;
2366
550k
          y -= 2;
2367
550k
        } while (y);
2368
276k
      }
2369
344k
    } else {
2370
309k
      prepare_half_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2371
2372
309k
      if (w == 8) {
2373
218k
        __m128i s_64[4];
2374
218k
        __m256i ss_256[2];
2375
2376
218k
        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2377
218k
        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2378
218k
        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2379
2380
        // Load lines a and b. Line a to lower 128, line b to upper 128
2381
218k
        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2382
218k
        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2383
2384
218k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2385
2386
438k
        do {
2387
438k
          src_ptr += 2 * src_stride;
2388
438k
          const __m256i res = y_convolve_4tap_8x2_avx2(
2389
438k
              src_ptr, src_stride, coeffs_256, s_64, ss_256);
2390
438k
          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2391
2392
438k
          ss_256[0] = ss_256[1];
2393
438k
          dst += 2 * dst_stride;
2394
438k
          y -= 2;
2395
438k
        } while (y);
2396
218k
      } else if (w == 16) {
2397
84.3k
        __m128i s_128[4];
2398
84.3k
        __m256i ss_256[4], r[2];
2399
2400
84.3k
        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2401
84.3k
        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2402
84.3k
        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2403
2404
        // Load lines a and b. Line a to lower 128, line b to upper 128
2405
84.3k
        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2406
84.3k
        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2407
2408
84.3k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2409
84.3k
        ss_256[2] = _mm256_unpackhi_epi8(src01, src12);
2410
2411
212k
        do {
2412
212k
          src_ptr += 2 * src_stride;
2413
212k
          y_convolve_4tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2414
212k
                                    ss_256, r);
2415
212k
          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2416
2417
212k
          ss_256[0] = ss_256[1];
2418
212k
          ss_256[2] = ss_256[3];
2419
212k
          dst += 2 * dst_stride;
2420
212k
          y -= 2;
2421
212k
        } while (y);
2422
84.3k
      } else if (w == 32) {
2423
        // AV1 standard won't have 32x4 case.
2424
        // This only favors some optimization feature which
2425
        // subsamples 32x8 to 32x4 and triggers 4-tap filter.
2426
2427
5.13k
        __m256i s_256[4], ss_256[4], tt_256[4], r[4];
2428
2429
5.13k
        s_256[0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * src_stride));
2430
5.13k
        s_256[1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * src_stride));
2431
5.13k
        s_256[2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * src_stride));
2432
2433
5.13k
        ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2434
5.13k
        ss_256[2] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2435
2436
5.13k
        tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2437
5.13k
        tt_256[2] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2438
2439
47.2k
        do {
2440
47.2k
          src_ptr += 2 * src_stride;
2441
47.2k
          y_convolve_4tap_32x2_avx2(src_ptr, src_stride, coeffs_256, s_256,
2442
47.2k
                                    ss_256, tt_256, r);
2443
47.2k
          sr_y_round_store_32x2_avx2(r, dst, dst_stride);
2444
2445
47.2k
          ss_256[0] = ss_256[1];
2446
47.2k
          ss_256[2] = ss_256[3];
2447
2448
47.2k
          tt_256[0] = tt_256[1];
2449
47.2k
          tt_256[2] = tt_256[3];
2450
47.2k
          dst += 2 * dst_stride;
2451
47.2k
          y -= 2;
2452
47.2k
        } while (y);
2453
5.13k
      } else {
2454
1.32k
        assert(!(w % 32));
2455
2456
0
        __m256i s_256[4], ss_256[4], tt_256[4], r[4];
2457
1.32k
        x = 0;
2458
2.96k
        do {
2459
2.96k
          const uint8_t *s = src_ptr + x;
2460
2.96k
          uint8_t *d = dst + x;
2461
2.96k
          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2462
2.96k
          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2463
2.96k
          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2464
2465
2.96k
          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2466
2.96k
          ss_256[2] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2467
2468
2.96k
          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2469
2.96k
          tt_256[2] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2470
2471
2.96k
          y = h;
2472
94.0k
          do {
2473
94.0k
            s += 2 * src_stride;
2474
94.0k
            y_convolve_4tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2475
94.0k
                                      tt_256, r);
2476
94.0k
            sr_y_round_store_32x2_avx2(r, d, dst_stride);
2477
2478
94.0k
            ss_256[0] = ss_256[1];
2479
94.0k
            ss_256[2] = ss_256[3];
2480
2481
94.0k
            tt_256[0] = tt_256[1];
2482
94.0k
            tt_256[2] = tt_256[3];
2483
94.0k
            d += 2 * dst_stride;
2484
94.0k
            y -= 2;
2485
94.0k
          } while (y);
2486
2.96k
          x += 32;
2487
2.96k
        } while (x < w);
2488
1.32k
      }
2489
309k
    }
2490
698k
  } else if (vert_tap == 6) {
2491
    // vert_filt as 6 tap
2492
667k
    const uint8_t *src_ptr = src - 2 * src_stride;
2493
2494
667k
    if (w <= 4) {
2495
227k
      prepare_half_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2496
2497
227k
      y = h;
2498
2499
227k
      if (w == 2) {
2500
34.0k
        __m128i s_16[6], ss_128[3];
2501
2502
34.0k
        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2503
34.0k
        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2504
34.0k
        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2505
34.0k
        s_16[3] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 3 * src_stride));
2506
34.0k
        s_16[4] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 4 * src_stride));
2507
2508
34.0k
        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2509
34.0k
        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2510
34.0k
        const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
2511
34.0k
        const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
2512
2513
34.0k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2514
34.0k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2515
2516
136k
        do {
2517
136k
          src_ptr += 2 * src_stride;
2518
136k
          const __m128i res = y_convolve_6tap_2x2_ssse3(
2519
136k
              src_ptr, src_stride, coeffs_128, s_16, ss_128);
2520
136k
          const __m128i r = sr_y_round_sse2(res);
2521
136k
          pack_store_2x2_sse2(r, dst, dst_stride);
2522
2523
136k
          ss_128[0] = ss_128[1];
2524
136k
          ss_128[1] = ss_128[2];
2525
136k
          dst += 2 * dst_stride;
2526
136k
          y -= 2;
2527
136k
        } while (y);
2528
193k
      } else {
2529
193k
        __m128i s_32[6], ss_128[3];
2530
2531
193k
        assert(w == 4);
2532
2533
0
        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2534
193k
        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2535
193k
        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2536
193k
        s_32[3] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 3 * src_stride));
2537
193k
        s_32[4] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 4 * src_stride));
2538
2539
193k
        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2540
193k
        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2541
193k
        const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
2542
193k
        const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
2543
2544
193k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2545
193k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2546
2547
1.03M
        do {
2548
1.03M
          src_ptr += 2 * src_stride;
2549
1.03M
          const __m128i res = y_convolve_6tap_4x2_ssse3(
2550
1.03M
              src_ptr, src_stride, coeffs_128, s_32, ss_128);
2551
1.03M
          const __m128i r = sr_y_round_sse2(res);
2552
1.03M
          pack_store_4x2_sse2(r, dst, dst_stride);
2553
2554
1.03M
          ss_128[0] = ss_128[1];
2555
1.03M
          ss_128[1] = ss_128[2];
2556
1.03M
          dst += 2 * dst_stride;
2557
1.03M
          y -= 2;
2558
1.03M
        } while (y);
2559
193k
      }
2560
440k
    } else {
2561
440k
      prepare_half_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2562
2563
440k
      if (w == 8) {
2564
231k
        __m128i s_64[6];
2565
231k
        __m256i ss_256[3];
2566
2567
231k
        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2568
231k
        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2569
231k
        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2570
231k
        s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
2571
231k
        s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
2572
2573
        // Load lines a and b. Line a to lower 128, line b to upper 128
2574
231k
        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2575
231k
        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2576
231k
        const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
2577
231k
        const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
2578
2579
231k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2580
231k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2581
2582
231k
        y = h;
2583
1.33M
        do {
2584
1.33M
          src_ptr += 2 * src_stride;
2585
1.33M
          const __m256i res = y_convolve_6tap_8x2_avx2(
2586
1.33M
              src_ptr, src_stride, coeffs_256, s_64, ss_256);
2587
1.33M
          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2588
2589
1.33M
          ss_256[0] = ss_256[1];
2590
1.33M
          ss_256[1] = ss_256[2];
2591
1.33M
          dst += 2 * dst_stride;
2592
1.33M
          y -= 2;
2593
1.33M
        } while (y);
2594
231k
      } else if (w == 16) {
2595
150k
        __m128i s_128[6];
2596
150k
        __m256i ss_256[6], r[2];
2597
2598
150k
        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2599
150k
        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2600
150k
        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2601
150k
        s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
2602
150k
        s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
2603
2604
        // Load lines a and b. Line a to lower 128, line b to upper 128
2605
150k
        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2606
150k
        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2607
150k
        const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
2608
150k
        const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
2609
2610
150k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2611
150k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2612
2613
150k
        ss_256[3] = _mm256_unpackhi_epi8(src01, src12);
2614
150k
        ss_256[4] = _mm256_unpackhi_epi8(src23, src34);
2615
2616
150k
        y = h;
2617
1.08M
        do {
2618
1.08M
          src_ptr += 2 * src_stride;
2619
1.08M
          y_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2620
1.08M
                                    ss_256, r);
2621
1.08M
          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2622
2623
1.08M
          ss_256[0] = ss_256[1];
2624
1.08M
          ss_256[1] = ss_256[2];
2625
2626
1.08M
          ss_256[3] = ss_256[4];
2627
1.08M
          ss_256[4] = ss_256[5];
2628
1.08M
          dst += 2 * dst_stride;
2629
1.08M
          y -= 2;
2630
1.08M
        } while (y);
2631
150k
      } else {
2632
59.5k
        __m256i s_256[6], ss_256[6], tt_256[6], r[4];
2633
2634
59.5k
        assert(!(w % 32));
2635
2636
0
        x = 0;
2637
71.6k
        do {
2638
71.6k
          const uint8_t *s = src_ptr + x;
2639
71.6k
          uint8_t *d = dst + x;
2640
2641
71.6k
          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2642
71.6k
          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2643
71.6k
          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2644
71.6k
          s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
2645
71.6k
          s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
2646
2647
71.6k
          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2648
71.6k
          ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
2649
71.6k
          ss_256[3] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2650
71.6k
          ss_256[4] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
2651
2652
71.6k
          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2653
71.6k
          tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
2654
71.6k
          tt_256[3] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2655
71.6k
          tt_256[4] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
2656
2657
71.6k
          y = h;
2658
1.20M
          do {
2659
1.20M
            s += 2 * src_stride;
2660
1.20M
            y_convolve_6tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2661
1.20M
                                      tt_256, r);
2662
1.20M
            sr_y_round_store_32x2_avx2(r, d, dst_stride);
2663
2664
1.20M
            ss_256[0] = ss_256[1];
2665
1.20M
            ss_256[1] = ss_256[2];
2666
1.20M
            ss_256[3] = ss_256[4];
2667
1.20M
            ss_256[4] = ss_256[5];
2668
2669
1.20M
            tt_256[0] = tt_256[1];
2670
1.20M
            tt_256[1] = tt_256[2];
2671
1.20M
            tt_256[3] = tt_256[4];
2672
1.20M
            tt_256[4] = tt_256[5];
2673
1.20M
            d += 2 * dst_stride;
2674
1.20M
            y -= 2;
2675
1.20M
          } while (y);
2676
2677
71.6k
          x += 32;
2678
71.6k
        } while (x < w);
2679
59.5k
      }
2680
440k
    }
2681
667k
  } else if (vert_tap == 8) {
2682
    // vert_filt as 8 tap
2683
31.1k
    const uint8_t *src_ptr = src - 3 * src_stride;
2684
2685
31.1k
    if (w <= 4) {
2686
11.8k
      prepare_half_coeffs_8tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2687
2688
11.8k
      y = h;
2689
2690
11.8k
      if (w == 2) {
2691
2.07k
        __m128i s_16[8], ss_128[4];
2692
2693
2.07k
        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2694
2.07k
        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2695
2.07k
        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2696
2.07k
        s_16[3] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 3 * src_stride));
2697
2.07k
        s_16[4] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 4 * src_stride));
2698
2.07k
        s_16[5] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 5 * src_stride));
2699
2.07k
        s_16[6] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 6 * src_stride));
2700
2701
2.07k
        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2702
2.07k
        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2703
2.07k
        const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
2704
2.07k
        const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
2705
2.07k
        const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
2706
2.07k
        const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[6]);
2707
2708
2.07k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2709
2.07k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2710
2.07k
        ss_128[2] = _mm_unpacklo_epi8(src45, src56);
2711
2712
8.29k
        do {
2713
8.29k
          const __m128i res = y_convolve_8tap_2x2_ssse3(
2714
8.29k
              src_ptr, src_stride, coeffs_128, s_16, ss_128);
2715
8.29k
          const __m128i r = sr_y_round_sse2(res);
2716
8.29k
          pack_store_2x2_sse2(r, dst, dst_stride);
2717
8.29k
          ss_128[0] = ss_128[1];
2718
8.29k
          ss_128[1] = ss_128[2];
2719
8.29k
          ss_128[2] = ss_128[3];
2720
8.29k
          src_ptr += 2 * src_stride;
2721
8.29k
          dst += 2 * dst_stride;
2722
8.29k
          y -= 2;
2723
8.29k
        } while (y);
2724
9.76k
      } else {
2725
9.76k
        __m128i s_32[8], ss_128[4];
2726
2727
9.76k
        assert(w == 4);
2728
2729
0
        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2730
9.76k
        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2731
9.76k
        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2732
9.76k
        s_32[3] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 3 * src_stride));
2733
9.76k
        s_32[4] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 4 * src_stride));
2734
9.76k
        s_32[5] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 5 * src_stride));
2735
9.76k
        s_32[6] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 6 * src_stride));
2736
2737
9.76k
        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2738
9.76k
        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2739
9.76k
        const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
2740
9.76k
        const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
2741
9.76k
        const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
2742
9.76k
        const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
2743
2744
9.76k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2745
9.76k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2746
9.76k
        ss_128[2] = _mm_unpacklo_epi8(src45, src56);
2747
2748
52.0k
        do {
2749
52.0k
          const __m128i res = y_convolve_8tap_4x2_ssse3(
2750
52.0k
              src_ptr, src_stride, coeffs_128, s_32, ss_128);
2751
52.0k
          const __m128i r = sr_y_round_sse2(res);
2752
52.0k
          pack_store_4x2_sse2(r, dst, dst_stride);
2753
52.0k
          ss_128[0] = ss_128[1];
2754
52.0k
          ss_128[1] = ss_128[2];
2755
52.0k
          ss_128[2] = ss_128[3];
2756
52.0k
          src_ptr += 2 * src_stride;
2757
52.0k
          dst += 2 * dst_stride;
2758
52.0k
          y -= 2;
2759
52.0k
        } while (y);
2760
9.76k
      }
2761
19.2k
    } else {
2762
19.2k
      prepare_half_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2763
2764
19.2k
      if (w == 8) {
2765
10.1k
        __m128i s_64[8];
2766
10.1k
        __m256i ss_256[4];
2767
2768
10.1k
        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2769
10.1k
        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2770
10.1k
        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2771
10.1k
        s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
2772
10.1k
        s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
2773
10.1k
        s_64[5] = _mm_loadl_epi64((__m128i *)(src_ptr + 5 * src_stride));
2774
10.1k
        s_64[6] = _mm_loadl_epi64((__m128i *)(src_ptr + 6 * src_stride));
2775
2776
        // Load lines a and b. Line a to lower 128, line b to upper 128
2777
10.1k
        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2778
10.1k
        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2779
10.1k
        const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
2780
10.1k
        const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
2781
10.1k
        const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
2782
10.1k
        const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[6]);
2783
2784
10.1k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2785
10.1k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2786
10.1k
        ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
2787
2788
10.1k
        y = h;
2789
61.3k
        do {
2790
61.3k
          const __m256i res = y_convolve_8tap_8x2_avx2(
2791
61.3k
              src_ptr, src_stride, coeffs_256, s_64, ss_256);
2792
61.3k
          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2793
61.3k
          ss_256[0] = ss_256[1];
2794
61.3k
          ss_256[1] = ss_256[2];
2795
61.3k
          ss_256[2] = ss_256[3];
2796
61.3k
          src_ptr += 2 * src_stride;
2797
61.3k
          dst += 2 * dst_stride;
2798
61.3k
          y -= 2;
2799
61.3k
        } while (y);
2800
10.1k
      } else if (w == 16) {
2801
6.01k
        __m128i s_128[8];
2802
6.01k
        __m256i ss_256[8], r[2];
2803
2804
6.01k
        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2805
6.01k
        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2806
6.01k
        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2807
6.01k
        s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
2808
6.01k
        s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
2809
6.01k
        s_128[5] = _mm_loadu_si128((__m128i *)(src_ptr + 5 * src_stride));
2810
6.01k
        s_128[6] = _mm_loadu_si128((__m128i *)(src_ptr + 6 * src_stride));
2811
2812
        // Load lines a and b. Line a to lower 128, line b to upper 128
2813
6.01k
        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2814
6.01k
        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2815
6.01k
        const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
2816
6.01k
        const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
2817
6.01k
        const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
2818
6.01k
        const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[6]);
2819
2820
6.01k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2821
6.01k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2822
6.01k
        ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
2823
2824
6.01k
        ss_256[4] = _mm256_unpackhi_epi8(src01, src12);
2825
6.01k
        ss_256[5] = _mm256_unpackhi_epi8(src23, src34);
2826
6.01k
        ss_256[6] = _mm256_unpackhi_epi8(src45, src56);
2827
2828
6.01k
        y = h;
2829
46.9k
        do {
2830
46.9k
          y_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2831
46.9k
                                    ss_256, r);
2832
46.9k
          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2833
2834
46.9k
          ss_256[0] = ss_256[1];
2835
46.9k
          ss_256[1] = ss_256[2];
2836
46.9k
          ss_256[2] = ss_256[3];
2837
2838
46.9k
          ss_256[4] = ss_256[5];
2839
46.9k
          ss_256[5] = ss_256[6];
2840
46.9k
          ss_256[6] = ss_256[7];
2841
46.9k
          src_ptr += 2 * src_stride;
2842
46.9k
          dst += 2 * dst_stride;
2843
46.9k
          y -= 2;
2844
46.9k
        } while (y);
2845
6.01k
      } else {
2846
3.14k
        __m256i s_256[8], ss_256[8], tt_256[8], r[4];
2847
2848
3.14k
        assert(!(w % 32));
2849
2850
0
        x = 0;
2851
4.30k
        do {
2852
4.30k
          const uint8_t *s = src_ptr + x;
2853
4.30k
          uint8_t *d = dst + x;
2854
2855
4.30k
          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2856
4.30k
          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2857
4.30k
          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2858
4.30k
          s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
2859
4.30k
          s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
2860
4.30k
          s_256[5] = _mm256_loadu_si256((__m256i *)(s + 5 * src_stride));
2861
4.30k
          s_256[6] = _mm256_loadu_si256((__m256i *)(s + 6 * src_stride));
2862
2863
4.30k
          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2864
4.30k
          ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
2865
4.30k
          ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
2866
4.30k
          ss_256[4] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2867
4.30k
          ss_256[5] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
2868
4.30k
          ss_256[6] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
2869
2870
4.30k
          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2871
4.30k
          tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
2872
4.30k
          tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[6]);
2873
4.30k
          tt_256[4] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2874
4.30k
          tt_256[5] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
2875
4.30k
          tt_256[6] = _mm256_unpackhi_epi8(s_256[5], s_256[6]);
2876
2877
4.30k
          y = h;
2878
92.8k
          do {
2879
92.8k
            y_convolve_8tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2880
92.8k
                                      tt_256, r);
2881
92.8k
            sr_y_round_store_32x2_avx2(r, d, dst_stride);
2882
2883
92.8k
            ss_256[0] = ss_256[1];
2884
92.8k
            ss_256[1] = ss_256[2];
2885
92.8k
            ss_256[2] = ss_256[3];
2886
92.8k
            ss_256[4] = ss_256[5];
2887
92.8k
            ss_256[5] = ss_256[6];
2888
92.8k
            ss_256[6] = ss_256[7];
2889
2890
92.8k
            tt_256[0] = tt_256[1];
2891
92.8k
            tt_256[1] = tt_256[2];
2892
92.8k
            tt_256[2] = tt_256[3];
2893
92.8k
            tt_256[4] = tt_256[5];
2894
92.8k
            tt_256[5] = tt_256[6];
2895
92.8k
            tt_256[6] = tt_256[7];
2896
92.8k
            s += 2 * src_stride;
2897
92.8k
            d += 2 * dst_stride;
2898
92.8k
            y -= 2;
2899
92.8k
          } while (y);
2900
2901
4.30k
          x += 32;
2902
4.30k
        } while (x < w);
2903
3.14k
      }
2904
19.2k
    }
2905
31.1k
  }
2906
1.39M
}
2907
2908
static INLINE void sr_x_2tap_32_avx2(const uint8_t *const src,
2909
                                     const __m256i coeffs[1],
2910
104k
                                     uint8_t *const dst) {
2911
104k
  __m256i r[2];
2912
2913
104k
  x_convolve_2tap_32_avx2(src, coeffs, r);
2914
104k
  sr_x_round_store_32_avx2(r, dst);
2915
104k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_2tap_32_avx2
convolve_avx2.c:sr_x_2tap_32_avx2
Line
Count
Source
2910
104k
                                     uint8_t *const dst) {
2911
104k
  __m256i r[2];
2912
2913
104k
  x_convolve_2tap_32_avx2(src, coeffs, r);
2914
104k
  sr_x_round_store_32_avx2(r, dst);
2915
104k
}
2916
2917
static INLINE void sr_x_6tap_32_avx2(const uint8_t *const src,
2918
                                     const __m256i coeffs[3],
2919
                                     const __m256i filt[3],
2920
2.47M
                                     uint8_t *const dst) {
2921
2.47M
  __m256i r[2];
2922
2923
2.47M
  x_convolve_6tap_32_avx2(src, coeffs, filt, r);
2924
2.47M
  sr_x_round_store_32_avx2(r, dst);
2925
2.47M
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_6tap_32_avx2
convolve_avx2.c:sr_x_6tap_32_avx2
Line
Count
Source
2920
2.47M
                                     uint8_t *const dst) {
2921
2.47M
  __m256i r[2];
2922
2923
2.47M
  x_convolve_6tap_32_avx2(src, coeffs, filt, r);
2924
2.47M
  sr_x_round_store_32_avx2(r, dst);
2925
2.47M
}
2926
2927
static AOM_FORCE_INLINE void sr_x_8tap_32_avx2(const uint8_t *const src,
2928
                                               const __m256i coeffs[4],
2929
                                               const __m256i filt[4],
2930
284k
                                               uint8_t *const dst) {
2931
284k
  __m256i r[2];
2932
2933
284k
  x_convolve_8tap_32_avx2(src, coeffs, filt, r);
2934
284k
  sr_x_round_store_32_avx2(r, dst);
2935
284k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_8tap_32_avx2
convolve_avx2.c:sr_x_8tap_32_avx2
Line
Count
Source
2930
284k
                                               uint8_t *const dst) {
2931
284k
  __m256i r[2];
2932
2933
284k
  x_convolve_8tap_32_avx2(src, coeffs, filt, r);
2934
284k
  sr_x_round_store_32_avx2(r, dst);
2935
284k
}
2936
2937
static AOM_FORCE_INLINE void av1_convolve_x_sr_specialized_avx2(
2938
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
2939
    int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
2940
1.47M
    const int32_t subpel_x_q4, ConvolveParams *conv_params) {
2941
1.47M
  int32_t y = h;
2942
1.47M
  __m128i coeffs_128[4];
2943
1.47M
  __m256i coeffs_256[4];
2944
2945
1.47M
  assert(conv_params->round_0 == 3);
2946
0
  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
2947
1.47M
         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
2948
0
  (void)conv_params;
2949
2950
1.47M
  const int horz_tap = get_filter_tap(filter_params_x, subpel_x_q4);
2951
2952
1.47M
  if (horz_tap == 2) {
2953
    // horz_filt as 2 tap
2954
35.5k
    const uint8_t *src_ptr = src;
2955
2956
35.5k
    if (subpel_x_q4 != 8) {
2957
19.4k
      if (w <= 8) {
2958
15.4k
        prepare_half_coeffs_2tap_ssse3(filter_params_x, subpel_x_q4,
2959
15.4k
                                       coeffs_128);
2960
2961
15.4k
        if (w == 2) {
2962
5.69k
          do {
2963
5.69k
            const __m128i res =
2964
5.69k
                x_convolve_2tap_2x2_sse4_1(src_ptr, src_stride, coeffs_128);
2965
5.69k
            const __m128i r = sr_x_round_sse2(res);
2966
5.69k
            pack_store_2x2_sse2(r, dst, dst_stride);
2967
5.69k
            src_ptr += 2 * src_stride;
2968
5.69k
            dst += 2 * dst_stride;
2969
5.69k
            y -= 2;
2970
5.69k
          } while (y);
2971
13.2k
        } else if (w == 4) {
2972
33.3k
          do {
2973
33.3k
            const __m128i res =
2974
33.3k
                x_convolve_2tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
2975
33.3k
            const __m128i r = sr_x_round_sse2(res);
2976
33.3k
            pack_store_4x2_sse2(r, dst, dst_stride);
2977
33.3k
            src_ptr += 2 * src_stride;
2978
33.3k
            dst += 2 * dst_stride;
2979
33.3k
            y -= 2;
2980
33.3k
          } while (y);
2981
7.04k
        } else {
2982
6.23k
          assert(w == 8);
2983
2984
34.5k
          do {
2985
34.5k
            __m128i res[2];
2986
2987
34.5k
            x_convolve_2tap_8x2_ssse3(src_ptr, src_stride, coeffs_128, res);
2988
34.5k
            res[0] = sr_x_round_sse2(res[0]);
2989
34.5k
            res[1] = sr_x_round_sse2(res[1]);
2990
34.5k
            const __m128i d = _mm_packus_epi16(res[0], res[1]);
2991
34.5k
            _mm_storel_epi64((__m128i *)dst, d);
2992
34.5k
            _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
2993
2994
34.5k
            src_ptr += 2 * src_stride;
2995
34.5k
            dst += 2 * dst_stride;
2996
34.5k
            y -= 2;
2997
34.5k
          } while (y);
2998
6.23k
        }
2999
15.4k
      } else {
3000
4.04k
        prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3001
3002
4.04k
        if (w == 16) {
3003
14.1k
          do {
3004
14.1k
            __m256i r[2];
3005
3006
14.1k
            x_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, r);
3007
14.1k
            sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3008
14.1k
            src_ptr += 2 * src_stride;
3009
14.1k
            dst += 2 * dst_stride;
3010
14.1k
            y -= 2;
3011
14.1k
          } while (y);
3012
2.68k
        } else if (w == 32) {
3013
15.0k
          do {
3014
15.0k
            sr_x_2tap_32_avx2(src_ptr, coeffs_256, dst);
3015
15.0k
            src_ptr += src_stride;
3016
15.0k
            dst += dst_stride;
3017
15.0k
          } while (--y);
3018
762
        } else if (w == 64) {
3019
20.1k
          do {
3020
20.1k
            sr_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
3021
20.1k
            sr_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
3022
20.1k
            src_ptr += src_stride;
3023
20.1k
            dst += dst_stride;
3024
20.1k
          } while (--y);
3025
477
        } else {
3026
127
          assert(w == 128);
3027
3028
12.2k
          do {
3029
12.2k
            sr_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
3030
12.2k
            sr_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
3031
12.2k
            sr_x_2tap_32_avx2(src_ptr + 2 * 32, coeffs_256, dst + 2 * 32);
3032
12.2k
            sr_x_2tap_32_avx2(src_ptr + 3 * 32, coeffs_256, dst + 3 * 32);
3033
12.2k
            src_ptr += src_stride;
3034
12.2k
            dst += dst_stride;
3035
12.2k
          } while (--y);
3036
127
        }
3037
4.04k
      }
3038
19.4k
    } else {
3039
      // average to get half pel
3040
16.0k
      if (w == 2) {
3041
4.99k
        do {
3042
4.99k
          __m128i s_128;
3043
3044
4.99k
          s_128 = load_u8_4x2_sse4_1(src_ptr, src_stride);
3045
4.99k
          const __m128i s1 = _mm_srli_si128(s_128, 1);
3046
4.99k
          const __m128i d = _mm_avg_epu8(s_128, s1);
3047
4.99k
          *(uint16_t *)dst = (uint16_t)_mm_cvtsi128_si32(d);
3048
4.99k
          *(uint16_t *)(dst + dst_stride) = _mm_extract_epi16(d, 2);
3049
3050
4.99k
          src_ptr += 2 * src_stride;
3051
4.99k
          dst += 2 * dst_stride;
3052
4.99k
          y -= 2;
3053
4.99k
        } while (y);
3054
13.8k
      } else if (w == 4) {
3055
37.8k
        do {
3056
37.8k
          __m128i s_128;
3057
3058
37.8k
          s_128 = load_u8_8x2_sse2(src_ptr, src_stride);
3059
37.8k
          const __m128i s1 = _mm_srli_si128(s_128, 1);
3060
37.8k
          const __m128i d = _mm_avg_epu8(s_128, s1);
3061
37.8k
          xx_storel_32(dst, d);
3062
37.8k
          *(int32_t *)(dst + dst_stride) = _mm_extract_epi32(d, 2);
3063
3064
37.8k
          src_ptr += 2 * src_stride;
3065
37.8k
          dst += 2 * dst_stride;
3066
37.8k
          y -= 2;
3067
37.8k
        } while (y);
3068
9.09k
      } else if (w == 8) {
3069
59.1k
        do {
3070
59.1k
          const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
3071
59.1k
          const __m128i s10 =
3072
59.1k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
3073
59.1k
          const __m128i s01 = _mm_srli_si128(s00, 1);
3074
59.1k
          const __m128i s11 = _mm_srli_si128(s10, 1);
3075
59.1k
          const __m128i d0 = _mm_avg_epu8(s00, s01);
3076
59.1k
          const __m128i d1 = _mm_avg_epu8(s10, s11);
3077
59.1k
          _mm_storel_epi64((__m128i *)dst, d0);
3078
59.1k
          _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
3079
3080
59.1k
          src_ptr += 2 * src_stride;
3081
59.1k
          dst += 2 * dst_stride;
3082
59.1k
          y -= 2;
3083
59.1k
        } while (y);
3084
6.28k
      } else if (w == 16) {
3085
22.6k
        do {
3086
22.6k
          const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
3087
22.6k
          const __m128i s01 = _mm_loadu_si128((__m128i *)(src_ptr + 1));
3088
22.6k
          const __m128i s10 =
3089
22.6k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
3090
22.6k
          const __m128i s11 =
3091
22.6k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride + 1));
3092
22.6k
          const __m128i d0 = _mm_avg_epu8(s00, s01);
3093
22.6k
          const __m128i d1 = _mm_avg_epu8(s10, s11);
3094
22.6k
          _mm_storeu_si128((__m128i *)dst, d0);
3095
22.6k
          _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
3096
3097
22.6k
          src_ptr += 2 * src_stride;
3098
22.6k
          dst += 2 * dst_stride;
3099
22.6k
          y -= 2;
3100
22.6k
        } while (y);
3101
1.81k
      } else if (w == 32) {
3102
11.8k
        do {
3103
11.8k
          sr_x_2tap_32_avg_avx2(src_ptr, dst);
3104
11.8k
          src_ptr += src_stride;
3105
11.8k
          dst += dst_stride;
3106
11.8k
        } while (--y);
3107
541
      } else if (w == 64) {
3108
12.2k
        do {
3109
12.2k
          sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
3110
12.2k
          sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
3111
12.2k
          src_ptr += src_stride;
3112
12.2k
          dst += dst_stride;
3113
12.2k
        } while (--y);
3114
330
      } else {
3115
125
        assert(w == 128);
3116
3117
10.6k
        do {
3118
10.6k
          sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
3119
10.6k
          sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
3120
10.6k
          sr_x_2tap_32_avg_avx2(src_ptr + 2 * 32, dst + 2 * 32);
3121
10.6k
          sr_x_2tap_32_avg_avx2(src_ptr + 3 * 32, dst + 3 * 32);
3122
10.6k
          src_ptr += src_stride;
3123
10.6k
          dst += dst_stride;
3124
10.6k
        } while (--y);
3125
125
      }
3126
16.0k
    }
3127
1.43M
  } else if (horz_tap == 4) {
3128
    // horz_filt as 4 tap
3129
686k
    const uint8_t *src_ptr = src - 1;
3130
3131
686k
    prepare_half_coeffs_4tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
3132
3133
686k
    if (w == 2) {
3134
334k
      do {
3135
334k
        const __m128i res =
3136
334k
            x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
3137
334k
        const __m128i r = sr_x_round_sse2(res);
3138
334k
        pack_store_2x2_sse2(r, dst, dst_stride);
3139
334k
        src_ptr += 2 * src_stride;
3140
334k
        dst += 2 * dst_stride;
3141
334k
        y -= 2;
3142
334k
      } while (y);
3143
544k
    } else if (w == 4) {
3144
1.68M
      do {
3145
1.68M
        const __m128i res =
3146
1.68M
            x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
3147
1.68M
        const __m128i r = sr_x_round_sse2(res);
3148
1.68M
        pack_store_4x2_sse2(r, dst, dst_stride);
3149
1.68M
        src_ptr += 2 * src_stride;
3150
1.68M
        dst += 2 * dst_stride;
3151
1.68M
        y -= 2;
3152
1.68M
      } while (y);
3153
507k
    } else if (w == 8) {
3154
      // TODO(chiyotsai@google.com): Reuse the old SIMD code here. Need to
3155
      // rewrite this for better performance later.
3156
22.6k
      __m256i filt_256[2];
3157
22.6k
      prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_256);
3158
3159
22.6k
      filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3160
22.6k
      filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3161
104k
      for (int i = 0; i < h; i += 2) {
3162
82.2k
        const __m256i data = _mm256_permute2x128_si256(
3163
82.2k
            _mm256_castsi128_si256(
3164
82.2k
                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
3165
82.2k
            _mm256_castsi128_si256(_mm_loadu_si128(
3166
82.2k
                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
3167
82.2k
            0x20);
3168
3169
82.2k
        __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs_256 + 1, filt_256);
3170
82.2k
        res_16b = sr_x_round_avx2(res_16b);
3171
3172
82.2k
        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
3173
3174
82.2k
        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
3175
82.2k
        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
3176
3177
82.2k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
3178
82.2k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
3179
82.2k
      }
3180
22.6k
    } else {
3181
14.9k
      assert(!(w % 16));
3182
      // TODO(chiyotsai@google.com): Reuse the old SIMD code here. Need to
3183
      // rewrite this for better performance later.
3184
0
      __m256i filt_256[2];
3185
14.9k
      prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_256);
3186
14.9k
      filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3187
14.9k
      filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3188
3189
244k
      for (int i = 0; i < h; ++i) {
3190
699k
        for (int j = 0; j < w; j += 16) {
3191
          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
3192
          // 18 19 20 21 22 23
3193
469k
          const __m256i data = _mm256_inserti128_si256(
3194
469k
              _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
3195
469k
              _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
3196
469k
              1);
3197
3198
469k
          __m256i res_16b =
3199
469k
              convolve_lowbd_x_4tap(data, coeffs_256 + 1, filt_256);
3200
469k
          res_16b = sr_x_round_avx2(res_16b);
3201
3202
          /* rounding code */
3203
          // 8 bit conversion and saturation to uint8
3204
469k
          __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
3205
3206
          // Store values into the destination buffer
3207
          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
3208
469k
          res_8b = _mm256_permute4x64_epi64(res_8b, 216);
3209
469k
          __m128i res = _mm256_castsi256_si128(res_8b);
3210
469k
          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
3211
469k
        }
3212
229k
      }
3213
14.9k
    }
3214
748k
  } else {
3215
748k
    __m256i filt_256[4];
3216
3217
748k
    filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3218
748k
    filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3219
748k
    filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx2);
3220
3221
748k
    if (horz_tap == 6) {
3222
      // horz_filt as 6 tap
3223
697k
      const uint8_t *src_ptr = src - 2;
3224
3225
697k
      prepare_half_coeffs_6tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3226
3227
697k
      if (w == 8) {
3228
1.60M
        do {
3229
1.60M
          const __m256i res = x_convolve_6tap_8x2_avx2(src_ptr, src_stride,
3230
1.60M
                                                       coeffs_256, filt_256);
3231
1.60M
          sr_x_round_store_8x2_avx2(res, dst, dst_stride);
3232
1.60M
          src_ptr += 2 * src_stride;
3233
1.60M
          dst += 2 * dst_stride;
3234
1.60M
          y -= 2;
3235
1.60M
        } while (y);
3236
416k
      } else if (w == 16) {
3237
1.16M
        do {
3238
1.16M
          __m256i r[2];
3239
3240
1.16M
          x_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256,
3241
1.16M
                                    r);
3242
1.16M
          sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3243
1.16M
          src_ptr += 2 * src_stride;
3244
1.16M
          dst += 2 * dst_stride;
3245
1.16M
          y -= 2;
3246
1.16M
        } while (y);
3247
216k
      } else if (w == 32) {
3248
1.10M
        do {
3249
1.10M
          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3250
1.10M
          src_ptr += src_stride;
3251
1.10M
          dst += dst_stride;
3252
1.10M
        } while (--y);
3253
53.6k
      } else if (w == 64) {
3254
451k
        do {
3255
451k
          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3256
451k
          sr_x_6tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, dst + 32);
3257
451k
          src_ptr += src_stride;
3258
451k
          dst += dst_stride;
3259
451k
        } while (--y);
3260
10.2k
      } else {
3261
1.09k
        assert(w == 128);
3262
3263
117k
        do {
3264
117k
          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3265
117k
          sr_x_6tap_32_avx2(src_ptr + 1 * 32, coeffs_256, filt_256,
3266
117k
                            dst + 1 * 32);
3267
117k
          sr_x_6tap_32_avx2(src_ptr + 2 * 32, coeffs_256, filt_256,
3268
117k
                            dst + 2 * 32);
3269
117k
          sr_x_6tap_32_avx2(src_ptr + 3 * 32, coeffs_256, filt_256,
3270
117k
                            dst + 3 * 32);
3271
117k
          src_ptr += src_stride;
3272
117k
          dst += dst_stride;
3273
117k
        } while (--y);
3274
1.09k
      }
3275
697k
    } else if (horz_tap == 8) {
3276
      // horz_filt as 8 tap
3277
51.2k
      const uint8_t *src_ptr = src - 3;
3278
3279
51.2k
      filt_256[3] = _mm256_loadu_si256((__m256i const *)filt4_global_avx2);
3280
3281
51.2k
      prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3282
3283
51.2k
      if (w == 8) {
3284
116k
        do {
3285
116k
          const __m256i res = x_convolve_8tap_8x2_avx2(src_ptr, src_stride,
3286
116k
                                                       coeffs_256, filt_256);
3287
116k
          sr_x_round_store_8x2_avx2(res, dst, dst_stride);
3288
116k
          src_ptr += 2 * src_stride;
3289
116k
          dst += 2 * dst_stride;
3290
116k
          y -= 2;
3291
116k
        } while (y);
3292
32.1k
      } else if (w == 16) {
3293
76.1k
        do {
3294
76.1k
          __m256i r[2];
3295
3296
76.1k
          x_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256,
3297
76.1k
                                    r);
3298
76.1k
          sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3299
76.1k
          src_ptr += 2 * src_stride;
3300
76.1k
          dst += 2 * dst_stride;
3301
76.1k
          y -= 2;
3302
76.1k
        } while (y);
3303
14.0k
      } else if (w == 32) {
3304
82.2k
        do {
3305
82.2k
          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3306
82.2k
          src_ptr += src_stride;
3307
82.2k
          dst += dst_stride;
3308
82.2k
        } while (--y);
3309
3.53k
      } else if (w == 64) {
3310
58.9k
        do {
3311
58.9k
          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3312
58.9k
          sr_x_8tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, dst + 32);
3313
58.9k
          src_ptr += src_stride;
3314
58.9k
          dst += dst_stride;
3315
58.9k
        } while (--y);
3316
1.29k
      } else {
3317
262
        assert(w == 128);
3318
3319
20.9k
        do {
3320
20.9k
          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3321
20.9k
          sr_x_8tap_32_avx2(src_ptr + 1 * 32, coeffs_256, filt_256,
3322
20.9k
                            dst + 1 * 32);
3323
20.9k
          sr_x_8tap_32_avx2(src_ptr + 2 * 32, coeffs_256, filt_256,
3324
20.9k
                            dst + 2 * 32);
3325
20.9k
          sr_x_8tap_32_avx2(src_ptr + 3 * 32, coeffs_256, filt_256,
3326
20.9k
                            dst + 3 * 32);
3327
20.9k
          src_ptr += src_stride;
3328
20.9k
          dst += dst_stride;
3329
20.9k
        } while (--y);
3330
262
      }
3331
51.2k
    }
3332
748k
  }
3333
1.47M
}
Unexecuted instantiation: convolve_2d_avx2.c:av1_convolve_x_sr_specialized_avx2
convolve_avx2.c:av1_convolve_x_sr_specialized_avx2
Line
Count
Source
2940
1.47M
    const int32_t subpel_x_q4, ConvolveParams *conv_params) {
2941
1.47M
  int32_t y = h;
2942
1.47M
  __m128i coeffs_128[4];
2943
1.47M
  __m256i coeffs_256[4];
2944
2945
1.47M
  assert(conv_params->round_0 == 3);
2946
0
  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
2947
1.47M
         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
2948
0
  (void)conv_params;
2949
2950
1.47M
  const int horz_tap = get_filter_tap(filter_params_x, subpel_x_q4);
2951
2952
1.47M
  if (horz_tap == 2) {
2953
    // horz_filt as 2 tap
2954
35.5k
    const uint8_t *src_ptr = src;
2955
2956
35.5k
    if (subpel_x_q4 != 8) {
2957
19.4k
      if (w <= 8) {
2958
15.4k
        prepare_half_coeffs_2tap_ssse3(filter_params_x, subpel_x_q4,
2959
15.4k
                                       coeffs_128);
2960
2961
15.4k
        if (w == 2) {
2962
5.69k
          do {
2963
5.69k
            const __m128i res =
2964
5.69k
                x_convolve_2tap_2x2_sse4_1(src_ptr, src_stride, coeffs_128);
2965
5.69k
            const __m128i r = sr_x_round_sse2(res);
2966
5.69k
            pack_store_2x2_sse2(r, dst, dst_stride);
2967
5.69k
            src_ptr += 2 * src_stride;
2968
5.69k
            dst += 2 * dst_stride;
2969
5.69k
            y -= 2;
2970
5.69k
          } while (y);
2971
13.2k
        } else if (w == 4) {
2972
33.3k
          do {
2973
33.3k
            const __m128i res =
2974
33.3k
                x_convolve_2tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
2975
33.3k
            const __m128i r = sr_x_round_sse2(res);
2976
33.3k
            pack_store_4x2_sse2(r, dst, dst_stride);
2977
33.3k
            src_ptr += 2 * src_stride;
2978
33.3k
            dst += 2 * dst_stride;
2979
33.3k
            y -= 2;
2980
33.3k
          } while (y);
2981
7.04k
        } else {
2982
6.23k
          assert(w == 8);
2983
2984
34.5k
          do {
2985
34.5k
            __m128i res[2];
2986
2987
34.5k
            x_convolve_2tap_8x2_ssse3(src_ptr, src_stride, coeffs_128, res);
2988
34.5k
            res[0] = sr_x_round_sse2(res[0]);
2989
34.5k
            res[1] = sr_x_round_sse2(res[1]);
2990
34.5k
            const __m128i d = _mm_packus_epi16(res[0], res[1]);
2991
34.5k
            _mm_storel_epi64((__m128i *)dst, d);
2992
34.5k
            _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
2993
2994
34.5k
            src_ptr += 2 * src_stride;
2995
34.5k
            dst += 2 * dst_stride;
2996
34.5k
            y -= 2;
2997
34.5k
          } while (y);
2998
6.23k
        }
2999
15.4k
      } else {
3000
4.04k
        prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3001
3002
4.04k
        if (w == 16) {
3003
14.1k
          do {
3004
14.1k
            __m256i r[2];
3005
3006
14.1k
            x_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, r);
3007
14.1k
            sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3008
14.1k
            src_ptr += 2 * src_stride;
3009
14.1k
            dst += 2 * dst_stride;
3010
14.1k
            y -= 2;
3011
14.1k
          } while (y);
3012
2.68k
        } else if (w == 32) {
3013
15.0k
          do {
3014
15.0k
            sr_x_2tap_32_avx2(src_ptr, coeffs_256, dst);
3015
15.0k
            src_ptr += src_stride;
3016
15.0k
            dst += dst_stride;
3017
15.0k
          } while (--y);
3018
762
        } else if (w == 64) {
3019
20.1k
          do {
3020
20.1k
            sr_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
3021
20.1k
            sr_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
3022
20.1k
            src_ptr += src_stride;
3023
20.1k
            dst += dst_stride;
3024
20.1k
          } while (--y);
3025
477
        } else {
3026
127
          assert(w == 128);
3027
3028
12.2k
          do {
3029
12.2k
            sr_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
3030
12.2k
            sr_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
3031
12.2k
            sr_x_2tap_32_avx2(src_ptr + 2 * 32, coeffs_256, dst + 2 * 32);
3032
12.2k
            sr_x_2tap_32_avx2(src_ptr + 3 * 32, coeffs_256, dst + 3 * 32);
3033
12.2k
            src_ptr += src_stride;
3034
12.2k
            dst += dst_stride;
3035
12.2k
          } while (--y);
3036
127
        }
3037
4.04k
      }
3038
19.4k
    } else {
3039
      // average to get half pel
3040
16.0k
      if (w == 2) {
3041
4.99k
        do {
3042
4.99k
          __m128i s_128;
3043
3044
4.99k
          s_128 = load_u8_4x2_sse4_1(src_ptr, src_stride);
3045
4.99k
          const __m128i s1 = _mm_srli_si128(s_128, 1);
3046
4.99k
          const __m128i d = _mm_avg_epu8(s_128, s1);
3047
4.99k
          *(uint16_t *)dst = (uint16_t)_mm_cvtsi128_si32(d);
3048
4.99k
          *(uint16_t *)(dst + dst_stride) = _mm_extract_epi16(d, 2);
3049
3050
4.99k
          src_ptr += 2 * src_stride;
3051
4.99k
          dst += 2 * dst_stride;
3052
4.99k
          y -= 2;
3053
4.99k
        } while (y);
3054
13.8k
      } else if (w == 4) {
3055
37.8k
        do {
3056
37.8k
          __m128i s_128;
3057
3058
37.8k
          s_128 = load_u8_8x2_sse2(src_ptr, src_stride);
3059
37.8k
          const __m128i s1 = _mm_srli_si128(s_128, 1);
3060
37.8k
          const __m128i d = _mm_avg_epu8(s_128, s1);
3061
37.8k
          xx_storel_32(dst, d);
3062
37.8k
          *(int32_t *)(dst + dst_stride) = _mm_extract_epi32(d, 2);
3063
3064
37.8k
          src_ptr += 2 * src_stride;
3065
37.8k
          dst += 2 * dst_stride;
3066
37.8k
          y -= 2;
3067
37.8k
        } while (y);
3068
9.09k
      } else if (w == 8) {
3069
59.1k
        do {
3070
59.1k
          const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
3071
59.1k
          const __m128i s10 =
3072
59.1k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
3073
59.1k
          const __m128i s01 = _mm_srli_si128(s00, 1);
3074
59.1k
          const __m128i s11 = _mm_srli_si128(s10, 1);
3075
59.1k
          const __m128i d0 = _mm_avg_epu8(s00, s01);
3076
59.1k
          const __m128i d1 = _mm_avg_epu8(s10, s11);
3077
59.1k
          _mm_storel_epi64((__m128i *)dst, d0);
3078
59.1k
          _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
3079
3080
59.1k
          src_ptr += 2 * src_stride;
3081
59.1k
          dst += 2 * dst_stride;
3082
59.1k
          y -= 2;
3083
59.1k
        } while (y);
3084
6.28k
      } else if (w == 16) {
3085
22.6k
        do {
3086
22.6k
          const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
3087
22.6k
          const __m128i s01 = _mm_loadu_si128((__m128i *)(src_ptr + 1));
3088
22.6k
          const __m128i s10 =
3089
22.6k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
3090
22.6k
          const __m128i s11 =
3091
22.6k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride + 1));
3092
22.6k
          const __m128i d0 = _mm_avg_epu8(s00, s01);
3093
22.6k
          const __m128i d1 = _mm_avg_epu8(s10, s11);
3094
22.6k
          _mm_storeu_si128((__m128i *)dst, d0);
3095
22.6k
          _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
3096
3097
22.6k
          src_ptr += 2 * src_stride;
3098
22.6k
          dst += 2 * dst_stride;
3099
22.6k
          y -= 2;
3100
22.6k
        } while (y);
3101
1.81k
      } else if (w == 32) {
3102
11.8k
        do {
3103
11.8k
          sr_x_2tap_32_avg_avx2(src_ptr, dst);
3104
11.8k
          src_ptr += src_stride;
3105
11.8k
          dst += dst_stride;
3106
11.8k
        } while (--y);
3107
541
      } else if (w == 64) {
3108
12.2k
        do {
3109
12.2k
          sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
3110
12.2k
          sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
3111
12.2k
          src_ptr += src_stride;
3112
12.2k
          dst += dst_stride;
3113
12.2k
        } while (--y);
3114
330
      } else {
3115
125
        assert(w == 128);
3116
3117
10.6k
        do {
3118
10.6k
          sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
3119
10.6k
          sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
3120
10.6k
          sr_x_2tap_32_avg_avx2(src_ptr + 2 * 32, dst + 2 * 32);
3121
10.6k
          sr_x_2tap_32_avg_avx2(src_ptr + 3 * 32, dst + 3 * 32);
3122
10.6k
          src_ptr += src_stride;
3123
10.6k
          dst += dst_stride;
3124
10.6k
        } while (--y);
3125
125
      }
3126
16.0k
    }
3127
1.43M
  } else if (horz_tap == 4) {
3128
    // horz_filt as 4 tap
3129
686k
    const uint8_t *src_ptr = src - 1;
3130
3131
686k
    prepare_half_coeffs_4tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
3132
3133
686k
    if (w == 2) {
3134
334k
      do {
3135
334k
        const __m128i res =
3136
334k
            x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
3137
334k
        const __m128i r = sr_x_round_sse2(res);
3138
334k
        pack_store_2x2_sse2(r, dst, dst_stride);
3139
334k
        src_ptr += 2 * src_stride;
3140
334k
        dst += 2 * dst_stride;
3141
334k
        y -= 2;
3142
334k
      } while (y);
3143
544k
    } else if (w == 4) {
3144
1.68M
      do {
3145
1.68M
        const __m128i res =
3146
1.68M
            x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
3147
1.68M
        const __m128i r = sr_x_round_sse2(res);
3148
1.68M
        pack_store_4x2_sse2(r, dst, dst_stride);
3149
1.68M
        src_ptr += 2 * src_stride;
3150
1.68M
        dst += 2 * dst_stride;
3151
1.68M
        y -= 2;
3152
1.68M
      } while (y);
3153
507k
    } else if (w == 8) {
3154
      // TODO(chiyotsai@google.com): Reuse the old SIMD code here. Need to
3155
      // rewrite this for better performance later.
3156
22.6k
      __m256i filt_256[2];
3157
22.6k
      prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_256);
3158
3159
22.6k
      filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3160
22.6k
      filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3161
104k
      for (int i = 0; i < h; i += 2) {
3162
82.2k
        const __m256i data = _mm256_permute2x128_si256(
3163
82.2k
            _mm256_castsi128_si256(
3164
82.2k
                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
3165
82.2k
            _mm256_castsi128_si256(_mm_loadu_si128(
3166
82.2k
                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
3167
82.2k
            0x20);
3168
3169
82.2k
        __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs_256 + 1, filt_256);
3170
82.2k
        res_16b = sr_x_round_avx2(res_16b);
3171
3172
82.2k
        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
3173
3174
82.2k
        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
3175
82.2k
        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
3176
3177
82.2k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
3178
82.2k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
3179
82.2k
      }
3180
22.6k
    } else {
3181
14.9k
      assert(!(w % 16));
3182
      // TODO(chiyotsai@google.com): Reuse the old SIMD code here. Need to
3183
      // rewrite this for better performance later.
3184
0
      __m256i filt_256[2];
3185
14.9k
      prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_256);
3186
14.9k
      filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3187
14.9k
      filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3188
3189
244k
      for (int i = 0; i < h; ++i) {
3190
699k
        for (int j = 0; j < w; j += 16) {
3191
          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
3192
          // 18 19 20 21 22 23
3193
469k
          const __m256i data = _mm256_inserti128_si256(
3194
469k
              _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
3195
469k
              _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
3196
469k
              1);
3197
3198
469k
          __m256i res_16b =
3199
469k
              convolve_lowbd_x_4tap(data, coeffs_256 + 1, filt_256);
3200
469k
          res_16b = sr_x_round_avx2(res_16b);
3201
3202
          /* rounding code */
3203
          // 8 bit conversion and saturation to uint8
3204
469k
          __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
3205
3206
          // Store values into the destination buffer
3207
          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
3208
469k
          res_8b = _mm256_permute4x64_epi64(res_8b, 216);
3209
469k
          __m128i res = _mm256_castsi256_si128(res_8b);
3210
469k
          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
3211
469k
        }
3212
229k
      }
3213
14.9k
    }
3214
748k
  } else {
3215
748k
    __m256i filt_256[4];
3216
3217
748k
    filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3218
748k
    filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3219
748k
    filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx2);
3220
3221
748k
    if (horz_tap == 6) {
3222
      // horz_filt as 6 tap
3223
697k
      const uint8_t *src_ptr = src - 2;
3224
3225
697k
      prepare_half_coeffs_6tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3226
3227
697k
      if (w == 8) {
3228
1.60M
        do {
3229
1.60M
          const __m256i res = x_convolve_6tap_8x2_avx2(src_ptr, src_stride,
3230
1.60M
                                                       coeffs_256, filt_256);
3231
1.60M
          sr_x_round_store_8x2_avx2(res, dst, dst_stride);
3232
1.60M
          src_ptr += 2 * src_stride;
3233
1.60M
          dst += 2 * dst_stride;
3234
1.60M
          y -= 2;
3235
1.60M
        } while (y);
3236
416k
      } else if (w == 16) {
3237
1.16M
        do {
3238
1.16M
          __m256i r[2];
3239
3240
1.16M
          x_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256,
3241
1.16M
                                    r);
3242
1.16M
          sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3243
1.16M
          src_ptr += 2 * src_stride;
3244
1.16M
          dst += 2 * dst_stride;
3245
1.16M
          y -= 2;
3246
1.16M
        } while (y);
3247
216k
      } else if (w == 32) {
3248
1.10M
        do {
3249
1.10M
          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3250
1.10M
          src_ptr += src_stride;
3251
1.10M
          dst += dst_stride;
3252
1.10M
        } while (--y);
3253
53.6k
      } else if (w == 64) {
3254
451k
        do {
3255
451k
          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3256
451k
          sr_x_6tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, dst + 32);
3257
451k
          src_ptr += src_stride;
3258
451k
          dst += dst_stride;
3259
451k
        } while (--y);
3260
10.2k
      } else {
3261
1.09k
        assert(w == 128);
3262
3263
117k
        do {
3264
117k
          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3265
117k
          sr_x_6tap_32_avx2(src_ptr + 1 * 32, coeffs_256, filt_256,
3266
117k
                            dst + 1 * 32);
3267
117k
          sr_x_6tap_32_avx2(src_ptr + 2 * 32, coeffs_256, filt_256,
3268
117k
                            dst + 2 * 32);
3269
117k
          sr_x_6tap_32_avx2(src_ptr + 3 * 32, coeffs_256, filt_256,
3270
117k
                            dst + 3 * 32);
3271
117k
          src_ptr += src_stride;
3272
117k
          dst += dst_stride;
3273
117k
        } while (--y);
3274
1.09k
      }
3275
697k
    } else if (horz_tap == 8) {
3276
      // horz_filt as 8 tap
3277
51.2k
      const uint8_t *src_ptr = src - 3;
3278
3279
51.2k
      filt_256[3] = _mm256_loadu_si256((__m256i const *)filt4_global_avx2);
3280
3281
51.2k
      prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3282
3283
51.2k
      if (w == 8) {
3284
116k
        do {
3285
116k
          const __m256i res = x_convolve_8tap_8x2_avx2(src_ptr, src_stride,
3286
116k
                                                       coeffs_256, filt_256);
3287
116k
          sr_x_round_store_8x2_avx2(res, dst, dst_stride);
3288
116k
          src_ptr += 2 * src_stride;
3289
116k
          dst += 2 * dst_stride;
3290
116k
          y -= 2;
3291
116k
        } while (y);
3292
32.1k
      } else if (w == 16) {
3293
76.1k
        do {
3294
76.1k
          __m256i r[2];
3295
3296
76.1k
          x_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256,
3297
76.1k
                                    r);
3298
76.1k
          sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3299
76.1k
          src_ptr += 2 * src_stride;
3300
76.1k
          dst += 2 * dst_stride;
3301
76.1k
          y -= 2;
3302
76.1k
        } while (y);
3303
14.0k
      } else if (w == 32) {
3304
82.2k
        do {
3305
82.2k
          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3306
82.2k
          src_ptr += src_stride;
3307
82.2k
          dst += dst_stride;
3308
82.2k
        } while (--y);
3309
3.53k
      } else if (w == 64) {
3310
58.9k
        do {
3311
58.9k
          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3312
58.9k
          sr_x_8tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, dst + 32);
3313
58.9k
          src_ptr += src_stride;
3314
58.9k
          dst += dst_stride;
3315
58.9k
        } while (--y);
3316
1.29k
      } else {
3317
262
        assert(w == 128);
3318
3319
20.9k
        do {
3320
20.9k
          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3321
20.9k
          sr_x_8tap_32_avx2(src_ptr + 1 * 32, coeffs_256, filt_256,
3322
20.9k
                            dst + 1 * 32);
3323
20.9k
          sr_x_8tap_32_avx2(src_ptr + 2 * 32, coeffs_256, filt_256,
3324
20.9k
                            dst + 2 * 32);
3325
20.9k
          sr_x_8tap_32_avx2(src_ptr + 3 * 32, coeffs_256, filt_256,
3326
20.9k
                            dst + 3 * 32);
3327
20.9k
          src_ptr += src_stride;
3328
20.9k
          dst += dst_stride;
3329
20.9k
        } while (--y);
3330
262
      }
3331
51.2k
    }
3332
748k
  }
3333
1.47M
}
3334
3335
#endif  // THIRD_PARTY_SVT_AV1_CONVOLVE_AVX2_H_