Coverage Report

Created: 2025-11-16 07:09

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/aom/third_party/SVT-AV1/convolve_avx2.h
Line
Count
Source
1
/*
2
 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#ifndef THIRD_PARTY_SVT_AV1_CONVOLVE_AVX2_H_
13
#define THIRD_PARTY_SVT_AV1_CONVOLVE_AVX2_H_
14
15
#include "EbMemory_AVX2.h"
16
#include "EbMemory_SSE4_1.h"
17
#include "synonyms.h"
18
19
#include "aom_dsp/aom_filter.h"
20
#include "aom_dsp/x86/convolve_avx2.h"
21
#include "aom_dsp/x86/mem_sse2.h"
22
23
static inline void populate_coeffs_4tap_avx2(const __m128i coeffs_128,
24
254k
                                             __m256i coeffs[2]) {
25
254k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
26
27
  // coeffs 2 3 2 3 2 3 2 3
28
254k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
29
  // coeffs 4 5 4 5 4 5 4 5
30
254k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
31
254k
}
convolve_2d_avx2.c:populate_coeffs_4tap_avx2
Line
Count
Source
24
52.5k
                                             __m256i coeffs[2]) {
25
52.5k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
26
27
  // coeffs 2 3 2 3 2 3 2 3
28
52.5k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
29
  // coeffs 4 5 4 5 4 5 4 5
30
52.5k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
31
52.5k
}
convolve_avx2.c:populate_coeffs_4tap_avx2
Line
Count
Source
24
201k
                                             __m256i coeffs[2]) {
25
201k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
26
27
  // coeffs 2 3 2 3 2 3 2 3
28
201k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
29
  // coeffs 4 5 4 5 4 5 4 5
30
201k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
31
201k
}
32
33
static inline void populate_coeffs_6tap_avx2(const __m128i coeffs_128,
34
1.45M
                                             __m256i coeffs[3]) {
35
1.45M
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
36
37
  // coeffs 1 2 1 2 1 2 1 2
38
1.45M
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0402u));
39
  // coeffs 3 4 3 4 3 4 3 4
40
1.45M
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0806u));
41
  // coeffs 5 6 5 6 5 6 5 6
42
1.45M
  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0C0Au));
43
1.45M
}
convolve_2d_avx2.c:populate_coeffs_6tap_avx2
Line
Count
Source
34
922k
                                             __m256i coeffs[3]) {
35
922k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
36
37
  // coeffs 1 2 1 2 1 2 1 2
38
922k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0402u));
39
  // coeffs 3 4 3 4 3 4 3 4
40
922k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0806u));
41
  // coeffs 5 6 5 6 5 6 5 6
42
922k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0C0Au));
43
922k
}
convolve_avx2.c:populate_coeffs_6tap_avx2
Line
Count
Source
34
532k
                                             __m256i coeffs[3]) {
35
532k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
36
37
  // coeffs 1 2 1 2 1 2 1 2
38
532k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0402u));
39
  // coeffs 3 4 3 4 3 4 3 4
40
532k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0806u));
41
  // coeffs 5 6 5 6 5 6 5 6
42
532k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0C0Au));
43
532k
}
44
45
static inline void populate_coeffs_8tap_avx2(const __m128i coeffs_128,
46
101k
                                             __m256i coeffs[4]) {
47
101k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
48
49
  // coeffs 0 1 0 1 0 1 0 1
50
101k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0200u));
51
  // coeffs 2 3 2 3 2 3 2 3
52
101k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
53
  // coeffs 4 5 4 5 4 5 4 5
54
101k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
55
  // coeffs 6 7 6 7 6 7 6 7
56
101k
  coeffs[3] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0e0cu));
57
101k
}
convolve_2d_avx2.c:populate_coeffs_8tap_avx2
Line
Count
Source
46
71.0k
                                             __m256i coeffs[4]) {
47
71.0k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
48
49
  // coeffs 0 1 0 1 0 1 0 1
50
71.0k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0200u));
51
  // coeffs 2 3 2 3 2 3 2 3
52
71.0k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
53
  // coeffs 4 5 4 5 4 5 4 5
54
71.0k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
55
  // coeffs 6 7 6 7 6 7 6 7
56
71.0k
  coeffs[3] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0e0cu));
57
71.0k
}
convolve_avx2.c:populate_coeffs_8tap_avx2
Line
Count
Source
46
30.6k
                                             __m256i coeffs[4]) {
47
30.6k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
48
49
  // coeffs 0 1 0 1 0 1 0 1
50
30.6k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0200u));
51
  // coeffs 2 3 2 3 2 3 2 3
52
30.6k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
53
  // coeffs 4 5 4 5 4 5 4 5
54
30.6k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
55
  // coeffs 6 7 6 7 6 7 6 7
56
30.6k
  coeffs[3] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0e0cu));
57
30.6k
}
58
59
static inline void prepare_half_coeffs_2tap_ssse3(
60
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
61
105k
    __m128i *const coeffs /* [1] */) {
62
105k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
63
105k
      filter_params, subpel_q4 & SUBPEL_MASK);
64
105k
  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
65
66
  // right shift all filter co-efficients by 1 to reduce the bits required.
67
  // This extra right shift will be taken care of at the end while rounding
68
  // the result.
69
  // Since all filter co-efficients are even, this change will not affect the
70
  // end result
71
105k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
72
105k
                            _mm_set1_epi16((short)0xffff)));
73
74
105k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
75
76
  // coeffs 3 4 3 4 3 4 3 4
77
105k
  *coeffs = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
78
105k
}
convolve_2d_avx2.c:prepare_half_coeffs_2tap_ssse3
Line
Count
Source
61
63.3k
    __m128i *const coeffs /* [1] */) {
62
63.3k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
63
63.3k
      filter_params, subpel_q4 & SUBPEL_MASK);
64
63.3k
  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
65
66
  // right shift all filter co-efficients by 1 to reduce the bits required.
67
  // This extra right shift will be taken care of at the end while rounding
68
  // the result.
69
  // Since all filter co-efficients are even, this change will not affect the
70
  // end result
71
63.3k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
72
63.3k
                            _mm_set1_epi16((short)0xffff)));
73
74
63.3k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
75
76
  // coeffs 3 4 3 4 3 4 3 4
77
63.3k
  *coeffs = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
78
63.3k
}
convolve_avx2.c:prepare_half_coeffs_2tap_ssse3
Line
Count
Source
61
41.7k
    __m128i *const coeffs /* [1] */) {
62
41.7k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
63
41.7k
      filter_params, subpel_q4 & SUBPEL_MASK);
64
41.7k
  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
65
66
  // right shift all filter co-efficients by 1 to reduce the bits required.
67
  // This extra right shift will be taken care of at the end while rounding
68
  // the result.
69
  // Since all filter co-efficients are even, this change will not affect the
70
  // end result
71
41.7k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
72
41.7k
                            _mm_set1_epi16((short)0xffff)));
73
74
41.7k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
75
76
  // coeffs 3 4 3 4 3 4 3 4
77
41.7k
  *coeffs = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
78
41.7k
}
79
80
static inline void prepare_half_coeffs_4tap_ssse3(
81
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
82
1.26M
    __m128i *const coeffs /* [2] */) {
83
1.26M
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
84
1.26M
      filter_params, subpel_q4 & SUBPEL_MASK);
85
1.26M
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
86
87
  // right shift all filter co-efficients by 1 to reduce the bits required.
88
  // This extra right shift will be taken care of at the end while rounding
89
  // the result.
90
  // Since all filter co-efficients are even, this change will not affect the
91
  // end result
92
1.26M
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
93
1.26M
                            _mm_set1_epi16((short)0xffff)));
94
95
1.26M
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
96
97
  // coeffs 2 3 2 3 2 3 2 3
98
1.26M
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
99
  // coeffs 4 5 4 5 4 5 4 5
100
1.26M
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
101
1.26M
}
convolve_2d_avx2.c:prepare_half_coeffs_4tap_ssse3
Line
Count
Source
82
822k
    __m128i *const coeffs /* [2] */) {
83
822k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
84
822k
      filter_params, subpel_q4 & SUBPEL_MASK);
85
822k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
86
87
  // right shift all filter co-efficients by 1 to reduce the bits required.
88
  // This extra right shift will be taken care of at the end while rounding
89
  // the result.
90
  // Since all filter co-efficients are even, this change will not affect the
91
  // end result
92
822k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
93
822k
                            _mm_set1_epi16((short)0xffff)));
94
95
822k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
96
97
  // coeffs 2 3 2 3 2 3 2 3
98
822k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
99
  // coeffs 4 5 4 5 4 5 4 5
100
822k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
101
822k
}
convolve_avx2.c:prepare_half_coeffs_4tap_ssse3
Line
Count
Source
82
444k
    __m128i *const coeffs /* [2] */) {
83
444k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
84
444k
      filter_params, subpel_q4 & SUBPEL_MASK);
85
444k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
86
87
  // right shift all filter co-efficients by 1 to reduce the bits required.
88
  // This extra right shift will be taken care of at the end while rounding
89
  // the result.
90
  // Since all filter co-efficients are even, this change will not affect the
91
  // end result
92
444k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
93
444k
                            _mm_set1_epi16((short)0xffff)));
94
95
444k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
96
97
  // coeffs 2 3 2 3 2 3 2 3
98
444k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
99
  // coeffs 4 5 4 5 4 5 4 5
100
444k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
101
444k
}
102
103
static inline void prepare_half_coeffs_6tap_ssse3(
104
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
105
111k
    __m128i *const coeffs /* [3] */) {
106
111k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
107
111k
      filter_params, subpel_q4 & SUBPEL_MASK);
108
111k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
109
110
  // right shift all filter co-efficients by 1 to reduce the bits required.
111
  // This extra right shift will be taken care of at the end while rounding
112
  // the result.
113
  // Since all filter co-efficients are even, this change will not affect the
114
  // end result
115
111k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
116
111k
                            _mm_set1_epi16((short)0xffff)));
117
118
111k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
119
120
  // coeffs 1 2 1 2 1 2 1 2
121
111k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0402u));
122
  // coeffs 3 4 3 4 3 4 3 4
123
111k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u));
124
  // coeffs 5 6 5 6 5 6 5 6
125
111k
  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0C0Au));
126
111k
}
Unexecuted instantiation: convolve_2d_avx2.c:prepare_half_coeffs_6tap_ssse3
convolve_avx2.c:prepare_half_coeffs_6tap_ssse3
Line
Count
Source
105
111k
    __m128i *const coeffs /* [3] */) {
106
111k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
107
111k
      filter_params, subpel_q4 & SUBPEL_MASK);
108
111k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
109
110
  // right shift all filter co-efficients by 1 to reduce the bits required.
111
  // This extra right shift will be taken care of at the end while rounding
112
  // the result.
113
  // Since all filter co-efficients are even, this change will not affect the
114
  // end result
115
111k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
116
111k
                            _mm_set1_epi16((short)0xffff)));
117
118
111k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
119
120
  // coeffs 1 2 1 2 1 2 1 2
121
111k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0402u));
122
  // coeffs 3 4 3 4 3 4 3 4
123
111k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u));
124
  // coeffs 5 6 5 6 5 6 5 6
125
111k
  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0C0Au));
126
111k
}
127
128
static inline void prepare_half_coeffs_8tap_ssse3(
129
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
130
6.88k
    __m128i *const coeffs /* [4] */) {
131
6.88k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
132
6.88k
      filter_params, subpel_q4 & SUBPEL_MASK);
133
6.88k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
134
135
  // right shift all filter co-efficients by 1 to reduce the bits required.
136
  // This extra right shift will be taken care of at the end while rounding
137
  // the result.
138
  // Since all filter co-efficients are even, this change will not affect the
139
  // end result
140
6.88k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
141
6.88k
                            _mm_set1_epi16((short)0xffff)));
142
143
6.88k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
144
145
  // coeffs 0 1 0 1 0 1 0 1
146
6.88k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
147
  // coeffs 2 3 2 3 2 3 2 3
148
6.88k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
149
  // coeffs 4 5 4 5 4 5 4 5
150
6.88k
  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
151
  // coeffs 6 7 6 7 6 7 6 7
152
6.88k
  coeffs[3] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0e0cu));
153
6.88k
}
Unexecuted instantiation: convolve_2d_avx2.c:prepare_half_coeffs_8tap_ssse3
convolve_avx2.c:prepare_half_coeffs_8tap_ssse3
Line
Count
Source
130
6.88k
    __m128i *const coeffs /* [4] */) {
131
6.88k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
132
6.88k
      filter_params, subpel_q4 & SUBPEL_MASK);
133
6.88k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
134
135
  // right shift all filter co-efficients by 1 to reduce the bits required.
136
  // This extra right shift will be taken care of at the end while rounding
137
  // the result.
138
  // Since all filter co-efficients are even, this change will not affect the
139
  // end result
140
6.88k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
141
6.88k
                            _mm_set1_epi16((short)0xffff)));
142
143
6.88k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
144
145
  // coeffs 0 1 0 1 0 1 0 1
146
6.88k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
147
  // coeffs 2 3 2 3 2 3 2 3
148
6.88k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
149
  // coeffs 4 5 4 5 4 5 4 5
150
6.88k
  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
151
  // coeffs 6 7 6 7 6 7 6 7
152
6.88k
  coeffs[3] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0e0cu));
153
6.88k
}
154
155
static inline void prepare_half_coeffs_2tap_avx2(
156
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
157
28.9k
    __m256i *const coeffs /* [1] */) {
158
28.9k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
159
28.9k
      filter_params, subpel_q4 & SUBPEL_MASK);
160
28.9k
  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
161
28.9k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
162
163
  // right shift all filter co-efficients by 1 to reduce the bits required.
164
  // This extra right shift will be taken care of at the end while rounding
165
  // the result.
166
  // Since all filter co-efficients are even, this change will not affect the
167
  // end result
168
28.9k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
169
28.9k
                            _mm_set1_epi16((short)0xffff)));
170
171
28.9k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
172
173
  // coeffs 3 4 3 4 3 4 3 4
174
28.9k
  *coeffs = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
175
28.9k
}
convolve_2d_avx2.c:prepare_half_coeffs_2tap_avx2
Line
Count
Source
157
16.3k
    __m256i *const coeffs /* [1] */) {
158
16.3k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
159
16.3k
      filter_params, subpel_q4 & SUBPEL_MASK);
160
16.3k
  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
161
16.3k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
162
163
  // right shift all filter co-efficients by 1 to reduce the bits required.
164
  // This extra right shift will be taken care of at the end while rounding
165
  // the result.
166
  // Since all filter co-efficients are even, this change will not affect the
167
  // end result
168
16.3k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
169
16.3k
                            _mm_set1_epi16((short)0xffff)));
170
171
16.3k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
172
173
  // coeffs 3 4 3 4 3 4 3 4
174
16.3k
  *coeffs = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
175
16.3k
}
convolve_avx2.c:prepare_half_coeffs_2tap_avx2
Line
Count
Source
157
12.6k
    __m256i *const coeffs /* [1] */) {
158
12.6k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
159
12.6k
      filter_params, subpel_q4 & SUBPEL_MASK);
160
12.6k
  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
161
12.6k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
162
163
  // right shift all filter co-efficients by 1 to reduce the bits required.
164
  // This extra right shift will be taken care of at the end while rounding
165
  // the result.
166
  // Since all filter co-efficients are even, this change will not affect the
167
  // end result
168
12.6k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
169
12.6k
                            _mm_set1_epi16((short)0xffff)));
170
171
12.6k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
172
173
  // coeffs 3 4 3 4 3 4 3 4
174
12.6k
  *coeffs = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
175
12.6k
}
176
177
static inline void prepare_half_coeffs_4tap_avx2(
178
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
179
254k
    __m256i *const coeffs /* [2] */) {
180
254k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
181
254k
      filter_params, subpel_q4 & SUBPEL_MASK);
182
254k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
183
184
  // right shift all filter co-efficients by 1 to reduce the bits required.
185
  // This extra right shift will be taken care of at the end while rounding
186
  // the result.
187
  // Since all filter co-efficients are even, this change will not affect the
188
  // end result
189
254k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
190
254k
                            _mm_set1_epi16((short)0xffff)));
191
254k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
192
254k
  populate_coeffs_4tap_avx2(coeffs_1, coeffs);
193
254k
}
convolve_2d_avx2.c:prepare_half_coeffs_4tap_avx2
Line
Count
Source
179
52.5k
    __m256i *const coeffs /* [2] */) {
180
52.5k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
181
52.5k
      filter_params, subpel_q4 & SUBPEL_MASK);
182
52.5k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
183
184
  // right shift all filter co-efficients by 1 to reduce the bits required.
185
  // This extra right shift will be taken care of at the end while rounding
186
  // the result.
187
  // Since all filter co-efficients are even, this change will not affect the
188
  // end result
189
52.5k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
190
52.5k
                            _mm_set1_epi16((short)0xffff)));
191
52.5k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
192
52.5k
  populate_coeffs_4tap_avx2(coeffs_1, coeffs);
193
52.5k
}
convolve_avx2.c:prepare_half_coeffs_4tap_avx2
Line
Count
Source
179
201k
    __m256i *const coeffs /* [2] */) {
180
201k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
181
201k
      filter_params, subpel_q4 & SUBPEL_MASK);
182
201k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
183
184
  // right shift all filter co-efficients by 1 to reduce the bits required.
185
  // This extra right shift will be taken care of at the end while rounding
186
  // the result.
187
  // Since all filter co-efficients are even, this change will not affect the
188
  // end result
189
201k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
190
201k
                            _mm_set1_epi16((short)0xffff)));
191
201k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
192
201k
  populate_coeffs_4tap_avx2(coeffs_1, coeffs);
193
201k
}
194
195
static inline void prepare_half_coeffs_6tap_avx2(
196
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
197
1.45M
    __m256i *const coeffs /* [3] */) {
198
1.45M
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
199
1.45M
      filter_params, subpel_q4 & SUBPEL_MASK);
200
1.45M
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
201
202
  // right shift all filter co-efficients by 1 to reduce the bits required.
203
  // This extra right shift will be taken care of at the end while rounding
204
  // the result.
205
  // Since all filter co-efficients are even, this change will not affect the
206
  // end result
207
1.45M
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
208
1.45M
                            _mm_set1_epi16((short)0xffff)));
209
1.45M
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
210
1.45M
  populate_coeffs_6tap_avx2(coeffs_1, coeffs);
211
1.45M
}
convolve_2d_avx2.c:prepare_half_coeffs_6tap_avx2
Line
Count
Source
197
922k
    __m256i *const coeffs /* [3] */) {
198
922k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
199
922k
      filter_params, subpel_q4 & SUBPEL_MASK);
200
922k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
201
202
  // right shift all filter co-efficients by 1 to reduce the bits required.
203
  // This extra right shift will be taken care of at the end while rounding
204
  // the result.
205
  // Since all filter co-efficients are even, this change will not affect the
206
  // end result
207
922k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
208
922k
                            _mm_set1_epi16((short)0xffff)));
209
922k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
210
922k
  populate_coeffs_6tap_avx2(coeffs_1, coeffs);
211
922k
}
convolve_avx2.c:prepare_half_coeffs_6tap_avx2
Line
Count
Source
197
532k
    __m256i *const coeffs /* [3] */) {
198
532k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
199
532k
      filter_params, subpel_q4 & SUBPEL_MASK);
200
532k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
201
202
  // right shift all filter co-efficients by 1 to reduce the bits required.
203
  // This extra right shift will be taken care of at the end while rounding
204
  // the result.
205
  // Since all filter co-efficients are even, this change will not affect the
206
  // end result
207
532k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
208
532k
                            _mm_set1_epi16((short)0xffff)));
209
532k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
210
532k
  populate_coeffs_6tap_avx2(coeffs_1, coeffs);
211
532k
}
212
213
static inline void prepare_half_coeffs_8tap_avx2(
214
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
215
101k
    __m256i *const coeffs /* [4] */) {
216
101k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
217
101k
      filter_params, subpel_q4 & SUBPEL_MASK);
218
101k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
219
220
  // right shift all filter co-efficients by 1 to reduce the bits required.
221
  // This extra right shift will be taken care of at the end while rounding
222
  // the result.
223
  // Since all filter co-efficients are even, this change will not affect the
224
  // end result
225
101k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
226
101k
                            _mm_set1_epi16((short)0xffff)));
227
101k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
228
101k
  populate_coeffs_8tap_avx2(coeffs_1, coeffs);
229
101k
}
convolve_2d_avx2.c:prepare_half_coeffs_8tap_avx2
Line
Count
Source
215
71.0k
    __m256i *const coeffs /* [4] */) {
216
71.0k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
217
71.0k
      filter_params, subpel_q4 & SUBPEL_MASK);
218
71.0k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
219
220
  // right shift all filter co-efficients by 1 to reduce the bits required.
221
  // This extra right shift will be taken care of at the end while rounding
222
  // the result.
223
  // Since all filter co-efficients are even, this change will not affect the
224
  // end result
225
71.0k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
226
71.0k
                            _mm_set1_epi16((short)0xffff)));
227
71.0k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
228
71.0k
  populate_coeffs_8tap_avx2(coeffs_1, coeffs);
229
71.0k
}
convolve_avx2.c:prepare_half_coeffs_8tap_avx2
Line
Count
Source
215
30.6k
    __m256i *const coeffs /* [4] */) {
216
30.6k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
217
30.6k
      filter_params, subpel_q4 & SUBPEL_MASK);
218
30.6k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
219
220
  // right shift all filter co-efficients by 1 to reduce the bits required.
221
  // This extra right shift will be taken care of at the end while rounding
222
  // the result.
223
  // Since all filter co-efficients are even, this change will not affect the
224
  // end result
225
30.6k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
226
30.6k
                            _mm_set1_epi16((short)0xffff)));
227
30.6k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
228
30.6k
  populate_coeffs_8tap_avx2(coeffs_1, coeffs);
229
30.6k
}
230
231
static inline void prepare_coeffs_2tap_sse2(
232
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
233
30.6k
    __m128i *const coeffs /* [1] */) {
234
30.6k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
235
30.6k
      filter_params, subpel_q4 & SUBPEL_MASK);
236
237
30.6k
  const __m128i coeff = _mm_cvtsi32_si128(loadu_int32(filter + 3));
238
239
  // coeffs 3 4 3 4 3 4 3 4
240
30.6k
  coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
241
30.6k
}
convolve_2d_avx2.c:prepare_coeffs_2tap_sse2
Line
Count
Source
233
30.6k
    __m128i *const coeffs /* [1] */) {
234
30.6k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
235
30.6k
      filter_params, subpel_q4 & SUBPEL_MASK);
236
237
30.6k
  const __m128i coeff = _mm_cvtsi32_si128(loadu_int32(filter + 3));
238
239
  // coeffs 3 4 3 4 3 4 3 4
240
  coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
241
30.6k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_2tap_sse2
242
243
static inline void prepare_coeffs_4tap_sse2(
244
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
245
94.8k
    __m128i *const coeffs /* [2] */) {
246
94.8k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
247
94.8k
      filter_params, subpel_q4 & SUBPEL_MASK);
248
249
94.8k
  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
250
251
  // coeffs 2 3 2 3 2 3 2 3
252
94.8k
  coeffs[0] = _mm_shuffle_epi32(coeff, 0x55);
253
  // coeffs 4 5 4 5 4 5 4 5
254
94.8k
  coeffs[1] = _mm_shuffle_epi32(coeff, 0xaa);
255
94.8k
}
convolve_2d_avx2.c:prepare_coeffs_4tap_sse2
Line
Count
Source
245
94.8k
    __m128i *const coeffs /* [2] */) {
246
94.8k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
247
94.8k
      filter_params, subpel_q4 & SUBPEL_MASK);
248
249
94.8k
  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
250
251
  // coeffs 2 3 2 3 2 3 2 3
252
94.8k
  coeffs[0] = _mm_shuffle_epi32(coeff, 0x55);
253
  // coeffs 4 5 4 5 4 5 4 5
254
  coeffs[1] = _mm_shuffle_epi32(coeff, 0xaa);
255
94.8k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_4tap_sse2
256
257
static inline void prepare_coeffs_6tap_ssse3(
258
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
259
65.4k
    __m128i *const coeffs /* [3] */) {
260
65.4k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
261
65.4k
      filter_params, subpel_q4 & SUBPEL_MASK);
262
65.4k
  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
263
264
  // coeffs 1 2 1 2 1 2 1 2
265
65.4k
  coeffs[0] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x05040302u));
266
  // coeffs 3 4 3 4 3 4 3 4
267
65.4k
  coeffs[1] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x09080706u));
268
  // coeffs 5 6 5 6 5 6 5 6
269
65.4k
  coeffs[2] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x0D0C0B0Au));
270
65.4k
}
convolve_2d_avx2.c:prepare_coeffs_6tap_ssse3
Line
Count
Source
259
65.4k
    __m128i *const coeffs /* [3] */) {
260
65.4k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
261
65.4k
      filter_params, subpel_q4 & SUBPEL_MASK);
262
65.4k
  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
263
264
  // coeffs 1 2 1 2 1 2 1 2
265
65.4k
  coeffs[0] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x05040302u));
266
  // coeffs 3 4 3 4 3 4 3 4
267
65.4k
  coeffs[1] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x09080706u));
268
  // coeffs 5 6 5 6 5 6 5 6
269
65.4k
  coeffs[2] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x0D0C0B0Au));
270
65.4k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_6tap_ssse3
271
272
static inline void prepare_coeffs_8tap_sse2(
273
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
274
4.50k
    __m128i *const coeffs /* [4] */) {
275
4.50k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
276
4.50k
      filter_params, subpel_q4 & SUBPEL_MASK);
277
278
4.50k
  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
279
280
  // coeffs 0 1 0 1 0 1 0 1
281
4.50k
  coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
282
  // coeffs 2 3 2 3 2 3 2 3
283
4.50k
  coeffs[1] = _mm_shuffle_epi32(coeff, 0x55);
284
  // coeffs 4 5 4 5 4 5 4 5
285
4.50k
  coeffs[2] = _mm_shuffle_epi32(coeff, 0xaa);
286
  // coeffs 6 7 6 7 6 7 6 7
287
4.50k
  coeffs[3] = _mm_shuffle_epi32(coeff, 0xff);
288
4.50k
}
convolve_2d_avx2.c:prepare_coeffs_8tap_sse2
Line
Count
Source
274
4.50k
    __m128i *const coeffs /* [4] */) {
275
4.50k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
276
4.50k
      filter_params, subpel_q4 & SUBPEL_MASK);
277
278
4.50k
  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
279
280
  // coeffs 0 1 0 1 0 1 0 1
281
4.50k
  coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
282
  // coeffs 2 3 2 3 2 3 2 3
283
4.50k
  coeffs[1] = _mm_shuffle_epi32(coeff, 0x55);
284
  // coeffs 4 5 4 5 4 5 4 5
285
4.50k
  coeffs[2] = _mm_shuffle_epi32(coeff, 0xaa);
286
  // coeffs 6 7 6 7 6 7 6 7
287
  coeffs[3] = _mm_shuffle_epi32(coeff, 0xff);
288
4.50k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_8tap_sse2
289
290
static inline void prepare_coeffs_2tap_avx2(
291
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
292
27.2k
    __m256i *const coeffs /* [1] */) {
293
27.2k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
294
27.2k
      filter_params, subpel_q4 & SUBPEL_MASK);
295
296
27.2k
  const __m128i coeff_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
297
27.2k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
298
299
  // coeffs 3 4 3 4 3 4 3 4
300
27.2k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
301
27.2k
}
convolve_2d_avx2.c:prepare_coeffs_2tap_avx2
Line
Count
Source
292
27.2k
    __m256i *const coeffs /* [1] */) {
293
27.2k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
294
27.2k
      filter_params, subpel_q4 & SUBPEL_MASK);
295
296
27.2k
  const __m128i coeff_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
297
27.2k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
298
299
  // coeffs 3 4 3 4 3 4 3 4
300
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
301
27.2k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_2tap_avx2
302
303
static inline void prepare_coeffs_4tap_avx2(
304
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
305
921k
    __m256i *const coeffs /* [2] */) {
306
921k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
307
921k
      filter_params, subpel_q4 & SUBPEL_MASK);
308
309
921k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
310
921k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
311
312
  // coeffs 2 3 2 3 2 3 2 3
313
921k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55);
314
  // coeffs 4 5 4 5 4 5 4 5
315
921k
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0xaa);
316
921k
}
convolve_2d_avx2.c:prepare_coeffs_4tap_avx2
Line
Count
Source
305
921k
    __m256i *const coeffs /* [2] */) {
306
921k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
307
921k
      filter_params, subpel_q4 & SUBPEL_MASK);
308
309
921k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
310
921k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
311
312
  // coeffs 2 3 2 3 2 3 2 3
313
921k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55);
314
  // coeffs 4 5 4 5 4 5 4 5
315
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0xaa);
316
921k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_4tap_avx2
317
318
static inline void prepare_coeffs_6tap_avx2(
319
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
320
721k
    __m256i *const coeffs /* [3]*/) {
321
721k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
322
721k
      filter_params, subpel_q4 & SUBPEL_MASK);
323
721k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
324
721k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeffs_8);
325
326
  // coeffs 1 2 1 2 1 2 1 2
327
721k
  coeffs[0] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x05040302u));
328
  // coeffs 3 4 3 4 3 4 3 4
329
721k
  coeffs[1] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x09080706u));
330
  // coeffs 5 6 5 6 5 6 5 6
331
721k
  coeffs[2] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x0D0C0B0Au));
332
721k
}
convolve_2d_avx2.c:prepare_coeffs_6tap_avx2
Line
Count
Source
320
721k
    __m256i *const coeffs /* [3]*/) {
321
721k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
322
721k
      filter_params, subpel_q4 & SUBPEL_MASK);
323
721k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
324
721k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeffs_8);
325
326
  // coeffs 1 2 1 2 1 2 1 2
327
721k
  coeffs[0] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x05040302u));
328
  // coeffs 3 4 3 4 3 4 3 4
329
721k
  coeffs[1] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x09080706u));
330
  // coeffs 5 6 5 6 5 6 5 6
331
721k
  coeffs[2] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x0D0C0B0Au));
332
721k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_6tap_avx2
333
334
static inline void prepare_coeffs_8tap_avx2(
335
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
336
60.6k
    __m256i *const coeffs /* [4] */) {
337
60.6k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
338
60.6k
      filter_params, subpel_q4 & SUBPEL_MASK);
339
340
60.6k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
341
60.6k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
342
343
  // coeffs 0 1 0 1 0 1 0 1
344
60.6k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
345
  // coeffs 2 3 2 3 2 3 2 3
346
60.6k
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
347
  // coeffs 4 5 4 5 4 5 4 5
348
60.6k
  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
349
  // coeffs 6 7 6 7 6 7 6 7
350
60.6k
  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
351
60.6k
}
convolve_2d_avx2.c:prepare_coeffs_8tap_avx2
Line
Count
Source
336
60.6k
    __m256i *const coeffs /* [4] */) {
337
60.6k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
338
60.6k
      filter_params, subpel_q4 & SUBPEL_MASK);
339
340
60.6k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
341
60.6k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
342
343
  // coeffs 0 1 0 1 0 1 0 1
344
60.6k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
345
  // coeffs 2 3 2 3 2 3 2 3
346
60.6k
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
347
  // coeffs 4 5 4 5 4 5 4 5
348
60.6k
  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
349
  // coeffs 6 7 6 7 6 7 6 7
350
  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
351
60.6k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_8tap_avx2
352
353
static inline void load_16bit_5rows_avx2(const int16_t *const src,
354
                                         const ptrdiff_t stride,
355
0
                                         __m256i dst[5]) {
356
0
  dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
357
0
  dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
358
0
  dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
359
0
  dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
360
0
  dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
361
0
}
Unexecuted instantiation: convolve_2d_avx2.c:load_16bit_5rows_avx2
Unexecuted instantiation: convolve_avx2.c:load_16bit_5rows_avx2
362
363
static inline void load_16bit_7rows_avx2(const int16_t *const src,
364
                                         const ptrdiff_t stride,
365
72.2k
                                         __m256i dst[7]) {
366
72.2k
  dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
367
72.2k
  dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
368
72.2k
  dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
369
72.2k
  dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
370
72.2k
  dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
371
72.2k
  dst[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
372
72.2k
  dst[6] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
373
72.2k
}
convolve_2d_avx2.c:load_16bit_7rows_avx2
Line
Count
Source
365
72.2k
                                         __m256i dst[7]) {
366
72.2k
  dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
367
72.2k
  dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
368
72.2k
  dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
369
72.2k
  dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
370
72.2k
  dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
371
72.2k
  dst[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
372
72.2k
  dst[6] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
373
72.2k
}
Unexecuted instantiation: convolve_avx2.c:load_16bit_7rows_avx2
374
375
static AOM_FORCE_INLINE void load_16bit_8rows_avx2(const int16_t *const src,
376
                                                   const ptrdiff_t stride,
377
415
                                                   __m256i dst[8]) {
378
415
  dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
379
415
  dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
380
415
  dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
381
415
  dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
382
415
  dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
383
415
  dst[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
384
415
  dst[6] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
385
415
  dst[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
386
415
}
convolve_2d_avx2.c:load_16bit_8rows_avx2
Line
Count
Source
377
415
                                                   __m256i dst[8]) {
378
415
  dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
379
415
  dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
380
415
  dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
381
415
  dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
382
415
  dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
383
415
  dst[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
384
415
  dst[6] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
385
415
  dst[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
386
415
}
Unexecuted instantiation: convolve_avx2.c:load_16bit_8rows_avx2
387
388
static AOM_FORCE_INLINE void loadu_unpack_16bit_5rows_avx2(
389
    const int16_t *const src, const ptrdiff_t stride, __m256i s_256[5],
390
175k
    __m256i ss_256[5], __m256i tt_256[5]) {
391
175k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
392
175k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
393
175k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
394
175k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
395
175k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
396
397
175k
  ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
398
175k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
399
175k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
400
175k
  ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
401
402
175k
  tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
403
175k
  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[4]);
404
175k
  tt_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
405
175k
  tt_256[4] = _mm256_unpackhi_epi16(s_256[3], s_256[4]);
406
175k
}
convolve_2d_avx2.c:loadu_unpack_16bit_5rows_avx2
Line
Count
Source
390
175k
    __m256i ss_256[5], __m256i tt_256[5]) {
391
175k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
392
175k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
393
175k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
394
175k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
395
175k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
396
397
175k
  ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
398
175k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
399
175k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
400
175k
  ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
401
402
175k
  tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
403
175k
  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[4]);
404
175k
  tt_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
405
175k
  tt_256[4] = _mm256_unpackhi_epi16(s_256[3], s_256[4]);
406
175k
}
Unexecuted instantiation: convolve_avx2.c:loadu_unpack_16bit_5rows_avx2
407
408
static AOM_FORCE_INLINE void loadu_unpack_16bit_3rows_avx2(
409
    const int16_t *const src, const ptrdiff_t stride, __m256i s_256[3],
410
21.8k
    __m256i ss_256[3], __m256i tt_256[3]) {
411
21.8k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
412
21.8k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
413
21.8k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
414
415
21.8k
  ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
416
21.8k
  ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
417
418
21.8k
  tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
419
21.8k
  tt_256[2] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
420
21.8k
}
convolve_2d_avx2.c:loadu_unpack_16bit_3rows_avx2
Line
Count
Source
410
21.8k
    __m256i ss_256[3], __m256i tt_256[3]) {
411
21.8k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
412
21.8k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
413
21.8k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
414
415
21.8k
  ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
416
21.8k
  ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
417
418
21.8k
  tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
419
21.8k
  tt_256[2] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
420
21.8k
}
Unexecuted instantiation: convolve_avx2.c:loadu_unpack_16bit_3rows_avx2
421
422
static inline void convolve_8tap_unpack_avx2(const __m256i s[6],
423
146k
                                             __m256i ss[7]) {
424
146k
  ss[0] = _mm256_unpacklo_epi16(s[0], s[1]);
425
146k
  ss[1] = _mm256_unpacklo_epi16(s[2], s[3]);
426
146k
  ss[2] = _mm256_unpacklo_epi16(s[4], s[5]);
427
146k
  ss[4] = _mm256_unpackhi_epi16(s[0], s[1]);
428
146k
  ss[5] = _mm256_unpackhi_epi16(s[2], s[3]);
429
146k
  ss[6] = _mm256_unpackhi_epi16(s[4], s[5]);
430
146k
}
convolve_2d_avx2.c:convolve_8tap_unpack_avx2
Line
Count
Source
423
146k
                                             __m256i ss[7]) {
424
146k
  ss[0] = _mm256_unpacklo_epi16(s[0], s[1]);
425
146k
  ss[1] = _mm256_unpacklo_epi16(s[2], s[3]);
426
146k
  ss[2] = _mm256_unpacklo_epi16(s[4], s[5]);
427
146k
  ss[4] = _mm256_unpackhi_epi16(s[0], s[1]);
428
146k
  ss[5] = _mm256_unpackhi_epi16(s[2], s[3]);
429
146k
  ss[6] = _mm256_unpackhi_epi16(s[4], s[5]);
430
146k
}
Unexecuted instantiation: convolve_avx2.c:convolve_8tap_unpack_avx2
431
432
static inline __m128i convolve_2tap_ssse3(const __m128i ss[1],
433
563k
                                          const __m128i coeffs[1]) {
434
563k
  return _mm_maddubs_epi16(ss[0], coeffs[0]);
435
563k
}
convolve_2d_avx2.c:convolve_2tap_ssse3
Line
Count
Source
433
350k
                                          const __m128i coeffs[1]) {
434
350k
  return _mm_maddubs_epi16(ss[0], coeffs[0]);
435
350k
}
convolve_avx2.c:convolve_2tap_ssse3
Line
Count
Source
433
213k
                                          const __m128i coeffs[1]) {
434
213k
  return _mm_maddubs_epi16(ss[0], coeffs[0]);
435
213k
}
436
437
static inline __m128i convolve_4tap_ssse3(const __m128i ss[2],
438
5.56M
                                          const __m128i coeffs[2]) {
439
5.56M
  const __m128i res_23 = _mm_maddubs_epi16(ss[0], coeffs[0]);
440
5.56M
  const __m128i res_45 = _mm_maddubs_epi16(ss[1], coeffs[1]);
441
5.56M
  return _mm_add_epi16(res_23, res_45);
442
5.56M
}
convolve_2d_avx2.c:convolve_4tap_ssse3
Line
Count
Source
438
4.43M
                                          const __m128i coeffs[2]) {
439
4.43M
  const __m128i res_23 = _mm_maddubs_epi16(ss[0], coeffs[0]);
440
4.43M
  const __m128i res_45 = _mm_maddubs_epi16(ss[1], coeffs[1]);
441
4.43M
  return _mm_add_epi16(res_23, res_45);
442
4.43M
}
convolve_avx2.c:convolve_4tap_ssse3
Line
Count
Source
438
1.12M
                                          const __m128i coeffs[2]) {
439
1.12M
  const __m128i res_23 = _mm_maddubs_epi16(ss[0], coeffs[0]);
440
1.12M
  const __m128i res_45 = _mm_maddubs_epi16(ss[1], coeffs[1]);
441
1.12M
  return _mm_add_epi16(res_23, res_45);
442
1.12M
}
443
444
static inline __m128i convolve_6tap_ssse3(const __m128i ss[3],
445
578k
                                          const __m128i coeffs[3]) {
446
578k
  const __m128i res_12 = _mm_maddubs_epi16(ss[0], coeffs[0]);
447
578k
  const __m128i res_34 = _mm_maddubs_epi16(ss[1], coeffs[1]);
448
578k
  const __m128i res_56 = _mm_maddubs_epi16(ss[2], coeffs[2]);
449
578k
  const __m128i res_1256 = _mm_add_epi16(res_12, res_56);
450
578k
  return _mm_add_epi16(res_1256, res_34);
451
578k
}
Unexecuted instantiation: convolve_2d_avx2.c:convolve_6tap_ssse3
convolve_avx2.c:convolve_6tap_ssse3
Line
Count
Source
445
578k
                                          const __m128i coeffs[3]) {
446
578k
  const __m128i res_12 = _mm_maddubs_epi16(ss[0], coeffs[0]);
447
578k
  const __m128i res_34 = _mm_maddubs_epi16(ss[1], coeffs[1]);
448
578k
  const __m128i res_56 = _mm_maddubs_epi16(ss[2], coeffs[2]);
449
578k
  const __m128i res_1256 = _mm_add_epi16(res_12, res_56);
450
578k
  return _mm_add_epi16(res_1256, res_34);
451
578k
}
452
453
static inline __m128i convolve_8tap_ssse3(const __m128i ss[4],
454
35.0k
                                          const __m128i coeffs[4]) {
455
35.0k
  const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
456
35.0k
  const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
457
35.0k
  const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]);
458
35.0k
  const __m128i res_67 = _mm_maddubs_epi16(ss[3], coeffs[3]);
459
35.0k
  const __m128i res_0145 = _mm_add_epi16(res_01, res_45);
460
35.0k
  const __m128i res_2367 = _mm_add_epi16(res_23, res_67);
461
35.0k
  return _mm_add_epi16(res_0145, res_2367);
462
35.0k
}
Unexecuted instantiation: convolve_2d_avx2.c:convolve_8tap_ssse3
convolve_avx2.c:convolve_8tap_ssse3
Line
Count
Source
454
35.0k
                                          const __m128i coeffs[4]) {
455
35.0k
  const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
456
35.0k
  const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
457
35.0k
  const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]);
458
35.0k
  const __m128i res_67 = _mm_maddubs_epi16(ss[3], coeffs[3]);
459
35.0k
  const __m128i res_0145 = _mm_add_epi16(res_01, res_45);
460
35.0k
  const __m128i res_2367 = _mm_add_epi16(res_23, res_67);
461
35.0k
  return _mm_add_epi16(res_0145, res_2367);
462
35.0k
}
463
464
static inline __m256i convolve_2tap_avx2(const __m256i ss[1],
465
1.85M
                                         const __m256i coeffs[1]) {
466
1.85M
  return _mm256_maddubs_epi16(ss[0], coeffs[0]);
467
1.85M
}
convolve_2d_avx2.c:convolve_2tap_avx2
Line
Count
Source
465
1.02M
                                         const __m256i coeffs[1]) {
466
1.02M
  return _mm256_maddubs_epi16(ss[0], coeffs[0]);
467
1.02M
}
convolve_avx2.c:convolve_2tap_avx2
Line
Count
Source
465
829k
                                         const __m256i coeffs[1]) {
466
829k
  return _mm256_maddubs_epi16(ss[0], coeffs[0]);
467
829k
}
468
469
static inline __m256i convolve_4tap_avx2(const __m256i ss[2],
470
3.46M
                                         const __m256i coeffs[2]) {
471
3.46M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
472
3.46M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
473
3.46M
  return _mm256_add_epi16(res_23, res_45);
474
3.46M
}
convolve_2d_avx2.c:convolve_4tap_avx2
Line
Count
Source
470
1.79M
                                         const __m256i coeffs[2]) {
471
1.79M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
472
1.79M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
473
1.79M
  return _mm256_add_epi16(res_23, res_45);
474
1.79M
}
convolve_avx2.c:convolve_4tap_avx2
Line
Count
Source
470
1.67M
                                         const __m256i coeffs[2]) {
471
1.67M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
472
1.67M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
473
1.67M
  return _mm256_add_epi16(res_23, res_45);
474
1.67M
}
475
476
static inline __m256i convolve_6tap_avx2(const __m256i ss[3],
477
22.7M
                                         const __m256i coeffs[3]) {
478
22.7M
  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
479
22.7M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
480
22.7M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
481
22.7M
  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
482
22.7M
  return _mm256_add_epi16(res_0145, res_23);
483
22.7M
}
convolve_2d_avx2.c:convolve_6tap_avx2
Line
Count
Source
477
14.3M
                                         const __m256i coeffs[3]) {
478
14.3M
  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
479
14.3M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
480
14.3M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
481
14.3M
  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
482
14.3M
  return _mm256_add_epi16(res_0145, res_23);
483
14.3M
}
convolve_avx2.c:convolve_6tap_avx2
Line
Count
Source
477
8.33M
                                         const __m256i coeffs[3]) {
478
8.33M
  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
479
8.33M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
480
8.33M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
481
8.33M
  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
482
8.33M
  return _mm256_add_epi16(res_0145, res_23);
483
8.33M
}
484
485
static inline __m256i convolve_8tap_avx2(const __m256i ss[4],
486
3.59M
                                         const __m256i coeffs[4]) {
487
3.59M
  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
488
3.59M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
489
3.59M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
490
3.59M
  const __m256i res_67 = _mm256_maddubs_epi16(ss[3], coeffs[3]);
491
3.59M
  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
492
3.59M
  const __m256i res_2367 = _mm256_add_epi16(res_23, res_67);
493
3.59M
  return _mm256_add_epi16(res_0145, res_2367);
494
3.59M
}
convolve_2d_avx2.c:convolve_8tap_avx2
Line
Count
Source
486
2.81M
                                         const __m256i coeffs[4]) {
487
2.81M
  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
488
2.81M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
489
2.81M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
490
2.81M
  const __m256i res_67 = _mm256_maddubs_epi16(ss[3], coeffs[3]);
491
2.81M
  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
492
2.81M
  const __m256i res_2367 = _mm256_add_epi16(res_23, res_67);
493
2.81M
  return _mm256_add_epi16(res_0145, res_2367);
494
2.81M
}
convolve_avx2.c:convolve_8tap_avx2
Line
Count
Source
486
785k
                                         const __m256i coeffs[4]) {
487
785k
  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
488
785k
  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
489
785k
  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
490
785k
  const __m256i res_67 = _mm256_maddubs_epi16(ss[3], coeffs[3]);
491
785k
  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
492
785k
  const __m256i res_2367 = _mm256_add_epi16(res_23, res_67);
493
785k
  return _mm256_add_epi16(res_0145, res_2367);
494
785k
}
495
496
static inline __m128i convolve16_2tap_sse2(const __m128i ss[1],
497
148k
                                           const __m128i coeffs[1]) {
498
148k
  return _mm_madd_epi16(ss[0], coeffs[0]);
499
148k
}
convolve_2d_avx2.c:convolve16_2tap_sse2
Line
Count
Source
497
148k
                                           const __m128i coeffs[1]) {
498
148k
  return _mm_madd_epi16(ss[0], coeffs[0]);
499
148k
}
Unexecuted instantiation: convolve_avx2.c:convolve16_2tap_sse2
500
501
static inline __m128i convolve16_4tap_sse2(const __m128i ss[2],
502
161k
                                           const __m128i coeffs[2]) {
503
161k
  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
504
161k
  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
505
161k
  return _mm_add_epi32(res_01, res_23);
506
161k
}
convolve_2d_avx2.c:convolve16_4tap_sse2
Line
Count
Source
502
161k
                                           const __m128i coeffs[2]) {
503
161k
  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
504
161k
  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
505
161k
  return _mm_add_epi32(res_01, res_23);
506
161k
}
Unexecuted instantiation: convolve_avx2.c:convolve16_4tap_sse2
507
508
static inline __m128i convolve16_6tap_sse2(const __m128i ss[3],
509
261k
                                           const __m128i coeffs[3]) {
510
261k
  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
511
261k
  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
512
261k
  const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
513
261k
  const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
514
261k
  return _mm_add_epi32(res_0123, res_45);
515
261k
}
convolve_2d_avx2.c:convolve16_6tap_sse2
Line
Count
Source
509
261k
                                           const __m128i coeffs[3]) {
510
261k
  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
511
261k
  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
512
261k
  const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
513
261k
  const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
514
261k
  return _mm_add_epi32(res_0123, res_45);
515
261k
}
Unexecuted instantiation: convolve_avx2.c:convolve16_6tap_sse2
516
517
static inline __m128i convolve16_8tap_sse2(const __m128i ss[4],
518
18.0k
                                           const __m128i coeffs[4]) {
519
18.0k
  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
520
18.0k
  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
521
18.0k
  const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
522
18.0k
  const __m128i res_67 = _mm_madd_epi16(ss[3], coeffs[3]);
523
18.0k
  const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
524
18.0k
  const __m128i res_4567 = _mm_add_epi32(res_45, res_67);
525
18.0k
  return _mm_add_epi32(res_0123, res_4567);
526
18.0k
}
convolve_2d_avx2.c:convolve16_8tap_sse2
Line
Count
Source
518
18.0k
                                           const __m128i coeffs[4]) {
519
18.0k
  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
520
18.0k
  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
521
18.0k
  const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
522
18.0k
  const __m128i res_67 = _mm_madd_epi16(ss[3], coeffs[3]);
523
18.0k
  const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
524
18.0k
  const __m128i res_4567 = _mm_add_epi32(res_45, res_67);
525
18.0k
  return _mm_add_epi32(res_0123, res_4567);
526
18.0k
}
Unexecuted instantiation: convolve_avx2.c:convolve16_8tap_sse2
527
528
static inline __m256i convolve16_2tap_avx2(const __m256i ss[1],
529
1.75M
                                           const __m256i coeffs[1]) {
530
1.75M
  return _mm256_madd_epi16(ss[0], coeffs[0]);
531
1.75M
}
convolve_2d_avx2.c:convolve16_2tap_avx2
Line
Count
Source
529
1.75M
                                           const __m256i coeffs[1]) {
530
1.75M
  return _mm256_madd_epi16(ss[0], coeffs[0]);
531
1.75M
}
Unexecuted instantiation: convolve_avx2.c:convolve16_2tap_avx2
532
533
static inline __m256i convolve16_4tap_avx2(const __m256i ss[2],
534
6.73M
                                           const __m256i coeffs[2]) {
535
6.73M
  const __m256i res_1 = _mm256_madd_epi16(ss[0], coeffs[0]);
536
6.73M
  const __m256i res_2 = _mm256_madd_epi16(ss[1], coeffs[1]);
537
6.73M
  return _mm256_add_epi32(res_1, res_2);
538
6.73M
}
convolve_2d_avx2.c:convolve16_4tap_avx2
Line
Count
Source
534
6.73M
                                           const __m256i coeffs[2]) {
535
6.73M
  const __m256i res_1 = _mm256_madd_epi16(ss[0], coeffs[0]);
536
6.73M
  const __m256i res_2 = _mm256_madd_epi16(ss[1], coeffs[1]);
537
6.73M
  return _mm256_add_epi32(res_1, res_2);
538
6.73M
}
Unexecuted instantiation: convolve_avx2.c:convolve16_4tap_avx2
539
540
static inline __m256i convolve16_6tap_avx2(const __m256i ss[3],
541
19.1M
                                           const __m256i coeffs[3]) {
542
19.1M
  const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
543
19.1M
  const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
544
19.1M
  const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
545
19.1M
  const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
546
19.1M
  return _mm256_add_epi32(res_0123, res_45);
547
19.1M
}
convolve_2d_avx2.c:convolve16_6tap_avx2
Line
Count
Source
541
19.1M
                                           const __m256i coeffs[3]) {
542
19.1M
  const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
543
19.1M
  const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
544
19.1M
  const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
545
19.1M
  const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
546
19.1M
  return _mm256_add_epi32(res_0123, res_45);
547
19.1M
}
Unexecuted instantiation: convolve_avx2.c:convolve16_6tap_avx2
548
549
static inline __m256i convolve16_8tap_avx2(const __m256i ss[4],
550
4.02M
                                           const __m256i coeffs[4]) {
551
4.02M
  const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
552
4.02M
  const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
553
4.02M
  const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
554
4.02M
  const __m256i res_67 = _mm256_madd_epi16(ss[3], coeffs[3]);
555
4.02M
  const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
556
4.02M
  const __m256i res_4567 = _mm256_add_epi32(res_45, res_67);
557
4.02M
  return _mm256_add_epi32(res_0123, res_4567);
558
4.02M
}
convolve_2d_avx2.c:convolve16_8tap_avx2
Line
Count
Source
550
4.02M
                                           const __m256i coeffs[4]) {
551
4.02M
  const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
552
4.02M
  const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
553
4.02M
  const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
554
4.02M
  const __m256i res_67 = _mm256_madd_epi16(ss[3], coeffs[3]);
555
4.02M
  const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
556
4.02M
  const __m256i res_4567 = _mm256_add_epi32(res_45, res_67);
557
4.02M
  return _mm256_add_epi32(res_0123, res_4567);
558
4.02M
}
Unexecuted instantiation: convolve_avx2.c:convolve16_8tap_avx2
559
560
static inline __m256i x_convolve_4tap_avx2(const __m256i data,
561
                                           const __m256i coeffs[2],
562
1.79M
                                           const __m256i filt[2]) {
563
1.79M
  __m256i ss[2];
564
565
1.79M
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
566
1.79M
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
567
568
1.79M
  return convolve_4tap_avx2(ss, coeffs);
569
1.79M
}
convolve_2d_avx2.c:x_convolve_4tap_avx2
Line
Count
Source
562
1.79M
                                           const __m256i filt[2]) {
563
1.79M
  __m256i ss[2];
564
565
1.79M
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
566
1.79M
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
567
568
1.79M
  return convolve_4tap_avx2(ss, coeffs);
569
1.79M
}
Unexecuted instantiation: convolve_avx2.c:x_convolve_4tap_avx2
570
571
static inline __m256i x_convolve_6tap_avx2(const __m256i data,
572
                                           const __m256i coeffs[3],
573
18.3M
                                           const __m256i filt[3]) {
574
18.3M
  __m256i ss[3];
575
576
18.3M
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
577
18.3M
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
578
18.3M
  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
579
580
18.3M
  return convolve_6tap_avx2(ss, coeffs);
581
18.3M
}
convolve_2d_avx2.c:x_convolve_6tap_avx2
Line
Count
Source
573
14.3M
                                           const __m256i filt[3]) {
574
14.3M
  __m256i ss[3];
575
576
14.3M
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
577
14.3M
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
578
14.3M
  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
579
580
14.3M
  return convolve_6tap_avx2(ss, coeffs);
581
14.3M
}
convolve_avx2.c:x_convolve_6tap_avx2
Line
Count
Source
573
3.96M
                                           const __m256i filt[3]) {
574
3.96M
  __m256i ss[3];
575
576
3.96M
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
577
3.96M
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
578
3.96M
  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
579
580
3.96M
  return convolve_6tap_avx2(ss, coeffs);
581
3.96M
}
582
583
static inline __m256i x_convolve_8tap_avx2(const __m256i data,
584
                                           const __m256i coeffs[4],
585
3.29M
                                           const __m256i filt[4]) {
586
3.29M
  __m256i ss[4];
587
588
3.29M
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
589
3.29M
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
590
3.29M
  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
591
3.29M
  ss[3] = _mm256_shuffle_epi8(data, filt[3]);
592
593
3.29M
  return convolve_8tap_avx2(ss, coeffs);
594
3.29M
}
convolve_2d_avx2.c:x_convolve_8tap_avx2
Line
Count
Source
585
2.81M
                                           const __m256i filt[4]) {
586
2.81M
  __m256i ss[4];
587
588
2.81M
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
589
2.81M
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
590
2.81M
  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
591
2.81M
  ss[3] = _mm256_shuffle_epi8(data, filt[3]);
592
593
2.81M
  return convolve_8tap_avx2(ss, coeffs);
594
2.81M
}
convolve_avx2.c:x_convolve_8tap_avx2
Line
Count
Source
585
482k
                                           const __m256i filt[4]) {
586
482k
  __m256i ss[4];
587
588
482k
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
589
482k
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
590
482k
  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
591
482k
  ss[3] = _mm256_shuffle_epi8(data, filt[3]);
592
593
482k
  return convolve_8tap_avx2(ss, coeffs);
594
482k
}
595
596
6.78M
static inline __m256i sr_y_round_avx2(const __m256i src) {
597
6.78M
  const __m256i round = _mm256_set1_epi16(32);
598
6.78M
  const __m256i dst = _mm256_add_epi16(src, round);
599
6.78M
  return _mm256_srai_epi16(dst, FILTER_BITS - 1);
600
6.78M
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_round_avx2
convolve_avx2.c:sr_y_round_avx2
Line
Count
Source
596
6.78M
static inline __m256i sr_y_round_avx2(const __m256i src) {
597
6.78M
  const __m256i round = _mm256_set1_epi16(32);
598
6.78M
  const __m256i dst = _mm256_add_epi16(src, round);
599
6.78M
  return _mm256_srai_epi16(dst, FILTER_BITS - 1);
600
6.78M
}
601
602
4.78M
static inline __m128i xy_x_round_sse2(const __m128i src) {
603
4.78M
  const __m128i round = _mm_set1_epi16(2);
604
4.78M
  const __m128i dst = _mm_add_epi16(src, round);
605
4.78M
  return _mm_srai_epi16(dst, 2);
606
4.78M
}
convolve_2d_avx2.c:xy_x_round_sse2
Line
Count
Source
602
4.78M
static inline __m128i xy_x_round_sse2(const __m128i src) {
603
4.78M
  const __m128i round = _mm_set1_epi16(2);
604
4.78M
  const __m128i dst = _mm_add_epi16(src, round);
605
4.78M
  return _mm_srai_epi16(dst, 2);
606
4.78M
}
Unexecuted instantiation: convolve_avx2.c:xy_x_round_sse2
607
608
20.0M
static inline __m256i xy_x_round_avx2(const __m256i src) {
609
20.0M
  const __m256i round = _mm256_set1_epi16(2);
610
20.0M
  const __m256i dst = _mm256_add_epi16(src, round);
611
20.0M
  return _mm256_srai_epi16(dst, 2);
612
20.0M
}
convolve_2d_avx2.c:xy_x_round_avx2
Line
Count
Source
608
20.0M
static inline __m256i xy_x_round_avx2(const __m256i src) {
609
20.0M
  const __m256i round = _mm256_set1_epi16(2);
610
20.0M
  const __m256i dst = _mm256_add_epi16(src, round);
611
20.0M
  return _mm256_srai_epi16(dst, 2);
612
20.0M
}
Unexecuted instantiation: convolve_avx2.c:xy_x_round_avx2
613
614
static inline void xy_x_round_store_2x2_sse2(const __m128i res,
615
877k
                                             int16_t *const dst) {
616
877k
  const __m128i d = xy_x_round_sse2(res);
617
877k
  _mm_storel_epi64((__m128i *)dst, d);
618
877k
}
convolve_2d_avx2.c:xy_x_round_store_2x2_sse2
Line
Count
Source
615
877k
                                             int16_t *const dst) {
616
877k
  const __m128i d = xy_x_round_sse2(res);
617
877k
  _mm_storel_epi64((__m128i *)dst, d);
618
877k
}
Unexecuted instantiation: convolve_avx2.c:xy_x_round_store_2x2_sse2
619
620
static inline void xy_x_round_store_4x2_sse2(const __m128i res,
621
3.70M
                                             int16_t *const dst) {
622
3.70M
  const __m128i d = xy_x_round_sse2(res);
623
3.70M
  _mm_storeu_si128((__m128i *)dst, d);
624
3.70M
}
convolve_2d_avx2.c:xy_x_round_store_4x2_sse2
Line
Count
Source
621
3.70M
                                             int16_t *const dst) {
622
3.70M
  const __m128i d = xy_x_round_sse2(res);
623
3.70M
  _mm_storeu_si128((__m128i *)dst, d);
624
3.70M
}
Unexecuted instantiation: convolve_avx2.c:xy_x_round_store_4x2_sse2
625
626
static inline void xy_x_round_store_8x2_sse2(const __m128i res[2],
627
99.5k
                                             int16_t *const dst) {
628
99.5k
  __m128i r[2];
629
630
99.5k
  r[0] = xy_x_round_sse2(res[0]);
631
99.5k
  r[1] = xy_x_round_sse2(res[1]);
632
99.5k
  _mm_storeu_si128((__m128i *)dst, r[0]);
633
99.5k
  _mm_storeu_si128((__m128i *)(dst + 8), r[1]);
634
99.5k
}
convolve_2d_avx2.c:xy_x_round_store_8x2_sse2
Line
Count
Source
627
99.5k
                                             int16_t *const dst) {
628
99.5k
  __m128i r[2];
629
630
99.5k
  r[0] = xy_x_round_sse2(res[0]);
631
99.5k
  r[1] = xy_x_round_sse2(res[1]);
632
99.5k
  _mm_storeu_si128((__m128i *)dst, r[0]);
633
99.5k
  _mm_storeu_si128((__m128i *)(dst + 8), r[1]);
634
99.5k
}
Unexecuted instantiation: convolve_avx2.c:xy_x_round_store_8x2_sse2
635
636
static inline void xy_x_round_store_8x2_avx2(const __m256i res,
637
3.66M
                                             int16_t *const dst) {
638
3.66M
  const __m256i d = xy_x_round_avx2(res);
639
3.66M
  _mm256_storeu_si256((__m256i *)dst, d);
640
3.66M
}
convolve_2d_avx2.c:xy_x_round_store_8x2_avx2
Line
Count
Source
637
3.66M
                                             int16_t *const dst) {
638
3.66M
  const __m256i d = xy_x_round_avx2(res);
639
3.66M
  _mm256_storeu_si256((__m256i *)dst, d);
640
3.66M
}
Unexecuted instantiation: convolve_avx2.c:xy_x_round_store_8x2_avx2
641
642
static inline void xy_x_round_store_32_avx2(const __m256i res[2],
643
2.51M
                                            int16_t *const dst) {
644
2.51M
  __m256i r[2];
645
646
2.51M
  r[0] = xy_x_round_avx2(res[0]);
647
2.51M
  r[1] = xy_x_round_avx2(res[1]);
648
2.51M
  const __m256i d0 =
649
2.51M
      _mm256_inserti128_si256(r[0], _mm256_castsi256_si128(r[1]), 1);
650
2.51M
  const __m256i d1 =
651
2.51M
      _mm256_inserti128_si256(r[1], _mm256_extracti128_si256(r[0], 1), 0);
652
2.51M
  _mm256_storeu_si256((__m256i *)dst, d0);
653
2.51M
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
654
2.51M
}
convolve_2d_avx2.c:xy_x_round_store_32_avx2
Line
Count
Source
643
2.51M
                                            int16_t *const dst) {
644
2.51M
  __m256i r[2];
645
646
2.51M
  r[0] = xy_x_round_avx2(res[0]);
647
2.51M
  r[1] = xy_x_round_avx2(res[1]);
648
2.51M
  const __m256i d0 =
649
2.51M
      _mm256_inserti128_si256(r[0], _mm256_castsi256_si128(r[1]), 1);
650
2.51M
  const __m256i d1 =
651
      _mm256_inserti128_si256(r[1], _mm256_extracti128_si256(r[0], 1), 0);
652
2.51M
  _mm256_storeu_si256((__m256i *)dst, d0);
653
2.51M
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
654
2.51M
}
Unexecuted instantiation: convolve_avx2.c:xy_x_round_store_32_avx2
655
656
589k
static inline __m128i xy_y_round_sse2(const __m128i src) {
657
589k
  const __m128i round = _mm_set1_epi32(1024);
658
589k
  const __m128i dst = _mm_add_epi32(src, round);
659
589k
  return _mm_srai_epi32(dst, 11);
660
589k
}
convolve_2d_avx2.c:xy_y_round_sse2
Line
Count
Source
656
589k
static inline __m128i xy_y_round_sse2(const __m128i src) {
657
589k
  const __m128i round = _mm_set1_epi32(1024);
658
589k
  const __m128i dst = _mm_add_epi32(src, round);
659
589k
  return _mm_srai_epi32(dst, 11);
660
589k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_sse2
661
662
28.0k
static inline __m128i xy_y_round_half_pel_sse2(const __m128i src) {
663
28.0k
  const __m128i round = _mm_set1_epi16(16);
664
28.0k
  const __m128i dst = _mm_add_epi16(src, round);
665
28.0k
  return _mm_srai_epi16(dst, 5);
666
28.0k
}
convolve_2d_avx2.c:xy_y_round_half_pel_sse2
Line
Count
Source
662
28.0k
static inline __m128i xy_y_round_half_pel_sse2(const __m128i src) {
663
28.0k
  const __m128i round = _mm_set1_epi16(16);
664
28.0k
  const __m128i dst = _mm_add_epi16(src, round);
665
28.0k
  return _mm_srai_epi16(dst, 5);
666
28.0k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_half_pel_sse2
667
668
31.6M
static inline __m256i xy_y_round_avx2(const __m256i src) {
669
31.6M
  const __m256i round = _mm256_set1_epi32(1024);
670
31.6M
  const __m256i dst = _mm256_add_epi32(src, round);
671
31.6M
  return _mm256_srai_epi32(dst, 11);
672
31.6M
}
convolve_2d_avx2.c:xy_y_round_avx2
Line
Count
Source
668
31.6M
static inline __m256i xy_y_round_avx2(const __m256i src) {
669
31.6M
  const __m256i round = _mm256_set1_epi32(1024);
670
31.6M
  const __m256i dst = _mm256_add_epi32(src, round);
671
31.6M
  return _mm256_srai_epi32(dst, 11);
672
31.6M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_avx2
673
674
14.8M
static inline __m256i xy_y_round_16_avx2(const __m256i r[2]) {
675
14.8M
  const __m256i r0 = xy_y_round_avx2(r[0]);
676
14.8M
  const __m256i r1 = xy_y_round_avx2(r[1]);
677
14.8M
  return _mm256_packs_epi32(r0, r1);
678
14.8M
}
convolve_2d_avx2.c:xy_y_round_16_avx2
Line
Count
Source
674
14.8M
static inline __m256i xy_y_round_16_avx2(const __m256i r[2]) {
675
14.8M
  const __m256i r0 = xy_y_round_avx2(r[0]);
676
14.8M
  const __m256i r1 = xy_y_round_avx2(r[1]);
677
14.8M
  return _mm256_packs_epi32(r0, r1);
678
14.8M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_16_avx2
679
680
380k
static inline __m256i xy_y_round_half_pel_avx2(const __m256i src) {
681
380k
  const __m256i round = _mm256_set1_epi16(16);
682
380k
  const __m256i dst = _mm256_add_epi16(src, round);
683
380k
  return _mm256_srai_epi16(dst, 5);
684
380k
}
convolve_2d_avx2.c:xy_y_round_half_pel_avx2
Line
Count
Source
680
380k
static inline __m256i xy_y_round_half_pel_avx2(const __m256i src) {
681
380k
  const __m256i round = _mm256_set1_epi16(16);
682
380k
  const __m256i dst = _mm256_add_epi16(src, round);
683
380k
  return _mm256_srai_epi16(dst, 5);
684
380k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_half_pel_avx2
685
686
static inline void pack_store_2x2_sse2(const __m128i res, uint8_t *const dst,
687
752k
                                       const ptrdiff_t stride) {
688
752k
  const __m128i d = _mm_packus_epi16(res, res);
689
752k
  *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d);
690
752k
  *(int16_t *)(dst + stride) = (int16_t)_mm_extract_epi16(d, 1);
691
752k
}
convolve_2d_avx2.c:pack_store_2x2_sse2
Line
Count
Source
687
462k
                                       const ptrdiff_t stride) {
688
462k
  const __m128i d = _mm_packus_epi16(res, res);
689
462k
  *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d);
690
  *(int16_t *)(dst + stride) = (int16_t)_mm_extract_epi16(d, 1);
691
462k
}
convolve_avx2.c:pack_store_2x2_sse2
Line
Count
Source
687
290k
                                       const ptrdiff_t stride) {
688
290k
  const __m128i d = _mm_packus_epi16(res, res);
689
290k
  *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d);
690
  *(int16_t *)(dst + stride) = (int16_t)_mm_extract_epi16(d, 1);
691
290k
}
692
693
static inline void pack_store_4x2_sse2(const __m128i res, uint8_t *const dst,
694
1.61M
                                       const ptrdiff_t stride) {
695
1.61M
  const __m128i d = _mm_packus_epi16(res, res);
696
1.61M
  store_u8_4x2_sse2(d, dst, stride);
697
1.61M
}
convolve_2d_avx2.c:pack_store_4x2_sse2
Line
Count
Source
694
89.0k
                                       const ptrdiff_t stride) {
695
89.0k
  const __m128i d = _mm_packus_epi16(res, res);
696
89.0k
  store_u8_4x2_sse2(d, dst, stride);
697
89.0k
}
convolve_avx2.c:pack_store_4x2_sse2
Line
Count
Source
694
1.52M
                                       const ptrdiff_t stride) {
695
1.52M
  const __m128i d = _mm_packus_epi16(res, res);
696
1.52M
  store_u8_4x2_sse2(d, dst, stride);
697
1.52M
}
698
699
static inline void pack_store_4x2_avx2(const __m256i res, uint8_t *const dst,
700
2.03M
                                       const ptrdiff_t stride) {
701
2.03M
  const __m256i d = _mm256_packus_epi16(res, res);
702
2.03M
  const __m128i d0 = _mm256_castsi256_si128(d);
703
2.03M
  const __m128i d1 = _mm256_extracti128_si256(d, 1);
704
705
2.03M
  xx_storel_32(dst, d0);
706
2.03M
  xx_storel_32(dst + stride, d1);
707
2.03M
}
convolve_2d_avx2.c:pack_store_4x2_avx2
Line
Count
Source
700
2.03M
                                       const ptrdiff_t stride) {
701
2.03M
  const __m256i d = _mm256_packus_epi16(res, res);
702
2.03M
  const __m128i d0 = _mm256_castsi256_si128(d);
703
2.03M
  const __m128i d1 = _mm256_extracti128_si256(d, 1);
704
705
2.03M
  xx_storel_32(dst, d0);
706
2.03M
  xx_storel_32(dst + stride, d1);
707
2.03M
}
Unexecuted instantiation: convolve_avx2.c:pack_store_4x2_avx2
708
709
static inline void pack_store_8x2_avx2(const __m256i res, uint8_t *const dst,
710
3.79M
                                       const ptrdiff_t stride) {
711
3.79M
  const __m256i d = _mm256_packus_epi16(res, res);
712
3.79M
  const __m128i d0 = _mm256_castsi256_si128(d);
713
3.79M
  const __m128i d1 = _mm256_extracti128_si256(d, 1);
714
3.79M
  _mm_storel_epi64((__m128i *)dst, d0);
715
3.79M
  _mm_storel_epi64((__m128i *)(dst + stride), d1);
716
3.79M
}
convolve_2d_avx2.c:pack_store_8x2_avx2
Line
Count
Source
710
2.20M
                                       const ptrdiff_t stride) {
711
2.20M
  const __m256i d = _mm256_packus_epi16(res, res);
712
2.20M
  const __m128i d0 = _mm256_castsi256_si128(d);
713
  const __m128i d1 = _mm256_extracti128_si256(d, 1);
714
2.20M
  _mm_storel_epi64((__m128i *)dst, d0);
715
2.20M
  _mm_storel_epi64((__m128i *)(dst + stride), d1);
716
2.20M
}
convolve_avx2.c:pack_store_8x2_avx2
Line
Count
Source
710
1.58M
                                       const ptrdiff_t stride) {
711
1.58M
  const __m256i d = _mm256_packus_epi16(res, res);
712
1.58M
  const __m128i d0 = _mm256_castsi256_si128(d);
713
  const __m128i d1 = _mm256_extracti128_si256(d, 1);
714
1.58M
  _mm_storel_epi64((__m128i *)dst, d0);
715
1.58M
  _mm_storel_epi64((__m128i *)(dst + stride), d1);
716
1.58M
}
717
718
static inline void pack_store_16x2_avx2(const __m256i res0, const __m256i res1,
719
                                        uint8_t *const dst,
720
1.36M
                                        const ptrdiff_t stride) {
721
1.36M
  const __m256i d = _mm256_packus_epi16(res0, res1);
722
1.36M
  storeu_u8_16x2_avx2(d, dst, stride);
723
1.36M
}
Unexecuted instantiation: convolve_2d_avx2.c:pack_store_16x2_avx2
convolve_avx2.c:pack_store_16x2_avx2
Line
Count
Source
720
1.36M
                                        const ptrdiff_t stride) {
721
1.36M
  const __m256i d = _mm256_packus_epi16(res0, res1);
722
1.36M
  storeu_u8_16x2_avx2(d, dst, stride);
723
1.36M
}
724
725
static inline void xy_y_pack_store_16x2_avx2(const __m256i res0,
726
                                             const __m256i res1,
727
                                             uint8_t *const dst,
728
1.66M
                                             const ptrdiff_t stride) {
729
1.66M
  const __m256i t = _mm256_packus_epi16(res0, res1);
730
1.66M
  const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
731
1.66M
  storeu_u8_16x2_avx2(d, dst, stride);
732
1.66M
}
convolve_2d_avx2.c:xy_y_pack_store_16x2_avx2
Line
Count
Source
728
1.66M
                                             const ptrdiff_t stride) {
729
1.66M
  const __m256i t = _mm256_packus_epi16(res0, res1);
730
  const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
731
1.66M
  storeu_u8_16x2_avx2(d, dst, stride);
732
1.66M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_pack_store_16x2_avx2
733
734
static inline void pack_store_32_avx2(const __m256i res0, const __m256i res1,
735
0
                                      uint8_t *const dst) {
736
0
  const __m256i t = _mm256_packus_epi16(res0, res1);
737
0
  const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
738
0
  _mm256_storeu_si256((__m256i *)dst, d);
739
0
}
Unexecuted instantiation: convolve_2d_avx2.c:pack_store_32_avx2
Unexecuted instantiation: convolve_avx2.c:pack_store_32_avx2
740
741
static inline void xy_y_round_store_2x2_sse2(const __m128i res,
742
                                             uint8_t *const dst,
743
457k
                                             const ptrdiff_t stride) {
744
457k
  const __m128i r = xy_y_round_sse2(res);
745
457k
  const __m128i rr = _mm_packs_epi32(r, r);
746
457k
  pack_store_2x2_sse2(rr, dst, stride);
747
457k
}
convolve_2d_avx2.c:xy_y_round_store_2x2_sse2
Line
Count
Source
743
457k
                                             const ptrdiff_t stride) {
744
457k
  const __m128i r = xy_y_round_sse2(res);
745
457k
  const __m128i rr = _mm_packs_epi32(r, r);
746
457k
  pack_store_2x2_sse2(rr, dst, stride);
747
457k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_store_2x2_sse2
748
749
static inline void xy_y_round_store_4x2_avx2(const __m256i res,
750
                                             uint8_t *const dst,
751
2.03M
                                             const ptrdiff_t stride) {
752
2.03M
  const __m256i r = xy_y_round_avx2(res);
753
2.03M
  const __m256i rr = _mm256_packs_epi32(r, r);
754
2.03M
  pack_store_4x2_avx2(rr, dst, stride);
755
2.03M
}
convolve_2d_avx2.c:xy_y_round_store_4x2_avx2
Line
Count
Source
751
2.03M
                                             const ptrdiff_t stride) {
752
2.03M
  const __m256i r = xy_y_round_avx2(res);
753
2.03M
  const __m256i rr = _mm256_packs_epi32(r, r);
754
2.03M
  pack_store_4x2_avx2(rr, dst, stride);
755
2.03M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_store_4x2_avx2
756
757
static inline void xy_y_pack_store_32_avx2(const __m256i res0,
758
                                           const __m256i res1,
759
4.82M
                                           uint8_t *const dst) {
760
4.82M
  const __m256i d = _mm256_packus_epi16(res0, res1);
761
  // d = _mm256_permute4x64_epi64(d, 0xD8);
762
4.82M
  _mm256_storeu_si256((__m256i *)dst, d);
763
4.82M
}
convolve_2d_avx2.c:xy_y_pack_store_32_avx2
Line
Count
Source
759
4.82M
                                           uint8_t *const dst) {
760
4.82M
  const __m256i d = _mm256_packus_epi16(res0, res1);
761
  // d = _mm256_permute4x64_epi64(d, 0xD8);
762
4.82M
  _mm256_storeu_si256((__m256i *)dst, d);
763
4.82M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_pack_store_32_avx2
764
765
static inline void xy_y_round_store_32_avx2(const __m256i r0[2],
766
                                            const __m256i r1[2],
767
4.66M
                                            uint8_t *const dst) {
768
4.66M
  const __m256i ra = xy_y_round_16_avx2(r0);
769
4.66M
  const __m256i rb = xy_y_round_16_avx2(r1);
770
4.66M
  xy_y_pack_store_32_avx2(ra, rb, dst);
771
4.66M
}
convolve_2d_avx2.c:xy_y_round_store_32_avx2
Line
Count
Source
767
4.66M
                                            uint8_t *const dst) {
768
4.66M
  const __m256i ra = xy_y_round_16_avx2(r0);
769
4.66M
  const __m256i rb = xy_y_round_16_avx2(r1);
770
4.66M
  xy_y_pack_store_32_avx2(ra, rb, dst);
771
4.66M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_store_32_avx2
772
773
static inline void convolve_store_32_avx2(const __m256i res0,
774
                                          const __m256i res1,
775
3.65M
                                          uint8_t *const dst) {
776
3.65M
  const __m256i d = _mm256_packus_epi16(res0, res1);
777
3.65M
  _mm256_storeu_si256((__m256i *)dst, d);
778
3.65M
}
Unexecuted instantiation: convolve_2d_avx2.c:convolve_store_32_avx2
convolve_avx2.c:convolve_store_32_avx2
Line
Count
Source
775
3.65M
                                          uint8_t *const dst) {
776
3.65M
  const __m256i d = _mm256_packus_epi16(res0, res1);
777
3.65M
  _mm256_storeu_si256((__m256i *)dst, d);
778
3.65M
}
779
780
854k
static inline __m128i sr_x_round_sse2(const __m128i src) {
781
854k
  const __m128i round = _mm_set1_epi16(34);
782
854k
  const __m128i dst = _mm_add_epi16(src, round);
783
854k
  return _mm_srai_epi16(dst, 6);
784
854k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_round_sse2
convolve_avx2.c:sr_x_round_sse2
Line
Count
Source
780
854k
static inline __m128i sr_x_round_sse2(const __m128i src) {
781
854k
  const __m128i round = _mm_set1_epi16(34);
782
854k
  const __m128i dst = _mm_add_epi16(src, round);
783
854k
  return _mm_srai_epi16(dst, 6);
784
854k
}
785
786
5.82M
static inline __m256i sr_x_round_avx2(const __m256i src) {
787
5.82M
  const __m256i round = _mm256_set1_epi16(34);
788
5.82M
  const __m256i dst = _mm256_add_epi16(src, round);
789
5.82M
  return _mm256_srai_epi16(dst, 6);
790
5.82M
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_round_avx2
convolve_avx2.c:sr_x_round_avx2
Line
Count
Source
786
5.82M
static inline __m256i sr_x_round_avx2(const __m256i src) {
787
5.82M
  const __m256i round = _mm256_set1_epi16(34);
788
5.82M
  const __m256i dst = _mm256_add_epi16(src, round);
789
5.82M
  return _mm256_srai_epi16(dst, 6);
790
5.82M
}
791
792
1.10M
static inline __m128i sr_y_round_sse2(const __m128i src) {
793
1.10M
  const __m128i round = _mm_set1_epi16(32);
794
1.10M
  const __m128i dst = _mm_add_epi16(src, round);
795
1.10M
  return _mm_srai_epi16(dst, FILTER_BITS - 1);
796
1.10M
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_round_sse2
convolve_avx2.c:sr_y_round_sse2
Line
Count
Source
792
1.10M
static inline __m128i sr_y_round_sse2(const __m128i src) {
793
1.10M
  const __m128i round = _mm_set1_epi16(32);
794
1.10M
  const __m128i dst = _mm_add_epi16(src, round);
795
1.10M
  return _mm_srai_epi16(dst, FILTER_BITS - 1);
796
1.10M
}
797
798
static inline void sr_x_round_store_8x2_avx2(const __m256i res,
799
                                             uint8_t *const dst,
800
680k
                                             const ptrdiff_t dst_stride) {
801
680k
  const __m256i r = sr_x_round_avx2(res);
802
680k
  pack_store_8x2_avx2(r, dst, dst_stride);
803
680k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_round_store_8x2_avx2
convolve_avx2.c:sr_x_round_store_8x2_avx2
Line
Count
Source
800
680k
                                             const ptrdiff_t dst_stride) {
801
680k
  const __m256i r = sr_x_round_avx2(res);
802
680k
  pack_store_8x2_avx2(r, dst, dst_stride);
803
680k
}
804
805
static inline void sr_x_round_store_16x2_avx2(const __m256i res[2],
806
                                              uint8_t *const dst,
807
577k
                                              const ptrdiff_t dst_stride) {
808
577k
  __m256i r[2];
809
810
577k
  r[0] = sr_x_round_avx2(res[0]);
811
577k
  r[1] = sr_x_round_avx2(res[1]);
812
577k
  pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
813
577k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_round_store_16x2_avx2
convolve_avx2.c:sr_x_round_store_16x2_avx2
Line
Count
Source
807
577k
                                              const ptrdiff_t dst_stride) {
808
577k
  __m256i r[2];
809
810
577k
  r[0] = sr_x_round_avx2(res[0]);
811
577k
  r[1] = sr_x_round_avx2(res[1]);
812
577k
  pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
813
577k
}
814
815
static inline void sr_x_round_store_32_avx2(const __m256i res[2],
816
1.50M
                                            uint8_t *const dst) {
817
1.50M
  __m256i r[2];
818
819
1.50M
  r[0] = sr_x_round_avx2(res[0]);
820
1.50M
  r[1] = sr_x_round_avx2(res[1]);
821
1.50M
  convolve_store_32_avx2(r[0], r[1], dst);
822
1.50M
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_round_store_32_avx2
convolve_avx2.c:sr_x_round_store_32_avx2
Line
Count
Source
816
1.50M
                                            uint8_t *const dst) {
817
1.50M
  __m256i r[2];
818
819
1.50M
  r[0] = sr_x_round_avx2(res[0]);
820
1.50M
  r[1] = sr_x_round_avx2(res[1]);
821
1.50M
  convolve_store_32_avx2(r[0], r[1], dst);
822
1.50M
}
823
824
static inline void sr_y_round_store_8x2_avx2(const __m256i res,
825
                                             uint8_t *const dst,
826
901k
                                             const ptrdiff_t dst_stride) {
827
901k
  const __m256i r = sr_y_round_avx2(res);
828
901k
  pack_store_8x2_avx2(r, dst, dst_stride);
829
901k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_round_store_8x2_avx2
convolve_avx2.c:sr_y_round_store_8x2_avx2
Line
Count
Source
826
901k
                                             const ptrdiff_t dst_stride) {
827
901k
  const __m256i r = sr_y_round_avx2(res);
828
901k
  pack_store_8x2_avx2(r, dst, dst_stride);
829
901k
}
830
831
static inline void sr_y_round_store_16x2_avx2(const __m256i res[2],
832
                                              uint8_t *const dst,
833
791k
                                              const ptrdiff_t dst_stride) {
834
791k
  __m256i r[2];
835
836
791k
  r[0] = sr_y_round_avx2(res[0]);
837
791k
  r[1] = sr_y_round_avx2(res[1]);
838
791k
  pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
839
791k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_round_store_16x2_avx2
convolve_avx2.c:sr_y_round_store_16x2_avx2
Line
Count
Source
833
791k
                                              const ptrdiff_t dst_stride) {
834
791k
  __m256i r[2];
835
836
791k
  r[0] = sr_y_round_avx2(res[0]);
837
791k
  r[1] = sr_y_round_avx2(res[1]);
838
791k
  pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
839
791k
}
840
841
static inline void sr_y_2tap_32_avg_avx2(const uint8_t *const src,
842
                                         const __m256i s0, __m256i *const s1,
843
90.2k
                                         uint8_t *const dst) {
844
90.2k
  *s1 = _mm256_loadu_si256((__m256i *)src);
845
90.2k
  const __m256i d = _mm256_avg_epu8(s0, *s1);
846
90.2k
  _mm256_storeu_si256((__m256i *)dst, d);
847
90.2k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_2tap_32_avg_avx2
convolve_avx2.c:sr_y_2tap_32_avg_avx2
Line
Count
Source
843
90.2k
                                         uint8_t *const dst) {
844
90.2k
  *s1 = _mm256_loadu_si256((__m256i *)src);
845
90.2k
  const __m256i d = _mm256_avg_epu8(s0, *s1);
846
90.2k
  _mm256_storeu_si256((__m256i *)dst, d);
847
90.2k
}
848
849
static inline void sr_x_2tap_32_avg_avx2(const uint8_t *const src,
850
163k
                                         uint8_t *const dst) {
851
163k
  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
852
163k
  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
853
163k
  const __m256i d = _mm256_avg_epu8(s0, s1);
854
163k
  _mm256_storeu_si256((__m256i *)dst, d);
855
163k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_2tap_32_avg_avx2
convolve_avx2.c:sr_x_2tap_32_avg_avx2
Line
Count
Source
850
163k
                                         uint8_t *const dst) {
851
163k
  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
852
163k
  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
853
163k
  const __m256i d = _mm256_avg_epu8(s0, s1);
854
163k
  _mm256_storeu_si256((__m256i *)dst, d);
855
163k
}
856
857
static inline __m128i x_convolve_2tap_2x2_sse4_1(const uint8_t *const src,
858
                                                 const ptrdiff_t stride,
859
36.6k
                                                 const __m128i coeffs[1]) {
860
36.6k
  const __m128i sfl =
861
36.6k
      _mm_setr_epi8(0, 1, 1, 2, 4, 5, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0);
862
36.6k
  const __m128i s_128 = load_u8_4x2_sse4_1(src, stride);
863
36.6k
  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
864
36.6k
  return convolve_2tap_ssse3(&ss, coeffs);
865
36.6k
}
convolve_2d_avx2.c:x_convolve_2tap_2x2_sse4_1
Line
Count
Source
859
31.9k
                                                 const __m128i coeffs[1]) {
860
31.9k
  const __m128i sfl =
861
31.9k
      _mm_setr_epi8(0, 1, 1, 2, 4, 5, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0);
862
31.9k
  const __m128i s_128 = load_u8_4x2_sse4_1(src, stride);
863
31.9k
  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
864
31.9k
  return convolve_2tap_ssse3(&ss, coeffs);
865
31.9k
}
convolve_avx2.c:x_convolve_2tap_2x2_sse4_1
Line
Count
Source
859
4.66k
                                                 const __m128i coeffs[1]) {
860
4.66k
  const __m128i sfl =
861
4.66k
      _mm_setr_epi8(0, 1, 1, 2, 4, 5, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0);
862
4.66k
  const __m128i s_128 = load_u8_4x2_sse4_1(src, stride);
863
4.66k
  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
864
4.66k
  return convolve_2tap_ssse3(&ss, coeffs);
865
4.66k
}
866
867
static inline __m128i x_convolve_2tap_4x2_ssse3(const uint8_t *const src,
868
                                                const ptrdiff_t stride,
869
144k
                                                const __m128i coeffs[1]) {
870
144k
  const __m128i sfl =
871
144k
      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
872
144k
  const __m128i s_128 = load_u8_8x2_sse2(src, stride);
873
144k
  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
874
144k
  return convolve_2tap_ssse3(&ss, coeffs);
875
144k
}
convolve_2d_avx2.c:x_convolve_2tap_4x2_ssse3
Line
Count
Source
869
119k
                                                const __m128i coeffs[1]) {
870
119k
  const __m128i sfl =
871
119k
      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
872
119k
  const __m128i s_128 = load_u8_8x2_sse2(src, stride);
873
119k
  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
874
119k
  return convolve_2tap_ssse3(&ss, coeffs);
875
119k
}
convolve_avx2.c:x_convolve_2tap_4x2_ssse3
Line
Count
Source
869
25.4k
                                                const __m128i coeffs[1]) {
870
25.4k
  const __m128i sfl =
871
25.4k
      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
872
25.4k
  const __m128i s_128 = load_u8_8x2_sse2(src, stride);
873
25.4k
  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
874
25.4k
  return convolve_2tap_ssse3(&ss, coeffs);
875
25.4k
}
876
877
static inline void x_convolve_2tap_8x2_ssse3(const uint8_t *const src,
878
                                             const ptrdiff_t stride,
879
                                             const __m128i coeffs[1],
880
125k
                                             __m128i r[2]) {
881
125k
  __m128i ss[2];
882
125k
  const __m128i s00 = _mm_loadu_si128((__m128i *)src);
883
125k
  const __m128i s10 = _mm_loadu_si128((__m128i *)(src + stride));
884
125k
  const __m128i s01 = _mm_srli_si128(s00, 1);
885
125k
  const __m128i s11 = _mm_srli_si128(s10, 1);
886
125k
  ss[0] = _mm_unpacklo_epi8(s00, s01);
887
125k
  ss[1] = _mm_unpacklo_epi8(s10, s11);
888
889
125k
  r[0] = convolve_2tap_ssse3(&ss[0], coeffs);
890
125k
  r[1] = convolve_2tap_ssse3(&ss[1], coeffs);
891
125k
}
convolve_2d_avx2.c:x_convolve_2tap_8x2_ssse3
Line
Count
Source
880
99.5k
                                             __m128i r[2]) {
881
99.5k
  __m128i ss[2];
882
99.5k
  const __m128i s00 = _mm_loadu_si128((__m128i *)src);
883
99.5k
  const __m128i s10 = _mm_loadu_si128((__m128i *)(src + stride));
884
99.5k
  const __m128i s01 = _mm_srli_si128(s00, 1);
885
99.5k
  const __m128i s11 = _mm_srli_si128(s10, 1);
886
99.5k
  ss[0] = _mm_unpacklo_epi8(s00, s01);
887
99.5k
  ss[1] = _mm_unpacklo_epi8(s10, s11);
888
889
99.5k
  r[0] = convolve_2tap_ssse3(&ss[0], coeffs);
890
99.5k
  r[1] = convolve_2tap_ssse3(&ss[1], coeffs);
891
99.5k
}
convolve_avx2.c:x_convolve_2tap_8x2_ssse3
Line
Count
Source
880
25.7k
                                             __m128i r[2]) {
881
25.7k
  __m128i ss[2];
882
25.7k
  const __m128i s00 = _mm_loadu_si128((__m128i *)src);
883
25.7k
  const __m128i s10 = _mm_loadu_si128((__m128i *)(src + stride));
884
25.7k
  const __m128i s01 = _mm_srli_si128(s00, 1);
885
25.7k
  const __m128i s11 = _mm_srli_si128(s10, 1);
886
25.7k
  ss[0] = _mm_unpacklo_epi8(s00, s01);
887
25.7k
  ss[1] = _mm_unpacklo_epi8(s10, s11);
888
889
25.7k
  r[0] = convolve_2tap_ssse3(&ss[0], coeffs);
890
25.7k
  r[1] = convolve_2tap_ssse3(&ss[1], coeffs);
891
25.7k
}
892
893
static inline __m256i x_convolve_2tap_8x2_avx2(const uint8_t *const src,
894
                                               const ptrdiff_t stride,
895
0
                                               const __m256i coeffs[1]) {
896
0
  __m128i s_128[2][2];
897
0
  __m256i s_256[2];
898
0
899
0
  s_128[0][0] = _mm_loadu_si128((__m128i *)src);
900
0
  s_128[1][0] = _mm_loadu_si128((__m128i *)(src + stride));
901
0
  s_128[0][1] = _mm_srli_si128(s_128[0][0], 1);
902
0
  s_128[1][1] = _mm_srli_si128(s_128[1][0], 1);
903
0
  s_256[0] = _mm256_setr_m128i(s_128[0][0], s_128[1][0]);
904
0
  s_256[1] = _mm256_setr_m128i(s_128[0][1], s_128[1][1]);
905
0
  const __m256i ss = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
906
0
  return convolve_2tap_avx2(&ss, coeffs);
907
0
}
Unexecuted instantiation: convolve_2d_avx2.c:x_convolve_2tap_8x2_avx2
Unexecuted instantiation: convolve_avx2.c:x_convolve_2tap_8x2_avx2
908
909
static inline void x_convolve_2tap_16x2_avx2(const uint8_t *const src,
910
                                             const ptrdiff_t stride,
911
                                             const __m256i coeffs[1],
912
87.9k
                                             __m256i r[2]) {
913
87.9k
  const __m256i s0_256 = loadu_8bit_16x2_avx2(src, stride);
914
87.9k
  const __m256i s1_256 = loadu_8bit_16x2_avx2(src + 1, stride);
915
87.9k
  const __m256i s0 = _mm256_unpacklo_epi8(s0_256, s1_256);
916
87.9k
  const __m256i s1 = _mm256_unpackhi_epi8(s0_256, s1_256);
917
87.9k
  r[0] = convolve_2tap_avx2(&s0, coeffs);
918
87.9k
  r[1] = convolve_2tap_avx2(&s1, coeffs);
919
87.9k
}
convolve_2d_avx2.c:x_convolve_2tap_16x2_avx2
Line
Count
Source
912
69.6k
                                             __m256i r[2]) {
913
69.6k
  const __m256i s0_256 = loadu_8bit_16x2_avx2(src, stride);
914
69.6k
  const __m256i s1_256 = loadu_8bit_16x2_avx2(src + 1, stride);
915
69.6k
  const __m256i s0 = _mm256_unpacklo_epi8(s0_256, s1_256);
916
69.6k
  const __m256i s1 = _mm256_unpackhi_epi8(s0_256, s1_256);
917
69.6k
  r[0] = convolve_2tap_avx2(&s0, coeffs);
918
69.6k
  r[1] = convolve_2tap_avx2(&s1, coeffs);
919
69.6k
}
convolve_avx2.c:x_convolve_2tap_16x2_avx2
Line
Count
Source
912
18.3k
                                             __m256i r[2]) {
913
18.3k
  const __m256i s0_256 = loadu_8bit_16x2_avx2(src, stride);
914
18.3k
  const __m256i s1_256 = loadu_8bit_16x2_avx2(src + 1, stride);
915
18.3k
  const __m256i s0 = _mm256_unpacklo_epi8(s0_256, s1_256);
916
18.3k
  const __m256i s1 = _mm256_unpackhi_epi8(s0_256, s1_256);
917
18.3k
  r[0] = convolve_2tap_avx2(&s0, coeffs);
918
18.3k
  r[1] = convolve_2tap_avx2(&s1, coeffs);
919
18.3k
}
920
921
static inline void x_convolve_2tap_32_avx2(const uint8_t *const src,
922
                                           const __m256i coeffs[1],
923
175k
                                           __m256i r[2]) {
924
175k
  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
925
175k
  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
926
175k
  const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
927
175k
  const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
928
929
175k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
930
175k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
931
175k
}
Unexecuted instantiation: convolve_2d_avx2.c:x_convolve_2tap_32_avx2
convolve_avx2.c:x_convolve_2tap_32_avx2
Line
Count
Source
923
175k
                                           __m256i r[2]) {
924
175k
  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
925
175k
  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
926
175k
  const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
927
175k
  const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
928
929
175k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
930
175k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
931
175k
}
932
933
static inline __m128i x_convolve_4tap_2x2_ssse3(const uint8_t *const src,
934
                                                const ptrdiff_t stride,
935
967k
                                                const __m128i coeffs[2]) {
936
967k
  const __m128i sfl0 =
937
967k
      _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
938
967k
  const __m128i sfl1 =
939
967k
      _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
940
967k
  const __m128i s = load_u8_8x2_sse2(src, stride);
941
967k
  __m128i ss[2];
942
943
967k
  ss[0] = _mm_shuffle_epi8(s, sfl0);
944
967k
  ss[1] = _mm_shuffle_epi8(s, sfl1);
945
967k
  return convolve_4tap_ssse3(ss, coeffs);
946
967k
}
convolve_2d_avx2.c:x_convolve_4tap_2x2_ssse3
Line
Count
Source
935
845k
                                                const __m128i coeffs[2]) {
936
845k
  const __m128i sfl0 =
937
845k
      _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
938
845k
  const __m128i sfl1 =
939
845k
      _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
940
845k
  const __m128i s = load_u8_8x2_sse2(src, stride);
941
845k
  __m128i ss[2];
942
943
845k
  ss[0] = _mm_shuffle_epi8(s, sfl0);
944
845k
  ss[1] = _mm_shuffle_epi8(s, sfl1);
945
845k
  return convolve_4tap_ssse3(ss, coeffs);
946
845k
}
convolve_avx2.c:x_convolve_4tap_2x2_ssse3
Line
Count
Source
935
121k
                                                const __m128i coeffs[2]) {
936
121k
  const __m128i sfl0 =
937
121k
      _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
938
121k
  const __m128i sfl1 =
939
121k
      _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
940
121k
  const __m128i s = load_u8_8x2_sse2(src, stride);
941
121k
  __m128i ss[2];
942
943
121k
  ss[0] = _mm_shuffle_epi8(s, sfl0);
944
121k
  ss[1] = _mm_shuffle_epi8(s, sfl1);
945
121k
  return convolve_4tap_ssse3(ss, coeffs);
946
121k
}
947
948
static inline __m128i x_convolve_4tap_4x2_ssse3(const uint8_t *const src,
949
                                                const ptrdiff_t stride,
950
4.23M
                                                const __m128i coeffs[2]) {
951
4.23M
  const __m128i s = load_u8_8x2_sse2(src, stride);
952
4.23M
  const __m128i sfl0 =
953
4.23M
      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
954
4.23M
  const __m128i sfl1 =
955
4.23M
      _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14);
956
4.23M
  __m128i ss[2];
957
958
4.23M
  ss[0] = _mm_shuffle_epi8(s, sfl0);
959
4.23M
  ss[1] = _mm_shuffle_epi8(s, sfl1);
960
4.23M
  return convolve_4tap_ssse3(ss, coeffs);
961
4.23M
}
convolve_2d_avx2.c:x_convolve_4tap_4x2_ssse3
Line
Count
Source
950
3.58M
                                                const __m128i coeffs[2]) {
951
3.58M
  const __m128i s = load_u8_8x2_sse2(src, stride);
952
3.58M
  const __m128i sfl0 =
953
3.58M
      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
954
3.58M
  const __m128i sfl1 =
955
3.58M
      _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14);
956
3.58M
  __m128i ss[2];
957
958
3.58M
  ss[0] = _mm_shuffle_epi8(s, sfl0);
959
3.58M
  ss[1] = _mm_shuffle_epi8(s, sfl1);
960
3.58M
  return convolve_4tap_ssse3(ss, coeffs);
961
3.58M
}
convolve_avx2.c:x_convolve_4tap_4x2_ssse3
Line
Count
Source
950
651k
                                                const __m128i coeffs[2]) {
951
651k
  const __m128i s = load_u8_8x2_sse2(src, stride);
952
651k
  const __m128i sfl0 =
953
651k
      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
954
651k
  const __m128i sfl1 =
955
651k
      _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14);
956
651k
  __m128i ss[2];
957
958
651k
  ss[0] = _mm_shuffle_epi8(s, sfl0);
959
651k
  ss[1] = _mm_shuffle_epi8(s, sfl1);
960
651k
  return convolve_4tap_ssse3(ss, coeffs);
961
651k
}
962
963
static inline __m256i x_convolve_4tap_8x2_avx2(const uint8_t *const src,
964
                                               const ptrdiff_t stride,
965
                                               const __m256i coeffs[2],
966
407k
                                               const __m256i filt[2]) {
967
407k
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
968
407k
  return x_convolve_4tap_avx2(s_256, coeffs, filt);
969
407k
}
convolve_2d_avx2.c:x_convolve_4tap_8x2_avx2
Line
Count
Source
966
407k
                                               const __m256i filt[2]) {
967
407k
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
968
407k
  return x_convolve_4tap_avx2(s_256, coeffs, filt);
969
407k
}
Unexecuted instantiation: convolve_avx2.c:x_convolve_4tap_8x2_avx2
970
971
static inline void x_convolve_4tap_16x2_avx2(const uint8_t *const src,
972
                                             const int32_t src_stride,
973
                                             const __m256i coeffs[2],
974
                                             const __m256i filt[2],
975
117k
                                             __m256i r[2]) {
976
117k
  r[0] = x_convolve_4tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
977
117k
  r[1] = x_convolve_4tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
978
117k
}
convolve_2d_avx2.c:x_convolve_4tap_16x2_avx2
Line
Count
Source
975
117k
                                             __m256i r[2]) {
976
117k
  r[0] = x_convolve_4tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
977
117k
  r[1] = x_convolve_4tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
978
117k
}
Unexecuted instantiation: convolve_avx2.c:x_convolve_4tap_16x2_avx2
979
980
static inline void x_convolve_4tap_32_avx2(const uint8_t *const src,
981
                                           const __m256i coeffs[2],
982
                                           const __m256i filt[2],
983
694k
                                           __m256i r[2]) {
984
694k
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
985
694k
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
986
987
694k
  r[0] = x_convolve_4tap_avx2(s0_256, coeffs, filt);
988
694k
  r[1] = x_convolve_4tap_avx2(s1_256, coeffs, filt);
989
694k
}
convolve_2d_avx2.c:x_convolve_4tap_32_avx2
Line
Count
Source
983
694k
                                           __m256i r[2]) {
984
694k
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
985
694k
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
986
987
694k
  r[0] = x_convolve_4tap_avx2(s0_256, coeffs, filt);
988
694k
  r[1] = x_convolve_4tap_avx2(s1_256, coeffs, filt);
989
694k
}
Unexecuted instantiation: convolve_avx2.c:x_convolve_4tap_32_avx2
990
991
static inline __m128i x_convolve_6tap_2x2_ssse3(const uint8_t *const src,
992
                                                const ptrdiff_t stride,
993
0
                                                const __m128i coeffs[3]) {
994
0
  const __m128i sfl0 =
995
0
      _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
996
0
  const __m128i sfl1 =
997
0
      _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
998
0
  const __m128i sfl2 =
999
0
      _mm_setr_epi8(4, 5, 5, 6, 12, 13, 13, 14, 0, 0, 0, 0, 0, 0, 0, 0);
1000
1001
0
  const __m128i s = load_u8_8x2_sse2(src, stride);
1002
0
  __m128i ss[3];
1003
1004
0
  ss[0] = _mm_shuffle_epi8(s, sfl0);
1005
0
  ss[1] = _mm_shuffle_epi8(s, sfl1);
1006
0
  ss[2] = _mm_shuffle_epi8(s, sfl2);
1007
0
  return convolve_6tap_ssse3(ss, coeffs);
1008
0
}
Unexecuted instantiation: convolve_2d_avx2.c:x_convolve_6tap_2x2_ssse3
Unexecuted instantiation: convolve_avx2.c:x_convolve_6tap_2x2_ssse3
1009
1010
static inline __m128i x_convolve_6tap_4x2_ssse3(const uint8_t *const src,
1011
                                                const ptrdiff_t stride,
1012
0
                                                const __m128i coeffs[3]) {
1013
0
  const __m128i s = load_u8_8x2_sse2(src, stride);
1014
0
  const __m128i sfl0 =
1015
0
      _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
1016
0
  const __m128i sfl1 =
1017
0
      _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
1018
0
  const __m128i sfl2 =
1019
0
      _mm_setr_epi8(4, 5, 5, 6, 12, 13, 13, 14, 0, 0, 0, 0, 0, 0, 0, 0);
1020
0
  __m128i ss[3];
1021
1022
0
  ss[0] = _mm_shuffle_epi8(s, sfl0);
1023
0
  ss[1] = _mm_shuffle_epi8(s, sfl1);
1024
0
  ss[2] = _mm_shuffle_epi8(s, sfl2);
1025
0
  return convolve_6tap_ssse3(ss, coeffs);
1026
0
}
Unexecuted instantiation: convolve_2d_avx2.c:x_convolve_6tap_4x2_ssse3
Unexecuted instantiation: convolve_avx2.c:x_convolve_6tap_4x2_ssse3
1027
1028
static inline __m256i x_convolve_6tap_8x2_avx2(const uint8_t *const src,
1029
                                               const ptrdiff_t stride,
1030
                                               const __m256i coeffs[3],
1031
9.36M
                                               const __m256i filt[3]) {
1032
9.36M
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
1033
9.36M
  return x_convolve_6tap_avx2(s_256, coeffs, filt);
1034
9.36M
}
convolve_2d_avx2.c:x_convolve_6tap_8x2_avx2
Line
Count
Source
1031
7.68M
                                               const __m256i filt[3]) {
1032
7.68M
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
1033
7.68M
  return x_convolve_6tap_avx2(s_256, coeffs, filt);
1034
7.68M
}
convolve_avx2.c:x_convolve_6tap_8x2_avx2
Line
Count
Source
1031
1.68M
                                               const __m256i filt[3]) {
1032
1.68M
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
1033
1.68M
  return x_convolve_6tap_avx2(s_256, coeffs, filt);
1034
1.68M
}
1035
1036
static inline void x_convolve_6tap_16x2_avx2(const uint8_t *const src,
1037
                                             const int32_t src_stride,
1038
                                             const __m256i coeffs[3],
1039
                                             const __m256i filt[3],
1040
2.72M
                                             __m256i r[2]) {
1041
2.72M
  r[0] = x_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1042
2.72M
  r[1] = x_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1043
2.72M
}
convolve_2d_avx2.c:x_convolve_6tap_16x2_avx2
Line
Count
Source
1040
2.19M
                                             __m256i r[2]) {
1041
2.19M
  r[0] = x_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1042
2.19M
  r[1] = x_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1043
2.19M
}
convolve_avx2.c:x_convolve_6tap_16x2_avx2
Line
Count
Source
1040
522k
                                             __m256i r[2]) {
1041
522k
  r[0] = x_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1042
522k
  r[1] = x_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1043
522k
}
1044
1045
static inline void x_convolve_6tap_32_avx2(const uint8_t *const src,
1046
                                           const __m256i coeffs[3],
1047
                                           const __m256i filt[3],
1048
4.49M
                                           __m256i r[2]) {
1049
4.49M
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
1050
4.49M
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
1051
1052
4.49M
  r[0] = x_convolve_6tap_avx2(s0_256, coeffs, filt);
1053
4.49M
  r[1] = x_convolve_6tap_avx2(s1_256, coeffs, filt);
1054
4.49M
}
convolve_2d_avx2.c:x_convolve_6tap_32_avx2
Line
Count
Source
1048
3.35M
                                           __m256i r[2]) {
1049
3.35M
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
1050
3.35M
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
1051
1052
3.35M
  r[0] = x_convolve_6tap_avx2(s0_256, coeffs, filt);
1053
3.35M
  r[1] = x_convolve_6tap_avx2(s1_256, coeffs, filt);
1054
3.35M
}
convolve_avx2.c:x_convolve_6tap_32_avx2
Line
Count
Source
1048
1.14M
                                           __m256i r[2]) {
1049
1.14M
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
1050
1.14M
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
1051
1052
1.14M
  r[0] = x_convolve_6tap_avx2(s0_256, coeffs, filt);
1053
1.14M
  r[1] = x_convolve_6tap_avx2(s1_256, coeffs, filt);
1054
1.14M
}
1055
1056
static inline __m256i x_convolve_8tap_8x2_avx2(const uint8_t *const src,
1057
                                               const ptrdiff_t stride,
1058
                                               const __m256i coeffs[4],
1059
579k
                                               const __m256i filt[4]) {
1060
579k
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
1061
579k
  return x_convolve_8tap_avx2(s_256, coeffs, filt);
1062
579k
}
convolve_2d_avx2.c:x_convolve_8tap_8x2_avx2
Line
Count
Source
1059
461k
                                               const __m256i filt[4]) {
1060
461k
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
1061
461k
  return x_convolve_8tap_avx2(s_256, coeffs, filt);
1062
461k
}
convolve_avx2.c:x_convolve_8tap_8x2_avx2
Line
Count
Source
1059
117k
                                               const __m256i filt[4]) {
1060
117k
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
1061
117k
  return x_convolve_8tap_avx2(s_256, coeffs, filt);
1062
117k
}
1063
1064
static AOM_FORCE_INLINE void x_convolve_8tap_16x2_avx2(const uint8_t *const src,
1065
                                                       const int32_t src_stride,
1066
                                                       const __m256i coeffs[4],
1067
                                                       const __m256i filt[4],
1068
166k
                                                       __m256i r[2]) {
1069
166k
  r[0] = x_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1070
166k
  r[1] = x_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1071
166k
}
convolve_2d_avx2.c:x_convolve_8tap_16x2_avx2
Line
Count
Source
1068
129k
                                                       __m256i r[2]) {
1069
129k
  r[0] = x_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1070
129k
  r[1] = x_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1071
129k
}
convolve_avx2.c:x_convolve_8tap_16x2_avx2
Line
Count
Source
1068
37.1k
                                                       __m256i r[2]) {
1069
37.1k
  r[0] = x_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1070
37.1k
  r[1] = x_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1071
37.1k
}
1072
1073
static AOM_FORCE_INLINE void x_convolve_8tap_32_avx2(const uint8_t *const src,
1074
                                                     const __m256i coeffs[4],
1075
                                                     const __m256i filt[4],
1076
1.35M
                                                     __m256i r[2]) {
1077
1.35M
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
1078
1.35M
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
1079
1080
1.35M
  r[0] = x_convolve_8tap_avx2(s0_256, coeffs, filt);
1081
1.35M
  r[1] = x_convolve_8tap_avx2(s1_256, coeffs, filt);
1082
1.35M
}
convolve_2d_avx2.c:x_convolve_8tap_32_avx2
Line
Count
Source
1076
1.17M
                                                     __m256i r[2]) {
1077
1.17M
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
1078
1.17M
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
1079
1080
1.17M
  r[0] = x_convolve_8tap_avx2(s0_256, coeffs, filt);
1081
1.17M
  r[1] = x_convolve_8tap_avx2(s1_256, coeffs, filt);
1082
1.17M
}
convolve_avx2.c:x_convolve_8tap_32_avx2
Line
Count
Source
1076
182k
                                                     __m256i r[2]) {
1077
182k
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
1078
182k
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
1079
1080
182k
  r[0] = x_convolve_8tap_avx2(s0_256, coeffs, filt);
1081
182k
  r[1] = x_convolve_8tap_avx2(s1_256, coeffs, filt);
1082
182k
}
1083
1084
static inline __m128i y_convolve_2tap_2x2_ssse3(const uint8_t *const src,
1085
                                                const ptrdiff_t stride,
1086
                                                const __m128i coeffs[1],
1087
7.14k
                                                __m128i s_16[2]) {
1088
7.14k
  __m128i s_128[2];
1089
1090
7.14k
  s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src + stride));
1091
7.14k
  s_128[0] = _mm_unpacklo_epi16(s_16[0], s_16[1]);
1092
7.14k
  s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src + 2 * stride));
1093
7.14k
  s_128[1] = _mm_unpacklo_epi16(s_16[1], s_16[0]);
1094
7.14k
  const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
1095
7.14k
  return convolve_2tap_ssse3(&ss, coeffs);
1096
7.14k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_2tap_2x2_ssse3
convolve_avx2.c:y_convolve_2tap_2x2_ssse3
Line
Count
Source
1087
7.14k
                                                __m128i s_16[2]) {
1088
7.14k
  __m128i s_128[2];
1089
1090
7.14k
  s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src + stride));
1091
7.14k
  s_128[0] = _mm_unpacklo_epi16(s_16[0], s_16[1]);
1092
7.14k
  s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src + 2 * stride));
1093
7.14k
  s_128[1] = _mm_unpacklo_epi16(s_16[1], s_16[0]);
1094
7.14k
  const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
1095
7.14k
  return convolve_2tap_ssse3(&ss, coeffs);
1096
7.14k
}
1097
1098
static inline __m128i y_convolve_2tap_4x2_ssse3(const uint8_t *const src,
1099
                                                const ptrdiff_t stride,
1100
                                                const __m128i coeffs[1],
1101
37.8k
                                                __m128i s_32[2]) {
1102
37.8k
  __m128i s_128[2];
1103
1104
37.8k
  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + stride));
1105
37.8k
  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1106
37.8k
  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * stride));
1107
37.8k
  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1108
37.8k
  const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
1109
37.8k
  return convolve_2tap_ssse3(&ss, coeffs);
1110
37.8k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_2tap_4x2_ssse3
convolve_avx2.c:y_convolve_2tap_4x2_ssse3
Line
Count
Source
1101
37.8k
                                                __m128i s_32[2]) {
1102
37.8k
  __m128i s_128[2];
1103
1104
37.8k
  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + stride));
1105
37.8k
  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1106
37.8k
  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * stride));
1107
37.8k
  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1108
37.8k
  const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
1109
37.8k
  return convolve_2tap_ssse3(&ss, coeffs);
1110
37.8k
}
1111
1112
static inline __m256i y_convolve_2tap_8x2_avx2(const uint8_t *const src,
1113
                                               const ptrdiff_t stride,
1114
                                               const __m256i coeffs[1],
1115
0
                                               __m128i s_64[2]) {
1116
0
  __m256i s_256[2];
1117
0
1118
0
  s_64[1] = _mm_loadl_epi64((__m128i *)(src + stride));
1119
0
  s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
1120
0
  s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * stride));
1121
0
  s_256[1] = _mm256_setr_m128i(s_64[1], s_64[0]);
1122
0
  const __m256i ss = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
1123
0
  return convolve_2tap_avx2(&ss, coeffs);
1124
0
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_2tap_8x2_avx2
Unexecuted instantiation: convolve_avx2.c:y_convolve_2tap_8x2_avx2
1125
1126
static inline void y_convolve_2tap_16x2_avx2(const uint8_t *const src,
1127
                                             const ptrdiff_t stride,
1128
                                             const __m256i coeffs[1],
1129
33.2k
                                             __m128i s_128[2], __m256i r[2]) {
1130
33.2k
  __m256i s_256[2];
1131
1132
33.2k
  s_128[1] = _mm_loadu_si128((__m128i *)(src + stride));
1133
33.2k
  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1134
33.2k
  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
1135
33.2k
  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1136
33.2k
  const __m256i ss0 = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
1137
33.2k
  const __m256i ss1 = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
1138
33.2k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
1139
33.2k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
1140
33.2k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_2tap_16x2_avx2
convolve_avx2.c:y_convolve_2tap_16x2_avx2
Line
Count
Source
1129
33.2k
                                             __m128i s_128[2], __m256i r[2]) {
1130
33.2k
  __m256i s_256[2];
1131
1132
33.2k
  s_128[1] = _mm_loadu_si128((__m128i *)(src + stride));
1133
33.2k
  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1134
33.2k
  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
1135
  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1136
33.2k
  const __m256i ss0 = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
1137
33.2k
  const __m256i ss1 = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
1138
33.2k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
1139
33.2k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
1140
33.2k
}
1141
1142
static inline void y_convolve_2tap_32_avx2(const uint8_t *const src,
1143
                                           const __m256i coeffs[1],
1144
                                           const __m256i s0, __m256i *const s1,
1145
187k
                                           __m256i r[2]) {
1146
187k
  *s1 = _mm256_loadu_si256((__m256i *)src);
1147
187k
  const __m256i ss0 = _mm256_unpacklo_epi8(s0, *s1);
1148
187k
  const __m256i ss1 = _mm256_unpackhi_epi8(s0, *s1);
1149
187k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
1150
187k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
1151
187k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_2tap_32_avx2
convolve_avx2.c:y_convolve_2tap_32_avx2
Line
Count
Source
1145
187k
                                           __m256i r[2]) {
1146
187k
  *s1 = _mm256_loadu_si256((__m256i *)src);
1147
187k
  const __m256i ss0 = _mm256_unpacklo_epi8(s0, *s1);
1148
187k
  const __m256i ss1 = _mm256_unpackhi_epi8(s0, *s1);
1149
187k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
1150
187k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
1151
187k
}
1152
1153
static inline __m128i y_convolve_4tap_2x2_ssse3(const uint8_t *const src,
1154
                                                const ptrdiff_t stride,
1155
                                                const __m128i coeffs[2],
1156
                                                __m128i s_16[4],
1157
56.4k
                                                __m128i ss_128[2]) {
1158
56.4k
  s_16[3] = _mm_cvtsi32_si128(loadu_int16(src + stride));
1159
56.4k
  const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
1160
56.4k
  s_16[2] = _mm_cvtsi32_si128(loadu_int16(src + 2 * stride));
1161
56.4k
  const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[2]);
1162
56.4k
  ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1163
56.4k
  return convolve_4tap_ssse3(ss_128, coeffs);
1164
56.4k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_4tap_2x2_ssse3
convolve_avx2.c:y_convolve_4tap_2x2_ssse3
Line
Count
Source
1157
56.4k
                                                __m128i ss_128[2]) {
1158
56.4k
  s_16[3] = _mm_cvtsi32_si128(loadu_int16(src + stride));
1159
56.4k
  const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
1160
56.4k
  s_16[2] = _mm_cvtsi32_si128(loadu_int16(src + 2 * stride));
1161
56.4k
  const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[2]);
1162
56.4k
  ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1163
56.4k
  return convolve_4tap_ssse3(ss_128, coeffs);
1164
56.4k
}
1165
1166
static inline __m128i y_convolve_4tap_4x2_ssse3(const uint8_t *const src,
1167
                                                const ptrdiff_t stride,
1168
                                                const __m128i coeffs[2],
1169
                                                __m128i s_32[4],
1170
300k
                                                __m128i ss_128[2]) {
1171
300k
  s_32[3] = _mm_cvtsi32_si128(loadu_int32(src + stride));
1172
300k
  const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1173
300k
  s_32[2] = _mm_cvtsi32_si128(loadu_int32(src + 2 * stride));
1174
300k
  const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
1175
300k
  ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1176
300k
  return convolve_4tap_ssse3(ss_128, coeffs);
1177
300k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_4tap_4x2_ssse3
convolve_avx2.c:y_convolve_4tap_4x2_ssse3
Line
Count
Source
1170
300k
                                                __m128i ss_128[2]) {
1171
300k
  s_32[3] = _mm_cvtsi32_si128(loadu_int32(src + stride));
1172
300k
  const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1173
300k
  s_32[2] = _mm_cvtsi32_si128(loadu_int32(src + 2 * stride));
1174
300k
  const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
1175
300k
  ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1176
300k
  return convolve_4tap_ssse3(ss_128, coeffs);
1177
300k
}
1178
1179
static inline __m256i y_convolve_4tap_8x2_avx2(const uint8_t *const src,
1180
                                               const ptrdiff_t stride,
1181
                                               const __m256i coeffs[2],
1182
                                               __m128i s_64[4],
1183
261k
                                               __m256i ss_256[2]) {
1184
261k
  s_64[3] = _mm_loadl_epi64((__m128i *)(src + stride));
1185
261k
  const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
1186
261k
  s_64[2] = _mm_loadl_epi64((__m128i *)(src + 2 * stride));
1187
261k
  const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[2]);
1188
261k
  ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1189
261k
  return convolve_4tap_avx2(ss_256, coeffs);
1190
261k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_4tap_8x2_avx2
convolve_avx2.c:y_convolve_4tap_8x2_avx2
Line
Count
Source
1183
261k
                                               __m256i ss_256[2]) {
1184
261k
  s_64[3] = _mm_loadl_epi64((__m128i *)(src + stride));
1185
261k
  const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
1186
261k
  s_64[2] = _mm_loadl_epi64((__m128i *)(src + 2 * stride));
1187
  const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[2]);
1188
261k
  ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1189
261k
  return convolve_4tap_avx2(ss_256, coeffs);
1190
261k
}
1191
1192
static inline void y_convolve_4tap_16x2_avx2(const uint8_t *const src,
1193
                                             const ptrdiff_t stride,
1194
                                             const __m256i coeffs[2],
1195
                                             __m128i s_128[4],
1196
157k
                                             __m256i ss_256[4], __m256i r[2]) {
1197
157k
  s_128[3] = _mm_loadu_si128((__m128i *)(src + stride));
1198
157k
  const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
1199
157k
  s_128[2] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
1200
157k
  const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[2]);
1201
157k
  ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1202
157k
  ss_256[3] = _mm256_unpackhi_epi8(src23, src34);
1203
157k
  r[0] = convolve_4tap_avx2(ss_256, coeffs);
1204
157k
  r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
1205
157k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_4tap_16x2_avx2
convolve_avx2.c:y_convolve_4tap_16x2_avx2
Line
Count
Source
1196
157k
                                             __m256i ss_256[4], __m256i r[2]) {
1197
157k
  s_128[3] = _mm_loadu_si128((__m128i *)(src + stride));
1198
157k
  const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
1199
157k
  s_128[2] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
1200
  const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[2]);
1201
157k
  ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1202
157k
  ss_256[3] = _mm256_unpackhi_epi8(src23, src34);
1203
157k
  r[0] = convolve_4tap_avx2(ss_256, coeffs);
1204
157k
  r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
1205
157k
}
1206
1207
static inline __m128i y_convolve_6tap_2x2_ssse3(const uint8_t *const src,
1208
                                                const ptrdiff_t stride,
1209
                                                const __m128i coeffs[3],
1210
                                                __m128i s_16[6],
1211
93.0k
                                                __m128i ss_128[3]) {
1212
93.0k
  s_16[5] = _mm_cvtsi32_si128(loadu_int16(src + 3 * stride));
1213
93.0k
  const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
1214
93.0k
  s_16[4] = _mm_cvtsi32_si128(loadu_int16(src + 4 * stride));
1215
93.0k
  const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[4]);
1216
93.0k
  ss_128[2] = _mm_unpacklo_epi8(src45, src56);
1217
93.0k
  return convolve_6tap_ssse3(ss_128, coeffs);
1218
93.0k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_6tap_2x2_ssse3
convolve_avx2.c:y_convolve_6tap_2x2_ssse3
Line
Count
Source
1211
93.0k
                                                __m128i ss_128[3]) {
1212
93.0k
  s_16[5] = _mm_cvtsi32_si128(loadu_int16(src + 3 * stride));
1213
93.0k
  const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
1214
93.0k
  s_16[4] = _mm_cvtsi32_si128(loadu_int16(src + 4 * stride));
1215
93.0k
  const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[4]);
1216
93.0k
  ss_128[2] = _mm_unpacklo_epi8(src45, src56);
1217
93.0k
  return convolve_6tap_ssse3(ss_128, coeffs);
1218
93.0k
}
1219
1220
static inline void y_convolve_4tap_32x2_avx2(
1221
    const uint8_t *const src, const ptrdiff_t stride, const __m256i coeffs[2],
1222
274k
    __m256i s_256[4], __m256i ss_256[4], __m256i tt_256[4], __m256i r[4]) {
1223
274k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
1224
274k
  ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
1225
274k
  ss_256[3] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
1226
274k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
1227
274k
  tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[2]);
1228
274k
  tt_256[3] = _mm256_unpackhi_epi8(s_256[3], s_256[2]);
1229
274k
  r[0] = convolve_4tap_avx2(ss_256 + 0, coeffs);
1230
274k
  r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
1231
274k
  r[2] = convolve_4tap_avx2(tt_256 + 0, coeffs);
1232
274k
  r[3] = convolve_4tap_avx2(tt_256 + 2, coeffs);
1233
274k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_4tap_32x2_avx2
convolve_avx2.c:y_convolve_4tap_32x2_avx2
Line
Count
Source
1222
274k
    __m256i s_256[4], __m256i ss_256[4], __m256i tt_256[4], __m256i r[4]) {
1223
274k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
1224
274k
  ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
1225
274k
  ss_256[3] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
1226
274k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
1227
274k
  tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[2]);
1228
274k
  tt_256[3] = _mm256_unpackhi_epi8(s_256[3], s_256[2]);
1229
274k
  r[0] = convolve_4tap_avx2(ss_256 + 0, coeffs);
1230
274k
  r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
1231
274k
  r[2] = convolve_4tap_avx2(tt_256 + 0, coeffs);
1232
274k
  r[3] = convolve_4tap_avx2(tt_256 + 2, coeffs);
1233
274k
}
1234
1235
static inline __m128i y_convolve_6tap_4x2_ssse3(const uint8_t *const src,
1236
                                                const ptrdiff_t stride,
1237
                                                const __m128i coeffs[3],
1238
                                                __m128i s_32[6],
1239
485k
                                                __m128i ss_128[3]) {
1240
485k
  s_32[5] = _mm_cvtsi32_si128(loadu_int32(src + 3 * stride));
1241
485k
  const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
1242
485k
  s_32[4] = _mm_cvtsi32_si128(loadu_int32(src + 4 * stride));
1243
485k
  const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
1244
485k
  ss_128[2] = _mm_unpacklo_epi8(src45, src56);
1245
485k
  return convolve_6tap_ssse3(ss_128, coeffs);
1246
485k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_6tap_4x2_ssse3
convolve_avx2.c:y_convolve_6tap_4x2_ssse3
Line
Count
Source
1239
485k
                                                __m128i ss_128[3]) {
1240
485k
  s_32[5] = _mm_cvtsi32_si128(loadu_int32(src + 3 * stride));
1241
485k
  const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
1242
485k
  s_32[4] = _mm_cvtsi32_si128(loadu_int32(src + 4 * stride));
1243
485k
  const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
1244
485k
  ss_128[2] = _mm_unpacklo_epi8(src45, src56);
1245
485k
  return convolve_6tap_ssse3(ss_128, coeffs);
1246
485k
}
1247
1248
static inline __m256i y_convolve_6tap_8x2_avx2(const uint8_t *const src,
1249
                                               const ptrdiff_t stride,
1250
                                               const __m256i coeffs[3],
1251
                                               __m128i s_64[6],
1252
612k
                                               __m256i ss_256[3]) {
1253
612k
  s_64[5] = _mm_loadl_epi64((__m128i *)(src + 3 * stride));
1254
612k
  const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
1255
612k
  s_64[4] = _mm_loadl_epi64((__m128i *)(src + 4 * stride));
1256
612k
  const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[4]);
1257
612k
  ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
1258
612k
  return convolve_6tap_avx2(ss_256, coeffs);
1259
612k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_6tap_8x2_avx2
convolve_avx2.c:y_convolve_6tap_8x2_avx2
Line
Count
Source
1252
612k
                                               __m256i ss_256[3]) {
1253
612k
  s_64[5] = _mm_loadl_epi64((__m128i *)(src + 3 * stride));
1254
612k
  const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
1255
612k
  s_64[4] = _mm_loadl_epi64((__m128i *)(src + 4 * stride));
1256
  const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[4]);
1257
612k
  ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
1258
612k
  return convolve_6tap_avx2(ss_256, coeffs);
1259
612k
}
1260
1261
static inline void y_convolve_6tap_16x2_avx2(const uint8_t *const src,
1262
                                             const ptrdiff_t stride,
1263
                                             const __m256i coeffs[3],
1264
                                             __m128i s_128[6],
1265
578k
                                             __m256i ss_256[6], __m256i r[2]) {
1266
578k
  s_128[5] = _mm_loadu_si128((__m128i *)(src + 3 * stride));
1267
578k
  const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
1268
578k
  s_128[4] = _mm_loadu_si128((__m128i *)(src + 4 * stride));
1269
578k
  const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[4]);
1270
578k
  ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
1271
578k
  ss_256[5] = _mm256_unpackhi_epi8(src45, src56);
1272
578k
  r[0] = convolve_6tap_avx2(ss_256, coeffs);
1273
578k
  r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
1274
578k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_6tap_16x2_avx2
convolve_avx2.c:y_convolve_6tap_16x2_avx2
Line
Count
Source
1265
578k
                                             __m256i ss_256[6], __m256i r[2]) {
1266
578k
  s_128[5] = _mm_loadu_si128((__m128i *)(src + 3 * stride));
1267
578k
  const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
1268
578k
  s_128[4] = _mm_loadu_si128((__m128i *)(src + 4 * stride));
1269
  const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[4]);
1270
578k
  ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
1271
578k
  ss_256[5] = _mm256_unpackhi_epi8(src45, src56);
1272
578k
  r[0] = convolve_6tap_avx2(ss_256, coeffs);
1273
578k
  r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
1274
578k
}
1275
1276
static inline void y_convolve_6tap_32x2_avx2(
1277
    const uint8_t *const src, const ptrdiff_t stride, const __m256i coeffs[3],
1278
650k
    __m256i s_256[6], __m256i ss_256[6], __m256i tt_256[6], __m256i r[4]) {
1279
650k
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
1280
650k
  ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
1281
650k
  ss_256[5] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
1282
650k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
1283
650k
  tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[4]);
1284
650k
  tt_256[5] = _mm256_unpackhi_epi8(s_256[5], s_256[4]);
1285
650k
  r[0] = convolve_6tap_avx2(ss_256 + 0, coeffs);
1286
650k
  r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
1287
650k
  r[2] = convolve_6tap_avx2(tt_256 + 0, coeffs);
1288
650k
  r[3] = convolve_6tap_avx2(tt_256 + 3, coeffs);
1289
650k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_6tap_32x2_avx2
convolve_avx2.c:y_convolve_6tap_32x2_avx2
Line
Count
Source
1278
650k
    __m256i s_256[6], __m256i ss_256[6], __m256i tt_256[6], __m256i r[4]) {
1279
650k
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
1280
650k
  ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
1281
650k
  ss_256[5] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
1282
650k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
1283
650k
  tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[4]);
1284
650k
  tt_256[5] = _mm256_unpackhi_epi8(s_256[5], s_256[4]);
1285
650k
  r[0] = convolve_6tap_avx2(ss_256 + 0, coeffs);
1286
650k
  r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
1287
650k
  r[2] = convolve_6tap_avx2(tt_256 + 0, coeffs);
1288
650k
  r[3] = convolve_6tap_avx2(tt_256 + 3, coeffs);
1289
650k
}
1290
1291
static inline __m128i y_convolve_8tap_2x2_ssse3(const uint8_t *const src,
1292
                                                const ptrdiff_t stride,
1293
                                                const __m128i coeffs[4],
1294
                                                __m128i s_16[8],
1295
7.22k
                                                __m128i ss_128[4]) {
1296
7.22k
  s_16[7] = _mm_cvtsi32_si128(loadu_int16(src + 7 * stride));
1297
7.22k
  const __m128i src67 = _mm_unpacklo_epi16(s_16[6], s_16[7]);
1298
7.22k
  s_16[6] = _mm_cvtsi32_si128(loadu_int16(src + 8 * stride));
1299
7.22k
  const __m128i src78 = _mm_unpacklo_epi16(s_16[7], s_16[6]);
1300
7.22k
  ss_128[3] = _mm_unpacklo_epi8(src67, src78);
1301
7.22k
  return convolve_8tap_ssse3(ss_128, coeffs);
1302
7.22k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_8tap_2x2_ssse3
convolve_avx2.c:y_convolve_8tap_2x2_ssse3
Line
Count
Source
1295
7.22k
                                                __m128i ss_128[4]) {
1296
7.22k
  s_16[7] = _mm_cvtsi32_si128(loadu_int16(src + 7 * stride));
1297
7.22k
  const __m128i src67 = _mm_unpacklo_epi16(s_16[6], s_16[7]);
1298
7.22k
  s_16[6] = _mm_cvtsi32_si128(loadu_int16(src + 8 * stride));
1299
7.22k
  const __m128i src78 = _mm_unpacklo_epi16(s_16[7], s_16[6]);
1300
7.22k
  ss_128[3] = _mm_unpacklo_epi8(src67, src78);
1301
7.22k
  return convolve_8tap_ssse3(ss_128, coeffs);
1302
7.22k
}
1303
1304
static inline __m128i y_convolve_8tap_4x2_ssse3(const uint8_t *const src,
1305
                                                const ptrdiff_t stride,
1306
                                                const __m128i coeffs[4],
1307
                                                __m128i s_32[8],
1308
27.8k
                                                __m128i ss_128[4]) {
1309
27.8k
  s_32[7] = _mm_cvtsi32_si128(loadu_int32(src + 7 * stride));
1310
27.8k
  const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
1311
27.8k
  s_32[6] = _mm_cvtsi32_si128(loadu_int32(src + 8 * stride));
1312
27.8k
  const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
1313
27.8k
  ss_128[3] = _mm_unpacklo_epi8(src67, src78);
1314
27.8k
  return convolve_8tap_ssse3(ss_128, coeffs);
1315
27.8k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_8tap_4x2_ssse3
convolve_avx2.c:y_convolve_8tap_4x2_ssse3
Line
Count
Source
1308
27.8k
                                                __m128i ss_128[4]) {
1309
27.8k
  s_32[7] = _mm_cvtsi32_si128(loadu_int32(src + 7 * stride));
1310
27.8k
  const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
1311
27.8k
  s_32[6] = _mm_cvtsi32_si128(loadu_int32(src + 8 * stride));
1312
27.8k
  const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
1313
27.8k
  ss_128[3] = _mm_unpacklo_epi8(src67, src78);
1314
27.8k
  return convolve_8tap_ssse3(ss_128, coeffs);
1315
27.8k
}
1316
1317
static inline __m256i y_convolve_8tap_8x2_avx2(const uint8_t *const src,
1318
                                               const ptrdiff_t stride,
1319
                                               const __m256i coeffs[4],
1320
                                               __m128i s_64[8],
1321
27.4k
                                               __m256i ss_256[4]) {
1322
27.4k
  s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * stride));
1323
27.4k
  const __m256i src67 = _mm256_setr_m128i(s_64[6], s_64[7]);
1324
27.4k
  s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * stride));
1325
27.4k
  const __m256i src78 = _mm256_setr_m128i(s_64[7], s_64[6]);
1326
27.4k
  ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
1327
27.4k
  return convolve_8tap_avx2(ss_256, coeffs);
1328
27.4k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_8tap_8x2_avx2
convolve_avx2.c:y_convolve_8tap_8x2_avx2
Line
Count
Source
1321
27.4k
                                               __m256i ss_256[4]) {
1322
27.4k
  s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * stride));
1323
27.4k
  const __m256i src67 = _mm256_setr_m128i(s_64[6], s_64[7]);
1324
27.4k
  s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * stride));
1325
  const __m256i src78 = _mm256_setr_m128i(s_64[7], s_64[6]);
1326
27.4k
  ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
1327
27.4k
  return convolve_8tap_avx2(ss_256, coeffs);
1328
27.4k
}
1329
1330
static inline void y_convolve_8tap_16x2_avx2(const uint8_t *const src,
1331
                                             const ptrdiff_t stride,
1332
                                             const __m256i coeffs[4],
1333
                                             __m128i s_128[8],
1334
23.0k
                                             __m256i ss_256[8], __m256i r[2]) {
1335
23.0k
  s_128[7] = _mm_loadu_si128((__m128i *)(src + 7 * stride));
1336
23.0k
  const __m256i src67 = _mm256_setr_m128i(s_128[6], s_128[7]);
1337
23.0k
  s_128[6] = _mm_loadu_si128((__m128i *)(src + 8 * stride));
1338
23.0k
  const __m256i src78 = _mm256_setr_m128i(s_128[7], s_128[6]);
1339
23.0k
  ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
1340
23.0k
  ss_256[7] = _mm256_unpackhi_epi8(src67, src78);
1341
23.0k
  r[0] = convolve_8tap_avx2(ss_256, coeffs);
1342
23.0k
  r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
1343
23.0k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_8tap_16x2_avx2
convolve_avx2.c:y_convolve_8tap_16x2_avx2
Line
Count
Source
1334
23.0k
                                             __m256i ss_256[8], __m256i r[2]) {
1335
23.0k
  s_128[7] = _mm_loadu_si128((__m128i *)(src + 7 * stride));
1336
23.0k
  const __m256i src67 = _mm256_setr_m128i(s_128[6], s_128[7]);
1337
23.0k
  s_128[6] = _mm_loadu_si128((__m128i *)(src + 8 * stride));
1338
  const __m256i src78 = _mm256_setr_m128i(s_128[7], s_128[6]);
1339
23.0k
  ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
1340
23.0k
  ss_256[7] = _mm256_unpackhi_epi8(src67, src78);
1341
23.0k
  r[0] = convolve_8tap_avx2(ss_256, coeffs);
1342
23.0k
  r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
1343
23.0k
}
1344
1345
static inline void y_convolve_8tap_32x2_avx2(
1346
    const uint8_t *const src, const ptrdiff_t stride, const __m256i coeffs[4],
1347
57.2k
    __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
1348
57.2k
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1349
57.2k
  ss_256[3] = _mm256_unpacklo_epi8(s_256[6], s_256[7]);
1350
57.2k
  ss_256[7] = _mm256_unpackhi_epi8(s_256[6], s_256[7]);
1351
57.2k
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1352
57.2k
  tt_256[3] = _mm256_unpacklo_epi8(s_256[7], s_256[6]);
1353
57.2k
  tt_256[7] = _mm256_unpackhi_epi8(s_256[7], s_256[6]);
1354
57.2k
  r[0] = convolve_8tap_avx2(ss_256 + 0, coeffs);
1355
57.2k
  r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
1356
57.2k
  r[2] = convolve_8tap_avx2(tt_256 + 0, coeffs);
1357
57.2k
  r[3] = convolve_8tap_avx2(tt_256 + 4, coeffs);
1358
57.2k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_8tap_32x2_avx2
convolve_avx2.c:y_convolve_8tap_32x2_avx2
Line
Count
Source
1347
57.2k
    __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
1348
57.2k
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1349
57.2k
  ss_256[3] = _mm256_unpacklo_epi8(s_256[6], s_256[7]);
1350
57.2k
  ss_256[7] = _mm256_unpackhi_epi8(s_256[6], s_256[7]);
1351
57.2k
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1352
57.2k
  tt_256[3] = _mm256_unpacklo_epi8(s_256[7], s_256[6]);
1353
57.2k
  tt_256[7] = _mm256_unpackhi_epi8(s_256[7], s_256[6]);
1354
57.2k
  r[0] = convolve_8tap_avx2(ss_256 + 0, coeffs);
1355
57.2k
  r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
1356
57.2k
  r[2] = convolve_8tap_avx2(tt_256 + 0, coeffs);
1357
57.2k
  r[3] = convolve_8tap_avx2(tt_256 + 4, coeffs);
1358
57.2k
}
1359
1360
static inline void xy_x_convolve_2tap_32_avx2(const uint8_t *const src,
1361
                                              const __m256i coeffs[1],
1362
442k
                                              __m256i r[2]) {
1363
442k
  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
1364
442k
  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
1365
442k
  const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
1366
442k
  const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
1367
1368
442k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
1369
442k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
1370
442k
}
convolve_2d_avx2.c:xy_x_convolve_2tap_32_avx2
Line
Count
Source
1362
442k
                                              __m256i r[2]) {
1363
442k
  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
1364
442k
  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
1365
442k
  const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
1366
442k
  const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
1367
1368
442k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
1369
442k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
1370
442k
}
Unexecuted instantiation: convolve_avx2.c:xy_x_convolve_2tap_32_avx2
1371
1372
static inline void xy_x_2tap_32_avx2(const uint8_t *const src,
1373
                                     const __m256i coeffs[1],
1374
442k
                                     int16_t *const dst) {
1375
442k
  __m256i r[2];
1376
1377
442k
  xy_x_convolve_2tap_32_avx2(src, coeffs, r);
1378
442k
  const __m256i d0 = xy_x_round_avx2(r[0]);
1379
442k
  const __m256i d1 = xy_x_round_avx2(r[1]);
1380
442k
  _mm256_storeu_si256((__m256i *)dst, d0);
1381
442k
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1382
442k
}
convolve_2d_avx2.c:xy_x_2tap_32_avx2
Line
Count
Source
1374
442k
                                     int16_t *const dst) {
1375
442k
  __m256i r[2];
1376
1377
442k
  xy_x_convolve_2tap_32_avx2(src, coeffs, r);
1378
442k
  const __m256i d0 = xy_x_round_avx2(r[0]);
1379
442k
  const __m256i d1 = xy_x_round_avx2(r[1]);
1380
442k
  _mm256_storeu_si256((__m256i *)dst, d0);
1381
442k
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1382
442k
}
Unexecuted instantiation: convolve_avx2.c:xy_x_2tap_32_avx2
1383
1384
static inline void xy_x_4tap_32_avx2(const uint8_t *const src,
1385
                                     const __m256i coeffs[2],
1386
                                     const __m256i filt[2],
1387
694k
                                     int16_t *const dst) {
1388
694k
  __m256i r[2];
1389
1390
694k
  x_convolve_4tap_32_avx2(src, coeffs, filt, r);
1391
694k
  const __m256i d0 = xy_x_round_avx2(r[0]);
1392
694k
  const __m256i d1 = xy_x_round_avx2(r[1]);
1393
694k
  _mm256_storeu_si256((__m256i *)dst, d0);
1394
694k
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1395
694k
}
convolve_2d_avx2.c:xy_x_4tap_32_avx2
Line
Count
Source
1387
694k
                                     int16_t *const dst) {
1388
694k
  __m256i r[2];
1389
1390
694k
  x_convolve_4tap_32_avx2(src, coeffs, filt, r);
1391
694k
  const __m256i d0 = xy_x_round_avx2(r[0]);
1392
694k
  const __m256i d1 = xy_x_round_avx2(r[1]);
1393
694k
  _mm256_storeu_si256((__m256i *)dst, d0);
1394
694k
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1395
694k
}
Unexecuted instantiation: convolve_avx2.c:xy_x_4tap_32_avx2
1396
1397
static inline void xy_x_6tap_32_avx2(const uint8_t *const src,
1398
                                     const __m256i coeffs[3],
1399
                                     const __m256i filt[3],
1400
3.35M
                                     int16_t *const dst) {
1401
3.35M
  __m256i r[2];
1402
1403
3.35M
  x_convolve_6tap_32_avx2(src, coeffs, filt, r);
1404
3.35M
  const __m256i d0 = xy_x_round_avx2(r[0]);
1405
3.35M
  const __m256i d1 = xy_x_round_avx2(r[1]);
1406
3.35M
  _mm256_storeu_si256((__m256i *)dst, d0);
1407
3.35M
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1408
3.35M
}
convolve_2d_avx2.c:xy_x_6tap_32_avx2
Line
Count
Source
1400
3.35M
                                     int16_t *const dst) {
1401
3.35M
  __m256i r[2];
1402
1403
3.35M
  x_convolve_6tap_32_avx2(src, coeffs, filt, r);
1404
3.35M
  const __m256i d0 = xy_x_round_avx2(r[0]);
1405
3.35M
  const __m256i d1 = xy_x_round_avx2(r[1]);
1406
3.35M
  _mm256_storeu_si256((__m256i *)dst, d0);
1407
3.35M
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1408
3.35M
}
Unexecuted instantiation: convolve_avx2.c:xy_x_6tap_32_avx2
1409
1410
static inline void xy_x_8tap_32_avx2(const uint8_t *const src,
1411
                                     const __m256i coeffs[4],
1412
                                     const __m256i filt[4],
1413
1.17M
                                     int16_t *const dst) {
1414
1.17M
  __m256i r[2];
1415
1416
1.17M
  x_convolve_8tap_32_avx2(src, coeffs, filt, r);
1417
1.17M
  const __m256i d0 = xy_x_round_avx2(r[0]);
1418
1.17M
  const __m256i d1 = xy_x_round_avx2(r[1]);
1419
1.17M
  _mm256_storeu_si256((__m256i *)dst, d0);
1420
1.17M
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1421
1.17M
}
convolve_2d_avx2.c:xy_x_8tap_32_avx2
Line
Count
Source
1413
1.17M
                                     int16_t *const dst) {
1414
1.17M
  __m256i r[2];
1415
1416
1.17M
  x_convolve_8tap_32_avx2(src, coeffs, filt, r);
1417
1.17M
  const __m256i d0 = xy_x_round_avx2(r[0]);
1418
1.17M
  const __m256i d1 = xy_x_round_avx2(r[1]);
1419
1.17M
  _mm256_storeu_si256((__m256i *)dst, d0);
1420
1.17M
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1421
1.17M
}
Unexecuted instantiation: convolve_avx2.c:xy_x_8tap_32_avx2
1422
1423
static inline __m128i xy_y_convolve_2tap_2x2_sse2(const int16_t *const src,
1424
                                                  __m128i s_32[2],
1425
15.7k
                                                  const __m128i coeffs[1]) {
1426
15.7k
  __m128i s_128[2];
1427
1428
15.7k
  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + 2));
1429
15.7k
  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1430
15.7k
  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * 2));
1431
15.7k
  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1432
15.7k
  const __m128i ss = _mm_unpacklo_epi16(s_128[0], s_128[1]);
1433
15.7k
  return convolve16_2tap_sse2(&ss, coeffs);
1434
15.7k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_2x2_sse2
Line
Count
Source
1425
15.7k
                                                  const __m128i coeffs[1]) {
1426
15.7k
  __m128i s_128[2];
1427
1428
15.7k
  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + 2));
1429
15.7k
  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1430
15.7k
  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * 2));
1431
15.7k
  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1432
15.7k
  const __m128i ss = _mm_unpacklo_epi16(s_128[0], s_128[1]);
1433
15.7k
  return convolve16_2tap_sse2(&ss, coeffs);
1434
15.7k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_2x2_sse2
1435
1436
static inline __m128i xy_y_convolve_2tap_2x2_half_pel_sse2(
1437
5.20k
    const int16_t *const src, __m128i s_32[2]) {
1438
5.20k
  __m128i s_128[2];
1439
1440
5.20k
  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + 2));
1441
5.20k
  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1442
5.20k
  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * 2));
1443
5.20k
  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1444
5.20k
  return _mm_add_epi16(s_128[0], s_128[1]);
1445
5.20k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_2x2_half_pel_sse2
Line
Count
Source
1437
5.20k
    const int16_t *const src, __m128i s_32[2]) {
1438
5.20k
  __m128i s_128[2];
1439
1440
5.20k
  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + 2));
1441
5.20k
  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1442
5.20k
  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * 2));
1443
5.20k
  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1444
5.20k
  return _mm_add_epi16(s_128[0], s_128[1]);
1445
5.20k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_2x2_half_pel_sse2
1446
1447
static inline void xy_y_convolve_2tap_4x2_sse2(const int16_t *const src,
1448
                                               __m128i s_64[2],
1449
                                               const __m128i coeffs[1],
1450
66.2k
                                               __m128i r[2]) {
1451
66.2k
  __m128i s_128[2];
1452
1453
66.2k
  s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
1454
66.2k
  s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
1455
66.2k
  s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
1456
66.2k
  s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
1457
66.2k
  const __m128i ss0 = _mm_unpacklo_epi16(s_128[0], s_128[1]);
1458
66.2k
  const __m128i ss1 = _mm_unpackhi_epi16(s_128[0], s_128[1]);
1459
66.2k
  r[0] = convolve16_2tap_sse2(&ss0, coeffs);
1460
66.2k
  r[1] = convolve16_2tap_sse2(&ss1, coeffs);
1461
66.2k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_4x2_sse2
Line
Count
Source
1450
66.2k
                                               __m128i r[2]) {
1451
66.2k
  __m128i s_128[2];
1452
1453
66.2k
  s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
1454
66.2k
  s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
1455
66.2k
  s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
1456
66.2k
  s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
1457
66.2k
  const __m128i ss0 = _mm_unpacklo_epi16(s_128[0], s_128[1]);
1458
66.2k
  const __m128i ss1 = _mm_unpackhi_epi16(s_128[0], s_128[1]);
1459
66.2k
  r[0] = convolve16_2tap_sse2(&ss0, coeffs);
1460
66.2k
  r[1] = convolve16_2tap_sse2(&ss1, coeffs);
1461
66.2k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_4x2_sse2
1462
1463
static inline __m128i xy_y_convolve_2tap_4x2_half_pel_sse2(
1464
22.8k
    const int16_t *const src, __m128i s_64[2]) {
1465
22.8k
  __m128i s_128[2];
1466
1467
22.8k
  s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
1468
22.8k
  s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
1469
22.8k
  s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
1470
22.8k
  s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
1471
22.8k
  return _mm_add_epi16(s_128[0], s_128[1]);
1472
22.8k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_4x2_half_pel_sse2
Line
Count
Source
1464
22.8k
    const int16_t *const src, __m128i s_64[2]) {
1465
22.8k
  __m128i s_128[2];
1466
1467
22.8k
  s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
1468
22.8k
  s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
1469
22.8k
  s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
1470
22.8k
  s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
1471
22.8k
  return _mm_add_epi16(s_128[0], s_128[1]);
1472
22.8k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_4x2_half_pel_sse2
1473
1474
static inline void xy_y_convolve_2tap_16_avx2(const __m256i s0,
1475
                                              const __m256i s1,
1476
                                              const __m256i coeffs[1],
1477
875k
                                              __m256i r[2]) {
1478
875k
  const __m256i ss0 = _mm256_unpacklo_epi16(s0, s1);
1479
875k
  const __m256i ss1 = _mm256_unpackhi_epi16(s0, s1);
1480
875k
  r[0] = convolve16_2tap_avx2(&ss0, coeffs);
1481
875k
  r[1] = convolve16_2tap_avx2(&ss1, coeffs);
1482
875k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_16_avx2
Line
Count
Source
1477
875k
                                              __m256i r[2]) {
1478
875k
  const __m256i ss0 = _mm256_unpacklo_epi16(s0, s1);
1479
875k
  const __m256i ss1 = _mm256_unpackhi_epi16(s0, s1);
1480
875k
  r[0] = convolve16_2tap_avx2(&ss0, coeffs);
1481
875k
  r[1] = convolve16_2tap_avx2(&ss1, coeffs);
1482
875k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_16_avx2
1483
1484
static inline void xy_y_convolve_2tap_8x2_avx2(const int16_t *const src,
1485
                                               __m128i s_128[2],
1486
                                               const __m256i coeffs[1],
1487
56.3k
                                               __m256i r[2]) {
1488
56.3k
  __m256i s_256[2];
1489
56.3k
  s_128[1] = _mm_loadu_si128((__m128i *)(src + 8));
1490
56.3k
  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1491
56.3k
  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
1492
56.3k
  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1493
56.3k
  xy_y_convolve_2tap_16_avx2(s_256[0], s_256[1], coeffs, r);
1494
56.3k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_8x2_avx2
Line
Count
Source
1487
56.3k
                                               __m256i r[2]) {
1488
56.3k
  __m256i s_256[2];
1489
56.3k
  s_128[1] = _mm_loadu_si128((__m128i *)(src + 8));
1490
56.3k
  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1491
56.3k
  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
1492
  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1493
56.3k
  xy_y_convolve_2tap_16_avx2(s_256[0], s_256[1], coeffs, r);
1494
56.3k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_8x2_avx2
1495
1496
static inline __m256i xy_y_convolve_2tap_8x2_half_pel_avx2(
1497
21.3k
    const int16_t *const src, __m128i s_128[2]) {
1498
21.3k
  __m256i s_256[2];
1499
21.3k
  s_128[1] = _mm_loadu_si128((__m128i *)(src + 8));
1500
21.3k
  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1501
21.3k
  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
1502
21.3k
  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1503
21.3k
  return _mm256_add_epi16(s_256[0], s_256[1]);
1504
21.3k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_8x2_half_pel_avx2
Line
Count
Source
1497
21.3k
    const int16_t *const src, __m128i s_128[2]) {
1498
21.3k
  __m256i s_256[2];
1499
21.3k
  s_128[1] = _mm_loadu_si128((__m128i *)(src + 8));
1500
21.3k
  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1501
21.3k
  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
1502
  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1503
21.3k
  return _mm256_add_epi16(s_256[0], s_256[1]);
1504
21.3k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_8x2_half_pel_avx2
1505
1506
static inline void xy_y_convolve_2tap_16x2_half_pel_avx2(
1507
18.5k
    const int16_t *const src, __m256i s_256[2], __m256i r[2]) {
1508
18.5k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1509
18.5k
  r[0] = _mm256_add_epi16(s_256[0], s_256[1]);
1510
18.5k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
1511
18.5k
  r[1] = _mm256_add_epi16(s_256[1], s_256[0]);
1512
18.5k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_16x2_half_pel_avx2
Line
Count
Source
1507
18.5k
    const int16_t *const src, __m256i s_256[2], __m256i r[2]) {
1508
18.5k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1509
18.5k
  r[0] = _mm256_add_epi16(s_256[0], s_256[1]);
1510
18.5k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
1511
18.5k
  r[1] = _mm256_add_epi16(s_256[1], s_256[0]);
1512
18.5k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_16x2_half_pel_avx2
1513
1514
static inline void xy_y_store_16x2_avx2(const __m256i r[2], uint8_t *const dst,
1515
0
                                        const ptrdiff_t stride) {
1516
0
  const __m256i t = _mm256_packus_epi16(r[0], r[1]);
1517
0
  const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
1518
0
  storeu_u8_16x2_avx2(d, dst, stride);
1519
0
}
Unexecuted instantiation: convolve_2d_avx2.c:xy_y_store_16x2_avx2
Unexecuted instantiation: convolve_avx2.c:xy_y_store_16x2_avx2
1520
1521
static inline void xy_y_convolve_2tap_16x2_avx2(const int16_t *const src,
1522
                                                __m256i s[2],
1523
                                                const __m256i coeffs[1],
1524
40.9k
                                                __m256i r[4]) {
1525
40.9k
  s[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1526
40.9k
  xy_y_convolve_2tap_16_avx2(s[0], s[1], coeffs, r + 0);
1527
40.9k
  s[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
1528
40.9k
  xy_y_convolve_2tap_16_avx2(s[1], s[0], coeffs, r + 2);
1529
40.9k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_16x2_avx2
Line
Count
Source
1524
40.9k
                                                __m256i r[4]) {
1525
40.9k
  s[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1526
40.9k
  xy_y_convolve_2tap_16_avx2(s[0], s[1], coeffs, r + 0);
1527
40.9k
  s[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
1528
40.9k
  xy_y_convolve_2tap_16_avx2(s[1], s[0], coeffs, r + 2);
1529
40.9k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_16x2_avx2
1530
1531
static inline void xy_y_convolve_2tap_32_avx2(const int16_t *const src,
1532
                                              const __m256i s0[2],
1533
                                              __m256i s1[2],
1534
                                              const __m256i coeffs[1],
1535
263k
                                              __m256i r[4]) {
1536
263k
  s1[0] = _mm256_loadu_si256((__m256i *)src);
1537
263k
  s1[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1538
263k
  xy_y_convolve_2tap_16_avx2(s0[0], s1[0], coeffs, r + 0);
1539
263k
  xy_y_convolve_2tap_16_avx2(s0[1], s1[1], coeffs, r + 2);
1540
263k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_32_avx2
Line
Count
Source
1535
263k
                                              __m256i r[4]) {
1536
263k
  s1[0] = _mm256_loadu_si256((__m256i *)src);
1537
263k
  s1[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1538
263k
  xy_y_convolve_2tap_16_avx2(s0[0], s1[0], coeffs, r + 0);
1539
263k
  xy_y_convolve_2tap_16_avx2(s0[1], s1[1], coeffs, r + 2);
1540
263k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_32_avx2
1541
1542
static inline void xy_y_convolve_2tap_32_all_avx2(const int16_t *const src,
1543
                                                  const __m256i s0[2],
1544
                                                  __m256i s1[2],
1545
                                                  const __m256i coeffs[1],
1546
263k
                                                  uint8_t *const dst) {
1547
263k
  __m256i r[4];
1548
1549
263k
  xy_y_convolve_2tap_32_avx2(src, s0, s1, coeffs, r);
1550
263k
  xy_y_round_store_32_avx2(r + 0, r + 2, dst);
1551
263k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_32_all_avx2
Line
Count
Source
1546
263k
                                                  uint8_t *const dst) {
1547
263k
  __m256i r[4];
1548
1549
263k
  xy_y_convolve_2tap_32_avx2(src, s0, s1, coeffs, r);
1550
263k
  xy_y_round_store_32_avx2(r + 0, r + 2, dst);
1551
263k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_32_all_avx2
1552
1553
static inline void xy_y_convolve_2tap_half_pel_32_avx2(const int16_t *const src,
1554
                                                       const __m256i s0[2],
1555
                                                       __m256i s1[2],
1556
161k
                                                       __m256i r[2]) {
1557
161k
  s1[0] = _mm256_loadu_si256((__m256i *)src);
1558
161k
  s1[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1559
161k
  r[0] = _mm256_add_epi16(s0[0], s1[0]);
1560
161k
  r[1] = _mm256_add_epi16(s0[1], s1[1]);
1561
161k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_half_pel_32_avx2
Line
Count
Source
1556
161k
                                                       __m256i r[2]) {
1557
161k
  s1[0] = _mm256_loadu_si256((__m256i *)src);
1558
161k
  s1[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1559
161k
  r[0] = _mm256_add_epi16(s0[0], s1[0]);
1560
161k
  r[1] = _mm256_add_epi16(s0[1], s1[1]);
1561
161k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_half_pel_32_avx2
1562
1563
static inline void xy_y_convolve_2tap_half_pel_32_all_avx2(
1564
    const int16_t *const src, const __m256i s0[2], __m256i s1[2],
1565
161k
    uint8_t *const dst) {
1566
161k
  __m256i r[2];
1567
1568
161k
  xy_y_convolve_2tap_half_pel_32_avx2(src, s0, s1, r);
1569
161k
  r[0] = xy_y_round_half_pel_avx2(r[0]);
1570
161k
  r[1] = xy_y_round_half_pel_avx2(r[1]);
1571
161k
  xy_y_pack_store_32_avx2(r[0], r[1], dst);
1572
161k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_half_pel_32_all_avx2
Line
Count
Source
1565
161k
    uint8_t *const dst) {
1566
161k
  __m256i r[2];
1567
1568
161k
  xy_y_convolve_2tap_half_pel_32_avx2(src, s0, s1, r);
1569
161k
  r[0] = xy_y_round_half_pel_avx2(r[0]);
1570
161k
  r[1] = xy_y_round_half_pel_avx2(r[1]);
1571
161k
  xy_y_pack_store_32_avx2(r[0], r[1], dst);
1572
161k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_half_pel_32_all_avx2
1573
1574
static inline __m128i xy_y_convolve_4tap_2x2_sse2(const int16_t *const src,
1575
                                                  __m128i s_32[4],
1576
                                                  __m128i ss_128[2],
1577
161k
                                                  const __m128i coeffs[2]) {
1578
161k
  s_32[3] = _mm_cvtsi32_si128(loadu_int32(src + 3 * 2));
1579
161k
  const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1580
161k
  s_32[2] = _mm_cvtsi32_si128(loadu_int32(src + 4 * 2));
1581
161k
  const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
1582
161k
  ss_128[1] = _mm_unpacklo_epi16(src23, src34);
1583
161k
  const __m128i r = convolve16_4tap_sse2(ss_128, coeffs);
1584
161k
  ss_128[0] = ss_128[1];
1585
161k
  return r;
1586
161k
}
convolve_2d_avx2.c:xy_y_convolve_4tap_2x2_sse2
Line
Count
Source
1577
161k
                                                  const __m128i coeffs[2]) {
1578
161k
  s_32[3] = _mm_cvtsi32_si128(loadu_int32(src + 3 * 2));
1579
161k
  const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1580
161k
  s_32[2] = _mm_cvtsi32_si128(loadu_int32(src + 4 * 2));
1581
161k
  const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
1582
161k
  ss_128[1] = _mm_unpacklo_epi16(src23, src34);
1583
161k
  const __m128i r = convolve16_4tap_sse2(ss_128, coeffs);
1584
161k
  ss_128[0] = ss_128[1];
1585
161k
  return r;
1586
161k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_2x2_sse2
1587
1588
static inline __m256i xy_y_convolve_4tap_4x2_avx2(const int16_t *const src,
1589
                                                  __m128i s_64[4],
1590
                                                  __m256i ss_256[2],
1591
825k
                                                  const __m256i coeffs[2]) {
1592
825k
  __m256i s_256[2];
1593
825k
  s_64[3] = _mm_loadl_epi64((__m128i *)(src + 3 * 4));
1594
825k
  s_256[0] = _mm256_setr_m128i(s_64[2], s_64[3]);
1595
825k
  s_64[2] = _mm_loadl_epi64((__m128i *)(src + 4 * 4));
1596
825k
  s_256[1] = _mm256_setr_m128i(s_64[3], s_64[2]);
1597
825k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1598
825k
  const __m256i r = convolve16_4tap_avx2(ss_256, coeffs);
1599
825k
  ss_256[0] = ss_256[1];
1600
825k
  return r;
1601
825k
}
convolve_2d_avx2.c:xy_y_convolve_4tap_4x2_avx2
Line
Count
Source
1591
825k
                                                  const __m256i coeffs[2]) {
1592
825k
  __m256i s_256[2];
1593
825k
  s_64[3] = _mm_loadl_epi64((__m128i *)(src + 3 * 4));
1594
825k
  s_256[0] = _mm256_setr_m128i(s_64[2], s_64[3]);
1595
825k
  s_64[2] = _mm_loadl_epi64((__m128i *)(src + 4 * 4));
1596
  s_256[1] = _mm256_setr_m128i(s_64[3], s_64[2]);
1597
825k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1598
825k
  const __m256i r = convolve16_4tap_avx2(ss_256, coeffs);
1599
825k
  ss_256[0] = ss_256[1];
1600
825k
  return r;
1601
825k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_4x2_avx2
1602
1603
static inline void xy_y_convolve_4tap_16_avx2(const __m256i *const ss,
1604
                                              const __m256i coeffs[2],
1605
2.95M
                                              __m256i r[2]) {
1606
2.95M
  r[0] = convolve16_4tap_avx2(ss, coeffs);
1607
2.95M
  r[1] = convolve16_4tap_avx2(ss + 2, coeffs);
1608
2.95M
}
convolve_2d_avx2.c:xy_y_convolve_4tap_16_avx2
Line
Count
Source
1605
2.95M
                                              __m256i r[2]) {
1606
2.95M
  r[0] = convolve16_4tap_avx2(ss, coeffs);
1607
2.95M
  r[1] = convolve16_4tap_avx2(ss + 2, coeffs);
1608
2.95M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_16_avx2
1609
1610
static inline void xy_y_convolve_4tap_8x2_avx2(const int16_t *const src,
1611
                                               __m256i ss_256[4],
1612
                                               const __m256i coeffs[2],
1613
536k
                                               __m256i r[2]) {
1614
536k
  __m256i s_256[2];
1615
536k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
1616
536k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
1617
536k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1618
536k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1619
536k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1620
536k
  ss_256[0] = ss_256[1];
1621
536k
  ss_256[2] = ss_256[3];
1622
536k
}
convolve_2d_avx2.c:xy_y_convolve_4tap_8x2_avx2
Line
Count
Source
1613
536k
                                               __m256i r[2]) {
1614
536k
  __m256i s_256[2];
1615
536k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
1616
536k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
1617
536k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1618
536k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1619
536k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1620
536k
  ss_256[0] = ss_256[1];
1621
536k
  ss_256[2] = ss_256[3];
1622
536k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_8x2_avx2
1623
1624
static inline void xy_y_convolve_4tap_8x2_half_pel_avx2(
1625
    const int16_t *const src, const __m256i coeffs[1], __m256i s_256[4],
1626
98.2k
    __m256i r[2]) {
1627
98.2k
  __m256i a_256[2];
1628
98.2k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
1629
98.2k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
1630
98.2k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
1631
98.2k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
1632
98.2k
  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r);
1633
98.2k
  s_256[0] = s_256[2];
1634
98.2k
  s_256[1] = s_256[3];
1635
98.2k
}
convolve_2d_avx2.c:xy_y_convolve_4tap_8x2_half_pel_avx2
Line
Count
Source
1626
98.2k
    __m256i r[2]) {
1627
98.2k
  __m256i a_256[2];
1628
98.2k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
1629
98.2k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
1630
98.2k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
1631
98.2k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
1632
98.2k
  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r);
1633
98.2k
  s_256[0] = s_256[2];
1634
98.2k
  s_256[1] = s_256[3];
1635
98.2k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_8x2_half_pel_avx2
1636
1637
static inline void xy_y_convolve_4tap_16x2_avx2(
1638
    const int16_t *const src, __m256i s_256[4], __m256i ss_256[4],
1639
271k
    __m256i tt_256[4], const __m256i coeffs[2], __m256i r[4]) {
1640
271k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
1641
271k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1642
271k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1643
271k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
1644
271k
  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
1645
271k
  tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
1646
271k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1647
271k
  xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
1648
271k
  ss_256[0] = ss_256[1];
1649
271k
  ss_256[2] = ss_256[3];
1650
271k
  tt_256[0] = tt_256[1];
1651
271k
  tt_256[2] = tt_256[3];
1652
271k
}
convolve_2d_avx2.c:xy_y_convolve_4tap_16x2_avx2
Line
Count
Source
1639
271k
    __m256i tt_256[4], const __m256i coeffs[2], __m256i r[4]) {
1640
271k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
1641
271k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1642
271k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1643
271k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
1644
271k
  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
1645
271k
  tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
1646
271k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1647
271k
  xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
1648
271k
  ss_256[0] = ss_256[1];
1649
271k
  ss_256[2] = ss_256[3];
1650
271k
  tt_256[0] = tt_256[1];
1651
271k
  tt_256[2] = tt_256[3];
1652
271k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_16x2_avx2
1653
1654
static inline void xy_y_convolve_4tap_32x2_avx2(
1655
    const int16_t *const src, const ptrdiff_t stride, __m256i s_256[4],
1656
    __m256i ss_256[4], __m256i tt_256[4], const __m256i coeffs[2],
1657
435k
    __m256i r[4]) {
1658
435k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
1659
435k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1660
435k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1661
435k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
1662
435k
  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
1663
435k
  tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
1664
435k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1665
435k
  xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
1666
435k
  ss_256[0] = ss_256[1];
1667
435k
  ss_256[2] = ss_256[3];
1668
435k
  tt_256[0] = tt_256[1];
1669
435k
  tt_256[2] = tt_256[3];
1670
435k
}
convolve_2d_avx2.c:xy_y_convolve_4tap_32x2_avx2
Line
Count
Source
1657
435k
    __m256i r[4]) {
1658
435k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
1659
435k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1660
435k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1661
435k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
1662
435k
  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
1663
435k
  tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
1664
435k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1665
435k
  xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
1666
435k
  ss_256[0] = ss_256[1];
1667
435k
  ss_256[2] = ss_256[3];
1668
435k
  tt_256[0] = tt_256[1];
1669
435k
  tt_256[2] = tt_256[3];
1670
435k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_32x2_avx2
1671
1672
static inline void xy_y_convolve_4tap_16x2_half_pelavx2(
1673
    const int16_t *const src, __m256i s_256[5], const __m256i coeffs[1],
1674
56.5k
    __m256i r[4]) {
1675
56.5k
  __m256i a_256[2];
1676
1677
56.5k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
1678
56.5k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
1679
1680
56.5k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
1681
56.5k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
1682
56.5k
  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 0);
1683
1684
56.5k
  a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
1685
56.5k
  a_256[1] = _mm256_add_epi16(s_256[2], s_256[3]);
1686
56.5k
  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 2);
1687
1688
56.5k
  s_256[0] = s_256[2];
1689
56.5k
  s_256[1] = s_256[3];
1690
56.5k
  s_256[2] = s_256[4];
1691
56.5k
}
convolve_2d_avx2.c:xy_y_convolve_4tap_16x2_half_pelavx2
Line
Count
Source
1674
56.5k
    __m256i r[4]) {
1675
56.5k
  __m256i a_256[2];
1676
1677
56.5k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
1678
56.5k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
1679
1680
56.5k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
1681
56.5k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
1682
56.5k
  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 0);
1683
1684
56.5k
  a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
1685
56.5k
  a_256[1] = _mm256_add_epi16(s_256[2], s_256[3]);
1686
56.5k
  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 2);
1687
1688
56.5k
  s_256[0] = s_256[2];
1689
56.5k
  s_256[1] = s_256[3];
1690
56.5k
  s_256[2] = s_256[4];
1691
56.5k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_16x2_half_pelavx2
1692
1693
static inline __m128i xy_y_convolve_6tap_2x2_sse2(const int16_t *const src,
1694
                                                  __m128i s_32[6],
1695
                                                  __m128i ss_128[3],
1696
261k
                                                  const __m128i coeffs[3]) {
1697
261k
  s_32[5] = _mm_cvtsi32_si128(loadu_int32(src + 5 * 2));
1698
261k
  const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
1699
261k
  s_32[4] = _mm_cvtsi32_si128(loadu_int32(src + 6 * 2));
1700
261k
  const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
1701
261k
  ss_128[2] = _mm_unpacklo_epi16(src45, src56);
1702
261k
  const __m128i r = convolve16_6tap_sse2(ss_128, coeffs);
1703
261k
  ss_128[0] = ss_128[1];
1704
261k
  ss_128[1] = ss_128[2];
1705
261k
  return r;
1706
261k
}
convolve_2d_avx2.c:xy_y_convolve_6tap_2x2_sse2
Line
Count
Source
1696
261k
                                                  const __m128i coeffs[3]) {
1697
261k
  s_32[5] = _mm_cvtsi32_si128(loadu_int32(src + 5 * 2));
1698
261k
  const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
1699
261k
  s_32[4] = _mm_cvtsi32_si128(loadu_int32(src + 6 * 2));
1700
261k
  const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
1701
261k
  ss_128[2] = _mm_unpacklo_epi16(src45, src56);
1702
261k
  const __m128i r = convolve16_6tap_sse2(ss_128, coeffs);
1703
261k
  ss_128[0] = ss_128[1];
1704
261k
  ss_128[1] = ss_128[2];
1705
261k
  return r;
1706
261k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_6tap_2x2_sse2
1707
1708
static inline __m256i xy_y_convolve_6tap_4x2_avx2(const int16_t *const src,
1709
                                                  __m128i s_64[6],
1710
                                                  __m256i ss_256[3],
1711
1.12M
                                                  const __m256i coeffs[3]) {
1712
1.12M
  __m256i s_256[2];
1713
1.12M
  s_64[5] = _mm_loadl_epi64((__m128i *)(src + 5 * 4));
1714
1.12M
  s_256[0] = _mm256_setr_m128i(s_64[4], s_64[5]);
1715
1.12M
  s_64[4] = _mm_loadl_epi64((__m128i *)(src + 6 * 4));
1716
1.12M
  s_256[1] = _mm256_setr_m128i(s_64[5], s_64[4]);
1717
1.12M
  ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1718
1.12M
  const __m256i r = convolve16_6tap_avx2(ss_256, coeffs);
1719
1.12M
  ss_256[0] = ss_256[1];
1720
1.12M
  ss_256[1] = ss_256[2];
1721
1.12M
  return r;
1722
1.12M
}
convolve_2d_avx2.c:xy_y_convolve_6tap_4x2_avx2
Line
Count
Source
1711
1.12M
                                                  const __m256i coeffs[3]) {
1712
1.12M
  __m256i s_256[2];
1713
1.12M
  s_64[5] = _mm_loadl_epi64((__m128i *)(src + 5 * 4));
1714
1.12M
  s_256[0] = _mm256_setr_m128i(s_64[4], s_64[5]);
1715
1.12M
  s_64[4] = _mm_loadl_epi64((__m128i *)(src + 6 * 4));
1716
  s_256[1] = _mm256_setr_m128i(s_64[5], s_64[4]);
1717
1.12M
  ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1718
1.12M
  const __m256i r = convolve16_6tap_avx2(ss_256, coeffs);
1719
1.12M
  ss_256[0] = ss_256[1];
1720
1.12M
  ss_256[1] = ss_256[2];
1721
1.12M
  return r;
1722
1.12M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_6tap_4x2_avx2
1723
1724
static inline void xy_y_convolve_6tap_16_avx2(const __m256i ss[6],
1725
                                              const __m256i coeffs[3],
1726
9.00M
                                              __m256i r[2]) {
1727
9.00M
  r[0] = convolve16_6tap_avx2(ss, coeffs);
1728
9.00M
  r[1] = convolve16_6tap_avx2(ss + 3, coeffs);
1729
9.00M
}
convolve_2d_avx2.c:xy_y_convolve_6tap_16_avx2
Line
Count
Source
1726
9.00M
                                              __m256i r[2]) {
1727
9.00M
  r[0] = convolve16_6tap_avx2(ss, coeffs);
1728
9.00M
  r[1] = convolve16_6tap_avx2(ss + 3, coeffs);
1729
9.00M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_6tap_16_avx2
1730
1731
static inline void xy_y_convolve_6tap_8x2_avx2(const int16_t *const src,
1732
                                               __m256i ss_256[6],
1733
                                               const __m256i coeffs[3],
1734
1.09M
                                               __m256i r[2]) {
1735
1.09M
  __m256i s_256[2];
1736
1.09M
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
1737
1.09M
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
1738
1.09M
  ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1739
1.09M
  ss_256[5] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1740
1.09M
  xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r);
1741
1.09M
  ss_256[0] = ss_256[1];
1742
1.09M
  ss_256[1] = ss_256[2];
1743
1.09M
  ss_256[3] = ss_256[4];
1744
1.09M
  ss_256[4] = ss_256[5];
1745
1.09M
}
convolve_2d_avx2.c:xy_y_convolve_6tap_8x2_avx2
Line
Count
Source
1734
1.09M
                                               __m256i r[2]) {
1735
1.09M
  __m256i s_256[2];
1736
1.09M
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
1737
1.09M
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
1738
1.09M
  ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1739
1.09M
  ss_256[5] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1740
1.09M
  xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r);
1741
1.09M
  ss_256[0] = ss_256[1];
1742
1.09M
  ss_256[1] = ss_256[2];
1743
1.09M
  ss_256[3] = ss_256[4];
1744
1.09M
  ss_256[4] = ss_256[5];
1745
1.09M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_6tap_8x2_avx2
1746
1747
static inline void xy_y_convolve_6tap_8x2_half_pel_avx2(
1748
    const int16_t *const src, const __m256i coeffs[2], __m256i s_256[6],
1749
321k
    __m256i r[2]) {
1750
321k
  __m256i a_256[2], ss_256[4];
1751
321k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
1752
321k
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
1753
321k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
1754
321k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
1755
321k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1756
321k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1757
321k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1758
321k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1759
321k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1760
321k
  s_256[0] = s_256[2];
1761
321k
  s_256[1] = s_256[3];
1762
321k
  s_256[2] = s_256[4];
1763
321k
  s_256[3] = s_256[5];
1764
321k
}
convolve_2d_avx2.c:xy_y_convolve_6tap_8x2_half_pel_avx2
Line
Count
Source
1749
321k
    __m256i r[2]) {
1750
321k
  __m256i a_256[2], ss_256[4];
1751
321k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
1752
321k
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
1753
321k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
1754
321k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
1755
321k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1756
321k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1757
321k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1758
321k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1759
321k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1760
321k
  s_256[0] = s_256[2];
1761
321k
  s_256[1] = s_256[3];
1762
321k
  s_256[2] = s_256[4];
1763
321k
  s_256[3] = s_256[5];
1764
321k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_6tap_8x2_half_pel_avx2
1765
1766
static inline void xy_y_convolve_6tap_16x2_avx2(
1767
    const int16_t *const src, const ptrdiff_t stride, __m256i s_256[6],
1768
    __m256i ss_256[6], __m256i tt_256[6], const __m256i coeffs[3],
1769
3.95M
    __m256i r[4]) {
1770
3.95M
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
1771
3.95M
  ss_256[2] = _mm256_unpacklo_epi16(s_256[4], s_256[5]);
1772
3.95M
  ss_256[5] = _mm256_unpackhi_epi16(s_256[4], s_256[5]);
1773
3.95M
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
1774
3.95M
  tt_256[2] = _mm256_unpacklo_epi16(s_256[5], s_256[4]);
1775
3.95M
  tt_256[5] = _mm256_unpackhi_epi16(s_256[5], s_256[4]);
1776
1777
3.95M
  xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r + 0);
1778
3.95M
  xy_y_convolve_6tap_16_avx2(tt_256, coeffs, r + 2);
1779
1780
3.95M
  ss_256[0] = ss_256[1];
1781
3.95M
  ss_256[1] = ss_256[2];
1782
3.95M
  ss_256[3] = ss_256[4];
1783
3.95M
  ss_256[4] = ss_256[5];
1784
1785
3.95M
  tt_256[0] = tt_256[1];
1786
3.95M
  tt_256[1] = tt_256[2];
1787
3.95M
  tt_256[3] = tt_256[4];
1788
3.95M
  tt_256[4] = tt_256[5];
1789
3.95M
}
convolve_2d_avx2.c:xy_y_convolve_6tap_16x2_avx2
Line
Count
Source
1769
3.95M
    __m256i r[4]) {
1770
3.95M
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
1771
3.95M
  ss_256[2] = _mm256_unpacklo_epi16(s_256[4], s_256[5]);
1772
3.95M
  ss_256[5] = _mm256_unpackhi_epi16(s_256[4], s_256[5]);
1773
3.95M
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
1774
3.95M
  tt_256[2] = _mm256_unpacklo_epi16(s_256[5], s_256[4]);
1775
3.95M
  tt_256[5] = _mm256_unpackhi_epi16(s_256[5], s_256[4]);
1776
1777
3.95M
  xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r + 0);
1778
3.95M
  xy_y_convolve_6tap_16_avx2(tt_256, coeffs, r + 2);
1779
1780
3.95M
  ss_256[0] = ss_256[1];
1781
3.95M
  ss_256[1] = ss_256[2];
1782
3.95M
  ss_256[3] = ss_256[4];
1783
3.95M
  ss_256[4] = ss_256[5];
1784
1785
3.95M
  tt_256[0] = tt_256[1];
1786
3.95M
  tt_256[1] = tt_256[2];
1787
3.95M
  tt_256[3] = tt_256[4];
1788
3.95M
  tt_256[4] = tt_256[5];
1789
3.95M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_6tap_16x2_avx2
1790
1791
static inline void xy_y_convolve_6tap_16x2_half_pel_avx2(
1792
    const int16_t *const src, const ptrdiff_t stride, __m256i s_256[6],
1793
302k
    __m256i ss_256[4], const __m256i coeffs[2], __m256i r[4]) {
1794
302k
  __m256i a_256[2];
1795
1796
302k
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
1797
302k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
1798
302k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
1799
302k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1800
302k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1801
302k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1802
302k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1803
302k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1804
1805
302k
  a_256[1] = _mm256_add_epi16(s_256[2], s_256[5]);
1806
302k
  s_256[0] = s_256[2];
1807
302k
  s_256[2] = s_256[4];
1808
302k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
1809
302k
  a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
1810
302k
  s_256[1] = s_256[3];
1811
302k
  s_256[3] = s_256[5];
1812
302k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1813
302k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
1814
302k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1815
302k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
1816
302k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
1817
302k
}
convolve_2d_avx2.c:xy_y_convolve_6tap_16x2_half_pel_avx2
Line
Count
Source
1793
302k
    __m256i ss_256[4], const __m256i coeffs[2], __m256i r[4]) {
1794
302k
  __m256i a_256[2];
1795
1796
302k
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
1797
302k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
1798
302k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
1799
302k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1800
302k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1801
302k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1802
302k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1803
302k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1804
1805
302k
  a_256[1] = _mm256_add_epi16(s_256[2], s_256[5]);
1806
302k
  s_256[0] = s_256[2];
1807
302k
  s_256[2] = s_256[4];
1808
302k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
1809
302k
  a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
1810
302k
  s_256[1] = s_256[3];
1811
302k
  s_256[3] = s_256[5];
1812
302k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1813
302k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
1814
302k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1815
302k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
1816
302k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
1817
302k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_6tap_16x2_half_pel_avx2
1818
1819
static inline __m128i xy_y_convolve_8tap_2x2_sse2(const int16_t *const src,
1820
                                                  __m128i s_32[8],
1821
                                                  __m128i ss_128[4],
1822
18.0k
                                                  const __m128i coeffs[4]) {
1823
18.0k
  s_32[7] = _mm_cvtsi32_si128(loadu_int32(src + 7 * 2));
1824
18.0k
  const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
1825
18.0k
  s_32[6] = _mm_cvtsi32_si128(loadu_int32(src + 8 * 2));
1826
18.0k
  const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
1827
18.0k
  ss_128[3] = _mm_unpacklo_epi16(src67, src78);
1828
18.0k
  const __m128i r = convolve16_8tap_sse2(ss_128, coeffs);
1829
18.0k
  ss_128[0] = ss_128[1];
1830
18.0k
  ss_128[1] = ss_128[2];
1831
18.0k
  ss_128[2] = ss_128[3];
1832
18.0k
  return r;
1833
18.0k
}
convolve_2d_avx2.c:xy_y_convolve_8tap_2x2_sse2
Line
Count
Source
1822
18.0k
                                                  const __m128i coeffs[4]) {
1823
18.0k
  s_32[7] = _mm_cvtsi32_si128(loadu_int32(src + 7 * 2));
1824
18.0k
  const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
1825
18.0k
  s_32[6] = _mm_cvtsi32_si128(loadu_int32(src + 8 * 2));
1826
18.0k
  const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
1827
18.0k
  ss_128[3] = _mm_unpacklo_epi16(src67, src78);
1828
18.0k
  const __m128i r = convolve16_8tap_sse2(ss_128, coeffs);
1829
18.0k
  ss_128[0] = ss_128[1];
1830
18.0k
  ss_128[1] = ss_128[2];
1831
18.0k
  ss_128[2] = ss_128[3];
1832
18.0k
  return r;
1833
18.0k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_8tap_2x2_sse2
1834
1835
static inline __m256i xy_y_convolve_8tap_4x2_avx2(const int16_t *const src,
1836
                                                  __m128i s_64[8],
1837
                                                  __m256i ss_256[4],
1838
74.7k
                                                  const __m256i coeffs[4]) {
1839
74.7k
  __m256i s_256[2];
1840
74.7k
  s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * 4));
1841
74.7k
  s_256[0] = _mm256_setr_m128i(s_64[6], s_64[7]);
1842
74.7k
  s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * 4));
1843
74.7k
  s_256[1] = _mm256_setr_m128i(s_64[7], s_64[6]);
1844
74.7k
  ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1845
74.7k
  const __m256i r = convolve16_8tap_avx2(ss_256, coeffs);
1846
74.7k
  ss_256[0] = ss_256[1];
1847
74.7k
  ss_256[1] = ss_256[2];
1848
74.7k
  ss_256[2] = ss_256[3];
1849
74.7k
  return r;
1850
74.7k
}
convolve_2d_avx2.c:xy_y_convolve_8tap_4x2_avx2
Line
Count
Source
1838
74.7k
                                                  const __m256i coeffs[4]) {
1839
74.7k
  __m256i s_256[2];
1840
74.7k
  s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * 4));
1841
74.7k
  s_256[0] = _mm256_setr_m128i(s_64[6], s_64[7]);
1842
74.7k
  s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * 4));
1843
  s_256[1] = _mm256_setr_m128i(s_64[7], s_64[6]);
1844
74.7k
  ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1845
74.7k
  const __m256i r = convolve16_8tap_avx2(ss_256, coeffs);
1846
74.7k
  ss_256[0] = ss_256[1];
1847
74.7k
  ss_256[1] = ss_256[2];
1848
74.7k
  ss_256[2] = ss_256[3];
1849
74.7k
  return r;
1850
74.7k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_8tap_4x2_avx2
1851
1852
static inline void xy_y_convolve_8tap_16_avx2(const __m256i *const ss,
1853
                                              const __m256i coeffs[4],
1854
1.97M
                                              __m256i r[2]) {
1855
1.97M
  r[0] = convolve16_8tap_avx2(ss, coeffs);
1856
1.97M
  r[1] = convolve16_8tap_avx2(ss + 4, coeffs);
1857
1.97M
}
convolve_2d_avx2.c:xy_y_convolve_8tap_16_avx2
Line
Count
Source
1854
1.97M
                                              __m256i r[2]) {
1855
1.97M
  r[0] = convolve16_8tap_avx2(ss, coeffs);
1856
1.97M
  r[1] = convolve16_8tap_avx2(ss + 4, coeffs);
1857
1.97M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_8tap_16_avx2
1858
1859
static inline void xy_y_convolve_8tap_8x2_avx2(const int16_t *const src,
1860
                                               __m256i ss_256[8],
1861
                                               const __m256i coeffs[4],
1862
50.5k
                                               __m256i r[2]) {
1863
50.5k
  __m256i s_256[2];
1864
50.5k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
1865
50.5k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
1866
50.5k
  ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1867
50.5k
  ss_256[7] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1868
50.5k
  xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r);
1869
50.5k
  ss_256[0] = ss_256[1];
1870
50.5k
  ss_256[1] = ss_256[2];
1871
50.5k
  ss_256[2] = ss_256[3];
1872
50.5k
  ss_256[4] = ss_256[5];
1873
50.5k
  ss_256[5] = ss_256[6];
1874
50.5k
  ss_256[6] = ss_256[7];
1875
50.5k
}
convolve_2d_avx2.c:xy_y_convolve_8tap_8x2_avx2
Line
Count
Source
1862
50.5k
                                               __m256i r[2]) {
1863
50.5k
  __m256i s_256[2];
1864
50.5k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
1865
50.5k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
1866
50.5k
  ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1867
50.5k
  ss_256[7] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1868
50.5k
  xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r);
1869
50.5k
  ss_256[0] = ss_256[1];
1870
50.5k
  ss_256[1] = ss_256[2];
1871
50.5k
  ss_256[2] = ss_256[3];
1872
50.5k
  ss_256[4] = ss_256[5];
1873
50.5k
  ss_256[5] = ss_256[6];
1874
50.5k
  ss_256[6] = ss_256[7];
1875
50.5k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_8tap_8x2_avx2
1876
1877
static inline void xy_y_convolve_8tap_8x2_half_pel_avx2(
1878
    const int16_t *const src, const __m256i coeffs[2], __m256i s_256[8],
1879
28.2k
    __m256i r[2]) {
1880
28.2k
  __m256i a_256[4], ss_256[4];
1881
1882
28.2k
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
1883
28.2k
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
1884
28.2k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
1885
28.2k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
1886
28.2k
  a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
1887
28.2k
  a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
1888
28.2k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1889
28.2k
  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1890
28.2k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1891
28.2k
  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1892
28.2k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1893
28.2k
  s_256[0] = s_256[2];
1894
28.2k
  s_256[1] = s_256[3];
1895
28.2k
  s_256[2] = s_256[4];
1896
28.2k
  s_256[3] = s_256[5];
1897
28.2k
  s_256[4] = s_256[6];
1898
28.2k
  s_256[5] = s_256[7];
1899
28.2k
}
convolve_2d_avx2.c:xy_y_convolve_8tap_8x2_half_pel_avx2
Line
Count
Source
1879
28.2k
    __m256i r[2]) {
1880
28.2k
  __m256i a_256[4], ss_256[4];
1881
1882
28.2k
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
1883
28.2k
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
1884
28.2k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
1885
28.2k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
1886
28.2k
  a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
1887
28.2k
  a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
1888
28.2k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1889
28.2k
  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1890
28.2k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1891
28.2k
  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1892
28.2k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1893
28.2k
  s_256[0] = s_256[2];
1894
28.2k
  s_256[1] = s_256[3];
1895
28.2k
  s_256[2] = s_256[4];
1896
28.2k
  s_256[3] = s_256[5];
1897
28.2k
  s_256[4] = s_256[6];
1898
28.2k
  s_256[5] = s_256[7];
1899
28.2k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_8tap_8x2_half_pel_avx2
1900
1901
static AOM_FORCE_INLINE void xy_y_convolve_8tap_16x2_avx2(
1902
    const int16_t *const src, const ptrdiff_t stride, const __m256i coeffs[4],
1903
962k
    __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
1904
962k
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1905
962k
  ss_256[3] = _mm256_unpacklo_epi16(s_256[6], s_256[7]);
1906
962k
  ss_256[7] = _mm256_unpackhi_epi16(s_256[6], s_256[7]);
1907
962k
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1908
962k
  tt_256[3] = _mm256_unpacklo_epi16(s_256[7], s_256[6]);
1909
962k
  tt_256[7] = _mm256_unpackhi_epi16(s_256[7], s_256[6]);
1910
1911
962k
  xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r + 0);
1912
962k
  xy_y_convolve_8tap_16_avx2(tt_256, coeffs, r + 2);
1913
1914
962k
  ss_256[0] = ss_256[1];
1915
962k
  ss_256[1] = ss_256[2];
1916
962k
  ss_256[2] = ss_256[3];
1917
962k
  ss_256[4] = ss_256[5];
1918
962k
  ss_256[5] = ss_256[6];
1919
962k
  ss_256[6] = ss_256[7];
1920
1921
962k
  tt_256[0] = tt_256[1];
1922
962k
  tt_256[1] = tt_256[2];
1923
962k
  tt_256[2] = tt_256[3];
1924
962k
  tt_256[4] = tt_256[5];
1925
962k
  tt_256[5] = tt_256[6];
1926
962k
  tt_256[6] = tt_256[7];
1927
962k
}
convolve_2d_avx2.c:xy_y_convolve_8tap_16x2_avx2
Line
Count
Source
1903
962k
    __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
1904
962k
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1905
962k
  ss_256[3] = _mm256_unpacklo_epi16(s_256[6], s_256[7]);
1906
962k
  ss_256[7] = _mm256_unpackhi_epi16(s_256[6], s_256[7]);
1907
962k
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1908
962k
  tt_256[3] = _mm256_unpacklo_epi16(s_256[7], s_256[6]);
1909
962k
  tt_256[7] = _mm256_unpackhi_epi16(s_256[7], s_256[6]);
1910
1911
962k
  xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r + 0);
1912
962k
  xy_y_convolve_8tap_16_avx2(tt_256, coeffs, r + 2);
1913
1914
962k
  ss_256[0] = ss_256[1];
1915
962k
  ss_256[1] = ss_256[2];
1916
962k
  ss_256[2] = ss_256[3];
1917
962k
  ss_256[4] = ss_256[5];
1918
962k
  ss_256[5] = ss_256[6];
1919
962k
  ss_256[6] = ss_256[7];
1920
1921
962k
  tt_256[0] = tt_256[1];
1922
962k
  tt_256[1] = tt_256[2];
1923
962k
  tt_256[2] = tt_256[3];
1924
962k
  tt_256[4] = tt_256[5];
1925
962k
  tt_256[5] = tt_256[6];
1926
962k
  tt_256[6] = tt_256[7];
1927
962k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_8tap_16x2_avx2
1928
1929
static inline void xy_y_convolve_8tap_16x2_half_pel_avx2(
1930
    const int16_t *const src, const ptrdiff_t stride, const __m256i coeffs[4],
1931
24.8k
    __m256i s_256[8], __m256i r[4]) {
1932
24.8k
  __m256i a_256[4], ss_256[4];
1933
24.8k
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1934
1935
24.8k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
1936
24.8k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
1937
24.8k
  a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
1938
24.8k
  a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
1939
24.8k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1940
24.8k
  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1941
24.8k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1942
24.8k
  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1943
1944
24.8k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1945
1946
24.8k
  a_256[1] = _mm256_add_epi16(s_256[2], s_256[7]);
1947
24.8k
  a_256[2] = _mm256_add_epi16(s_256[3], s_256[6]);
1948
24.8k
  a_256[3] = _mm256_add_epi16(s_256[4], s_256[5]);
1949
24.8k
  s_256[0] = s_256[2];
1950
24.8k
  s_256[2] = s_256[4];
1951
24.8k
  s_256[4] = s_256[6];
1952
24.8k
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1953
1954
24.8k
  a_256[0] = _mm256_add_epi16(s_256[1], s_256[6]);
1955
24.8k
  s_256[1] = s_256[3];
1956
24.8k
  s_256[3] = s_256[5];
1957
24.8k
  s_256[5] = s_256[7];
1958
24.8k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1959
24.8k
  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1960
24.8k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1961
24.8k
  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1962
1963
24.8k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
1964
24.8k
}
convolve_2d_avx2.c:xy_y_convolve_8tap_16x2_half_pel_avx2
Line
Count
Source
1931
24.8k
    __m256i s_256[8], __m256i r[4]) {
1932
24.8k
  __m256i a_256[4], ss_256[4];
1933
24.8k
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1934
1935
24.8k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
1936
24.8k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
1937
24.8k
  a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
1938
24.8k
  a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
1939
24.8k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1940
24.8k
  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1941
24.8k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1942
24.8k
  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1943
1944
24.8k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1945
1946
24.8k
  a_256[1] = _mm256_add_epi16(s_256[2], s_256[7]);
1947
24.8k
  a_256[2] = _mm256_add_epi16(s_256[3], s_256[6]);
1948
24.8k
  a_256[3] = _mm256_add_epi16(s_256[4], s_256[5]);
1949
24.8k
  s_256[0] = s_256[2];
1950
24.8k
  s_256[2] = s_256[4];
1951
24.8k
  s_256[4] = s_256[6];
1952
24.8k
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1953
1954
24.8k
  a_256[0] = _mm256_add_epi16(s_256[1], s_256[6]);
1955
24.8k
  s_256[1] = s_256[3];
1956
24.8k
  s_256[3] = s_256[5];
1957
24.8k
  s_256[5] = s_256[7];
1958
24.8k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1959
24.8k
  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1960
24.8k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1961
24.8k
  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1962
1963
24.8k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
1964
24.8k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_8tap_16x2_half_pel_avx2
1965
1966
static inline void xy_y_round_store_8x2_avx2(const __m256i res[2],
1967
                                             uint8_t *const dst,
1968
2.18M
                                             const ptrdiff_t stride) {
1969
2.18M
  const __m256i r = xy_y_round_16_avx2(res);
1970
2.18M
  pack_store_8x2_avx2(r, dst, stride);
1971
2.18M
}
convolve_2d_avx2.c:xy_y_round_store_8x2_avx2
Line
Count
Source
1968
2.18M
                                             const ptrdiff_t stride) {
1969
2.18M
  const __m256i r = xy_y_round_16_avx2(res);
1970
2.18M
  pack_store_8x2_avx2(r, dst, stride);
1971
2.18M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_store_8x2_avx2
1972
1973
static inline void xy_y_round_store_16x2_avx2(const __m256i res[4],
1974
                                              uint8_t *const dst,
1975
1.64M
                                              const ptrdiff_t stride) {
1976
1.64M
  const __m256i r0 = xy_y_round_16_avx2(res + 0);
1977
1.64M
  const __m256i r1 = xy_y_round_16_avx2(res + 2);
1978
1.64M
  xy_y_pack_store_16x2_avx2(r0, r1, dst, stride);
1979
1.64M
}
convolve_2d_avx2.c:xy_y_round_store_16x2_avx2
Line
Count
Source
1975
1.64M
                                              const ptrdiff_t stride) {
1976
1.64M
  const __m256i r0 = xy_y_round_16_avx2(res + 0);
1977
1.64M
  const __m256i r1 = xy_y_round_16_avx2(res + 2);
1978
1.64M
  xy_y_pack_store_16x2_avx2(r0, r1, dst, stride);
1979
1.64M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_store_16x2_avx2
1980
1981
static inline void sr_y_round_store_32_avx2(const __m256i res[2],
1982
2.15M
                                            uint8_t *const dst) {
1983
2.15M
  __m256i r[2];
1984
1985
2.15M
  r[0] = sr_y_round_avx2(res[0]);
1986
2.15M
  r[1] = sr_y_round_avx2(res[1]);
1987
2.15M
  convolve_store_32_avx2(r[0], r[1], dst);
1988
2.15M
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_round_store_32_avx2
convolve_avx2.c:sr_y_round_store_32_avx2
Line
Count
Source
1982
2.15M
                                            uint8_t *const dst) {
1983
2.15M
  __m256i r[2];
1984
1985
2.15M
  r[0] = sr_y_round_avx2(res[0]);
1986
2.15M
  r[1] = sr_y_round_avx2(res[1]);
1987
2.15M
  convolve_store_32_avx2(r[0], r[1], dst);
1988
2.15M
}
1989
1990
static inline void sr_y_round_store_32x2_avx2(const __m256i res[4],
1991
                                              uint8_t *const dst,
1992
982k
                                              const int32_t dst_stride) {
1993
982k
  sr_y_round_store_32_avx2(res, dst);
1994
982k
  sr_y_round_store_32_avx2(res + 2, dst + dst_stride);
1995
982k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_round_store_32x2_avx2
convolve_avx2.c:sr_y_round_store_32x2_avx2
Line
Count
Source
1992
982k
                                              const int32_t dst_stride) {
1993
982k
  sr_y_round_store_32_avx2(res, dst);
1994
982k
  sr_y_round_store_32_avx2(res + 2, dst + dst_stride);
1995
982k
}
1996
1997
static inline void sr_y_2tap_32_avx2(const uint8_t *const src,
1998
                                     const __m256i coeffs[1], const __m256i s0,
1999
187k
                                     __m256i *const s1, uint8_t *const dst) {
2000
187k
  __m256i r[2];
2001
187k
  y_convolve_2tap_32_avx2(src, coeffs, s0, s1, r);
2002
187k
  sr_y_round_store_32_avx2(r, dst);
2003
187k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_2tap_32_avx2
convolve_avx2.c:sr_y_2tap_32_avx2
Line
Count
Source
1999
187k
                                     __m256i *const s1, uint8_t *const dst) {
2000
187k
  __m256i r[2];
2001
187k
  y_convolve_2tap_32_avx2(src, coeffs, s0, s1, r);
2002
187k
  sr_y_round_store_32_avx2(r, dst);
2003
187k
}
2004
2005
static AOM_FORCE_INLINE void av1_convolve_y_sr_specialized_avx2(
2006
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
2007
    int32_t w, int32_t h, const InterpFilterParams *filter_params_y,
2008
795k
    const int32_t subpel_y_q4) {
2009
795k
  int32_t x, y;
2010
795k
  __m128i coeffs_128[4];
2011
795k
  __m256i coeffs_256[4];
2012
2013
795k
  int vert_tap = get_filter_tap(filter_params_y, subpel_y_q4);
2014
2015
795k
  if (vert_tap == 2) {
2016
    // vert_filt as 2 tap
2017
53.7k
    const uint8_t *src_ptr = src;
2018
2019
53.7k
    y = h;
2020
2021
53.7k
    if (subpel_y_q4 != 8) {
2022
31.1k
      if (w <= 8) {
2023
24.5k
        prepare_half_coeffs_2tap_ssse3(filter_params_y, subpel_y_q4,
2024
24.5k
                                       coeffs_128);
2025
2026
24.5k
        if (w == 2) {
2027
4.22k
          __m128i s_16[2];
2028
2029
4.22k
          s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
2030
2031
7.14k
          do {
2032
7.14k
            const __m128i res = y_convolve_2tap_2x2_ssse3(src_ptr, src_stride,
2033
7.14k
                                                          coeffs_128, s_16);
2034
7.14k
            const __m128i r = sr_y_round_sse2(res);
2035
7.14k
            pack_store_2x2_sse2(r, dst, dst_stride);
2036
7.14k
            src_ptr += 2 * src_stride;
2037
7.14k
            dst += 2 * dst_stride;
2038
7.14k
            y -= 2;
2039
7.14k
          } while (y);
2040
20.2k
        } else if (w == 4) {
2041
11.2k
          __m128i s_32[2];
2042
2043
11.2k
          s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr));
2044
2045
37.8k
          do {
2046
37.8k
            const __m128i res = y_convolve_2tap_4x2_ssse3(src_ptr, src_stride,
2047
37.8k
                                                          coeffs_128, s_32);
2048
37.8k
            const __m128i r = sr_y_round_sse2(res);
2049
37.8k
            pack_store_4x2_sse2(r, dst, dst_stride);
2050
37.8k
            src_ptr += 2 * src_stride;
2051
37.8k
            dst += 2 * dst_stride;
2052
37.8k
            y -= 2;
2053
37.8k
          } while (y);
2054
11.2k
        } else {
2055
9.01k
          __m128i s_64[2], s_128[2];
2056
2057
9.01k
          assert(w == 8);
2058
2059
9.01k
          s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
2060
2061
43.3k
          do {
2062
            // Note: Faster than binding to AVX2 registers.
2063
43.3k
            s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
2064
43.3k
            s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
2065
43.3k
            s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2066
43.3k
            s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
2067
43.3k
            const __m128i ss0 = _mm_unpacklo_epi8(s_128[0], s_128[1]);
2068
43.3k
            const __m128i ss1 = _mm_unpackhi_epi8(s_128[0], s_128[1]);
2069
43.3k
            const __m128i res0 = convolve_2tap_ssse3(&ss0, coeffs_128);
2070
43.3k
            const __m128i res1 = convolve_2tap_ssse3(&ss1, coeffs_128);
2071
43.3k
            const __m128i r0 = sr_y_round_sse2(res0);
2072
43.3k
            const __m128i r1 = sr_y_round_sse2(res1);
2073
43.3k
            const __m128i d = _mm_packus_epi16(r0, r1);
2074
43.3k
            _mm_storel_epi64((__m128i *)dst, d);
2075
43.3k
            _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
2076
43.3k
            src_ptr += 2 * src_stride;
2077
43.3k
            dst += 2 * dst_stride;
2078
43.3k
            y -= 2;
2079
43.3k
          } while (y);
2080
9.01k
        }
2081
24.5k
      } else {
2082
6.60k
        prepare_half_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2083
2084
6.60k
        if (w == 16) {
2085
4.35k
          __m128i s_128[2];
2086
2087
4.35k
          s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
2088
2089
33.2k
          do {
2090
33.2k
            __m256i r[2];
2091
2092
33.2k
            y_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2093
33.2k
                                      r);
2094
33.2k
            sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2095
33.2k
            src_ptr += 2 * src_stride;
2096
33.2k
            dst += 2 * dst_stride;
2097
33.2k
            y -= 2;
2098
33.2k
          } while (y);
2099
4.35k
        } else if (w == 32) {
2100
1.22k
          __m256i s_256[2];
2101
2102
1.22k
          s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
2103
2104
15.1k
          do {
2105
15.1k
            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0],
2106
15.1k
                              &s_256[1], dst);
2107
15.1k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1],
2108
15.1k
                              &s_256[0], dst + dst_stride);
2109
15.1k
            src_ptr += 2 * src_stride;
2110
15.1k
            dst += 2 * dst_stride;
2111
15.1k
            y -= 2;
2112
15.1k
          } while (y);
2113
1.22k
        } else if (w == 64) {
2114
828
          __m256i s_256[2][2];
2115
2116
828
          s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2117
828
          s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2118
2119
19.7k
          do {
2120
19.7k
            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0][0],
2121
19.7k
                              &s_256[1][0], dst);
2122
19.7k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 32, coeffs_256,
2123
19.7k
                              s_256[0][1], &s_256[1][1], dst + 32);
2124
19.7k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1][0],
2125
19.7k
                              &s_256[0][0], dst + dst_stride);
2126
19.7k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 32, coeffs_256,
2127
19.7k
                              s_256[1][1], &s_256[0][1], dst + dst_stride + 32);
2128
2129
19.7k
            src_ptr += 2 * src_stride;
2130
19.7k
            dst += 2 * dst_stride;
2131
19.7k
            y -= 2;
2132
19.7k
          } while (y);
2133
828
        } else {
2134
196
          __m256i s_256[2][4];
2135
2136
196
          assert(w == 128);
2137
2138
196
          s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2139
196
          s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2140
196
          s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
2141
196
          s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
2142
2143
9.82k
          do {
2144
9.82k
            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0][0],
2145
9.82k
                              &s_256[1][0], dst);
2146
9.82k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 1 * 32, coeffs_256,
2147
9.82k
                              s_256[0][1], &s_256[1][1], dst + 1 * 32);
2148
9.82k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 2 * 32, coeffs_256,
2149
9.82k
                              s_256[0][2], &s_256[1][2], dst + 2 * 32);
2150
9.82k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 3 * 32, coeffs_256,
2151
9.82k
                              s_256[0][3], &s_256[1][3], dst + 3 * 32);
2152
2153
9.82k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1][0],
2154
9.82k
                              &s_256[0][0], dst + dst_stride);
2155
9.82k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 1 * 32, coeffs_256,
2156
9.82k
                              s_256[1][1], &s_256[0][1],
2157
9.82k
                              dst + dst_stride + 1 * 32);
2158
9.82k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 2 * 32, coeffs_256,
2159
9.82k
                              s_256[1][2], &s_256[0][2],
2160
9.82k
                              dst + dst_stride + 2 * 32);
2161
9.82k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 3 * 32, coeffs_256,
2162
9.82k
                              s_256[1][3], &s_256[0][3],
2163
9.82k
                              dst + dst_stride + 3 * 32);
2164
2165
9.82k
            src_ptr += 2 * src_stride;
2166
9.82k
            dst += 2 * dst_stride;
2167
9.82k
            y -= 2;
2168
9.82k
          } while (y);
2169
196
        }
2170
6.60k
      }
2171
31.1k
    } else {
2172
      // average to get half pel
2173
22.5k
      if (w <= 8) {
2174
19.4k
        if (w == 2) {
2175
4.06k
          __m128i s_16[2];
2176
2177
4.06k
          s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
2178
2179
7.84k
          do {
2180
7.84k
            s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + src_stride));
2181
7.84k
            const __m128i d0 = _mm_avg_epu8(s_16[0], s_16[1]);
2182
7.84k
            *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d0);
2183
7.84k
            s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
2184
7.84k
            const __m128i d1 = _mm_avg_epu8(s_16[1], s_16[0]);
2185
7.84k
            *(int16_t *)(dst + dst_stride) = (int16_t)_mm_cvtsi128_si32(d1);
2186
7.84k
            src_ptr += 2 * src_stride;
2187
7.84k
            dst += 2 * dst_stride;
2188
7.84k
            y -= 2;
2189
7.84k
          } while (y);
2190
15.3k
        } else if (w == 4) {
2191
9.17k
          __m128i s_32[2];
2192
2193
9.17k
          s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr));
2194
2195
25.7k
          do {
2196
25.7k
            s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + src_stride));
2197
25.7k
            const __m128i d0 = _mm_avg_epu8(s_32[0], s_32[1]);
2198
25.7k
            xx_storel_32(dst, d0);
2199
25.7k
            s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2200
25.7k
            const __m128i d1 = _mm_avg_epu8(s_32[1], s_32[0]);
2201
25.7k
            xx_storel_32(dst + dst_stride, d1);
2202
25.7k
            src_ptr += 2 * src_stride;
2203
25.7k
            dst += 2 * dst_stride;
2204
25.7k
            y -= 2;
2205
25.7k
          } while (y);
2206
9.17k
        } else {
2207
6.21k
          __m128i s_64[2];
2208
2209
6.21k
          assert(w == 8);
2210
2211
6.21k
          s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
2212
2213
18.9k
          do {
2214
            // Note: Faster than binding to AVX2 registers.
2215
18.9k
            s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
2216
18.9k
            const __m128i d0 = _mm_avg_epu8(s_64[0], s_64[1]);
2217
18.9k
            _mm_storel_epi64((__m128i *)dst, d0);
2218
18.9k
            s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2219
18.9k
            const __m128i d1 = _mm_avg_epu8(s_64[1], s_64[0]);
2220
18.9k
            _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
2221
18.9k
            src_ptr += 2 * src_stride;
2222
18.9k
            dst += 2 * dst_stride;
2223
18.9k
            y -= 2;
2224
18.9k
          } while (y);
2225
6.21k
        }
2226
19.4k
      } else if (w == 16) {
2227
2.08k
        __m128i s_128[2];
2228
2229
2.08k
        s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
2230
2231
11.6k
        do {
2232
11.6k
          s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
2233
11.6k
          const __m128i d0 = _mm_avg_epu8(s_128[0], s_128[1]);
2234
11.6k
          _mm_storeu_si128((__m128i *)dst, d0);
2235
11.6k
          s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2236
11.6k
          const __m128i d1 = _mm_avg_epu8(s_128[1], s_128[0]);
2237
11.6k
          _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
2238
11.6k
          src_ptr += 2 * src_stride;
2239
11.6k
          dst += 2 * dst_stride;
2240
11.6k
          y -= 2;
2241
11.6k
        } while (y);
2242
2.08k
      } else if (w == 32) {
2243
717
        __m256i s_256[2];
2244
2245
717
        s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
2246
2247
7.99k
        do {
2248
7.99k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0], &s_256[1], dst);
2249
7.99k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1], &s_256[0],
2250
7.99k
                                dst + dst_stride);
2251
7.99k
          src_ptr += 2 * src_stride;
2252
7.99k
          dst += 2 * dst_stride;
2253
7.99k
          y -= 2;
2254
7.99k
        } while (y);
2255
717
      } else if (w == 64) {
2256
156
        __m256i s_256[2][2];
2257
2258
156
        s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2259
156
        s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2260
2261
3.26k
        do {
2262
3.26k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0][0], &s_256[1][0],
2263
3.26k
                                dst);
2264
3.26k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 32, s_256[0][1],
2265
3.26k
                                &s_256[1][1], dst + 32);
2266
2267
3.26k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1][0],
2268
3.26k
                                &s_256[0][0], dst + dst_stride);
2269
3.26k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 32, s_256[1][1],
2270
3.26k
                                &s_256[0][1], dst + dst_stride + 32);
2271
2272
3.26k
          src_ptr += 2 * src_stride;
2273
3.26k
          dst += 2 * dst_stride;
2274
3.26k
          y -= 2;
2275
3.26k
        } while (y);
2276
178
      } else {
2277
178
        __m256i s_256[2][4];
2278
2279
178
        assert(w == 128);
2280
2281
178
        s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2282
178
        s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2283
178
        s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
2284
178
        s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
2285
2286
7.64k
        do {
2287
7.64k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0][0], &s_256[1][0],
2288
7.64k
                                dst);
2289
7.64k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 1 * 32, s_256[0][1],
2290
7.64k
                                &s_256[1][1], dst + 1 * 32);
2291
7.64k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 2 * 32, s_256[0][2],
2292
7.64k
                                &s_256[1][2], dst + 2 * 32);
2293
7.64k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 3 * 32, s_256[0][3],
2294
7.64k
                                &s_256[1][3], dst + 3 * 32);
2295
2296
7.64k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1][0],
2297
7.64k
                                &s_256[0][0], dst + dst_stride);
2298
7.64k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 1 * 32, s_256[1][1],
2299
7.64k
                                &s_256[0][1], dst + dst_stride + 1 * 32);
2300
7.64k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 2 * 32, s_256[1][2],
2301
7.64k
                                &s_256[0][2], dst + dst_stride + 2 * 32);
2302
7.64k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 3 * 32, s_256[1][3],
2303
7.64k
                                &s_256[0][3], dst + dst_stride + 3 * 32);
2304
2305
7.64k
          src_ptr += 2 * src_stride;
2306
7.64k
          dst += 2 * dst_stride;
2307
7.64k
          y -= 2;
2308
7.64k
        } while (y);
2309
178
      }
2310
22.5k
    }
2311
742k
  } else if (vert_tap == 4) {
2312
    // vert_filt as 4 tap
2313
386k
    const uint8_t *src_ptr = src - src_stride;
2314
2315
386k
    y = h;
2316
2317
386k
    if (w <= 4) {
2318
184k
      prepare_half_coeffs_4tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2319
2320
184k
      if (w == 2) {
2321
32.6k
        __m128i s_16[4], ss_128[2];
2322
2323
32.6k
        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2324
32.6k
        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2325
32.6k
        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2326
2327
32.6k
        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2328
32.6k
        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2329
2330
32.6k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2331
2332
56.4k
        do {
2333
56.4k
          src_ptr += 2 * src_stride;
2334
56.4k
          const __m128i res = y_convolve_4tap_2x2_ssse3(
2335
56.4k
              src_ptr, src_stride, coeffs_128, s_16, ss_128);
2336
56.4k
          const __m128i r = sr_y_round_sse2(res);
2337
56.4k
          pack_store_2x2_sse2(r, dst, dst_stride);
2338
2339
56.4k
          ss_128[0] = ss_128[1];
2340
56.4k
          dst += 2 * dst_stride;
2341
56.4k
          y -= 2;
2342
56.4k
        } while (y);
2343
152k
      } else {
2344
152k
        __m128i s_32[4], ss_128[2];
2345
2346
152k
        assert(w == 4);
2347
2348
152k
        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2349
152k
        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2350
152k
        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2351
2352
152k
        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2353
152k
        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2354
2355
152k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2356
2357
300k
        do {
2358
300k
          src_ptr += 2 * src_stride;
2359
300k
          const __m128i res = y_convolve_4tap_4x2_ssse3(
2360
300k
              src_ptr, src_stride, coeffs_128, s_32, ss_128);
2361
300k
          const __m128i r = sr_y_round_sse2(res);
2362
300k
          pack_store_4x2_sse2(r, dst, dst_stride);
2363
2364
300k
          ss_128[0] = ss_128[1];
2365
300k
          dst += 2 * dst_stride;
2366
300k
          y -= 2;
2367
300k
        } while (y);
2368
152k
      }
2369
201k
    } else {
2370
201k
      prepare_half_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2371
2372
201k
      if (w == 8) {
2373
131k
        __m128i s_64[4];
2374
131k
        __m256i ss_256[2];
2375
2376
131k
        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2377
131k
        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2378
131k
        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2379
2380
        // Load lines a and b. Line a to lower 128, line b to upper 128
2381
131k
        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2382
131k
        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2383
2384
131k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2385
2386
261k
        do {
2387
261k
          src_ptr += 2 * src_stride;
2388
261k
          const __m256i res = y_convolve_4tap_8x2_avx2(
2389
261k
              src_ptr, src_stride, coeffs_256, s_64, ss_256);
2390
261k
          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2391
2392
261k
          ss_256[0] = ss_256[1];
2393
261k
          dst += 2 * dst_stride;
2394
261k
          y -= 2;
2395
261k
        } while (y);
2396
131k
      } else if (w == 16) {
2397
63.6k
        __m128i s_128[4];
2398
63.6k
        __m256i ss_256[4], r[2];
2399
2400
63.6k
        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2401
63.6k
        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2402
63.6k
        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2403
2404
        // Load lines a and b. Line a to lower 128, line b to upper 128
2405
63.6k
        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2406
63.6k
        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2407
2408
63.6k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2409
63.6k
        ss_256[2] = _mm256_unpackhi_epi8(src01, src12);
2410
2411
157k
        do {
2412
157k
          src_ptr += 2 * src_stride;
2413
157k
          y_convolve_4tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2414
157k
                                    ss_256, r);
2415
157k
          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2416
2417
157k
          ss_256[0] = ss_256[1];
2418
157k
          ss_256[2] = ss_256[3];
2419
157k
          dst += 2 * dst_stride;
2420
157k
          y -= 2;
2421
157k
        } while (y);
2422
63.6k
      } else if (w == 32) {
2423
        // AV1 standard won't have 32x4 case.
2424
        // This only favors some optimization feature which
2425
        // subsamples 32x8 to 32x4 and triggers 4-tap filter.
2426
2427
4.05k
        __m256i s_256[4], ss_256[4], tt_256[4], r[4];
2428
2429
4.05k
        s_256[0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * src_stride));
2430
4.05k
        s_256[1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * src_stride));
2431
4.05k
        s_256[2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * src_stride));
2432
2433
4.05k
        ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2434
4.05k
        ss_256[2] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2435
2436
4.05k
        tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2437
4.05k
        tt_256[2] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2438
2439
31.1k
        do {
2440
31.1k
          src_ptr += 2 * src_stride;
2441
31.1k
          y_convolve_4tap_32x2_avx2(src_ptr, src_stride, coeffs_256, s_256,
2442
31.1k
                                    ss_256, tt_256, r);
2443
31.1k
          sr_y_round_store_32x2_avx2(r, dst, dst_stride);
2444
2445
31.1k
          ss_256[0] = ss_256[1];
2446
31.1k
          ss_256[2] = ss_256[3];
2447
2448
31.1k
          tt_256[0] = tt_256[1];
2449
31.1k
          tt_256[2] = tt_256[3];
2450
31.1k
          dst += 2 * dst_stride;
2451
31.1k
          y -= 2;
2452
31.1k
        } while (y);
2453
4.05k
      } else {
2454
2.18k
        assert(!(w % 32));
2455
2456
2.19k
        __m256i s_256[4], ss_256[4], tt_256[4], r[4];
2457
2.19k
        x = 0;
2458
5.54k
        do {
2459
5.54k
          const uint8_t *s = src_ptr + x;
2460
5.54k
          uint8_t *d = dst + x;
2461
5.54k
          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2462
5.54k
          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2463
5.54k
          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2464
2465
5.54k
          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2466
5.54k
          ss_256[2] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2467
2468
5.54k
          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2469
5.54k
          tt_256[2] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2470
2471
5.54k
          y = h;
2472
243k
          do {
2473
243k
            s += 2 * src_stride;
2474
243k
            y_convolve_4tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2475
243k
                                      tt_256, r);
2476
243k
            sr_y_round_store_32x2_avx2(r, d, dst_stride);
2477
2478
243k
            ss_256[0] = ss_256[1];
2479
243k
            ss_256[2] = ss_256[3];
2480
2481
243k
            tt_256[0] = tt_256[1];
2482
243k
            tt_256[2] = tt_256[3];
2483
243k
            d += 2 * dst_stride;
2484
243k
            y -= 2;
2485
243k
          } while (y);
2486
5.54k
          x += 32;
2487
5.54k
        } while (x < w);
2488
2.19k
      }
2489
201k
    }
2490
386k
  } else if (vert_tap == 6) {
2491
    // vert_filt as 6 tap
2492
338k
    const uint8_t *src_ptr = src - 2 * src_stride;
2493
2494
338k
    if (w <= 4) {
2495
111k
      prepare_half_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2496
2497
111k
      y = h;
2498
2499
111k
      if (w == 2) {
2500
23.2k
        __m128i s_16[6], ss_128[3];
2501
2502
23.2k
        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2503
23.2k
        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2504
23.2k
        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2505
23.2k
        s_16[3] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 3 * src_stride));
2506
23.2k
        s_16[4] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 4 * src_stride));
2507
2508
23.2k
        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2509
23.2k
        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2510
23.2k
        const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
2511
23.2k
        const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
2512
2513
23.2k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2514
23.2k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2515
2516
93.0k
        do {
2517
93.0k
          src_ptr += 2 * src_stride;
2518
93.0k
          const __m128i res = y_convolve_6tap_2x2_ssse3(
2519
93.0k
              src_ptr, src_stride, coeffs_128, s_16, ss_128);
2520
93.0k
          const __m128i r = sr_y_round_sse2(res);
2521
93.0k
          pack_store_2x2_sse2(r, dst, dst_stride);
2522
2523
93.0k
          ss_128[0] = ss_128[1];
2524
93.0k
          ss_128[1] = ss_128[2];
2525
93.0k
          dst += 2 * dst_stride;
2526
93.0k
          y -= 2;
2527
93.0k
        } while (y);
2528
88.3k
      } else {
2529
88.3k
        __m128i s_32[6], ss_128[3];
2530
2531
88.3k
        assert(w == 4);
2532
2533
88.3k
        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2534
88.3k
        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2535
88.3k
        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2536
88.3k
        s_32[3] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 3 * src_stride));
2537
88.3k
        s_32[4] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 4 * src_stride));
2538
2539
88.3k
        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2540
88.3k
        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2541
88.3k
        const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
2542
88.3k
        const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
2543
2544
88.3k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2545
88.3k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2546
2547
485k
        do {
2548
485k
          src_ptr += 2 * src_stride;
2549
485k
          const __m128i res = y_convolve_6tap_4x2_ssse3(
2550
485k
              src_ptr, src_stride, coeffs_128, s_32, ss_128);
2551
485k
          const __m128i r = sr_y_round_sse2(res);
2552
485k
          pack_store_4x2_sse2(r, dst, dst_stride);
2553
2554
485k
          ss_128[0] = ss_128[1];
2555
485k
          ss_128[1] = ss_128[2];
2556
485k
          dst += 2 * dst_stride;
2557
485k
          y -= 2;
2558
485k
        } while (y);
2559
88.3k
      }
2560
227k
    } else {
2561
227k
      prepare_half_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2562
2563
227k
      if (w == 8) {
2564
110k
        __m128i s_64[6];
2565
110k
        __m256i ss_256[3];
2566
2567
110k
        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2568
110k
        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2569
110k
        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2570
110k
        s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
2571
110k
        s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
2572
2573
        // Load lines a and b. Line a to lower 128, line b to upper 128
2574
110k
        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2575
110k
        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2576
110k
        const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
2577
110k
        const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
2578
2579
110k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2580
110k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2581
2582
110k
        y = h;
2583
612k
        do {
2584
612k
          src_ptr += 2 * src_stride;
2585
612k
          const __m256i res = y_convolve_6tap_8x2_avx2(
2586
612k
              src_ptr, src_stride, coeffs_256, s_64, ss_256);
2587
612k
          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2588
2589
612k
          ss_256[0] = ss_256[1];
2590
612k
          ss_256[1] = ss_256[2];
2591
612k
          dst += 2 * dst_stride;
2592
612k
          y -= 2;
2593
612k
        } while (y);
2594
116k
      } else if (w == 16) {
2595
82.4k
        __m128i s_128[6];
2596
82.4k
        __m256i ss_256[6], r[2];
2597
2598
82.4k
        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2599
82.4k
        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2600
82.4k
        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2601
82.4k
        s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
2602
82.4k
        s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
2603
2604
        // Load lines a and b. Line a to lower 128, line b to upper 128
2605
82.4k
        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2606
82.4k
        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2607
82.4k
        const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
2608
82.4k
        const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
2609
2610
82.4k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2611
82.4k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2612
2613
82.4k
        ss_256[3] = _mm256_unpackhi_epi8(src01, src12);
2614
82.4k
        ss_256[4] = _mm256_unpackhi_epi8(src23, src34);
2615
2616
82.4k
        y = h;
2617
578k
        do {
2618
578k
          src_ptr += 2 * src_stride;
2619
578k
          y_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2620
578k
                                    ss_256, r);
2621
578k
          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2622
2623
578k
          ss_256[0] = ss_256[1];
2624
578k
          ss_256[1] = ss_256[2];
2625
2626
578k
          ss_256[3] = ss_256[4];
2627
578k
          ss_256[4] = ss_256[5];
2628
578k
          dst += 2 * dst_stride;
2629
578k
          y -= 2;
2630
578k
        } while (y);
2631
82.4k
      } else {
2632
34.1k
        __m256i s_256[6], ss_256[6], tt_256[6], r[4];
2633
2634
34.1k
        assert(!(w % 32));
2635
2636
34.2k
        x = 0;
2637
40.4k
        do {
2638
40.4k
          const uint8_t *s = src_ptr + x;
2639
40.4k
          uint8_t *d = dst + x;
2640
2641
40.4k
          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2642
40.4k
          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2643
40.4k
          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2644
40.4k
          s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
2645
40.4k
          s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
2646
2647
40.4k
          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2648
40.4k
          ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
2649
40.4k
          ss_256[3] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2650
40.4k
          ss_256[4] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
2651
2652
40.4k
          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2653
40.4k
          tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
2654
40.4k
          tt_256[3] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2655
40.4k
          tt_256[4] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
2656
2657
40.4k
          y = h;
2658
650k
          do {
2659
650k
            s += 2 * src_stride;
2660
650k
            y_convolve_6tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2661
650k
                                      tt_256, r);
2662
650k
            sr_y_round_store_32x2_avx2(r, d, dst_stride);
2663
2664
650k
            ss_256[0] = ss_256[1];
2665
650k
            ss_256[1] = ss_256[2];
2666
650k
            ss_256[3] = ss_256[4];
2667
650k
            ss_256[4] = ss_256[5];
2668
2669
650k
            tt_256[0] = tt_256[1];
2670
650k
            tt_256[1] = tt_256[2];
2671
650k
            tt_256[3] = tt_256[4];
2672
650k
            tt_256[4] = tt_256[5];
2673
650k
            d += 2 * dst_stride;
2674
650k
            y -= 2;
2675
650k
          } while (y);
2676
2677
40.4k
          x += 32;
2678
40.4k
        } while (x < w);
2679
34.2k
      }
2680
227k
    }
2681
338k
  } else if (vert_tap == 8) {
2682
    // vert_filt as 8 tap
2683
16.5k
    const uint8_t *src_ptr = src - 3 * src_stride;
2684
2685
16.5k
    if (w <= 4) {
2686
6.88k
      prepare_half_coeffs_8tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2687
2688
6.88k
      y = h;
2689
2690
6.88k
      if (w == 2) {
2691
1.80k
        __m128i s_16[8], ss_128[4];
2692
2693
1.80k
        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2694
1.80k
        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2695
1.80k
        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2696
1.80k
        s_16[3] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 3 * src_stride));
2697
1.80k
        s_16[4] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 4 * src_stride));
2698
1.80k
        s_16[5] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 5 * src_stride));
2699
1.80k
        s_16[6] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 6 * src_stride));
2700
2701
1.80k
        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2702
1.80k
        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2703
1.80k
        const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
2704
1.80k
        const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
2705
1.80k
        const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
2706
1.80k
        const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[6]);
2707
2708
1.80k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2709
1.80k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2710
1.80k
        ss_128[2] = _mm_unpacklo_epi8(src45, src56);
2711
2712
7.22k
        do {
2713
7.22k
          const __m128i res = y_convolve_8tap_2x2_ssse3(
2714
7.22k
              src_ptr, src_stride, coeffs_128, s_16, ss_128);
2715
7.22k
          const __m128i r = sr_y_round_sse2(res);
2716
7.22k
          pack_store_2x2_sse2(r, dst, dst_stride);
2717
7.22k
          ss_128[0] = ss_128[1];
2718
7.22k
          ss_128[1] = ss_128[2];
2719
7.22k
          ss_128[2] = ss_128[3];
2720
7.22k
          src_ptr += 2 * src_stride;
2721
7.22k
          dst += 2 * dst_stride;
2722
7.22k
          y -= 2;
2723
7.22k
        } while (y);
2724
5.08k
      } else {
2725
5.08k
        __m128i s_32[8], ss_128[4];
2726
2727
5.08k
        assert(w == 4);
2728
2729
5.08k
        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2730
5.08k
        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2731
5.08k
        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2732
5.08k
        s_32[3] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 3 * src_stride));
2733
5.08k
        s_32[4] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 4 * src_stride));
2734
5.08k
        s_32[5] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 5 * src_stride));
2735
5.08k
        s_32[6] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 6 * src_stride));
2736
2737
5.08k
        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2738
5.08k
        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2739
5.08k
        const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
2740
5.08k
        const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
2741
5.08k
        const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
2742
5.08k
        const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
2743
2744
5.08k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2745
5.08k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2746
5.08k
        ss_128[2] = _mm_unpacklo_epi8(src45, src56);
2747
2748
27.8k
        do {
2749
27.8k
          const __m128i res = y_convolve_8tap_4x2_ssse3(
2750
27.8k
              src_ptr, src_stride, coeffs_128, s_32, ss_128);
2751
27.8k
          const __m128i r = sr_y_round_sse2(res);
2752
27.8k
          pack_store_4x2_sse2(r, dst, dst_stride);
2753
27.8k
          ss_128[0] = ss_128[1];
2754
27.8k
          ss_128[1] = ss_128[2];
2755
27.8k
          ss_128[2] = ss_128[3];
2756
27.8k
          src_ptr += 2 * src_stride;
2757
27.8k
          dst += 2 * dst_stride;
2758
27.8k
          y -= 2;
2759
27.8k
        } while (y);
2760
5.08k
      }
2761
9.67k
    } else {
2762
9.67k
      prepare_half_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2763
2764
9.67k
      if (w == 8) {
2765
4.67k
        __m128i s_64[8];
2766
4.67k
        __m256i ss_256[4];
2767
2768
4.67k
        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2769
4.67k
        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2770
4.67k
        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2771
4.67k
        s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
2772
4.67k
        s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
2773
4.67k
        s_64[5] = _mm_loadl_epi64((__m128i *)(src_ptr + 5 * src_stride));
2774
4.67k
        s_64[6] = _mm_loadl_epi64((__m128i *)(src_ptr + 6 * src_stride));
2775
2776
        // Load lines a and b. Line a to lower 128, line b to upper 128
2777
4.67k
        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2778
4.67k
        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2779
4.67k
        const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
2780
4.67k
        const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
2781
4.67k
        const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
2782
4.67k
        const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[6]);
2783
2784
4.67k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2785
4.67k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2786
4.67k
        ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
2787
2788
4.67k
        y = h;
2789
27.4k
        do {
2790
27.4k
          const __m256i res = y_convolve_8tap_8x2_avx2(
2791
27.4k
              src_ptr, src_stride, coeffs_256, s_64, ss_256);
2792
27.4k
          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2793
27.4k
          ss_256[0] = ss_256[1];
2794
27.4k
          ss_256[1] = ss_256[2];
2795
27.4k
          ss_256[2] = ss_256[3];
2796
27.4k
          src_ptr += 2 * src_stride;
2797
27.4k
          dst += 2 * dst_stride;
2798
27.4k
          y -= 2;
2799
27.4k
        } while (y);
2800
4.99k
      } else if (w == 16) {
2801
3.19k
        __m128i s_128[8];
2802
3.19k
        __m256i ss_256[8], r[2];
2803
2804
3.19k
        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2805
3.19k
        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2806
3.19k
        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2807
3.19k
        s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
2808
3.19k
        s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
2809
3.19k
        s_128[5] = _mm_loadu_si128((__m128i *)(src_ptr + 5 * src_stride));
2810
3.19k
        s_128[6] = _mm_loadu_si128((__m128i *)(src_ptr + 6 * src_stride));
2811
2812
        // Load lines a and b. Line a to lower 128, line b to upper 128
2813
3.19k
        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2814
3.19k
        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2815
3.19k
        const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
2816
3.19k
        const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
2817
3.19k
        const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
2818
3.19k
        const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[6]);
2819
2820
3.19k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2821
3.19k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2822
3.19k
        ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
2823
2824
3.19k
        ss_256[4] = _mm256_unpackhi_epi8(src01, src12);
2825
3.19k
        ss_256[5] = _mm256_unpackhi_epi8(src23, src34);
2826
3.19k
        ss_256[6] = _mm256_unpackhi_epi8(src45, src56);
2827
2828
3.19k
        y = h;
2829
23.0k
        do {
2830
23.0k
          y_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2831
23.0k
                                    ss_256, r);
2832
23.0k
          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2833
2834
23.0k
          ss_256[0] = ss_256[1];
2835
23.0k
          ss_256[1] = ss_256[2];
2836
23.0k
          ss_256[2] = ss_256[3];
2837
2838
23.0k
          ss_256[4] = ss_256[5];
2839
23.0k
          ss_256[5] = ss_256[6];
2840
23.0k
          ss_256[6] = ss_256[7];
2841
23.0k
          src_ptr += 2 * src_stride;
2842
23.0k
          dst += 2 * dst_stride;
2843
23.0k
          y -= 2;
2844
23.0k
        } while (y);
2845
3.19k
      } else {
2846
1.79k
        __m256i s_256[8], ss_256[8], tt_256[8], r[4];
2847
2848
1.79k
        assert(!(w % 32));
2849
2850
1.79k
        x = 0;
2851
2.57k
        do {
2852
2.57k
          const uint8_t *s = src_ptr + x;
2853
2.57k
          uint8_t *d = dst + x;
2854
2855
2.57k
          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2856
2.57k
          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2857
2.57k
          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2858
2.57k
          s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
2859
2.57k
          s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
2860
2.57k
          s_256[5] = _mm256_loadu_si256((__m256i *)(s + 5 * src_stride));
2861
2.57k
          s_256[6] = _mm256_loadu_si256((__m256i *)(s + 6 * src_stride));
2862
2863
2.57k
          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2864
2.57k
          ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
2865
2.57k
          ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
2866
2.57k
          ss_256[4] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2867
2.57k
          ss_256[5] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
2868
2.57k
          ss_256[6] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
2869
2870
2.57k
          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2871
2.57k
          tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
2872
2.57k
          tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[6]);
2873
2.57k
          tt_256[4] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2874
2.57k
          tt_256[5] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
2875
2.57k
          tt_256[6] = _mm256_unpackhi_epi8(s_256[5], s_256[6]);
2876
2877
2.57k
          y = h;
2878
57.2k
          do {
2879
57.2k
            y_convolve_8tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2880
57.2k
                                      tt_256, r);
2881
57.2k
            sr_y_round_store_32x2_avx2(r, d, dst_stride);
2882
2883
57.2k
            ss_256[0] = ss_256[1];
2884
57.2k
            ss_256[1] = ss_256[2];
2885
57.2k
            ss_256[2] = ss_256[3];
2886
57.2k
            ss_256[4] = ss_256[5];
2887
57.2k
            ss_256[5] = ss_256[6];
2888
57.2k
            ss_256[6] = ss_256[7];
2889
2890
57.2k
            tt_256[0] = tt_256[1];
2891
57.2k
            tt_256[1] = tt_256[2];
2892
57.2k
            tt_256[2] = tt_256[3];
2893
57.2k
            tt_256[4] = tt_256[5];
2894
57.2k
            tt_256[5] = tt_256[6];
2895
57.2k
            tt_256[6] = tt_256[7];
2896
57.2k
            s += 2 * src_stride;
2897
57.2k
            d += 2 * dst_stride;
2898
57.2k
            y -= 2;
2899
57.2k
          } while (y);
2900
2901
2.57k
          x += 32;
2902
2.57k
        } while (x < w);
2903
1.79k
      }
2904
9.67k
    }
2905
16.5k
  }
2906
795k
}
Unexecuted instantiation: convolve_2d_avx2.c:av1_convolve_y_sr_specialized_avx2
convolve_avx2.c:av1_convolve_y_sr_specialized_avx2
Line
Count
Source
2008
795k
    const int32_t subpel_y_q4) {
2009
795k
  int32_t x, y;
2010
795k
  __m128i coeffs_128[4];
2011
795k
  __m256i coeffs_256[4];
2012
2013
795k
  int vert_tap = get_filter_tap(filter_params_y, subpel_y_q4);
2014
2015
795k
  if (vert_tap == 2) {
2016
    // vert_filt as 2 tap
2017
53.7k
    const uint8_t *src_ptr = src;
2018
2019
53.7k
    y = h;
2020
2021
53.7k
    if (subpel_y_q4 != 8) {
2022
31.1k
      if (w <= 8) {
2023
24.5k
        prepare_half_coeffs_2tap_ssse3(filter_params_y, subpel_y_q4,
2024
24.5k
                                       coeffs_128);
2025
2026
24.5k
        if (w == 2) {
2027
4.22k
          __m128i s_16[2];
2028
2029
4.22k
          s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
2030
2031
7.14k
          do {
2032
7.14k
            const __m128i res = y_convolve_2tap_2x2_ssse3(src_ptr, src_stride,
2033
7.14k
                                                          coeffs_128, s_16);
2034
7.14k
            const __m128i r = sr_y_round_sse2(res);
2035
7.14k
            pack_store_2x2_sse2(r, dst, dst_stride);
2036
7.14k
            src_ptr += 2 * src_stride;
2037
7.14k
            dst += 2 * dst_stride;
2038
7.14k
            y -= 2;
2039
7.14k
          } while (y);
2040
20.2k
        } else if (w == 4) {
2041
11.2k
          __m128i s_32[2];
2042
2043
11.2k
          s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr));
2044
2045
37.8k
          do {
2046
37.8k
            const __m128i res = y_convolve_2tap_4x2_ssse3(src_ptr, src_stride,
2047
37.8k
                                                          coeffs_128, s_32);
2048
37.8k
            const __m128i r = sr_y_round_sse2(res);
2049
37.8k
            pack_store_4x2_sse2(r, dst, dst_stride);
2050
37.8k
            src_ptr += 2 * src_stride;
2051
37.8k
            dst += 2 * dst_stride;
2052
37.8k
            y -= 2;
2053
37.8k
          } while (y);
2054
11.2k
        } else {
2055
9.01k
          __m128i s_64[2], s_128[2];
2056
2057
9.01k
          assert(w == 8);
2058
2059
9.01k
          s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
2060
2061
43.3k
          do {
2062
            // Note: Faster than binding to AVX2 registers.
2063
43.3k
            s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
2064
43.3k
            s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
2065
43.3k
            s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2066
43.3k
            s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
2067
43.3k
            const __m128i ss0 = _mm_unpacklo_epi8(s_128[0], s_128[1]);
2068
43.3k
            const __m128i ss1 = _mm_unpackhi_epi8(s_128[0], s_128[1]);
2069
43.3k
            const __m128i res0 = convolve_2tap_ssse3(&ss0, coeffs_128);
2070
43.3k
            const __m128i res1 = convolve_2tap_ssse3(&ss1, coeffs_128);
2071
43.3k
            const __m128i r0 = sr_y_round_sse2(res0);
2072
43.3k
            const __m128i r1 = sr_y_round_sse2(res1);
2073
43.3k
            const __m128i d = _mm_packus_epi16(r0, r1);
2074
43.3k
            _mm_storel_epi64((__m128i *)dst, d);
2075
43.3k
            _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
2076
43.3k
            src_ptr += 2 * src_stride;
2077
43.3k
            dst += 2 * dst_stride;
2078
43.3k
            y -= 2;
2079
43.3k
          } while (y);
2080
9.01k
        }
2081
24.5k
      } else {
2082
6.60k
        prepare_half_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2083
2084
6.60k
        if (w == 16) {
2085
4.35k
          __m128i s_128[2];
2086
2087
4.35k
          s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
2088
2089
33.2k
          do {
2090
33.2k
            __m256i r[2];
2091
2092
33.2k
            y_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2093
33.2k
                                      r);
2094
33.2k
            sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2095
33.2k
            src_ptr += 2 * src_stride;
2096
33.2k
            dst += 2 * dst_stride;
2097
33.2k
            y -= 2;
2098
33.2k
          } while (y);
2099
4.35k
        } else if (w == 32) {
2100
1.22k
          __m256i s_256[2];
2101
2102
1.22k
          s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
2103
2104
15.1k
          do {
2105
15.1k
            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0],
2106
15.1k
                              &s_256[1], dst);
2107
15.1k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1],
2108
15.1k
                              &s_256[0], dst + dst_stride);
2109
15.1k
            src_ptr += 2 * src_stride;
2110
15.1k
            dst += 2 * dst_stride;
2111
15.1k
            y -= 2;
2112
15.1k
          } while (y);
2113
1.22k
        } else if (w == 64) {
2114
828
          __m256i s_256[2][2];
2115
2116
828
          s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2117
828
          s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2118
2119
19.7k
          do {
2120
19.7k
            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0][0],
2121
19.7k
                              &s_256[1][0], dst);
2122
19.7k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 32, coeffs_256,
2123
19.7k
                              s_256[0][1], &s_256[1][1], dst + 32);
2124
19.7k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1][0],
2125
19.7k
                              &s_256[0][0], dst + dst_stride);
2126
19.7k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 32, coeffs_256,
2127
19.7k
                              s_256[1][1], &s_256[0][1], dst + dst_stride + 32);
2128
2129
19.7k
            src_ptr += 2 * src_stride;
2130
19.7k
            dst += 2 * dst_stride;
2131
19.7k
            y -= 2;
2132
19.7k
          } while (y);
2133
828
        } else {
2134
196
          __m256i s_256[2][4];
2135
2136
196
          assert(w == 128);
2137
2138
196
          s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2139
196
          s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2140
196
          s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
2141
196
          s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
2142
2143
9.82k
          do {
2144
9.82k
            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0][0],
2145
9.82k
                              &s_256[1][0], dst);
2146
9.82k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 1 * 32, coeffs_256,
2147
9.82k
                              s_256[0][1], &s_256[1][1], dst + 1 * 32);
2148
9.82k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 2 * 32, coeffs_256,
2149
9.82k
                              s_256[0][2], &s_256[1][2], dst + 2 * 32);
2150
9.82k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 3 * 32, coeffs_256,
2151
9.82k
                              s_256[0][3], &s_256[1][3], dst + 3 * 32);
2152
2153
9.82k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1][0],
2154
9.82k
                              &s_256[0][0], dst + dst_stride);
2155
9.82k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 1 * 32, coeffs_256,
2156
9.82k
                              s_256[1][1], &s_256[0][1],
2157
9.82k
                              dst + dst_stride + 1 * 32);
2158
9.82k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 2 * 32, coeffs_256,
2159
9.82k
                              s_256[1][2], &s_256[0][2],
2160
9.82k
                              dst + dst_stride + 2 * 32);
2161
9.82k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 3 * 32, coeffs_256,
2162
9.82k
                              s_256[1][3], &s_256[0][3],
2163
9.82k
                              dst + dst_stride + 3 * 32);
2164
2165
9.82k
            src_ptr += 2 * src_stride;
2166
9.82k
            dst += 2 * dst_stride;
2167
9.82k
            y -= 2;
2168
9.82k
          } while (y);
2169
196
        }
2170
6.60k
      }
2171
31.1k
    } else {
2172
      // average to get half pel
2173
22.5k
      if (w <= 8) {
2174
19.4k
        if (w == 2) {
2175
4.06k
          __m128i s_16[2];
2176
2177
4.06k
          s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
2178
2179
7.84k
          do {
2180
7.84k
            s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + src_stride));
2181
7.84k
            const __m128i d0 = _mm_avg_epu8(s_16[0], s_16[1]);
2182
7.84k
            *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d0);
2183
7.84k
            s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
2184
7.84k
            const __m128i d1 = _mm_avg_epu8(s_16[1], s_16[0]);
2185
7.84k
            *(int16_t *)(dst + dst_stride) = (int16_t)_mm_cvtsi128_si32(d1);
2186
7.84k
            src_ptr += 2 * src_stride;
2187
7.84k
            dst += 2 * dst_stride;
2188
7.84k
            y -= 2;
2189
7.84k
          } while (y);
2190
15.3k
        } else if (w == 4) {
2191
9.17k
          __m128i s_32[2];
2192
2193
9.17k
          s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr));
2194
2195
25.7k
          do {
2196
25.7k
            s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + src_stride));
2197
25.7k
            const __m128i d0 = _mm_avg_epu8(s_32[0], s_32[1]);
2198
25.7k
            xx_storel_32(dst, d0);
2199
25.7k
            s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2200
25.7k
            const __m128i d1 = _mm_avg_epu8(s_32[1], s_32[0]);
2201
25.7k
            xx_storel_32(dst + dst_stride, d1);
2202
25.7k
            src_ptr += 2 * src_stride;
2203
25.7k
            dst += 2 * dst_stride;
2204
25.7k
            y -= 2;
2205
25.7k
          } while (y);
2206
9.17k
        } else {
2207
6.21k
          __m128i s_64[2];
2208
2209
6.21k
          assert(w == 8);
2210
2211
6.21k
          s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
2212
2213
18.9k
          do {
2214
            // Note: Faster than binding to AVX2 registers.
2215
18.9k
            s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
2216
18.9k
            const __m128i d0 = _mm_avg_epu8(s_64[0], s_64[1]);
2217
18.9k
            _mm_storel_epi64((__m128i *)dst, d0);
2218
18.9k
            s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2219
18.9k
            const __m128i d1 = _mm_avg_epu8(s_64[1], s_64[0]);
2220
18.9k
            _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
2221
18.9k
            src_ptr += 2 * src_stride;
2222
18.9k
            dst += 2 * dst_stride;
2223
18.9k
            y -= 2;
2224
18.9k
          } while (y);
2225
6.21k
        }
2226
19.4k
      } else if (w == 16) {
2227
2.08k
        __m128i s_128[2];
2228
2229
2.08k
        s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
2230
2231
11.6k
        do {
2232
11.6k
          s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
2233
11.6k
          const __m128i d0 = _mm_avg_epu8(s_128[0], s_128[1]);
2234
11.6k
          _mm_storeu_si128((__m128i *)dst, d0);
2235
11.6k
          s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2236
11.6k
          const __m128i d1 = _mm_avg_epu8(s_128[1], s_128[0]);
2237
11.6k
          _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
2238
11.6k
          src_ptr += 2 * src_stride;
2239
11.6k
          dst += 2 * dst_stride;
2240
11.6k
          y -= 2;
2241
11.6k
        } while (y);
2242
2.08k
      } else if (w == 32) {
2243
717
        __m256i s_256[2];
2244
2245
717
        s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
2246
2247
7.99k
        do {
2248
7.99k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0], &s_256[1], dst);
2249
7.99k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1], &s_256[0],
2250
7.99k
                                dst + dst_stride);
2251
7.99k
          src_ptr += 2 * src_stride;
2252
7.99k
          dst += 2 * dst_stride;
2253
7.99k
          y -= 2;
2254
7.99k
        } while (y);
2255
717
      } else if (w == 64) {
2256
156
        __m256i s_256[2][2];
2257
2258
156
        s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2259
156
        s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2260
2261
3.26k
        do {
2262
3.26k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0][0], &s_256[1][0],
2263
3.26k
                                dst);
2264
3.26k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 32, s_256[0][1],
2265
3.26k
                                &s_256[1][1], dst + 32);
2266
2267
3.26k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1][0],
2268
3.26k
                                &s_256[0][0], dst + dst_stride);
2269
3.26k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 32, s_256[1][1],
2270
3.26k
                                &s_256[0][1], dst + dst_stride + 32);
2271
2272
3.26k
          src_ptr += 2 * src_stride;
2273
3.26k
          dst += 2 * dst_stride;
2274
3.26k
          y -= 2;
2275
3.26k
        } while (y);
2276
178
      } else {
2277
178
        __m256i s_256[2][4];
2278
2279
178
        assert(w == 128);
2280
2281
178
        s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2282
178
        s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2283
178
        s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
2284
178
        s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
2285
2286
7.64k
        do {
2287
7.64k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0][0], &s_256[1][0],
2288
7.64k
                                dst);
2289
7.64k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 1 * 32, s_256[0][1],
2290
7.64k
                                &s_256[1][1], dst + 1 * 32);
2291
7.64k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 2 * 32, s_256[0][2],
2292
7.64k
                                &s_256[1][2], dst + 2 * 32);
2293
7.64k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 3 * 32, s_256[0][3],
2294
7.64k
                                &s_256[1][3], dst + 3 * 32);
2295
2296
7.64k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1][0],
2297
7.64k
                                &s_256[0][0], dst + dst_stride);
2298
7.64k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 1 * 32, s_256[1][1],
2299
7.64k
                                &s_256[0][1], dst + dst_stride + 1 * 32);
2300
7.64k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 2 * 32, s_256[1][2],
2301
7.64k
                                &s_256[0][2], dst + dst_stride + 2 * 32);
2302
7.64k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 3 * 32, s_256[1][3],
2303
7.64k
                                &s_256[0][3], dst + dst_stride + 3 * 32);
2304
2305
7.64k
          src_ptr += 2 * src_stride;
2306
7.64k
          dst += 2 * dst_stride;
2307
7.64k
          y -= 2;
2308
7.64k
        } while (y);
2309
178
      }
2310
22.5k
    }
2311
742k
  } else if (vert_tap == 4) {
2312
    // vert_filt as 4 tap
2313
386k
    const uint8_t *src_ptr = src - src_stride;
2314
2315
386k
    y = h;
2316
2317
386k
    if (w <= 4) {
2318
184k
      prepare_half_coeffs_4tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2319
2320
184k
      if (w == 2) {
2321
32.6k
        __m128i s_16[4], ss_128[2];
2322
2323
32.6k
        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2324
32.6k
        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2325
32.6k
        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2326
2327
32.6k
        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2328
32.6k
        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2329
2330
32.6k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2331
2332
56.4k
        do {
2333
56.4k
          src_ptr += 2 * src_stride;
2334
56.4k
          const __m128i res = y_convolve_4tap_2x2_ssse3(
2335
56.4k
              src_ptr, src_stride, coeffs_128, s_16, ss_128);
2336
56.4k
          const __m128i r = sr_y_round_sse2(res);
2337
56.4k
          pack_store_2x2_sse2(r, dst, dst_stride);
2338
2339
56.4k
          ss_128[0] = ss_128[1];
2340
56.4k
          dst += 2 * dst_stride;
2341
56.4k
          y -= 2;
2342
56.4k
        } while (y);
2343
152k
      } else {
2344
152k
        __m128i s_32[4], ss_128[2];
2345
2346
152k
        assert(w == 4);
2347
2348
152k
        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2349
152k
        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2350
152k
        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2351
2352
152k
        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2353
152k
        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2354
2355
152k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2356
2357
300k
        do {
2358
300k
          src_ptr += 2 * src_stride;
2359
300k
          const __m128i res = y_convolve_4tap_4x2_ssse3(
2360
300k
              src_ptr, src_stride, coeffs_128, s_32, ss_128);
2361
300k
          const __m128i r = sr_y_round_sse2(res);
2362
300k
          pack_store_4x2_sse2(r, dst, dst_stride);
2363
2364
300k
          ss_128[0] = ss_128[1];
2365
300k
          dst += 2 * dst_stride;
2366
300k
          y -= 2;
2367
300k
        } while (y);
2368
152k
      }
2369
201k
    } else {
2370
201k
      prepare_half_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2371
2372
201k
      if (w == 8) {
2373
131k
        __m128i s_64[4];
2374
131k
        __m256i ss_256[2];
2375
2376
131k
        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2377
131k
        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2378
131k
        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2379
2380
        // Load lines a and b. Line a to lower 128, line b to upper 128
2381
131k
        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2382
131k
        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2383
2384
131k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2385
2386
261k
        do {
2387
261k
          src_ptr += 2 * src_stride;
2388
261k
          const __m256i res = y_convolve_4tap_8x2_avx2(
2389
261k
              src_ptr, src_stride, coeffs_256, s_64, ss_256);
2390
261k
          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2391
2392
261k
          ss_256[0] = ss_256[1];
2393
261k
          dst += 2 * dst_stride;
2394
261k
          y -= 2;
2395
261k
        } while (y);
2396
131k
      } else if (w == 16) {
2397
63.6k
        __m128i s_128[4];
2398
63.6k
        __m256i ss_256[4], r[2];
2399
2400
63.6k
        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2401
63.6k
        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2402
63.6k
        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2403
2404
        // Load lines a and b. Line a to lower 128, line b to upper 128
2405
63.6k
        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2406
63.6k
        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2407
2408
63.6k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2409
63.6k
        ss_256[2] = _mm256_unpackhi_epi8(src01, src12);
2410
2411
157k
        do {
2412
157k
          src_ptr += 2 * src_stride;
2413
157k
          y_convolve_4tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2414
157k
                                    ss_256, r);
2415
157k
          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2416
2417
157k
          ss_256[0] = ss_256[1];
2418
157k
          ss_256[2] = ss_256[3];
2419
157k
          dst += 2 * dst_stride;
2420
157k
          y -= 2;
2421
157k
        } while (y);
2422
63.6k
      } else if (w == 32) {
2423
        // AV1 standard won't have 32x4 case.
2424
        // This only favors some optimization feature which
2425
        // subsamples 32x8 to 32x4 and triggers 4-tap filter.
2426
2427
4.05k
        __m256i s_256[4], ss_256[4], tt_256[4], r[4];
2428
2429
4.05k
        s_256[0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * src_stride));
2430
4.05k
        s_256[1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * src_stride));
2431
4.05k
        s_256[2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * src_stride));
2432
2433
4.05k
        ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2434
4.05k
        ss_256[2] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2435
2436
4.05k
        tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2437
4.05k
        tt_256[2] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2438
2439
31.1k
        do {
2440
31.1k
          src_ptr += 2 * src_stride;
2441
31.1k
          y_convolve_4tap_32x2_avx2(src_ptr, src_stride, coeffs_256, s_256,
2442
31.1k
                                    ss_256, tt_256, r);
2443
31.1k
          sr_y_round_store_32x2_avx2(r, dst, dst_stride);
2444
2445
31.1k
          ss_256[0] = ss_256[1];
2446
31.1k
          ss_256[2] = ss_256[3];
2447
2448
31.1k
          tt_256[0] = tt_256[1];
2449
31.1k
          tt_256[2] = tt_256[3];
2450
31.1k
          dst += 2 * dst_stride;
2451
31.1k
          y -= 2;
2452
31.1k
        } while (y);
2453
4.05k
      } else {
2454
2.18k
        assert(!(w % 32));
2455
2456
2.19k
        __m256i s_256[4], ss_256[4], tt_256[4], r[4];
2457
2.19k
        x = 0;
2458
5.54k
        do {
2459
5.54k
          const uint8_t *s = src_ptr + x;
2460
5.54k
          uint8_t *d = dst + x;
2461
5.54k
          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2462
5.54k
          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2463
5.54k
          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2464
2465
5.54k
          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2466
5.54k
          ss_256[2] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2467
2468
5.54k
          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2469
5.54k
          tt_256[2] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2470
2471
5.54k
          y = h;
2472
243k
          do {
2473
243k
            s += 2 * src_stride;
2474
243k
            y_convolve_4tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2475
243k
                                      tt_256, r);
2476
243k
            sr_y_round_store_32x2_avx2(r, d, dst_stride);
2477
2478
243k
            ss_256[0] = ss_256[1];
2479
243k
            ss_256[2] = ss_256[3];
2480
2481
243k
            tt_256[0] = tt_256[1];
2482
243k
            tt_256[2] = tt_256[3];
2483
243k
            d += 2 * dst_stride;
2484
243k
            y -= 2;
2485
243k
          } while (y);
2486
5.54k
          x += 32;
2487
5.54k
        } while (x < w);
2488
2.19k
      }
2489
201k
    }
2490
386k
  } else if (vert_tap == 6) {
2491
    // vert_filt as 6 tap
2492
338k
    const uint8_t *src_ptr = src - 2 * src_stride;
2493
2494
338k
    if (w <= 4) {
2495
111k
      prepare_half_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2496
2497
111k
      y = h;
2498
2499
111k
      if (w == 2) {
2500
23.2k
        __m128i s_16[6], ss_128[3];
2501
2502
23.2k
        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2503
23.2k
        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2504
23.2k
        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2505
23.2k
        s_16[3] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 3 * src_stride));
2506
23.2k
        s_16[4] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 4 * src_stride));
2507
2508
23.2k
        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2509
23.2k
        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2510
23.2k
        const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
2511
23.2k
        const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
2512
2513
23.2k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2514
23.2k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2515
2516
93.0k
        do {
2517
93.0k
          src_ptr += 2 * src_stride;
2518
93.0k
          const __m128i res = y_convolve_6tap_2x2_ssse3(
2519
93.0k
              src_ptr, src_stride, coeffs_128, s_16, ss_128);
2520
93.0k
          const __m128i r = sr_y_round_sse2(res);
2521
93.0k
          pack_store_2x2_sse2(r, dst, dst_stride);
2522
2523
93.0k
          ss_128[0] = ss_128[1];
2524
93.0k
          ss_128[1] = ss_128[2];
2525
93.0k
          dst += 2 * dst_stride;
2526
93.0k
          y -= 2;
2527
93.0k
        } while (y);
2528
88.3k
      } else {
2529
88.3k
        __m128i s_32[6], ss_128[3];
2530
2531
88.3k
        assert(w == 4);
2532
2533
88.3k
        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2534
88.3k
        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2535
88.3k
        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2536
88.3k
        s_32[3] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 3 * src_stride));
2537
88.3k
        s_32[4] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 4 * src_stride));
2538
2539
88.3k
        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2540
88.3k
        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2541
88.3k
        const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
2542
88.3k
        const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
2543
2544
88.3k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2545
88.3k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2546
2547
485k
        do {
2548
485k
          src_ptr += 2 * src_stride;
2549
485k
          const __m128i res = y_convolve_6tap_4x2_ssse3(
2550
485k
              src_ptr, src_stride, coeffs_128, s_32, ss_128);
2551
485k
          const __m128i r = sr_y_round_sse2(res);
2552
485k
          pack_store_4x2_sse2(r, dst, dst_stride);
2553
2554
485k
          ss_128[0] = ss_128[1];
2555
485k
          ss_128[1] = ss_128[2];
2556
485k
          dst += 2 * dst_stride;
2557
485k
          y -= 2;
2558
485k
        } while (y);
2559
88.3k
      }
2560
227k
    } else {
2561
227k
      prepare_half_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2562
2563
227k
      if (w == 8) {
2564
110k
        __m128i s_64[6];
2565
110k
        __m256i ss_256[3];
2566
2567
110k
        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2568
110k
        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2569
110k
        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2570
110k
        s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
2571
110k
        s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
2572
2573
        // Load lines a and b. Line a to lower 128, line b to upper 128
2574
110k
        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2575
110k
        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2576
110k
        const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
2577
110k
        const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
2578
2579
110k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2580
110k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2581
2582
110k
        y = h;
2583
612k
        do {
2584
612k
          src_ptr += 2 * src_stride;
2585
612k
          const __m256i res = y_convolve_6tap_8x2_avx2(
2586
612k
              src_ptr, src_stride, coeffs_256, s_64, ss_256);
2587
612k
          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2588
2589
612k
          ss_256[0] = ss_256[1];
2590
612k
          ss_256[1] = ss_256[2];
2591
612k
          dst += 2 * dst_stride;
2592
612k
          y -= 2;
2593
612k
        } while (y);
2594
116k
      } else if (w == 16) {
2595
82.4k
        __m128i s_128[6];
2596
82.4k
        __m256i ss_256[6], r[2];
2597
2598
82.4k
        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2599
82.4k
        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2600
82.4k
        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2601
82.4k
        s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
2602
82.4k
        s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
2603
2604
        // Load lines a and b. Line a to lower 128, line b to upper 128
2605
82.4k
        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2606
82.4k
        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2607
82.4k
        const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
2608
82.4k
        const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
2609
2610
82.4k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2611
82.4k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2612
2613
82.4k
        ss_256[3] = _mm256_unpackhi_epi8(src01, src12);
2614
82.4k
        ss_256[4] = _mm256_unpackhi_epi8(src23, src34);
2615
2616
82.4k
        y = h;
2617
578k
        do {
2618
578k
          src_ptr += 2 * src_stride;
2619
578k
          y_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2620
578k
                                    ss_256, r);
2621
578k
          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2622
2623
578k
          ss_256[0] = ss_256[1];
2624
578k
          ss_256[1] = ss_256[2];
2625
2626
578k
          ss_256[3] = ss_256[4];
2627
578k
          ss_256[4] = ss_256[5];
2628
578k
          dst += 2 * dst_stride;
2629
578k
          y -= 2;
2630
578k
        } while (y);
2631
82.4k
      } else {
2632
34.1k
        __m256i s_256[6], ss_256[6], tt_256[6], r[4];
2633
2634
34.1k
        assert(!(w % 32));
2635
2636
34.2k
        x = 0;
2637
40.4k
        do {
2638
40.4k
          const uint8_t *s = src_ptr + x;
2639
40.4k
          uint8_t *d = dst + x;
2640
2641
40.4k
          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2642
40.4k
          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2643
40.4k
          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2644
40.4k
          s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
2645
40.4k
          s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
2646
2647
40.4k
          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2648
40.4k
          ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
2649
40.4k
          ss_256[3] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2650
40.4k
          ss_256[4] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
2651
2652
40.4k
          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2653
40.4k
          tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
2654
40.4k
          tt_256[3] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2655
40.4k
          tt_256[4] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
2656
2657
40.4k
          y = h;
2658
650k
          do {
2659
650k
            s += 2 * src_stride;
2660
650k
            y_convolve_6tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2661
650k
                                      tt_256, r);
2662
650k
            sr_y_round_store_32x2_avx2(r, d, dst_stride);
2663
2664
650k
            ss_256[0] = ss_256[1];
2665
650k
            ss_256[1] = ss_256[2];
2666
650k
            ss_256[3] = ss_256[4];
2667
650k
            ss_256[4] = ss_256[5];
2668
2669
650k
            tt_256[0] = tt_256[1];
2670
650k
            tt_256[1] = tt_256[2];
2671
650k
            tt_256[3] = tt_256[4];
2672
650k
            tt_256[4] = tt_256[5];
2673
650k
            d += 2 * dst_stride;
2674
650k
            y -= 2;
2675
650k
          } while (y);
2676
2677
40.4k
          x += 32;
2678
40.4k
        } while (x < w);
2679
34.2k
      }
2680
227k
    }
2681
338k
  } else if (vert_tap == 8) {
2682
    // vert_filt as 8 tap
2683
16.5k
    const uint8_t *src_ptr = src - 3 * src_stride;
2684
2685
16.5k
    if (w <= 4) {
2686
6.88k
      prepare_half_coeffs_8tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2687
2688
6.88k
      y = h;
2689
2690
6.88k
      if (w == 2) {
2691
1.80k
        __m128i s_16[8], ss_128[4];
2692
2693
1.80k
        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2694
1.80k
        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2695
1.80k
        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2696
1.80k
        s_16[3] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 3 * src_stride));
2697
1.80k
        s_16[4] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 4 * src_stride));
2698
1.80k
        s_16[5] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 5 * src_stride));
2699
1.80k
        s_16[6] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 6 * src_stride));
2700
2701
1.80k
        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2702
1.80k
        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2703
1.80k
        const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
2704
1.80k
        const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
2705
1.80k
        const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
2706
1.80k
        const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[6]);
2707
2708
1.80k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2709
1.80k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2710
1.80k
        ss_128[2] = _mm_unpacklo_epi8(src45, src56);
2711
2712
7.22k
        do {
2713
7.22k
          const __m128i res = y_convolve_8tap_2x2_ssse3(
2714
7.22k
              src_ptr, src_stride, coeffs_128, s_16, ss_128);
2715
7.22k
          const __m128i r = sr_y_round_sse2(res);
2716
7.22k
          pack_store_2x2_sse2(r, dst, dst_stride);
2717
7.22k
          ss_128[0] = ss_128[1];
2718
7.22k
          ss_128[1] = ss_128[2];
2719
7.22k
          ss_128[2] = ss_128[3];
2720
7.22k
          src_ptr += 2 * src_stride;
2721
7.22k
          dst += 2 * dst_stride;
2722
7.22k
          y -= 2;
2723
7.22k
        } while (y);
2724
5.08k
      } else {
2725
5.08k
        __m128i s_32[8], ss_128[4];
2726
2727
5.08k
        assert(w == 4);
2728
2729
5.08k
        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2730
5.08k
        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2731
5.08k
        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2732
5.08k
        s_32[3] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 3 * src_stride));
2733
5.08k
        s_32[4] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 4 * src_stride));
2734
5.08k
        s_32[5] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 5 * src_stride));
2735
5.08k
        s_32[6] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 6 * src_stride));
2736
2737
5.08k
        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2738
5.08k
        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2739
5.08k
        const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
2740
5.08k
        const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
2741
5.08k
        const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
2742
5.08k
        const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
2743
2744
5.08k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2745
5.08k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2746
5.08k
        ss_128[2] = _mm_unpacklo_epi8(src45, src56);
2747
2748
27.8k
        do {
2749
27.8k
          const __m128i res = y_convolve_8tap_4x2_ssse3(
2750
27.8k
              src_ptr, src_stride, coeffs_128, s_32, ss_128);
2751
27.8k
          const __m128i r = sr_y_round_sse2(res);
2752
27.8k
          pack_store_4x2_sse2(r, dst, dst_stride);
2753
27.8k
          ss_128[0] = ss_128[1];
2754
27.8k
          ss_128[1] = ss_128[2];
2755
27.8k
          ss_128[2] = ss_128[3];
2756
27.8k
          src_ptr += 2 * src_stride;
2757
27.8k
          dst += 2 * dst_stride;
2758
27.8k
          y -= 2;
2759
27.8k
        } while (y);
2760
5.08k
      }
2761
9.67k
    } else {
2762
9.67k
      prepare_half_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2763
2764
9.67k
      if (w == 8) {
2765
4.67k
        __m128i s_64[8];
2766
4.67k
        __m256i ss_256[4];
2767
2768
4.67k
        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2769
4.67k
        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2770
4.67k
        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2771
4.67k
        s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
2772
4.67k
        s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
2773
4.67k
        s_64[5] = _mm_loadl_epi64((__m128i *)(src_ptr + 5 * src_stride));
2774
4.67k
        s_64[6] = _mm_loadl_epi64((__m128i *)(src_ptr + 6 * src_stride));
2775
2776
        // Load lines a and b. Line a to lower 128, line b to upper 128
2777
4.67k
        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2778
4.67k
        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2779
4.67k
        const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
2780
4.67k
        const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
2781
4.67k
        const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
2782
4.67k
        const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[6]);
2783
2784
4.67k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2785
4.67k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2786
4.67k
        ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
2787
2788
4.67k
        y = h;
2789
27.4k
        do {
2790
27.4k
          const __m256i res = y_convolve_8tap_8x2_avx2(
2791
27.4k
              src_ptr, src_stride, coeffs_256, s_64, ss_256);
2792
27.4k
          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2793
27.4k
          ss_256[0] = ss_256[1];
2794
27.4k
          ss_256[1] = ss_256[2];
2795
27.4k
          ss_256[2] = ss_256[3];
2796
27.4k
          src_ptr += 2 * src_stride;
2797
27.4k
          dst += 2 * dst_stride;
2798
27.4k
          y -= 2;
2799
27.4k
        } while (y);
2800
4.99k
      } else if (w == 16) {
2801
3.19k
        __m128i s_128[8];
2802
3.19k
        __m256i ss_256[8], r[2];
2803
2804
3.19k
        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2805
3.19k
        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2806
3.19k
        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2807
3.19k
        s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
2808
3.19k
        s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
2809
3.19k
        s_128[5] = _mm_loadu_si128((__m128i *)(src_ptr + 5 * src_stride));
2810
3.19k
        s_128[6] = _mm_loadu_si128((__m128i *)(src_ptr + 6 * src_stride));
2811
2812
        // Load lines a and b. Line a to lower 128, line b to upper 128
2813
3.19k
        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2814
3.19k
        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2815
3.19k
        const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
2816
3.19k
        const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
2817
3.19k
        const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
2818
3.19k
        const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[6]);
2819
2820
3.19k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2821
3.19k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2822
3.19k
        ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
2823
2824
3.19k
        ss_256[4] = _mm256_unpackhi_epi8(src01, src12);
2825
3.19k
        ss_256[5] = _mm256_unpackhi_epi8(src23, src34);
2826
3.19k
        ss_256[6] = _mm256_unpackhi_epi8(src45, src56);
2827
2828
3.19k
        y = h;
2829
23.0k
        do {
2830
23.0k
          y_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2831
23.0k
                                    ss_256, r);
2832
23.0k
          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2833
2834
23.0k
          ss_256[0] = ss_256[1];
2835
23.0k
          ss_256[1] = ss_256[2];
2836
23.0k
          ss_256[2] = ss_256[3];
2837
2838
23.0k
          ss_256[4] = ss_256[5];
2839
23.0k
          ss_256[5] = ss_256[6];
2840
23.0k
          ss_256[6] = ss_256[7];
2841
23.0k
          src_ptr += 2 * src_stride;
2842
23.0k
          dst += 2 * dst_stride;
2843
23.0k
          y -= 2;
2844
23.0k
        } while (y);
2845
3.19k
      } else {
2846
1.79k
        __m256i s_256[8], ss_256[8], tt_256[8], r[4];
2847
2848
1.79k
        assert(!(w % 32));
2849
2850
1.79k
        x = 0;
2851
2.57k
        do {
2852
2.57k
          const uint8_t *s = src_ptr + x;
2853
2.57k
          uint8_t *d = dst + x;
2854
2855
2.57k
          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2856
2.57k
          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2857
2.57k
          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2858
2.57k
          s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
2859
2.57k
          s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
2860
2.57k
          s_256[5] = _mm256_loadu_si256((__m256i *)(s + 5 * src_stride));
2861
2.57k
          s_256[6] = _mm256_loadu_si256((__m256i *)(s + 6 * src_stride));
2862
2863
2.57k
          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2864
2.57k
          ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
2865
2.57k
          ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
2866
2.57k
          ss_256[4] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2867
2.57k
          ss_256[5] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
2868
2.57k
          ss_256[6] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
2869
2870
2.57k
          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2871
2.57k
          tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
2872
2.57k
          tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[6]);
2873
2.57k
          tt_256[4] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2874
2.57k
          tt_256[5] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
2875
2.57k
          tt_256[6] = _mm256_unpackhi_epi8(s_256[5], s_256[6]);
2876
2877
2.57k
          y = h;
2878
57.2k
          do {
2879
57.2k
            y_convolve_8tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2880
57.2k
                                      tt_256, r);
2881
57.2k
            sr_y_round_store_32x2_avx2(r, d, dst_stride);
2882
2883
57.2k
            ss_256[0] = ss_256[1];
2884
57.2k
            ss_256[1] = ss_256[2];
2885
57.2k
            ss_256[2] = ss_256[3];
2886
57.2k
            ss_256[4] = ss_256[5];
2887
57.2k
            ss_256[5] = ss_256[6];
2888
57.2k
            ss_256[6] = ss_256[7];
2889
2890
57.2k
            tt_256[0] = tt_256[1];
2891
57.2k
            tt_256[1] = tt_256[2];
2892
57.2k
            tt_256[2] = tt_256[3];
2893
57.2k
            tt_256[4] = tt_256[5];
2894
57.2k
            tt_256[5] = tt_256[6];
2895
57.2k
            tt_256[6] = tt_256[7];
2896
57.2k
            s += 2 * src_stride;
2897
57.2k
            d += 2 * dst_stride;
2898
57.2k
            y -= 2;
2899
57.2k
          } while (y);
2900
2901
2.57k
          x += 32;
2902
2.57k
        } while (x < w);
2903
1.79k
      }
2904
9.67k
    }
2905
16.5k
  }
2906
795k
}
2907
2908
static inline void sr_x_2tap_32_avx2(const uint8_t *const src,
2909
                                     const __m256i coeffs[1],
2910
175k
                                     uint8_t *const dst) {
2911
175k
  __m256i r[2];
2912
2913
175k
  x_convolve_2tap_32_avx2(src, coeffs, r);
2914
175k
  sr_x_round_store_32_avx2(r, dst);
2915
175k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_2tap_32_avx2
convolve_avx2.c:sr_x_2tap_32_avx2
Line
Count
Source
2910
175k
                                     uint8_t *const dst) {
2911
175k
  __m256i r[2];
2912
2913
175k
  x_convolve_2tap_32_avx2(src, coeffs, r);
2914
175k
  sr_x_round_store_32_avx2(r, dst);
2915
175k
}
2916
2917
static inline void sr_x_6tap_32_avx2(const uint8_t *const src,
2918
                                     const __m256i coeffs[3],
2919
                                     const __m256i filt[3],
2920
1.14M
                                     uint8_t *const dst) {
2921
1.14M
  __m256i r[2];
2922
2923
1.14M
  x_convolve_6tap_32_avx2(src, coeffs, filt, r);
2924
1.14M
  sr_x_round_store_32_avx2(r, dst);
2925
1.14M
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_6tap_32_avx2
convolve_avx2.c:sr_x_6tap_32_avx2
Line
Count
Source
2920
1.14M
                                     uint8_t *const dst) {
2921
1.14M
  __m256i r[2];
2922
2923
1.14M
  x_convolve_6tap_32_avx2(src, coeffs, filt, r);
2924
1.14M
  sr_x_round_store_32_avx2(r, dst);
2925
1.14M
}
2926
2927
static AOM_FORCE_INLINE void sr_x_8tap_32_avx2(const uint8_t *const src,
2928
                                               const __m256i coeffs[4],
2929
                                               const __m256i filt[4],
2930
182k
                                               uint8_t *const dst) {
2931
182k
  __m256i r[2];
2932
2933
182k
  x_convolve_8tap_32_avx2(src, coeffs, filt, r);
2934
182k
  sr_x_round_store_32_avx2(r, dst);
2935
182k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_8tap_32_avx2
convolve_avx2.c:sr_x_8tap_32_avx2
Line
Count
Source
2930
182k
                                               uint8_t *const dst) {
2931
182k
  __m256i r[2];
2932
2933
182k
  x_convolve_8tap_32_avx2(src, coeffs, filt, r);
2934
182k
  sr_x_round_store_32_avx2(r, dst);
2935
182k
}
2936
2937
static AOM_FORCE_INLINE void av1_convolve_x_sr_specialized_avx2(
2938
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
2939
    int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
2940
627k
    const int32_t subpel_x_q4, ConvolveParams *conv_params) {
2941
627k
  int32_t y = h;
2942
627k
  __m128i coeffs_128[4];
2943
627k
  __m256i coeffs_256[4];
2944
2945
627k
  assert(conv_params->round_0 == 3);
2946
627k
  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
2947
627k
         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
2948
627k
  (void)conv_params;
2949
2950
627k
  const int horz_tap = get_filter_tap(filter_params_x, subpel_x_q4);
2951
2952
627k
  if (horz_tap == 2) {
2953
    // horz_filt as 2 tap
2954
40.6k
    const uint8_t *src_ptr = src;
2955
2956
40.6k
    if (subpel_x_q4 != 8) {
2957
23.2k
      if (w <= 8) {
2958
17.2k
        prepare_half_coeffs_2tap_ssse3(filter_params_x, subpel_x_q4,
2959
17.2k
                                       coeffs_128);
2960
2961
17.2k
        if (w == 2) {
2962
4.66k
          do {
2963
4.66k
            const __m128i res =
2964
4.66k
                x_convolve_2tap_2x2_sse4_1(src_ptr, src_stride, coeffs_128);
2965
4.66k
            const __m128i r = sr_x_round_sse2(res);
2966
4.66k
            pack_store_2x2_sse2(r, dst, dst_stride);
2967
4.66k
            src_ptr += 2 * src_stride;
2968
4.66k
            dst += 2 * dst_stride;
2969
4.66k
            y -= 2;
2970
4.66k
          } while (y);
2971
14.6k
        } else if (w == 4) {
2972
25.4k
          do {
2973
25.4k
            const __m128i res =
2974
25.4k
                x_convolve_2tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
2975
25.4k
            const __m128i r = sr_x_round_sse2(res);
2976
25.4k
            pack_store_4x2_sse2(r, dst, dst_stride);
2977
25.4k
            src_ptr += 2 * src_stride;
2978
25.4k
            dst += 2 * dst_stride;
2979
25.4k
            y -= 2;
2980
25.4k
          } while (y);
2981
7.68k
        } else {
2982
6.95k
          assert(w == 8);
2983
2984
25.7k
          do {
2985
25.7k
            __m128i res[2];
2986
2987
25.7k
            x_convolve_2tap_8x2_ssse3(src_ptr, src_stride, coeffs_128, res);
2988
25.7k
            res[0] = sr_x_round_sse2(res[0]);
2989
25.7k
            res[1] = sr_x_round_sse2(res[1]);
2990
25.7k
            const __m128i d = _mm_packus_epi16(res[0], res[1]);
2991
25.7k
            _mm_storel_epi64((__m128i *)dst, d);
2992
25.7k
            _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
2993
2994
25.7k
            src_ptr += 2 * src_stride;
2995
25.7k
            dst += 2 * dst_stride;
2996
25.7k
            y -= 2;
2997
25.7k
          } while (y);
2998
6.95k
        }
2999
17.2k
      } else {
3000
6.00k
        prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3001
3002
6.00k
        if (w == 16) {
3003
18.3k
          do {
3004
18.3k
            __m256i r[2];
3005
3006
18.3k
            x_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, r);
3007
18.3k
            sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3008
18.3k
            src_ptr += 2 * src_stride;
3009
18.3k
            dst += 2 * dst_stride;
3010
18.3k
            y -= 2;
3011
18.3k
          } while (y);
3012
3.56k
        } else if (w == 32) {
3013
27.1k
          do {
3014
27.1k
            sr_x_2tap_32_avx2(src_ptr, coeffs_256, dst);
3015
27.1k
            src_ptr += src_stride;
3016
27.1k
            dst += dst_stride;
3017
27.1k
          } while (--y);
3018
1.24k
        } else if (w == 64) {
3019
42.6k
          do {
3020
42.6k
            sr_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
3021
42.6k
            sr_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
3022
42.6k
            src_ptr += src_stride;
3023
42.6k
            dst += dst_stride;
3024
42.6k
          } while (--y);
3025
1.06k
        } else {
3026
178
          assert(w == 128);
3027
3028
15.7k
          do {
3029
15.7k
            sr_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
3030
15.7k
            sr_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
3031
15.7k
            sr_x_2tap_32_avx2(src_ptr + 2 * 32, coeffs_256, dst + 2 * 32);
3032
15.7k
            sr_x_2tap_32_avx2(src_ptr + 3 * 32, coeffs_256, dst + 3 * 32);
3033
15.7k
            src_ptr += src_stride;
3034
15.7k
            dst += dst_stride;
3035
15.7k
          } while (--y);
3036
178
        }
3037
6.00k
      }
3038
23.2k
    } else {
3039
      // average to get half pel
3040
17.3k
      if (w == 2) {
3041
5.10k
        do {
3042
5.10k
          __m128i s_128;
3043
3044
5.10k
          s_128 = load_u8_4x2_sse4_1(src_ptr, src_stride);
3045
5.10k
          const __m128i s1 = _mm_srli_si128(s_128, 1);
3046
5.10k
          const __m128i d = _mm_avg_epu8(s_128, s1);
3047
5.10k
          *(uint16_t *)dst = (uint16_t)_mm_cvtsi128_si32(d);
3048
5.10k
          *(uint16_t *)(dst + dst_stride) = _mm_extract_epi16(d, 2);
3049
3050
5.10k
          src_ptr += 2 * src_stride;
3051
5.10k
          dst += 2 * dst_stride;
3052
5.10k
          y -= 2;
3053
5.10k
        } while (y);
3054
14.8k
      } else if (w == 4) {
3055
18.3k
        do {
3056
18.3k
          __m128i s_128;
3057
3058
18.3k
          s_128 = load_u8_8x2_sse2(src_ptr, src_stride);
3059
18.3k
          const __m128i s1 = _mm_srli_si128(s_128, 1);
3060
18.3k
          const __m128i d = _mm_avg_epu8(s_128, s1);
3061
18.3k
          xx_storel_32(dst, d);
3062
18.3k
          *(int32_t *)(dst + dst_stride) = _mm_extract_epi32(d, 2);
3063
3064
18.3k
          src_ptr += 2 * src_stride;
3065
18.3k
          dst += 2 * dst_stride;
3066
18.3k
          y -= 2;
3067
18.3k
        } while (y);
3068
8.14k
      } else if (w == 8) {
3069
15.4k
        do {
3070
15.4k
          const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
3071
15.4k
          const __m128i s10 =
3072
15.4k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
3073
15.4k
          const __m128i s01 = _mm_srli_si128(s00, 1);
3074
15.4k
          const __m128i s11 = _mm_srli_si128(s10, 1);
3075
15.4k
          const __m128i d0 = _mm_avg_epu8(s00, s01);
3076
15.4k
          const __m128i d1 = _mm_avg_epu8(s10, s11);
3077
15.4k
          _mm_storel_epi64((__m128i *)dst, d0);
3078
15.4k
          _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
3079
3080
15.4k
          src_ptr += 2 * src_stride;
3081
15.4k
          dst += 2 * dst_stride;
3082
15.4k
          y -= 2;
3083
15.4k
        } while (y);
3084
4.52k
      } else if (w == 16) {
3085
11.4k
        do {
3086
11.4k
          const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
3087
11.4k
          const __m128i s01 = _mm_loadu_si128((__m128i *)(src_ptr + 1));
3088
11.4k
          const __m128i s10 =
3089
11.4k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
3090
11.4k
          const __m128i s11 =
3091
11.4k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride + 1));
3092
11.4k
          const __m128i d0 = _mm_avg_epu8(s00, s01);
3093
11.4k
          const __m128i d1 = _mm_avg_epu8(s10, s11);
3094
11.4k
          _mm_storeu_si128((__m128i *)dst, d0);
3095
11.4k
          _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
3096
3097
11.4k
          src_ptr += 2 * src_stride;
3098
11.4k
          dst += 2 * dst_stride;
3099
11.4k
          y -= 2;
3100
11.4k
        } while (y);
3101
1.93k
      } else if (w == 32) {
3102
23.7k
        do {
3103
23.7k
          sr_x_2tap_32_avg_avx2(src_ptr, dst);
3104
23.7k
          src_ptr += src_stride;
3105
23.7k
          dst += dst_stride;
3106
23.7k
        } while (--y);
3107
989
      } else if (w == 64) {
3108
21.0k
        do {
3109
21.0k
          sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
3110
21.0k
          sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
3111
21.0k
          src_ptr += src_stride;
3112
21.0k
          dst += dst_stride;
3113
21.0k
        } while (--y);
3114
397
      } else {
3115
301
        assert(w == 128);
3116
3117
24.3k
        do {
3118
24.3k
          sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
3119
24.3k
          sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
3120
24.3k
          sr_x_2tap_32_avg_avx2(src_ptr + 2 * 32, dst + 2 * 32);
3121
24.3k
          sr_x_2tap_32_avg_avx2(src_ptr + 3 * 32, dst + 3 * 32);
3122
24.3k
          src_ptr += src_stride;
3123
24.3k
          dst += dst_stride;
3124
24.3k
        } while (--y);
3125
301
      }
3126
17.3k
    }
3127
586k
  } else if (horz_tap == 4) {
3128
    // horz_filt as 4 tap
3129
259k
    const uint8_t *src_ptr = src - 1;
3130
3131
259k
    prepare_half_coeffs_4tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
3132
3133
259k
    if (w == 2) {
3134
121k
      do {
3135
121k
        const __m128i res =
3136
121k
            x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
3137
121k
        const __m128i r = sr_x_round_sse2(res);
3138
121k
        pack_store_2x2_sse2(r, dst, dst_stride);
3139
121k
        src_ptr += 2 * src_stride;
3140
121k
        dst += 2 * dst_stride;
3141
121k
        y -= 2;
3142
121k
      } while (y);
3143
215k
    } else if (w == 4) {
3144
651k
      do {
3145
651k
        const __m128i res =
3146
651k
            x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
3147
651k
        const __m128i r = sr_x_round_sse2(res);
3148
651k
        pack_store_4x2_sse2(r, dst, dst_stride);
3149
651k
        src_ptr += 2 * src_stride;
3150
651k
        dst += 2 * dst_stride;
3151
651k
        y -= 2;
3152
651k
      } while (y);
3153
196k
    } else if (w == 8) {
3154
      // TODO(chiyotsai@google.com): Reuse the old SIMD code here. Need to
3155
      // rewrite this for better performance later.
3156
9.80k
      __m256i filt_256[2];
3157
9.80k
      prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_256);
3158
3159
9.80k
      filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3160
9.80k
      filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3161
45.6k
      for (int i = 0; i < h; i += 2) {
3162
35.8k
        const __m256i data = _mm256_permute2x128_si256(
3163
35.8k
            _mm256_castsi128_si256(
3164
35.8k
                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
3165
35.8k
            _mm256_castsi128_si256(_mm_loadu_si128(
3166
35.8k
                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
3167
35.8k
            0x20);
3168
3169
35.8k
        __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs_256 + 1, filt_256);
3170
35.8k
        res_16b = sr_x_round_avx2(res_16b);
3171
3172
35.8k
        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
3173
3174
35.8k
        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
3175
35.8k
        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
3176
3177
35.8k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
3178
35.8k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
3179
35.8k
      }
3180
9.80k
    } else {
3181
8.93k
      assert(!(w % 16));
3182
      // TODO(chiyotsai@google.com): Reuse the old SIMD code here. Need to
3183
      // rewrite this for better performance later.
3184
8.93k
      __m256i filt_256[2];
3185
8.93k
      prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_256);
3186
8.93k
      filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3187
8.93k
      filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3188
3189
245k
      for (int i = 0; i < h; ++i) {
3190
1.18M
        for (int j = 0; j < w; j += 16) {
3191
          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
3192
          // 18 19 20 21 22 23
3193
952k
          const __m256i data = _mm256_inserti128_si256(
3194
952k
              _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
3195
952k
              _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
3196
952k
              1);
3197
3198
952k
          __m256i res_16b =
3199
952k
              convolve_lowbd_x_4tap(data, coeffs_256 + 1, filt_256);
3200
952k
          res_16b = sr_x_round_avx2(res_16b);
3201
3202
          /* rounding code */
3203
          // 8 bit conversion and saturation to uint8
3204
952k
          __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
3205
3206
          // Store values into the destination buffer
3207
          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
3208
952k
          res_8b = _mm256_permute4x64_epi64(res_8b, 216);
3209
952k
          __m128i res = _mm256_castsi256_si128(res_8b);
3210
952k
          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
3211
952k
        }
3212
236k
      }
3213
8.93k
    }
3214
326k
  } else {
3215
326k
    __m256i filt_256[4];
3216
3217
326k
    filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3218
326k
    filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3219
326k
    filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx2);
3220
3221
326k
    if (horz_tap == 6) {
3222
      // horz_filt as 6 tap
3223
305k
      const uint8_t *src_ptr = src - 2;
3224
3225
305k
      prepare_half_coeffs_6tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3226
3227
305k
      if (w == 8) {
3228
636k
        do {
3229
636k
          const __m256i res = x_convolve_6tap_8x2_avx2(src_ptr, src_stride,
3230
636k
                                                       coeffs_256, filt_256);
3231
636k
          sr_x_round_store_8x2_avx2(res, dst, dst_stride);
3232
636k
          src_ptr += 2 * src_stride;
3233
636k
          dst += 2 * dst_stride;
3234
636k
          y -= 2;
3235
636k
        } while (y);
3236
173k
      } else if (w == 16) {
3237
522k
        do {
3238
522k
          __m256i r[2];
3239
3240
522k
          x_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256,
3241
522k
                                    r);
3242
522k
          sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3243
522k
          src_ptr += 2 * src_stride;
3244
522k
          dst += 2 * dst_stride;
3245
522k
          y -= 2;
3246
522k
        } while (y);
3247
103k
      } else if (w == 32) {
3248
493k
        do {
3249
493k
          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3250
493k
          src_ptr += src_stride;
3251
493k
          dst += dst_stride;
3252
493k
        } while (--y);
3253
25.2k
      } else if (w == 64) {
3254
189k
        do {
3255
189k
          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3256
189k
          sr_x_6tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, dst + 32);
3257
189k
          src_ptr += src_stride;
3258
189k
          dst += dst_stride;
3259
189k
        } while (--y);
3260
3.92k
      } else {
3261
546
        assert(w == 128);
3262
3263
68.5k
        do {
3264
68.5k
          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3265
68.5k
          sr_x_6tap_32_avx2(src_ptr + 1 * 32, coeffs_256, filt_256,
3266
68.5k
                            dst + 1 * 32);
3267
68.5k
          sr_x_6tap_32_avx2(src_ptr + 2 * 32, coeffs_256, filt_256,
3268
68.5k
                            dst + 2 * 32);
3269
68.5k
          sr_x_6tap_32_avx2(src_ptr + 3 * 32, coeffs_256, filt_256,
3270
68.5k
                            dst + 3 * 32);
3271
68.5k
          src_ptr += src_stride;
3272
68.5k
          dst += dst_stride;
3273
68.5k
        } while (--y);
3274
610
      }
3275
305k
    } else if (horz_tap == 8) {
3276
      // horz_filt as 8 tap
3277
20.9k
      const uint8_t *src_ptr = src - 3;
3278
3279
20.9k
      filt_256[3] = _mm256_loadu_si256((__m256i const *)filt4_global_avx2);
3280
3281
20.9k
      prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3282
3283
20.9k
      if (w == 8) {
3284
43.6k
        do {
3285
43.6k
          const __m256i res = x_convolve_8tap_8x2_avx2(src_ptr, src_stride,
3286
43.6k
                                                       coeffs_256, filt_256);
3287
43.6k
          sr_x_round_store_8x2_avx2(res, dst, dst_stride);
3288
43.6k
          src_ptr += 2 * src_stride;
3289
43.6k
          dst += 2 * dst_stride;
3290
43.6k
          y -= 2;
3291
43.6k
        } while (y);
3292
11.6k
      } else if (w == 16) {
3293
37.1k
        do {
3294
37.1k
          __m256i r[2];
3295
3296
37.1k
          x_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256,
3297
37.1k
                                    r);
3298
37.1k
          sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3299
37.1k
          src_ptr += 2 * src_stride;
3300
37.1k
          dst += 2 * dst_stride;
3301
37.1k
          y -= 2;
3302
37.1k
        } while (y);
3303
6.46k
      } else if (w == 32) {
3304
43.7k
        do {
3305
43.7k
          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3306
43.7k
          src_ptr += src_stride;
3307
43.7k
          dst += dst_stride;
3308
43.7k
        } while (--y);
3309
1.88k
      } else if (w == 64) {
3310
33.8k
        do {
3311
33.8k
          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3312
33.8k
          sr_x_8tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, dst + 32);
3313
33.8k
          src_ptr += src_stride;
3314
33.8k
          dst += dst_stride;
3315
33.8k
        } while (--y);
3316
743
      } else {
3317
177
        assert(w == 128);
3318
3319
17.7k
        do {
3320
17.7k
          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3321
17.7k
          sr_x_8tap_32_avx2(src_ptr + 1 * 32, coeffs_256, filt_256,
3322
17.7k
                            dst + 1 * 32);
3323
17.7k
          sr_x_8tap_32_avx2(src_ptr + 2 * 32, coeffs_256, filt_256,
3324
17.7k
                            dst + 2 * 32);
3325
17.7k
          sr_x_8tap_32_avx2(src_ptr + 3 * 32, coeffs_256, filt_256,
3326
17.7k
                            dst + 3 * 32);
3327
17.7k
          src_ptr += src_stride;
3328
17.7k
          dst += dst_stride;
3329
17.7k
        } while (--y);
3330
177
      }
3331
20.9k
    }
3332
326k
  }
3333
627k
}
Unexecuted instantiation: convolve_2d_avx2.c:av1_convolve_x_sr_specialized_avx2
convolve_avx2.c:av1_convolve_x_sr_specialized_avx2
Line
Count
Source
2940
627k
    const int32_t subpel_x_q4, ConvolveParams *conv_params) {
2941
627k
  int32_t y = h;
2942
627k
  __m128i coeffs_128[4];
2943
627k
  __m256i coeffs_256[4];
2944
2945
627k
  assert(conv_params->round_0 == 3);
2946
627k
  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
2947
627k
         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
2948
627k
  (void)conv_params;
2949
2950
627k
  const int horz_tap = get_filter_tap(filter_params_x, subpel_x_q4);
2951
2952
627k
  if (horz_tap == 2) {
2953
    // horz_filt as 2 tap
2954
40.6k
    const uint8_t *src_ptr = src;
2955
2956
40.6k
    if (subpel_x_q4 != 8) {
2957
23.2k
      if (w <= 8) {
2958
17.2k
        prepare_half_coeffs_2tap_ssse3(filter_params_x, subpel_x_q4,
2959
17.2k
                                       coeffs_128);
2960
2961
17.2k
        if (w == 2) {
2962
4.66k
          do {
2963
4.66k
            const __m128i res =
2964
4.66k
                x_convolve_2tap_2x2_sse4_1(src_ptr, src_stride, coeffs_128);
2965
4.66k
            const __m128i r = sr_x_round_sse2(res);
2966
4.66k
            pack_store_2x2_sse2(r, dst, dst_stride);
2967
4.66k
            src_ptr += 2 * src_stride;
2968
4.66k
            dst += 2 * dst_stride;
2969
4.66k
            y -= 2;
2970
4.66k
          } while (y);
2971
14.6k
        } else if (w == 4) {
2972
25.4k
          do {
2973
25.4k
            const __m128i res =
2974
25.4k
                x_convolve_2tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
2975
25.4k
            const __m128i r = sr_x_round_sse2(res);
2976
25.4k
            pack_store_4x2_sse2(r, dst, dst_stride);
2977
25.4k
            src_ptr += 2 * src_stride;
2978
25.4k
            dst += 2 * dst_stride;
2979
25.4k
            y -= 2;
2980
25.4k
          } while (y);
2981
7.68k
        } else {
2982
6.95k
          assert(w == 8);
2983
2984
25.7k
          do {
2985
25.7k
            __m128i res[2];
2986
2987
25.7k
            x_convolve_2tap_8x2_ssse3(src_ptr, src_stride, coeffs_128, res);
2988
25.7k
            res[0] = sr_x_round_sse2(res[0]);
2989
25.7k
            res[1] = sr_x_round_sse2(res[1]);
2990
25.7k
            const __m128i d = _mm_packus_epi16(res[0], res[1]);
2991
25.7k
            _mm_storel_epi64((__m128i *)dst, d);
2992
25.7k
            _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
2993
2994
25.7k
            src_ptr += 2 * src_stride;
2995
25.7k
            dst += 2 * dst_stride;
2996
25.7k
            y -= 2;
2997
25.7k
          } while (y);
2998
6.95k
        }
2999
17.2k
      } else {
3000
6.00k
        prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3001
3002
6.00k
        if (w == 16) {
3003
18.3k
          do {
3004
18.3k
            __m256i r[2];
3005
3006
18.3k
            x_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, r);
3007
18.3k
            sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3008
18.3k
            src_ptr += 2 * src_stride;
3009
18.3k
            dst += 2 * dst_stride;
3010
18.3k
            y -= 2;
3011
18.3k
          } while (y);
3012
3.56k
        } else if (w == 32) {
3013
27.1k
          do {
3014
27.1k
            sr_x_2tap_32_avx2(src_ptr, coeffs_256, dst);
3015
27.1k
            src_ptr += src_stride;
3016
27.1k
            dst += dst_stride;
3017
27.1k
          } while (--y);
3018
1.24k
        } else if (w == 64) {
3019
42.6k
          do {
3020
42.6k
            sr_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
3021
42.6k
            sr_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
3022
42.6k
            src_ptr += src_stride;
3023
42.6k
            dst += dst_stride;
3024
42.6k
          } while (--y);
3025
1.06k
        } else {
3026
178
          assert(w == 128);
3027
3028
15.7k
          do {
3029
15.7k
            sr_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
3030
15.7k
            sr_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
3031
15.7k
            sr_x_2tap_32_avx2(src_ptr + 2 * 32, coeffs_256, dst + 2 * 32);
3032
15.7k
            sr_x_2tap_32_avx2(src_ptr + 3 * 32, coeffs_256, dst + 3 * 32);
3033
15.7k
            src_ptr += src_stride;
3034
15.7k
            dst += dst_stride;
3035
15.7k
          } while (--y);
3036
178
        }
3037
6.00k
      }
3038
23.2k
    } else {
3039
      // average to get half pel
3040
17.3k
      if (w == 2) {
3041
5.10k
        do {
3042
5.10k
          __m128i s_128;
3043
3044
5.10k
          s_128 = load_u8_4x2_sse4_1(src_ptr, src_stride);
3045
5.10k
          const __m128i s1 = _mm_srli_si128(s_128, 1);
3046
5.10k
          const __m128i d = _mm_avg_epu8(s_128, s1);
3047
5.10k
          *(uint16_t *)dst = (uint16_t)_mm_cvtsi128_si32(d);
3048
5.10k
          *(uint16_t *)(dst + dst_stride) = _mm_extract_epi16(d, 2);
3049
3050
5.10k
          src_ptr += 2 * src_stride;
3051
5.10k
          dst += 2 * dst_stride;
3052
5.10k
          y -= 2;
3053
5.10k
        } while (y);
3054
14.8k
      } else if (w == 4) {
3055
18.3k
        do {
3056
18.3k
          __m128i s_128;
3057
3058
18.3k
          s_128 = load_u8_8x2_sse2(src_ptr, src_stride);
3059
18.3k
          const __m128i s1 = _mm_srli_si128(s_128, 1);
3060
18.3k
          const __m128i d = _mm_avg_epu8(s_128, s1);
3061
18.3k
          xx_storel_32(dst, d);
3062
18.3k
          *(int32_t *)(dst + dst_stride) = _mm_extract_epi32(d, 2);
3063
3064
18.3k
          src_ptr += 2 * src_stride;
3065
18.3k
          dst += 2 * dst_stride;
3066
18.3k
          y -= 2;
3067
18.3k
        } while (y);
3068
8.14k
      } else if (w == 8) {
3069
15.4k
        do {
3070
15.4k
          const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
3071
15.4k
          const __m128i s10 =
3072
15.4k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
3073
15.4k
          const __m128i s01 = _mm_srli_si128(s00, 1);
3074
15.4k
          const __m128i s11 = _mm_srli_si128(s10, 1);
3075
15.4k
          const __m128i d0 = _mm_avg_epu8(s00, s01);
3076
15.4k
          const __m128i d1 = _mm_avg_epu8(s10, s11);
3077
15.4k
          _mm_storel_epi64((__m128i *)dst, d0);
3078
15.4k
          _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
3079
3080
15.4k
          src_ptr += 2 * src_stride;
3081
15.4k
          dst += 2 * dst_stride;
3082
15.4k
          y -= 2;
3083
15.4k
        } while (y);
3084
4.52k
      } else if (w == 16) {
3085
11.4k
        do {
3086
11.4k
          const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
3087
11.4k
          const __m128i s01 = _mm_loadu_si128((__m128i *)(src_ptr + 1));
3088
11.4k
          const __m128i s10 =
3089
11.4k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
3090
11.4k
          const __m128i s11 =
3091
11.4k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride + 1));
3092
11.4k
          const __m128i d0 = _mm_avg_epu8(s00, s01);
3093
11.4k
          const __m128i d1 = _mm_avg_epu8(s10, s11);
3094
11.4k
          _mm_storeu_si128((__m128i *)dst, d0);
3095
11.4k
          _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
3096
3097
11.4k
          src_ptr += 2 * src_stride;
3098
11.4k
          dst += 2 * dst_stride;
3099
11.4k
          y -= 2;
3100
11.4k
        } while (y);
3101
1.93k
      } else if (w == 32) {
3102
23.7k
        do {
3103
23.7k
          sr_x_2tap_32_avg_avx2(src_ptr, dst);
3104
23.7k
          src_ptr += src_stride;
3105
23.7k
          dst += dst_stride;
3106
23.7k
        } while (--y);
3107
989
      } else if (w == 64) {
3108
21.0k
        do {
3109
21.0k
          sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
3110
21.0k
          sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
3111
21.0k
          src_ptr += src_stride;
3112
21.0k
          dst += dst_stride;
3113
21.0k
        } while (--y);
3114
397
      } else {
3115
301
        assert(w == 128);
3116
3117
24.3k
        do {
3118
24.3k
          sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
3119
24.3k
          sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
3120
24.3k
          sr_x_2tap_32_avg_avx2(src_ptr + 2 * 32, dst + 2 * 32);
3121
24.3k
          sr_x_2tap_32_avg_avx2(src_ptr + 3 * 32, dst + 3 * 32);
3122
24.3k
          src_ptr += src_stride;
3123
24.3k
          dst += dst_stride;
3124
24.3k
        } while (--y);
3125
301
      }
3126
17.3k
    }
3127
586k
  } else if (horz_tap == 4) {
3128
    // horz_filt as 4 tap
3129
259k
    const uint8_t *src_ptr = src - 1;
3130
3131
259k
    prepare_half_coeffs_4tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
3132
3133
259k
    if (w == 2) {
3134
121k
      do {
3135
121k
        const __m128i res =
3136
121k
            x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
3137
121k
        const __m128i r = sr_x_round_sse2(res);
3138
121k
        pack_store_2x2_sse2(r, dst, dst_stride);
3139
121k
        src_ptr += 2 * src_stride;
3140
121k
        dst += 2 * dst_stride;
3141
121k
        y -= 2;
3142
121k
      } while (y);
3143
215k
    } else if (w == 4) {
3144
651k
      do {
3145
651k
        const __m128i res =
3146
651k
            x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
3147
651k
        const __m128i r = sr_x_round_sse2(res);
3148
651k
        pack_store_4x2_sse2(r, dst, dst_stride);
3149
651k
        src_ptr += 2 * src_stride;
3150
651k
        dst += 2 * dst_stride;
3151
651k
        y -= 2;
3152
651k
      } while (y);
3153
196k
    } else if (w == 8) {
3154
      // TODO(chiyotsai@google.com): Reuse the old SIMD code here. Need to
3155
      // rewrite this for better performance later.
3156
9.80k
      __m256i filt_256[2];
3157
9.80k
      prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_256);
3158
3159
9.80k
      filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3160
9.80k
      filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3161
45.6k
      for (int i = 0; i < h; i += 2) {
3162
35.8k
        const __m256i data = _mm256_permute2x128_si256(
3163
35.8k
            _mm256_castsi128_si256(
3164
35.8k
                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
3165
35.8k
            _mm256_castsi128_si256(_mm_loadu_si128(
3166
35.8k
                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
3167
35.8k
            0x20);
3168
3169
35.8k
        __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs_256 + 1, filt_256);
3170
35.8k
        res_16b = sr_x_round_avx2(res_16b);
3171
3172
35.8k
        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
3173
3174
35.8k
        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
3175
35.8k
        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
3176
3177
35.8k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
3178
35.8k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
3179
35.8k
      }
3180
9.80k
    } else {
3181
8.93k
      assert(!(w % 16));
3182
      // TODO(chiyotsai@google.com): Reuse the old SIMD code here. Need to
3183
      // rewrite this for better performance later.
3184
8.93k
      __m256i filt_256[2];
3185
8.93k
      prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_256);
3186
8.93k
      filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3187
8.93k
      filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3188
3189
245k
      for (int i = 0; i < h; ++i) {
3190
1.18M
        for (int j = 0; j < w; j += 16) {
3191
          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
3192
          // 18 19 20 21 22 23
3193
952k
          const __m256i data = _mm256_inserti128_si256(
3194
952k
              _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
3195
952k
              _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
3196
952k
              1);
3197
3198
952k
          __m256i res_16b =
3199
952k
              convolve_lowbd_x_4tap(data, coeffs_256 + 1, filt_256);
3200
952k
          res_16b = sr_x_round_avx2(res_16b);
3201
3202
          /* rounding code */
3203
          // 8 bit conversion and saturation to uint8
3204
952k
          __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
3205
3206
          // Store values into the destination buffer
3207
          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
3208
952k
          res_8b = _mm256_permute4x64_epi64(res_8b, 216);
3209
952k
          __m128i res = _mm256_castsi256_si128(res_8b);
3210
952k
          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
3211
952k
        }
3212
236k
      }
3213
8.93k
    }
3214
326k
  } else {
3215
326k
    __m256i filt_256[4];
3216
3217
326k
    filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3218
326k
    filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3219
326k
    filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx2);
3220
3221
326k
    if (horz_tap == 6) {
3222
      // horz_filt as 6 tap
3223
305k
      const uint8_t *src_ptr = src - 2;
3224
3225
305k
      prepare_half_coeffs_6tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3226
3227
305k
      if (w == 8) {
3228
636k
        do {
3229
636k
          const __m256i res = x_convolve_6tap_8x2_avx2(src_ptr, src_stride,
3230
636k
                                                       coeffs_256, filt_256);
3231
636k
          sr_x_round_store_8x2_avx2(res, dst, dst_stride);
3232
636k
          src_ptr += 2 * src_stride;
3233
636k
          dst += 2 * dst_stride;
3234
636k
          y -= 2;
3235
636k
        } while (y);
3236
173k
      } else if (w == 16) {
3237
522k
        do {
3238
522k
          __m256i r[2];
3239
3240
522k
          x_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256,
3241
522k
                                    r);
3242
522k
          sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3243
522k
          src_ptr += 2 * src_stride;
3244
522k
          dst += 2 * dst_stride;
3245
522k
          y -= 2;
3246
522k
        } while (y);
3247
103k
      } else if (w == 32) {
3248
493k
        do {
3249
493k
          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3250
493k
          src_ptr += src_stride;
3251
493k
          dst += dst_stride;
3252
493k
        } while (--y);
3253
25.2k
      } else if (w == 64) {
3254
189k
        do {
3255
189k
          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3256
189k
          sr_x_6tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, dst + 32);
3257
189k
          src_ptr += src_stride;
3258
189k
          dst += dst_stride;
3259
189k
        } while (--y);
3260
3.92k
      } else {
3261
546
        assert(w == 128);
3262
3263
68.5k
        do {
3264
68.5k
          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3265
68.5k
          sr_x_6tap_32_avx2(src_ptr + 1 * 32, coeffs_256, filt_256,
3266
68.5k
                            dst + 1 * 32);
3267
68.5k
          sr_x_6tap_32_avx2(src_ptr + 2 * 32, coeffs_256, filt_256,
3268
68.5k
                            dst + 2 * 32);
3269
68.5k
          sr_x_6tap_32_avx2(src_ptr + 3 * 32, coeffs_256, filt_256,
3270
68.5k
                            dst + 3 * 32);
3271
68.5k
          src_ptr += src_stride;
3272
68.5k
          dst += dst_stride;
3273
68.5k
        } while (--y);
3274
610
      }
3275
305k
    } else if (horz_tap == 8) {
3276
      // horz_filt as 8 tap
3277
20.9k
      const uint8_t *src_ptr = src - 3;
3278
3279
20.9k
      filt_256[3] = _mm256_loadu_si256((__m256i const *)filt4_global_avx2);
3280
3281
20.9k
      prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3282
3283
20.9k
      if (w == 8) {
3284
43.6k
        do {
3285
43.6k
          const __m256i res = x_convolve_8tap_8x2_avx2(src_ptr, src_stride,
3286
43.6k
                                                       coeffs_256, filt_256);
3287
43.6k
          sr_x_round_store_8x2_avx2(res, dst, dst_stride);
3288
43.6k
          src_ptr += 2 * src_stride;
3289
43.6k
          dst += 2 * dst_stride;
3290
43.6k
          y -= 2;
3291
43.6k
        } while (y);
3292
11.6k
      } else if (w == 16) {
3293
37.1k
        do {
3294
37.1k
          __m256i r[2];
3295
3296
37.1k
          x_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256,
3297
37.1k
                                    r);
3298
37.1k
          sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3299
37.1k
          src_ptr += 2 * src_stride;
3300
37.1k
          dst += 2 * dst_stride;
3301
37.1k
          y -= 2;
3302
37.1k
        } while (y);
3303
6.46k
      } else if (w == 32) {
3304
43.7k
        do {
3305
43.7k
          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3306
43.7k
          src_ptr += src_stride;
3307
43.7k
          dst += dst_stride;
3308
43.7k
        } while (--y);
3309
1.88k
      } else if (w == 64) {
3310
33.8k
        do {
3311
33.8k
          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3312
33.8k
          sr_x_8tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, dst + 32);
3313
33.8k
          src_ptr += src_stride;
3314
33.8k
          dst += dst_stride;
3315
33.8k
        } while (--y);
3316
743
      } else {
3317
177
        assert(w == 128);
3318
3319
17.7k
        do {
3320
17.7k
          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3321
17.7k
          sr_x_8tap_32_avx2(src_ptr + 1 * 32, coeffs_256, filt_256,
3322
17.7k
                            dst + 1 * 32);
3323
17.7k
          sr_x_8tap_32_avx2(src_ptr + 2 * 32, coeffs_256, filt_256,
3324
17.7k
                            dst + 2 * 32);
3325
17.7k
          sr_x_8tap_32_avx2(src_ptr + 3 * 32, coeffs_256, filt_256,
3326
17.7k
                            dst + 3 * 32);
3327
17.7k
          src_ptr += src_stride;
3328
17.7k
          dst += dst_stride;
3329
17.7k
        } while (--y);
3330
177
      }
3331
20.9k
    }
3332
326k
  }
3333
627k
}
3334
3335
#endif  // THIRD_PARTY_SVT_AV1_CONVOLVE_AVX2_H_