Coverage Report

Created: 2026-02-14 07:00

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/aom/third_party/SVT-AV1/convolve_avx2.h
Line
Count
Source
1
/*
2
 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#ifndef THIRD_PARTY_SVT_AV1_CONVOLVE_AVX2_H_
13
#define THIRD_PARTY_SVT_AV1_CONVOLVE_AVX2_H_
14
15
#include "EbMemory_AVX2.h"
16
#include "EbMemory_SSE4_1.h"
17
#include "synonyms.h"
18
19
#include "aom_dsp/aom_filter.h"
20
#include "aom_dsp/x86/convolve_avx2.h"
21
#include "aom_dsp/x86/mem_sse2.h"
22
23
static inline void populate_coeffs_4tap_avx2(const __m128i coeffs_128,
24
229k
                                             __m256i coeffs[2]) {
25
229k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
26
27
  // coeffs 2 3 2 3 2 3 2 3
28
229k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
29
  // coeffs 4 5 4 5 4 5 4 5
30
229k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
31
229k
}
convolve_2d_avx2.c:populate_coeffs_4tap_avx2
Line
Count
Source
24
64.7k
                                             __m256i coeffs[2]) {
25
64.7k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
26
27
  // coeffs 2 3 2 3 2 3 2 3
28
64.7k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
29
  // coeffs 4 5 4 5 4 5 4 5
30
64.7k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
31
64.7k
}
convolve_avx2.c:populate_coeffs_4tap_avx2
Line
Count
Source
24
165k
                                             __m256i coeffs[2]) {
25
165k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
26
27
  // coeffs 2 3 2 3 2 3 2 3
28
165k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
29
  // coeffs 4 5 4 5 4 5 4 5
30
165k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
31
165k
}
32
33
static inline void populate_coeffs_6tap_avx2(const __m128i coeffs_128,
34
1.48M
                                             __m256i coeffs[3]) {
35
1.48M
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
36
37
  // coeffs 1 2 1 2 1 2 1 2
38
1.48M
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0402u));
39
  // coeffs 3 4 3 4 3 4 3 4
40
1.48M
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0806u));
41
  // coeffs 5 6 5 6 5 6 5 6
42
1.48M
  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0C0Au));
43
1.48M
}
convolve_2d_avx2.c:populate_coeffs_6tap_avx2
Line
Count
Source
34
896k
                                             __m256i coeffs[3]) {
35
896k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
36
37
  // coeffs 1 2 1 2 1 2 1 2
38
896k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0402u));
39
  // coeffs 3 4 3 4 3 4 3 4
40
896k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0806u));
41
  // coeffs 5 6 5 6 5 6 5 6
42
896k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0C0Au));
43
896k
}
convolve_avx2.c:populate_coeffs_6tap_avx2
Line
Count
Source
34
584k
                                             __m256i coeffs[3]) {
35
584k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
36
37
  // coeffs 1 2 1 2 1 2 1 2
38
584k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0402u));
39
  // coeffs 3 4 3 4 3 4 3 4
40
584k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0806u));
41
  // coeffs 5 6 5 6 5 6 5 6
42
584k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0C0Au));
43
584k
}
44
45
static inline void populate_coeffs_8tap_avx2(const __m128i coeffs_128,
46
88.1k
                                             __m256i coeffs[4]) {
47
88.1k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
48
49
  // coeffs 0 1 0 1 0 1 0 1
50
88.1k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0200u));
51
  // coeffs 2 3 2 3 2 3 2 3
52
88.1k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
53
  // coeffs 4 5 4 5 4 5 4 5
54
88.1k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
55
  // coeffs 6 7 6 7 6 7 6 7
56
88.1k
  coeffs[3] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0e0cu));
57
88.1k
}
convolve_2d_avx2.c:populate_coeffs_8tap_avx2
Line
Count
Source
46
64.3k
                                             __m256i coeffs[4]) {
47
64.3k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
48
49
  // coeffs 0 1 0 1 0 1 0 1
50
64.3k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0200u));
51
  // coeffs 2 3 2 3 2 3 2 3
52
64.3k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
53
  // coeffs 4 5 4 5 4 5 4 5
54
64.3k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
55
  // coeffs 6 7 6 7 6 7 6 7
56
64.3k
  coeffs[3] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0e0cu));
57
64.3k
}
convolve_avx2.c:populate_coeffs_8tap_avx2
Line
Count
Source
46
23.7k
                                             __m256i coeffs[4]) {
47
23.7k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
48
49
  // coeffs 0 1 0 1 0 1 0 1
50
23.7k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0200u));
51
  // coeffs 2 3 2 3 2 3 2 3
52
23.7k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
53
  // coeffs 4 5 4 5 4 5 4 5
54
23.7k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
55
  // coeffs 6 7 6 7 6 7 6 7
56
23.7k
  coeffs[3] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0e0cu));
57
23.7k
}
58
59
static inline void prepare_half_coeffs_2tap_ssse3(
60
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
61
56.8k
    __m128i *const coeffs /* [1] */) {
62
56.8k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
63
56.8k
      filter_params, subpel_q4 & SUBPEL_MASK);
64
56.8k
  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
65
66
  // right shift all filter co-efficients by 1 to reduce the bits required.
67
  // This extra right shift will be taken care of at the end while rounding
68
  // the result.
69
  // Since all filter co-efficients are even, this change will not affect the
70
  // end result
71
56.8k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
72
56.8k
                            _mm_set1_epi16((short)0xffff)));
73
74
56.8k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
75
76
  // coeffs 3 4 3 4 3 4 3 4
77
56.8k
  *coeffs = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
78
56.8k
}
convolve_2d_avx2.c:prepare_half_coeffs_2tap_ssse3
Line
Count
Source
61
37.6k
    __m128i *const coeffs /* [1] */) {
62
37.6k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
63
37.6k
      filter_params, subpel_q4 & SUBPEL_MASK);
64
37.6k
  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
65
66
  // right shift all filter co-efficients by 1 to reduce the bits required.
67
  // This extra right shift will be taken care of at the end while rounding
68
  // the result.
69
  // Since all filter co-efficients are even, this change will not affect the
70
  // end result
71
37.6k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
72
37.6k
                            _mm_set1_epi16((short)0xffff)));
73
74
37.6k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
75
76
  // coeffs 3 4 3 4 3 4 3 4
77
37.6k
  *coeffs = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
78
37.6k
}
convolve_avx2.c:prepare_half_coeffs_2tap_ssse3
Line
Count
Source
61
19.2k
    __m128i *const coeffs /* [1] */) {
62
19.2k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
63
19.2k
      filter_params, subpel_q4 & SUBPEL_MASK);
64
19.2k
  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
65
66
  // right shift all filter co-efficients by 1 to reduce the bits required.
67
  // This extra right shift will be taken care of at the end while rounding
68
  // the result.
69
  // Since all filter co-efficients are even, this change will not affect the
70
  // end result
71
19.2k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
72
19.2k
                            _mm_set1_epi16((short)0xffff)));
73
74
19.2k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
75
76
  // coeffs 3 4 3 4 3 4 3 4
77
19.2k
  *coeffs = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
78
19.2k
}
79
80
static inline void prepare_half_coeffs_4tap_ssse3(
81
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
82
1.25M
    __m128i *const coeffs /* [2] */) {
83
1.25M
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
84
1.25M
      filter_params, subpel_q4 & SUBPEL_MASK);
85
1.25M
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
86
87
  // right shift all filter co-efficients by 1 to reduce the bits required.
88
  // This extra right shift will be taken care of at the end while rounding
89
  // the result.
90
  // Since all filter co-efficients are even, this change will not affect the
91
  // end result
92
1.25M
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
93
1.25M
                            _mm_set1_epi16((short)0xffff)));
94
95
1.25M
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
96
97
  // coeffs 2 3 2 3 2 3 2 3
98
1.25M
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
99
  // coeffs 4 5 4 5 4 5 4 5
100
1.25M
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
101
1.25M
}
convolve_2d_avx2.c:prepare_half_coeffs_4tap_ssse3
Line
Count
Source
82
781k
    __m128i *const coeffs /* [2] */) {
83
781k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
84
781k
      filter_params, subpel_q4 & SUBPEL_MASK);
85
781k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
86
87
  // right shift all filter co-efficients by 1 to reduce the bits required.
88
  // This extra right shift will be taken care of at the end while rounding
89
  // the result.
90
  // Since all filter co-efficients are even, this change will not affect the
91
  // end result
92
781k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
93
781k
                            _mm_set1_epi16((short)0xffff)));
94
95
781k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
96
97
  // coeffs 2 3 2 3 2 3 2 3
98
781k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
99
  // coeffs 4 5 4 5 4 5 4 5
100
781k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
101
781k
}
convolve_avx2.c:prepare_half_coeffs_4tap_ssse3
Line
Count
Source
82
470k
    __m128i *const coeffs /* [2] */) {
83
470k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
84
470k
      filter_params, subpel_q4 & SUBPEL_MASK);
85
470k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
86
87
  // right shift all filter co-efficients by 1 to reduce the bits required.
88
  // This extra right shift will be taken care of at the end while rounding
89
  // the result.
90
  // Since all filter co-efficients are even, this change will not affect the
91
  // end result
92
470k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
93
470k
                            _mm_set1_epi16((short)0xffff)));
94
95
470k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
96
97
  // coeffs 2 3 2 3 2 3 2 3
98
470k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
99
  // coeffs 4 5 4 5 4 5 4 5
100
470k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
101
470k
}
102
103
static inline void prepare_half_coeffs_6tap_ssse3(
104
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
105
86.9k
    __m128i *const coeffs /* [3] */) {
106
86.9k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
107
86.9k
      filter_params, subpel_q4 & SUBPEL_MASK);
108
86.9k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
109
110
  // right shift all filter co-efficients by 1 to reduce the bits required.
111
  // This extra right shift will be taken care of at the end while rounding
112
  // the result.
113
  // Since all filter co-efficients are even, this change will not affect the
114
  // end result
115
86.9k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
116
86.9k
                            _mm_set1_epi16((short)0xffff)));
117
118
86.9k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
119
120
  // coeffs 1 2 1 2 1 2 1 2
121
86.9k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0402u));
122
  // coeffs 3 4 3 4 3 4 3 4
123
86.9k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u));
124
  // coeffs 5 6 5 6 5 6 5 6
125
86.9k
  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0C0Au));
126
86.9k
}
Unexecuted instantiation: convolve_2d_avx2.c:prepare_half_coeffs_6tap_ssse3
convolve_avx2.c:prepare_half_coeffs_6tap_ssse3
Line
Count
Source
105
86.9k
    __m128i *const coeffs /* [3] */) {
106
86.9k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
107
86.9k
      filter_params, subpel_q4 & SUBPEL_MASK);
108
86.9k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
109
110
  // right shift all filter co-efficients by 1 to reduce the bits required.
111
  // This extra right shift will be taken care of at the end while rounding
112
  // the result.
113
  // Since all filter co-efficients are even, this change will not affect the
114
  // end result
115
86.9k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
116
86.9k
                            _mm_set1_epi16((short)0xffff)));
117
118
86.9k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
119
120
  // coeffs 1 2 1 2 1 2 1 2
121
86.9k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0402u));
122
  // coeffs 3 4 3 4 3 4 3 4
123
86.9k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u));
124
  // coeffs 5 6 5 6 5 6 5 6
125
86.9k
  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0C0Au));
126
86.9k
}
127
128
static inline void prepare_half_coeffs_8tap_ssse3(
129
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
130
6.74k
    __m128i *const coeffs /* [4] */) {
131
6.74k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
132
6.74k
      filter_params, subpel_q4 & SUBPEL_MASK);
133
6.74k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
134
135
  // right shift all filter co-efficients by 1 to reduce the bits required.
136
  // This extra right shift will be taken care of at the end while rounding
137
  // the result.
138
  // Since all filter co-efficients are even, this change will not affect the
139
  // end result
140
6.74k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
141
6.74k
                            _mm_set1_epi16((short)0xffff)));
142
143
6.74k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
144
145
  // coeffs 0 1 0 1 0 1 0 1
146
6.74k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
147
  // coeffs 2 3 2 3 2 3 2 3
148
6.74k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
149
  // coeffs 4 5 4 5 4 5 4 5
150
6.74k
  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
151
  // coeffs 6 7 6 7 6 7 6 7
152
6.74k
  coeffs[3] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0e0cu));
153
6.74k
}
Unexecuted instantiation: convolve_2d_avx2.c:prepare_half_coeffs_8tap_ssse3
convolve_avx2.c:prepare_half_coeffs_8tap_ssse3
Line
Count
Source
130
6.74k
    __m128i *const coeffs /* [4] */) {
131
6.74k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
132
6.74k
      filter_params, subpel_q4 & SUBPEL_MASK);
133
6.74k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
134
135
  // right shift all filter co-efficients by 1 to reduce the bits required.
136
  // This extra right shift will be taken care of at the end while rounding
137
  // the result.
138
  // Since all filter co-efficients are even, this change will not affect the
139
  // end result
140
6.74k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
141
6.74k
                            _mm_set1_epi16((short)0xffff)));
142
143
6.74k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
144
145
  // coeffs 0 1 0 1 0 1 0 1
146
6.74k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
147
  // coeffs 2 3 2 3 2 3 2 3
148
6.74k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
149
  // coeffs 4 5 4 5 4 5 4 5
150
6.74k
  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
151
  // coeffs 6 7 6 7 6 7 6 7
152
6.74k
  coeffs[3] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0e0cu));
153
6.74k
}
154
155
static inline void prepare_half_coeffs_2tap_avx2(
156
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
157
16.5k
    __m256i *const coeffs /* [1] */) {
158
16.5k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
159
16.5k
      filter_params, subpel_q4 & SUBPEL_MASK);
160
16.5k
  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
161
16.5k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
162
163
  // right shift all filter co-efficients by 1 to reduce the bits required.
164
  // This extra right shift will be taken care of at the end while rounding
165
  // the result.
166
  // Since all filter co-efficients are even, this change will not affect the
167
  // end result
168
16.5k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
169
16.5k
                            _mm_set1_epi16((short)0xffff)));
170
171
16.5k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
172
173
  // coeffs 3 4 3 4 3 4 3 4
174
16.5k
  *coeffs = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
175
16.5k
}
convolve_2d_avx2.c:prepare_half_coeffs_2tap_avx2
Line
Count
Source
157
9.90k
    __m256i *const coeffs /* [1] */) {
158
9.90k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
159
9.90k
      filter_params, subpel_q4 & SUBPEL_MASK);
160
9.90k
  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
161
9.90k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
162
163
  // right shift all filter co-efficients by 1 to reduce the bits required.
164
  // This extra right shift will be taken care of at the end while rounding
165
  // the result.
166
  // Since all filter co-efficients are even, this change will not affect the
167
  // end result
168
9.90k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
169
9.90k
                            _mm_set1_epi16((short)0xffff)));
170
171
9.90k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
172
173
  // coeffs 3 4 3 4 3 4 3 4
174
9.90k
  *coeffs = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
175
9.90k
}
convolve_avx2.c:prepare_half_coeffs_2tap_avx2
Line
Count
Source
157
6.68k
    __m256i *const coeffs /* [1] */) {
158
6.68k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
159
6.68k
      filter_params, subpel_q4 & SUBPEL_MASK);
160
6.68k
  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
161
6.68k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
162
163
  // right shift all filter co-efficients by 1 to reduce the bits required.
164
  // This extra right shift will be taken care of at the end while rounding
165
  // the result.
166
  // Since all filter co-efficients are even, this change will not affect the
167
  // end result
168
6.68k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
169
6.68k
                            _mm_set1_epi16((short)0xffff)));
170
171
6.68k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
172
173
  // coeffs 3 4 3 4 3 4 3 4
174
6.68k
  *coeffs = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
175
6.68k
}
176
177
static inline void prepare_half_coeffs_4tap_avx2(
178
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
179
229k
    __m256i *const coeffs /* [2] */) {
180
229k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
181
229k
      filter_params, subpel_q4 & SUBPEL_MASK);
182
229k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
183
184
  // right shift all filter co-efficients by 1 to reduce the bits required.
185
  // This extra right shift will be taken care of at the end while rounding
186
  // the result.
187
  // Since all filter co-efficients are even, this change will not affect the
188
  // end result
189
229k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
190
229k
                            _mm_set1_epi16((short)0xffff)));
191
229k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
192
229k
  populate_coeffs_4tap_avx2(coeffs_1, coeffs);
193
229k
}
convolve_2d_avx2.c:prepare_half_coeffs_4tap_avx2
Line
Count
Source
179
64.7k
    __m256i *const coeffs /* [2] */) {
180
64.7k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
181
64.7k
      filter_params, subpel_q4 & SUBPEL_MASK);
182
64.7k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
183
184
  // right shift all filter co-efficients by 1 to reduce the bits required.
185
  // This extra right shift will be taken care of at the end while rounding
186
  // the result.
187
  // Since all filter co-efficients are even, this change will not affect the
188
  // end result
189
64.7k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
190
64.7k
                            _mm_set1_epi16((short)0xffff)));
191
64.7k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
192
64.7k
  populate_coeffs_4tap_avx2(coeffs_1, coeffs);
193
64.7k
}
convolve_avx2.c:prepare_half_coeffs_4tap_avx2
Line
Count
Source
179
165k
    __m256i *const coeffs /* [2] */) {
180
165k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
181
165k
      filter_params, subpel_q4 & SUBPEL_MASK);
182
165k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
183
184
  // right shift all filter co-efficients by 1 to reduce the bits required.
185
  // This extra right shift will be taken care of at the end while rounding
186
  // the result.
187
  // Since all filter co-efficients are even, this change will not affect the
188
  // end result
189
165k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
190
165k
                            _mm_set1_epi16((short)0xffff)));
191
165k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
192
165k
  populate_coeffs_4tap_avx2(coeffs_1, coeffs);
193
165k
}
194
195
static inline void prepare_half_coeffs_6tap_avx2(
196
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
197
1.48M
    __m256i *const coeffs /* [3] */) {
198
1.48M
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
199
1.48M
      filter_params, subpel_q4 & SUBPEL_MASK);
200
1.48M
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
201
202
  // right shift all filter co-efficients by 1 to reduce the bits required.
203
  // This extra right shift will be taken care of at the end while rounding
204
  // the result.
205
  // Since all filter co-efficients are even, this change will not affect the
206
  // end result
207
1.48M
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
208
1.48M
                            _mm_set1_epi16((short)0xffff)));
209
1.48M
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
210
1.48M
  populate_coeffs_6tap_avx2(coeffs_1, coeffs);
211
1.48M
}
convolve_2d_avx2.c:prepare_half_coeffs_6tap_avx2
Line
Count
Source
197
896k
    __m256i *const coeffs /* [3] */) {
198
896k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
199
896k
      filter_params, subpel_q4 & SUBPEL_MASK);
200
896k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
201
202
  // right shift all filter co-efficients by 1 to reduce the bits required.
203
  // This extra right shift will be taken care of at the end while rounding
204
  // the result.
205
  // Since all filter co-efficients are even, this change will not affect the
206
  // end result
207
896k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
208
896k
                            _mm_set1_epi16((short)0xffff)));
209
896k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
210
896k
  populate_coeffs_6tap_avx2(coeffs_1, coeffs);
211
896k
}
convolve_avx2.c:prepare_half_coeffs_6tap_avx2
Line
Count
Source
197
584k
    __m256i *const coeffs /* [3] */) {
198
584k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
199
584k
      filter_params, subpel_q4 & SUBPEL_MASK);
200
584k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
201
202
  // right shift all filter co-efficients by 1 to reduce the bits required.
203
  // This extra right shift will be taken care of at the end while rounding
204
  // the result.
205
  // Since all filter co-efficients are even, this change will not affect the
206
  // end result
207
584k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
208
584k
                            _mm_set1_epi16((short)0xffff)));
209
584k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
210
584k
  populate_coeffs_6tap_avx2(coeffs_1, coeffs);
211
584k
}
212
213
static inline void prepare_half_coeffs_8tap_avx2(
214
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
215
88.1k
    __m256i *const coeffs /* [4] */) {
216
88.1k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
217
88.1k
      filter_params, subpel_q4 & SUBPEL_MASK);
218
88.1k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
219
220
  // right shift all filter co-efficients by 1 to reduce the bits required.
221
  // This extra right shift will be taken care of at the end while rounding
222
  // the result.
223
  // Since all filter co-efficients are even, this change will not affect the
224
  // end result
225
88.1k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
226
88.1k
                            _mm_set1_epi16((short)0xffff)));
227
88.1k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
228
88.1k
  populate_coeffs_8tap_avx2(coeffs_1, coeffs);
229
88.1k
}
convolve_2d_avx2.c:prepare_half_coeffs_8tap_avx2
Line
Count
Source
215
64.3k
    __m256i *const coeffs /* [4] */) {
216
64.3k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
217
64.3k
      filter_params, subpel_q4 & SUBPEL_MASK);
218
64.3k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
219
220
  // right shift all filter co-efficients by 1 to reduce the bits required.
221
  // This extra right shift will be taken care of at the end while rounding
222
  // the result.
223
  // Since all filter co-efficients are even, this change will not affect the
224
  // end result
225
64.3k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
226
64.3k
                            _mm_set1_epi16((short)0xffff)));
227
64.3k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
228
64.3k
  populate_coeffs_8tap_avx2(coeffs_1, coeffs);
229
64.3k
}
convolve_avx2.c:prepare_half_coeffs_8tap_avx2
Line
Count
Source
215
23.7k
    __m256i *const coeffs /* [4] */) {
216
23.7k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
217
23.7k
      filter_params, subpel_q4 & SUBPEL_MASK);
218
23.7k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
219
220
  // right shift all filter co-efficients by 1 to reduce the bits required.
221
  // This extra right shift will be taken care of at the end while rounding
222
  // the result.
223
  // Since all filter co-efficients are even, this change will not affect the
224
  // end result
225
23.7k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
226
23.7k
                            _mm_set1_epi16((short)0xffff)));
227
23.7k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
228
23.7k
  populate_coeffs_8tap_avx2(coeffs_1, coeffs);
229
23.7k
}
230
231
static inline void prepare_coeffs_2tap_sse2(
232
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
233
14.3k
    __m128i *const coeffs /* [1] */) {
234
14.3k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
235
14.3k
      filter_params, subpel_q4 & SUBPEL_MASK);
236
237
14.3k
  const __m128i coeff = _mm_cvtsi32_si128(loadu_int32(filter + 3));
238
239
  // coeffs 3 4 3 4 3 4 3 4
240
14.3k
  coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
241
14.3k
}
convolve_2d_avx2.c:prepare_coeffs_2tap_sse2
Line
Count
Source
233
14.3k
    __m128i *const coeffs /* [1] */) {
234
14.3k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
235
14.3k
      filter_params, subpel_q4 & SUBPEL_MASK);
236
237
14.3k
  const __m128i coeff = _mm_cvtsi32_si128(loadu_int32(filter + 3));
238
239
  // coeffs 3 4 3 4 3 4 3 4
240
  coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
241
14.3k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_2tap_sse2
242
243
static inline void prepare_coeffs_4tap_sse2(
244
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
245
98.3k
    __m128i *const coeffs /* [2] */) {
246
98.3k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
247
98.3k
      filter_params, subpel_q4 & SUBPEL_MASK);
248
249
98.3k
  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
250
251
  // coeffs 2 3 2 3 2 3 2 3
252
98.3k
  coeffs[0] = _mm_shuffle_epi32(coeff, 0x55);
253
  // coeffs 4 5 4 5 4 5 4 5
254
98.3k
  coeffs[1] = _mm_shuffle_epi32(coeff, 0xaa);
255
98.3k
}
convolve_2d_avx2.c:prepare_coeffs_4tap_sse2
Line
Count
Source
245
98.3k
    __m128i *const coeffs /* [2] */) {
246
98.3k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
247
98.3k
      filter_params, subpel_q4 & SUBPEL_MASK);
248
249
98.3k
  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
250
251
  // coeffs 2 3 2 3 2 3 2 3
252
98.3k
  coeffs[0] = _mm_shuffle_epi32(coeff, 0x55);
253
  // coeffs 4 5 4 5 4 5 4 5
254
  coeffs[1] = _mm_shuffle_epi32(coeff, 0xaa);
255
98.3k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_4tap_sse2
256
257
static inline void prepare_coeffs_6tap_ssse3(
258
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
259
57.8k
    __m128i *const coeffs /* [3] */) {
260
57.8k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
261
57.8k
      filter_params, subpel_q4 & SUBPEL_MASK);
262
57.8k
  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
263
264
  // coeffs 1 2 1 2 1 2 1 2
265
57.8k
  coeffs[0] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x05040302u));
266
  // coeffs 3 4 3 4 3 4 3 4
267
57.8k
  coeffs[1] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x09080706u));
268
  // coeffs 5 6 5 6 5 6 5 6
269
57.8k
  coeffs[2] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x0D0C0B0Au));
270
57.8k
}
convolve_2d_avx2.c:prepare_coeffs_6tap_ssse3
Line
Count
Source
259
57.8k
    __m128i *const coeffs /* [3] */) {
260
57.8k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
261
57.8k
      filter_params, subpel_q4 & SUBPEL_MASK);
262
57.8k
  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
263
264
  // coeffs 1 2 1 2 1 2 1 2
265
57.8k
  coeffs[0] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x05040302u));
266
  // coeffs 3 4 3 4 3 4 3 4
267
57.8k
  coeffs[1] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x09080706u));
268
  // coeffs 5 6 5 6 5 6 5 6
269
57.8k
  coeffs[2] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x0D0C0B0Au));
270
57.8k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_6tap_ssse3
271
272
static inline void prepare_coeffs_8tap_sse2(
273
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
274
2.77k
    __m128i *const coeffs /* [4] */) {
275
2.77k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
276
2.77k
      filter_params, subpel_q4 & SUBPEL_MASK);
277
278
2.77k
  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
279
280
  // coeffs 0 1 0 1 0 1 0 1
281
2.77k
  coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
282
  // coeffs 2 3 2 3 2 3 2 3
283
2.77k
  coeffs[1] = _mm_shuffle_epi32(coeff, 0x55);
284
  // coeffs 4 5 4 5 4 5 4 5
285
2.77k
  coeffs[2] = _mm_shuffle_epi32(coeff, 0xaa);
286
  // coeffs 6 7 6 7 6 7 6 7
287
2.77k
  coeffs[3] = _mm_shuffle_epi32(coeff, 0xff);
288
2.77k
}
convolve_2d_avx2.c:prepare_coeffs_8tap_sse2
Line
Count
Source
274
2.77k
    __m128i *const coeffs /* [4] */) {
275
2.77k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
276
2.77k
      filter_params, subpel_q4 & SUBPEL_MASK);
277
278
2.77k
  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
279
280
  // coeffs 0 1 0 1 0 1 0 1
281
2.77k
  coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
282
  // coeffs 2 3 2 3 2 3 2 3
283
2.77k
  coeffs[1] = _mm_shuffle_epi32(coeff, 0x55);
284
  // coeffs 4 5 4 5 4 5 4 5
285
2.77k
  coeffs[2] = _mm_shuffle_epi32(coeff, 0xaa);
286
  // coeffs 6 7 6 7 6 7 6 7
287
  coeffs[3] = _mm_shuffle_epi32(coeff, 0xff);
288
2.77k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_8tap_sse2
289
290
static inline void prepare_coeffs_2tap_avx2(
291
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
292
12.6k
    __m256i *const coeffs /* [1] */) {
293
12.6k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
294
12.6k
      filter_params, subpel_q4 & SUBPEL_MASK);
295
296
12.6k
  const __m128i coeff_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
297
12.6k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
298
299
  // coeffs 3 4 3 4 3 4 3 4
300
12.6k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
301
12.6k
}
convolve_2d_avx2.c:prepare_coeffs_2tap_avx2
Line
Count
Source
292
12.6k
    __m256i *const coeffs /* [1] */) {
293
12.6k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
294
12.6k
      filter_params, subpel_q4 & SUBPEL_MASK);
295
296
12.6k
  const __m128i coeff_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
297
12.6k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
298
299
  // coeffs 3 4 3 4 3 4 3 4
300
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
301
12.6k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_2tap_avx2
302
303
static inline void prepare_coeffs_4tap_avx2(
304
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
305
887k
    __m256i *const coeffs /* [2] */) {
306
887k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
307
887k
      filter_params, subpel_q4 & SUBPEL_MASK);
308
309
887k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
310
887k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
311
312
  // coeffs 2 3 2 3 2 3 2 3
313
887k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55);
314
  // coeffs 4 5 4 5 4 5 4 5
315
887k
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0xaa);
316
887k
}
convolve_2d_avx2.c:prepare_coeffs_4tap_avx2
Line
Count
Source
305
887k
    __m256i *const coeffs /* [2] */) {
306
887k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
307
887k
      filter_params, subpel_q4 & SUBPEL_MASK);
308
309
887k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
310
887k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
311
312
  // coeffs 2 3 2 3 2 3 2 3
313
887k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55);
314
  // coeffs 4 5 4 5 4 5 4 5
315
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0xaa);
316
887k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_4tap_avx2
317
318
static inline void prepare_coeffs_6tap_avx2(
319
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
320
702k
    __m256i *const coeffs /* [3]*/) {
321
702k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
322
702k
      filter_params, subpel_q4 & SUBPEL_MASK);
323
702k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
324
702k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeffs_8);
325
326
  // coeffs 1 2 1 2 1 2 1 2
327
702k
  coeffs[0] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x05040302u));
328
  // coeffs 3 4 3 4 3 4 3 4
329
702k
  coeffs[1] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x09080706u));
330
  // coeffs 5 6 5 6 5 6 5 6
331
702k
  coeffs[2] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x0D0C0B0Au));
332
702k
}
convolve_2d_avx2.c:prepare_coeffs_6tap_avx2
Line
Count
Source
320
702k
    __m256i *const coeffs /* [3]*/) {
321
702k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
322
702k
      filter_params, subpel_q4 & SUBPEL_MASK);
323
702k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
324
702k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeffs_8);
325
326
  // coeffs 1 2 1 2 1 2 1 2
327
702k
  coeffs[0] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x05040302u));
328
  // coeffs 3 4 3 4 3 4 3 4
329
702k
  coeffs[1] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x09080706u));
330
  // coeffs 5 6 5 6 5 6 5 6
331
702k
  coeffs[2] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x0D0C0B0Au));
332
702k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_6tap_avx2
333
334
static inline void prepare_coeffs_8tap_avx2(
335
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
336
58.6k
    __m256i *const coeffs /* [4] */) {
337
58.6k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
338
58.6k
      filter_params, subpel_q4 & SUBPEL_MASK);
339
340
58.6k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
341
58.6k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
342
343
  // coeffs 0 1 0 1 0 1 0 1
344
58.6k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
345
  // coeffs 2 3 2 3 2 3 2 3
346
58.6k
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
347
  // coeffs 4 5 4 5 4 5 4 5
348
58.6k
  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
349
  // coeffs 6 7 6 7 6 7 6 7
350
58.6k
  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
351
58.6k
}
convolve_2d_avx2.c:prepare_coeffs_8tap_avx2
Line
Count
Source
336
58.6k
    __m256i *const coeffs /* [4] */) {
337
58.6k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
338
58.6k
      filter_params, subpel_q4 & SUBPEL_MASK);
339
340
58.6k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
341
58.6k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
342
343
  // coeffs 0 1 0 1 0 1 0 1
344
58.6k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
345
  // coeffs 2 3 2 3 2 3 2 3
346
58.6k
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
347
  // coeffs 4 5 4 5 4 5 4 5
348
58.6k
  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
349
  // coeffs 6 7 6 7 6 7 6 7
350
  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
351
58.6k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_8tap_avx2
352
353
static inline void load_16bit_5rows_avx2(const int16_t *const src,
354
                                         const ptrdiff_t stride,
355
0
                                         __m256i dst[5]) {
356
0
  dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
357
0
  dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
358
0
  dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
359
0
  dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
360
0
  dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
361
0
}
Unexecuted instantiation: convolve_2d_avx2.c:load_16bit_5rows_avx2
Unexecuted instantiation: convolve_avx2.c:load_16bit_5rows_avx2
362
363
static inline void load_16bit_7rows_avx2(const int16_t *const src,
364
                                         const ptrdiff_t stride,
365
94.9k
                                         __m256i dst[7]) {
366
94.9k
  dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
367
94.9k
  dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
368
94.9k
  dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
369
94.9k
  dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
370
94.9k
  dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
371
94.9k
  dst[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
372
94.9k
  dst[6] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
373
94.9k
}
convolve_2d_avx2.c:load_16bit_7rows_avx2
Line
Count
Source
365
94.9k
                                         __m256i dst[7]) {
366
94.9k
  dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
367
94.9k
  dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
368
94.9k
  dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
369
94.9k
  dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
370
94.9k
  dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
371
94.9k
  dst[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
372
94.9k
  dst[6] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
373
94.9k
}
Unexecuted instantiation: convolve_avx2.c:load_16bit_7rows_avx2
374
375
static AOM_FORCE_INLINE void load_16bit_8rows_avx2(const int16_t *const src,
376
                                                   const ptrdiff_t stride,
377
275
                                                   __m256i dst[8]) {
378
275
  dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
379
275
  dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
380
275
  dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
381
275
  dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
382
275
  dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
383
275
  dst[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
384
275
  dst[6] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
385
275
  dst[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
386
275
}
convolve_2d_avx2.c:load_16bit_8rows_avx2
Line
Count
Source
377
275
                                                   __m256i dst[8]) {
378
275
  dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
379
275
  dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
380
275
  dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
381
275
  dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
382
275
  dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
383
275
  dst[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
384
275
  dst[6] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
385
275
  dst[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
386
275
}
Unexecuted instantiation: convolve_avx2.c:load_16bit_8rows_avx2
387
388
static AOM_FORCE_INLINE void loadu_unpack_16bit_5rows_avx2(
389
    const int16_t *const src, const ptrdiff_t stride, __m256i s_256[5],
390
192k
    __m256i ss_256[5], __m256i tt_256[5]) {
391
192k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
392
192k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
393
192k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
394
192k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
395
192k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
396
397
192k
  ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
398
192k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
399
192k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
400
192k
  ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
401
402
192k
  tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
403
192k
  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[4]);
404
192k
  tt_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
405
192k
  tt_256[4] = _mm256_unpackhi_epi16(s_256[3], s_256[4]);
406
192k
}
convolve_2d_avx2.c:loadu_unpack_16bit_5rows_avx2
Line
Count
Source
390
192k
    __m256i ss_256[5], __m256i tt_256[5]) {
391
192k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
392
192k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
393
192k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
394
192k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
395
192k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
396
397
192k
  ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
398
192k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
399
192k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
400
192k
  ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
401
402
192k
  tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
403
192k
  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[4]);
404
192k
  tt_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
405
192k
  tt_256[4] = _mm256_unpackhi_epi16(s_256[3], s_256[4]);
406
192k
}
Unexecuted instantiation: convolve_avx2.c:loadu_unpack_16bit_5rows_avx2
407
408
static AOM_FORCE_INLINE void loadu_unpack_16bit_3rows_avx2(
409
    const int16_t *const src, const ptrdiff_t stride, __m256i s_256[3],
410
25.1k
    __m256i ss_256[3], __m256i tt_256[3]) {
411
25.1k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
412
25.1k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
413
25.1k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
414
415
25.1k
  ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
416
25.1k
  ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
417
418
25.1k
  tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
419
25.1k
  tt_256[2] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
420
25.1k
}
convolve_2d_avx2.c:loadu_unpack_16bit_3rows_avx2
Line
Count
Source
410
25.1k
    __m256i ss_256[3], __m256i tt_256[3]) {
411
25.1k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
412
25.1k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
413
25.1k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
414
415
25.1k
  ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
416
25.1k
  ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
417
418
25.1k
  tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
419
25.1k
  tt_256[2] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
420
25.1k
}
Unexecuted instantiation: convolve_avx2.c:loadu_unpack_16bit_3rows_avx2
421
422
static inline void convolve_8tap_unpack_avx2(const __m256i s[6],
423
192k
                                             __m256i ss[7]) {
424
192k
  ss[0] = _mm256_unpacklo_epi16(s[0], s[1]);
425
192k
  ss[1] = _mm256_unpacklo_epi16(s[2], s[3]);
426
192k
  ss[2] = _mm256_unpacklo_epi16(s[4], s[5]);
427
192k
  ss[4] = _mm256_unpackhi_epi16(s[0], s[1]);
428
192k
  ss[5] = _mm256_unpackhi_epi16(s[2], s[3]);
429
192k
  ss[6] = _mm256_unpackhi_epi16(s[4], s[5]);
430
192k
}
convolve_2d_avx2.c:convolve_8tap_unpack_avx2
Line
Count
Source
423
192k
                                             __m256i ss[7]) {
424
192k
  ss[0] = _mm256_unpacklo_epi16(s[0], s[1]);
425
192k
  ss[1] = _mm256_unpacklo_epi16(s[2], s[3]);
426
192k
  ss[2] = _mm256_unpacklo_epi16(s[4], s[5]);
427
192k
  ss[4] = _mm256_unpackhi_epi16(s[0], s[1]);
428
192k
  ss[5] = _mm256_unpackhi_epi16(s[2], s[3]);
429
192k
  ss[6] = _mm256_unpackhi_epi16(s[4], s[5]);
430
192k
}
Unexecuted instantiation: convolve_avx2.c:convolve_8tap_unpack_avx2
431
432
static inline __m128i convolve_2tap_ssse3(const __m128i ss[1],
433
295k
                                          const __m128i coeffs[1]) {
434
295k
  return _mm_maddubs_epi16(ss[0], coeffs[0]);
435
295k
}
convolve_2d_avx2.c:convolve_2tap_ssse3
Line
Count
Source
433
213k
                                          const __m128i coeffs[1]) {
434
213k
  return _mm_maddubs_epi16(ss[0], coeffs[0]);
435
213k
}
convolve_avx2.c:convolve_2tap_ssse3
Line
Count
Source
433
81.9k
                                          const __m128i coeffs[1]) {
434
81.9k
  return _mm_maddubs_epi16(ss[0], coeffs[0]);
435
81.9k
}
436
437
static inline __m128i convolve_4tap_ssse3(const __m128i ss[2],
438
5.44M
                                          const __m128i coeffs[2]) {
439
5.44M
  const __m128i res_23 = _mm_maddubs_epi16(ss[0], coeffs[0]);
440
5.44M
  const __m128i res_45 = _mm_maddubs_epi16(ss[1], coeffs[1]);
441
5.44M
  return _mm_add_epi16(res_23, res_45);
442
5.44M
}
convolve_2d_avx2.c:convolve_4tap_ssse3
Line
Count
Source
438
4.20M
                                          const __m128i coeffs[2]) {
439
4.20M
  const __m128i res_23 = _mm_maddubs_epi16(ss[0], coeffs[0]);
440
4.20M
  const __m128i res_45 = _mm_maddubs_epi16(ss[1], coeffs[1]);
441
4.20M
  return _mm_add_epi16(res_23, res_45);
442
4.20M
}
convolve_avx2.c:convolve_4tap_ssse3
Line
Count
Source
438
1.24M
                                          const __m128i coeffs[2]) {
439
1.24M
  const __m128i res_23 = _mm_maddubs_epi16(ss[0], coeffs[0]);
440
1.24M
  const __m128i res_45 = _mm_maddubs_epi16(ss[1], coeffs[1]);
441
1.24M
  return _mm_add_epi16(res_23, res_45);
442
1.24M
}
443
444
static inline __m128i convolve_6tap_ssse3(const __m128i ss[3],
445
450k
                                          const __m128i coeffs[3]) {
446
450k
  const __m128i res_12 = _mm_maddubs_epi16(ss[0], coeffs[0]);
447
450k
  const __m128i res_34 = _mm_maddubs_epi16(ss[1], coeffs[1]);
448
450k
  const __m128i res_56 = _mm_maddubs_epi16(ss[2], coeffs[2]);
449
450k
  const __m128i res_1256 = _mm_add_epi16(res_12, res_56);
450
450k
  return _mm_add_epi16(res_1256, res_34);
451
450k
}
Unexecuted instantiation: convolve_2d_avx2.c:convolve_6tap_ssse3
convolve_avx2.c:convolve_6tap_ssse3
Line
Count
Source
445
450k
                                          const __m128i coeffs[3]) {
446
450k
  const __m128i res_12 = _mm_maddubs_epi16(ss[0], coeffs[0]);
447
450k
  const __m128i res_34 = _mm_maddubs_epi16(ss[1], coeffs[1]);
448
450k
  const __m128i res_56 = _mm_maddubs_epi16(ss[2], coeffs[2]);
449
450k
  const __m128i res_1256 = _mm_add_epi16(res_12, res_56);
450
450k
  return _mm_add_epi16(res_1256, res_34);
451
450k
}
452
453
static inline __m128i convolve_8tap_ssse3(const __m128i ss[4],
454
32.6k
                                          const __m128i coeffs[4]) {
455
32.6k
  const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
456
32.6k
  const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
457
32.6k
  const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]);
458
32.6k
  const __m128i res_67 = _mm_maddubs_epi16(ss[3], coeffs[3]);
459
32.6k
  const __m128i res_0145 = _mm_add_epi16(res_01, res_45);
460
32.6k
  const __m128i res_2367 = _mm_add_epi16(res_23, res_67);
461
32.6k
  return _mm_add_epi16(res_0145, res_2367);
462
32.6k
}
Unexecuted instantiation: convolve_2d_avx2.c:convolve_8tap_ssse3
convolve_avx2.c:convolve_8tap_ssse3
Line
Count
Source
454
32.6k
                                          const __m128i coeffs[4]) {
455
32.6k
  const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
456
32.6k
  const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
457
32.6k
  const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]);
458
32.6k
  const __m128i res_67 = _mm_maddubs_epi16(ss[3], coeffs[3]);
459
32.6k
  const __m128i res_0145 = _mm_add_epi16(res_01, res_45);
460
32.6k
  const __m128i res_2367 = _mm_add_epi16(res_23, res_67);
461
32.6k
  return _mm_add_epi16(res_0145, res_2367);
462
32.6k
}
463
464
static inline __m256i convolve_2tap_avx2(const __m256i ss[1],
465
1.15M
                                         const __m256i coeffs[1]) {
466
1.15M
  return _mm256_maddubs_epi16(ss[0], coeffs[0]);
467
1.15M
}
convolve_2d_avx2.c:convolve_2tap_avx2
Line
Count
Source
465
658k
                                         const __m256i coeffs[1]) {
466
658k
  return _mm256_maddubs_epi16(ss[0], coeffs[0]);
467
658k
}
convolve_avx2.c:convolve_2tap_avx2
Line
Count
Source
465
499k
                                         const __m256i coeffs[1]) {
466
499k
  return _mm256_maddubs_epi16(ss[0], coeffs[0]);
467
499k
}
468
469
static inline __m256i convolve_4tap_avx2(const __m256i ss[2],
470
2.90M
                                         const __m256i coeffs[2]) {
471
2.90M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
472
2.90M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
473
2.90M
  return _mm256_add_epi16(res_23, res_45);
474
2.90M
}
convolve_2d_avx2.c:convolve_4tap_avx2
Line
Count
Source
470
1.70M
                                         const __m256i coeffs[2]) {
471
1.70M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
472
1.70M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
473
1.70M
  return _mm256_add_epi16(res_23, res_45);
474
1.70M
}
convolve_avx2.c:convolve_4tap_avx2
Line
Count
Source
470
1.20M
                                         const __m256i coeffs[2]) {
471
1.20M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
472
1.20M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
473
1.20M
  return _mm256_add_epi16(res_23, res_45);
474
1.20M
}
475
476
static inline __m256i convolve_6tap_avx2(const __m256i ss[3],
477
25.0M
                                         const __m256i coeffs[3]) {
478
25.0M
  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
479
25.0M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
480
25.0M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
481
25.0M
  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
482
25.0M
  return _mm256_add_epi16(res_0145, res_23);
483
25.0M
}
convolve_2d_avx2.c:convolve_6tap_avx2
Line
Count
Source
477
15.5M
                                         const __m256i coeffs[3]) {
478
15.5M
  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
479
15.5M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
480
15.5M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
481
15.5M
  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
482
15.5M
  return _mm256_add_epi16(res_0145, res_23);
483
15.5M
}
convolve_avx2.c:convolve_6tap_avx2
Line
Count
Source
477
9.54M
                                         const __m256i coeffs[3]) {
478
9.54M
  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
479
9.54M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
480
9.54M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
481
9.54M
  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
482
9.54M
  return _mm256_add_epi16(res_0145, res_23);
483
9.54M
}
484
485
static inline __m256i convolve_8tap_avx2(const __m256i ss[4],
486
4.08M
                                         const __m256i coeffs[4]) {
487
4.08M
  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
488
4.08M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
489
4.08M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
490
4.08M
  const __m256i res_67 = _mm256_maddubs_epi16(ss[3], coeffs[3]);
491
4.08M
  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
492
4.08M
  const __m256i res_2367 = _mm256_add_epi16(res_23, res_67);
493
4.08M
  return _mm256_add_epi16(res_0145, res_2367);
494
4.08M
}
convolve_2d_avx2.c:convolve_8tap_avx2
Line
Count
Source
486
3.40M
                                         const __m256i coeffs[4]) {
487
3.40M
  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
488
3.40M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
489
3.40M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
490
3.40M
  const __m256i res_67 = _mm256_maddubs_epi16(ss[3], coeffs[3]);
491
3.40M
  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
492
3.40M
  const __m256i res_2367 = _mm256_add_epi16(res_23, res_67);
493
3.40M
  return _mm256_add_epi16(res_0145, res_2367);
494
3.40M
}
convolve_avx2.c:convolve_8tap_avx2
Line
Count
Source
486
686k
                                         const __m256i coeffs[4]) {
487
686k
  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
488
686k
  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
489
686k
  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
490
686k
  const __m256i res_67 = _mm256_maddubs_epi16(ss[3], coeffs[3]);
491
686k
  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
492
686k
  const __m256i res_2367 = _mm256_add_epi16(res_23, res_67);
493
686k
  return _mm256_add_epi16(res_0145, res_2367);
494
686k
}
495
496
static inline __m128i convolve16_2tap_sse2(const __m128i ss[1],
497
63.6k
                                           const __m128i coeffs[1]) {
498
63.6k
  return _mm_madd_epi16(ss[0], coeffs[0]);
499
63.6k
}
convolve_2d_avx2.c:convolve16_2tap_sse2
Line
Count
Source
497
63.6k
                                           const __m128i coeffs[1]) {
498
63.6k
  return _mm_madd_epi16(ss[0], coeffs[0]);
499
63.6k
}
Unexecuted instantiation: convolve_avx2.c:convolve16_2tap_sse2
500
501
static inline __m128i convolve16_4tap_sse2(const __m128i ss[2],
502
169k
                                           const __m128i coeffs[2]) {
503
169k
  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
504
169k
  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
505
169k
  return _mm_add_epi32(res_01, res_23);
506
169k
}
convolve_2d_avx2.c:convolve16_4tap_sse2
Line
Count
Source
502
169k
                                           const __m128i coeffs[2]) {
503
169k
  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
504
169k
  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
505
169k
  return _mm_add_epi32(res_01, res_23);
506
169k
}
Unexecuted instantiation: convolve_avx2.c:convolve16_4tap_sse2
507
508
static inline __m128i convolve16_6tap_sse2(const __m128i ss[3],
509
231k
                                           const __m128i coeffs[3]) {
510
231k
  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
511
231k
  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
512
231k
  const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
513
231k
  const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
514
231k
  return _mm_add_epi32(res_0123, res_45);
515
231k
}
convolve_2d_avx2.c:convolve16_6tap_sse2
Line
Count
Source
509
231k
                                           const __m128i coeffs[3]) {
510
231k
  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
511
231k
  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
512
231k
  const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
513
231k
  const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
514
231k
  return _mm_add_epi32(res_0123, res_45);
515
231k
}
Unexecuted instantiation: convolve_avx2.c:convolve16_6tap_sse2
516
517
static inline __m128i convolve16_8tap_sse2(const __m128i ss[4],
518
11.1k
                                           const __m128i coeffs[4]) {
519
11.1k
  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
520
11.1k
  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
521
11.1k
  const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
522
11.1k
  const __m128i res_67 = _mm_madd_epi16(ss[3], coeffs[3]);
523
11.1k
  const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
524
11.1k
  const __m128i res_4567 = _mm_add_epi32(res_45, res_67);
525
11.1k
  return _mm_add_epi32(res_0123, res_4567);
526
11.1k
}
convolve_2d_avx2.c:convolve16_8tap_sse2
Line
Count
Source
518
11.1k
                                           const __m128i coeffs[4]) {
519
11.1k
  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
520
11.1k
  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
521
11.1k
  const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
522
11.1k
  const __m128i res_67 = _mm_madd_epi16(ss[3], coeffs[3]);
523
11.1k
  const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
524
11.1k
  const __m128i res_4567 = _mm_add_epi32(res_45, res_67);
525
11.1k
  return _mm_add_epi32(res_0123, res_4567);
526
11.1k
}
Unexecuted instantiation: convolve_avx2.c:convolve16_8tap_sse2
527
528
static inline __m256i convolve16_2tap_avx2(const __m256i ss[1],
529
1.17M
                                           const __m256i coeffs[1]) {
530
1.17M
  return _mm256_madd_epi16(ss[0], coeffs[0]);
531
1.17M
}
convolve_2d_avx2.c:convolve16_2tap_avx2
Line
Count
Source
529
1.17M
                                           const __m256i coeffs[1]) {
530
1.17M
  return _mm256_madd_epi16(ss[0], coeffs[0]);
531
1.17M
}
Unexecuted instantiation: convolve_avx2.c:convolve16_2tap_avx2
532
533
static inline __m256i convolve16_4tap_avx2(const __m256i ss[2],
534
6.54M
                                           const __m256i coeffs[2]) {
535
6.54M
  const __m256i res_1 = _mm256_madd_epi16(ss[0], coeffs[0]);
536
6.54M
  const __m256i res_2 = _mm256_madd_epi16(ss[1], coeffs[1]);
537
6.54M
  return _mm256_add_epi32(res_1, res_2);
538
6.54M
}
convolve_2d_avx2.c:convolve16_4tap_avx2
Line
Count
Source
534
6.54M
                                           const __m256i coeffs[2]) {
535
6.54M
  const __m256i res_1 = _mm256_madd_epi16(ss[0], coeffs[0]);
536
6.54M
  const __m256i res_2 = _mm256_madd_epi16(ss[1], coeffs[1]);
537
6.54M
  return _mm256_add_epi32(res_1, res_2);
538
6.54M
}
Unexecuted instantiation: convolve_avx2.c:convolve16_4tap_avx2
539
540
static inline __m256i convolve16_6tap_avx2(const __m256i ss[3],
541
21.2M
                                           const __m256i coeffs[3]) {
542
21.2M
  const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
543
21.2M
  const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
544
21.2M
  const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
545
21.2M
  const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
546
21.2M
  return _mm256_add_epi32(res_0123, res_45);
547
21.2M
}
convolve_2d_avx2.c:convolve16_6tap_avx2
Line
Count
Source
541
21.2M
                                           const __m256i coeffs[3]) {
542
21.2M
  const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
543
21.2M
  const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
544
21.2M
  const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
545
21.2M
  const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
546
21.2M
  return _mm256_add_epi32(res_0123, res_45);
547
21.2M
}
Unexecuted instantiation: convolve_avx2.c:convolve16_6tap_avx2
548
549
static inline __m256i convolve16_8tap_avx2(const __m256i ss[4],
550
4.99M
                                           const __m256i coeffs[4]) {
551
4.99M
  const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
552
4.99M
  const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
553
4.99M
  const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
554
4.99M
  const __m256i res_67 = _mm256_madd_epi16(ss[3], coeffs[3]);
555
4.99M
  const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
556
4.99M
  const __m256i res_4567 = _mm256_add_epi32(res_45, res_67);
557
4.99M
  return _mm256_add_epi32(res_0123, res_4567);
558
4.99M
}
convolve_2d_avx2.c:convolve16_8tap_avx2
Line
Count
Source
550
4.99M
                                           const __m256i coeffs[4]) {
551
4.99M
  const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
552
4.99M
  const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
553
4.99M
  const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
554
4.99M
  const __m256i res_67 = _mm256_madd_epi16(ss[3], coeffs[3]);
555
4.99M
  const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
556
4.99M
  const __m256i res_4567 = _mm256_add_epi32(res_45, res_67);
557
4.99M
  return _mm256_add_epi32(res_0123, res_4567);
558
4.99M
}
Unexecuted instantiation: convolve_avx2.c:convolve16_8tap_avx2
559
560
static inline __m256i x_convolve_4tap_avx2(const __m256i data,
561
                                           const __m256i coeffs[2],
562
1.70M
                                           const __m256i filt[2]) {
563
1.70M
  __m256i ss[2];
564
565
1.70M
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
566
1.70M
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
567
568
1.70M
  return convolve_4tap_avx2(ss, coeffs);
569
1.70M
}
convolve_2d_avx2.c:x_convolve_4tap_avx2
Line
Count
Source
562
1.70M
                                           const __m256i filt[2]) {
563
1.70M
  __m256i ss[2];
564
565
1.70M
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
566
1.70M
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
567
568
1.70M
  return convolve_4tap_avx2(ss, coeffs);
569
1.70M
}
Unexecuted instantiation: convolve_avx2.c:x_convolve_4tap_avx2
570
571
static inline __m256i x_convolve_6tap_avx2(const __m256i data,
572
                                           const __m256i coeffs[3],
573
20.8M
                                           const __m256i filt[3]) {
574
20.8M
  __m256i ss[3];
575
576
20.8M
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
577
20.8M
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
578
20.8M
  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
579
580
20.8M
  return convolve_6tap_avx2(ss, coeffs);
581
20.8M
}
convolve_2d_avx2.c:x_convolve_6tap_avx2
Line
Count
Source
573
15.5M
                                           const __m256i filt[3]) {
574
15.5M
  __m256i ss[3];
575
576
15.5M
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
577
15.5M
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
578
15.5M
  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
579
580
15.5M
  return convolve_6tap_avx2(ss, coeffs);
581
15.5M
}
convolve_avx2.c:x_convolve_6tap_avx2
Line
Count
Source
573
5.31M
                                           const __m256i filt[3]) {
574
5.31M
  __m256i ss[3];
575
576
5.31M
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
577
5.31M
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
578
5.31M
  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
579
580
5.31M
  return convolve_6tap_avx2(ss, coeffs);
581
5.31M
}
582
583
static inline __m256i x_convolve_8tap_avx2(const __m256i data,
584
                                           const __m256i coeffs[4],
585
3.84M
                                           const __m256i filt[4]) {
586
3.84M
  __m256i ss[4];
587
588
3.84M
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
589
3.84M
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
590
3.84M
  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
591
3.84M
  ss[3] = _mm256_shuffle_epi8(data, filt[3]);
592
593
3.84M
  return convolve_8tap_avx2(ss, coeffs);
594
3.84M
}
convolve_2d_avx2.c:x_convolve_8tap_avx2
Line
Count
Source
585
3.40M
                                           const __m256i filt[4]) {
586
3.40M
  __m256i ss[4];
587
588
3.40M
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
589
3.40M
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
590
3.40M
  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
591
3.40M
  ss[3] = _mm256_shuffle_epi8(data, filt[3]);
592
593
3.40M
  return convolve_8tap_avx2(ss, coeffs);
594
3.40M
}
convolve_avx2.c:x_convolve_8tap_avx2
Line
Count
Source
585
443k
                                           const __m256i filt[4]) {
586
443k
  __m256i ss[4];
587
588
443k
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
589
443k
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
590
443k
  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
591
443k
  ss[3] = _mm256_shuffle_epi8(data, filt[3]);
592
593
443k
  return convolve_8tap_avx2(ss, coeffs);
594
443k
}
595
596
5.94M
static inline __m256i sr_y_round_avx2(const __m256i src) {
597
5.94M
  const __m256i round = _mm256_set1_epi16(32);
598
5.94M
  const __m256i dst = _mm256_add_epi16(src, round);
599
5.94M
  return _mm256_srai_epi16(dst, FILTER_BITS - 1);
600
5.94M
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_round_avx2
convolve_avx2.c:sr_y_round_avx2
Line
Count
Source
596
5.94M
static inline __m256i sr_y_round_avx2(const __m256i src) {
597
5.94M
  const __m256i round = _mm256_set1_epi16(32);
598
5.94M
  const __m256i dst = _mm256_add_epi16(src, round);
599
5.94M
  return _mm256_srai_epi16(dst, FILTER_BITS - 1);
600
5.94M
}
601
602
4.41M
static inline __m128i xy_x_round_sse2(const __m128i src) {
603
4.41M
  const __m128i round = _mm_set1_epi16(2);
604
4.41M
  const __m128i dst = _mm_add_epi16(src, round);
605
4.41M
  return _mm_srai_epi16(dst, 2);
606
4.41M
}
convolve_2d_avx2.c:xy_x_round_sse2
Line
Count
Source
602
4.41M
static inline __m128i xy_x_round_sse2(const __m128i src) {
603
4.41M
  const __m128i round = _mm_set1_epi16(2);
604
4.41M
  const __m128i dst = _mm_add_epi16(src, round);
605
4.41M
  return _mm_srai_epi16(dst, 2);
606
4.41M
}
Unexecuted instantiation: convolve_avx2.c:xy_x_round_sse2
607
608
21.3M
static inline __m256i xy_x_round_avx2(const __m256i src) {
609
21.3M
  const __m256i round = _mm256_set1_epi16(2);
610
21.3M
  const __m256i dst = _mm256_add_epi16(src, round);
611
21.3M
  return _mm256_srai_epi16(dst, 2);
612
21.3M
}
convolve_2d_avx2.c:xy_x_round_avx2
Line
Count
Source
608
21.3M
static inline __m256i xy_x_round_avx2(const __m256i src) {
609
21.3M
  const __m256i round = _mm256_set1_epi16(2);
610
21.3M
  const __m256i dst = _mm256_add_epi16(src, round);
611
21.3M
  return _mm256_srai_epi16(dst, 2);
612
21.3M
}
Unexecuted instantiation: convolve_avx2.c:xy_x_round_avx2
613
614
static inline void xy_x_round_store_2x2_sse2(const __m128i res,
615
814k
                                             int16_t *const dst) {
616
814k
  const __m128i d = xy_x_round_sse2(res);
617
814k
  _mm_storel_epi64((__m128i *)dst, d);
618
814k
}
convolve_2d_avx2.c:xy_x_round_store_2x2_sse2
Line
Count
Source
615
814k
                                             int16_t *const dst) {
616
814k
  const __m128i d = xy_x_round_sse2(res);
617
814k
  _mm_storel_epi64((__m128i *)dst, d);
618
814k
}
Unexecuted instantiation: convolve_avx2.c:xy_x_round_store_2x2_sse2
619
620
static inline void xy_x_round_store_4x2_sse2(const __m128i res,
621
3.47M
                                             int16_t *const dst) {
622
3.47M
  const __m128i d = xy_x_round_sse2(res);
623
3.47M
  _mm_storeu_si128((__m128i *)dst, d);
624
3.47M
}
convolve_2d_avx2.c:xy_x_round_store_4x2_sse2
Line
Count
Source
621
3.47M
                                             int16_t *const dst) {
622
3.47M
  const __m128i d = xy_x_round_sse2(res);
623
3.47M
  _mm_storeu_si128((__m128i *)dst, d);
624
3.47M
}
Unexecuted instantiation: convolve_avx2.c:xy_x_round_store_4x2_sse2
625
626
static inline void xy_x_round_store_8x2_sse2(const __m128i res[2],
627
60.7k
                                             int16_t *const dst) {
628
60.7k
  __m128i r[2];
629
630
60.7k
  r[0] = xy_x_round_sse2(res[0]);
631
60.7k
  r[1] = xy_x_round_sse2(res[1]);
632
60.7k
  _mm_storeu_si128((__m128i *)dst, r[0]);
633
60.7k
  _mm_storeu_si128((__m128i *)(dst + 8), r[1]);
634
60.7k
}
convolve_2d_avx2.c:xy_x_round_store_8x2_sse2
Line
Count
Source
627
60.7k
                                             int16_t *const dst) {
628
60.7k
  __m128i r[2];
629
630
60.7k
  r[0] = xy_x_round_sse2(res[0]);
631
60.7k
  r[1] = xy_x_round_sse2(res[1]);
632
60.7k
  _mm_storeu_si128((__m128i *)dst, r[0]);
633
60.7k
  _mm_storeu_si128((__m128i *)(dst + 8), r[1]);
634
60.7k
}
Unexecuted instantiation: convolve_avx2.c:xy_x_round_store_8x2_sse2
635
636
static inline void xy_x_round_store_8x2_avx2(const __m256i res,
637
3.53M
                                             int16_t *const dst) {
638
3.53M
  const __m256i d = xy_x_round_avx2(res);
639
3.53M
  _mm256_storeu_si256((__m256i *)dst, d);
640
3.53M
}
convolve_2d_avx2.c:xy_x_round_store_8x2_avx2
Line
Count
Source
637
3.53M
                                             int16_t *const dst) {
638
3.53M
  const __m256i d = xy_x_round_avx2(res);
639
3.53M
  _mm256_storeu_si256((__m256i *)dst, d);
640
3.53M
}
Unexecuted instantiation: convolve_avx2.c:xy_x_round_store_8x2_avx2
641
642
static inline void xy_x_round_store_32_avx2(const __m256i res[2],
643
2.44M
                                            int16_t *const dst) {
644
2.44M
  __m256i r[2];
645
646
2.44M
  r[0] = xy_x_round_avx2(res[0]);
647
2.44M
  r[1] = xy_x_round_avx2(res[1]);
648
2.44M
  const __m256i d0 =
649
2.44M
      _mm256_inserti128_si256(r[0], _mm256_castsi256_si128(r[1]), 1);
650
2.44M
  const __m256i d1 =
651
2.44M
      _mm256_inserti128_si256(r[1], _mm256_extracti128_si256(r[0], 1), 0);
652
2.44M
  _mm256_storeu_si256((__m256i *)dst, d0);
653
2.44M
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
654
2.44M
}
convolve_2d_avx2.c:xy_x_round_store_32_avx2
Line
Count
Source
643
2.44M
                                            int16_t *const dst) {
644
2.44M
  __m256i r[2];
645
646
2.44M
  r[0] = xy_x_round_avx2(res[0]);
647
2.44M
  r[1] = xy_x_round_avx2(res[1]);
648
2.44M
  const __m256i d0 =
649
2.44M
      _mm256_inserti128_si256(r[0], _mm256_castsi256_si128(r[1]), 1);
650
2.44M
  const __m256i d1 =
651
      _mm256_inserti128_si256(r[1], _mm256_extracti128_si256(r[0], 1), 0);
652
2.44M
  _mm256_storeu_si256((__m256i *)dst, d0);
653
2.44M
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
654
2.44M
}
Unexecuted instantiation: convolve_avx2.c:xy_x_round_store_32_avx2
655
656
475k
static inline __m128i xy_y_round_sse2(const __m128i src) {
657
475k
  const __m128i round = _mm_set1_epi32(1024);
658
475k
  const __m128i dst = _mm_add_epi32(src, round);
659
475k
  return _mm_srai_epi32(dst, 11);
660
475k
}
convolve_2d_avx2.c:xy_y_round_sse2
Line
Count
Source
656
475k
static inline __m128i xy_y_round_sse2(const __m128i src) {
657
475k
  const __m128i round = _mm_set1_epi32(1024);
658
475k
  const __m128i dst = _mm_add_epi32(src, round);
659
475k
  return _mm_srai_epi32(dst, 11);
660
475k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_sse2
661
662
31.2k
static inline __m128i xy_y_round_half_pel_sse2(const __m128i src) {
663
31.2k
  const __m128i round = _mm_set1_epi16(16);
664
31.2k
  const __m128i dst = _mm_add_epi16(src, round);
665
31.2k
  return _mm_srai_epi16(dst, 5);
666
31.2k
}
convolve_2d_avx2.c:xy_y_round_half_pel_sse2
Line
Count
Source
662
31.2k
static inline __m128i xy_y_round_half_pel_sse2(const __m128i src) {
663
31.2k
  const __m128i round = _mm_set1_epi16(16);
664
31.2k
  const __m128i dst = _mm_add_epi16(src, round);
665
31.2k
  return _mm_srai_epi16(dst, 5);
666
31.2k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_half_pel_sse2
667
668
33.9M
static inline __m256i xy_y_round_avx2(const __m256i src) {
669
33.9M
  const __m256i round = _mm256_set1_epi32(1024);
670
33.9M
  const __m256i dst = _mm256_add_epi32(src, round);
671
33.9M
  return _mm256_srai_epi32(dst, 11);
672
33.9M
}
convolve_2d_avx2.c:xy_y_round_avx2
Line
Count
Source
668
33.9M
static inline __m256i xy_y_round_avx2(const __m256i src) {
669
33.9M
  const __m256i round = _mm256_set1_epi32(1024);
670
33.9M
  const __m256i dst = _mm256_add_epi32(src, round);
671
33.9M
  return _mm256_srai_epi32(dst, 11);
672
33.9M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_avx2
673
674
15.9M
static inline __m256i xy_y_round_16_avx2(const __m256i r[2]) {
675
15.9M
  const __m256i r0 = xy_y_round_avx2(r[0]);
676
15.9M
  const __m256i r1 = xy_y_round_avx2(r[1]);
677
15.9M
  return _mm256_packs_epi32(r0, r1);
678
15.9M
}
convolve_2d_avx2.c:xy_y_round_16_avx2
Line
Count
Source
674
15.9M
static inline __m256i xy_y_round_16_avx2(const __m256i r[2]) {
675
15.9M
  const __m256i r0 = xy_y_round_avx2(r[0]);
676
15.9M
  const __m256i r1 = xy_y_round_avx2(r[1]);
677
15.9M
  return _mm256_packs_epi32(r0, r1);
678
15.9M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_16_avx2
679
680
292k
static inline __m256i xy_y_round_half_pel_avx2(const __m256i src) {
681
292k
  const __m256i round = _mm256_set1_epi16(16);
682
292k
  const __m256i dst = _mm256_add_epi16(src, round);
683
292k
  return _mm256_srai_epi16(dst, 5);
684
292k
}
convolve_2d_avx2.c:xy_y_round_half_pel_avx2
Line
Count
Source
680
292k
static inline __m256i xy_y_round_half_pel_avx2(const __m256i src) {
681
292k
  const __m256i round = _mm256_set1_epi16(16);
682
292k
  const __m256i dst = _mm256_add_epi16(src, round);
683
292k
  return _mm256_srai_epi16(dst, 5);
684
292k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_half_pel_avx2
685
686
static inline void pack_store_2x2_sse2(const __m128i res, uint8_t *const dst,
687
698k
                                       const ptrdiff_t stride) {
688
698k
  const __m128i d = _mm_packus_epi16(res, res);
689
698k
  *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d);
690
698k
  *(int16_t *)(dst + stride) = (int16_t)_mm_extract_epi16(d, 1);
691
698k
}
convolve_2d_avx2.c:pack_store_2x2_sse2
Line
Count
Source
687
426k
                                       const ptrdiff_t stride) {
688
426k
  const __m128i d = _mm_packus_epi16(res, res);
689
426k
  *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d);
690
  *(int16_t *)(dst + stride) = (int16_t)_mm_extract_epi16(d, 1);
691
426k
}
convolve_avx2.c:pack_store_2x2_sse2
Line
Count
Source
687
272k
                                       const ptrdiff_t stride) {
688
272k
  const __m128i d = _mm_packus_epi16(res, res);
689
272k
  *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d);
690
  *(int16_t *)(dst + stride) = (int16_t)_mm_extract_epi16(d, 1);
691
272k
}
692
693
static inline void pack_store_4x2_sse2(const __m128i res, uint8_t *const dst,
694
1.54M
                                       const ptrdiff_t stride) {
695
1.54M
  const __m128i d = _mm_packus_epi16(res, res);
696
1.54M
  store_u8_4x2_sse2(d, dst, stride);
697
1.54M
}
convolve_2d_avx2.c:pack_store_4x2_sse2
Line
Count
Source
694
52.8k
                                       const ptrdiff_t stride) {
695
52.8k
  const __m128i d = _mm_packus_epi16(res, res);
696
52.8k
  store_u8_4x2_sse2(d, dst, stride);
697
52.8k
}
convolve_avx2.c:pack_store_4x2_sse2
Line
Count
Source
694
1.49M
                                       const ptrdiff_t stride) {
695
1.49M
  const __m128i d = _mm_packus_epi16(res, res);
696
1.49M
  store_u8_4x2_sse2(d, dst, stride);
697
1.49M
}
698
699
static inline void pack_store_4x2_avx2(const __m256i res, uint8_t *const dst,
700
1.93M
                                       const ptrdiff_t stride) {
701
1.93M
  const __m256i d = _mm256_packus_epi16(res, res);
702
1.93M
  const __m128i d0 = _mm256_castsi256_si128(d);
703
1.93M
  const __m128i d1 = _mm256_extracti128_si256(d, 1);
704
705
1.93M
  xx_storel_32(dst, d0);
706
1.93M
  xx_storel_32(dst + stride, d1);
707
1.93M
}
convolve_2d_avx2.c:pack_store_4x2_avx2
Line
Count
Source
700
1.93M
                                       const ptrdiff_t stride) {
701
1.93M
  const __m256i d = _mm256_packus_epi16(res, res);
702
1.93M
  const __m128i d0 = _mm256_castsi256_si128(d);
703
1.93M
  const __m128i d1 = _mm256_extracti128_si256(d, 1);
704
705
1.93M
  xx_storel_32(dst, d0);
706
1.93M
  xx_storel_32(dst + stride, d1);
707
1.93M
}
Unexecuted instantiation: convolve_avx2.c:pack_store_4x2_avx2
708
709
static inline void pack_store_8x2_avx2(const __m256i res, uint8_t *const dst,
710
3.74M
                                       const ptrdiff_t stride) {
711
3.74M
  const __m256i d = _mm256_packus_epi16(res, res);
712
3.74M
  const __m128i d0 = _mm256_castsi256_si128(d);
713
3.74M
  const __m128i d1 = _mm256_extracti128_si256(d, 1);
714
3.74M
  _mm_storel_epi64((__m128i *)dst, d0);
715
3.74M
  _mm_storel_epi64((__m128i *)(dst + stride), d1);
716
3.74M
}
convolve_2d_avx2.c:pack_store_8x2_avx2
Line
Count
Source
710
2.12M
                                       const ptrdiff_t stride) {
711
2.12M
  const __m256i d = _mm256_packus_epi16(res, res);
712
2.12M
  const __m128i d0 = _mm256_castsi256_si128(d);
713
  const __m128i d1 = _mm256_extracti128_si256(d, 1);
714
2.12M
  _mm_storel_epi64((__m128i *)dst, d0);
715
2.12M
  _mm_storel_epi64((__m128i *)(dst + stride), d1);
716
2.12M
}
convolve_avx2.c:pack_store_8x2_avx2
Line
Count
Source
710
1.62M
                                       const ptrdiff_t stride) {
711
1.62M
  const __m256i d = _mm256_packus_epi16(res, res);
712
1.62M
  const __m128i d0 = _mm256_castsi256_si128(d);
713
  const __m128i d1 = _mm256_extracti128_si256(d, 1);
714
1.62M
  _mm_storel_epi64((__m128i *)dst, d0);
715
1.62M
  _mm_storel_epi64((__m128i *)(dst + stride), d1);
716
1.62M
}
717
718
static inline void pack_store_16x2_avx2(const __m256i res0, const __m256i res1,
719
                                        uint8_t *const dst,
720
1.38M
                                        const ptrdiff_t stride) {
721
1.38M
  const __m256i d = _mm256_packus_epi16(res0, res1);
722
1.38M
  storeu_u8_16x2_avx2(d, dst, stride);
723
1.38M
}
Unexecuted instantiation: convolve_2d_avx2.c:pack_store_16x2_avx2
convolve_avx2.c:pack_store_16x2_avx2
Line
Count
Source
720
1.38M
                                        const ptrdiff_t stride) {
721
1.38M
  const __m256i d = _mm256_packus_epi16(res0, res1);
722
1.38M
  storeu_u8_16x2_avx2(d, dst, stride);
723
1.38M
}
724
725
static inline void xy_y_pack_store_16x2_avx2(const __m256i res0,
726
                                             const __m256i res1,
727
                                             uint8_t *const dst,
728
1.62M
                                             const ptrdiff_t stride) {
729
1.62M
  const __m256i t = _mm256_packus_epi16(res0, res1);
730
1.62M
  const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
731
1.62M
  storeu_u8_16x2_avx2(d, dst, stride);
732
1.62M
}
convolve_2d_avx2.c:xy_y_pack_store_16x2_avx2
Line
Count
Source
728
1.62M
                                             const ptrdiff_t stride) {
729
1.62M
  const __m256i t = _mm256_packus_epi16(res0, res1);
730
  const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
731
1.62M
  storeu_u8_16x2_avx2(d, dst, stride);
732
1.62M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_pack_store_16x2_avx2
733
734
static inline void pack_store_32_avx2(const __m256i res0, const __m256i res1,
735
0
                                      uint8_t *const dst) {
736
0
  const __m256i t = _mm256_packus_epi16(res0, res1);
737
0
  const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
738
0
  _mm256_storeu_si256((__m256i *)dst, d);
739
0
}
Unexecuted instantiation: convolve_2d_avx2.c:pack_store_32_avx2
Unexecuted instantiation: convolve_avx2.c:pack_store_32_avx2
740
741
static inline void xy_y_round_store_2x2_sse2(const __m128i res,
742
                                             uint8_t *const dst,
743
420k
                                             const ptrdiff_t stride) {
744
420k
  const __m128i r = xy_y_round_sse2(res);
745
420k
  const __m128i rr = _mm_packs_epi32(r, r);
746
420k
  pack_store_2x2_sse2(rr, dst, stride);
747
420k
}
convolve_2d_avx2.c:xy_y_round_store_2x2_sse2
Line
Count
Source
743
420k
                                             const ptrdiff_t stride) {
744
420k
  const __m128i r = xy_y_round_sse2(res);
745
420k
  const __m128i rr = _mm_packs_epi32(r, r);
746
420k
  pack_store_2x2_sse2(rr, dst, stride);
747
420k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_store_2x2_sse2
748
749
static inline void xy_y_round_store_4x2_avx2(const __m256i res,
750
                                             uint8_t *const dst,
751
1.93M
                                             const ptrdiff_t stride) {
752
1.93M
  const __m256i r = xy_y_round_avx2(res);
753
1.93M
  const __m256i rr = _mm256_packs_epi32(r, r);
754
1.93M
  pack_store_4x2_avx2(rr, dst, stride);
755
1.93M
}
convolve_2d_avx2.c:xy_y_round_store_4x2_avx2
Line
Count
Source
751
1.93M
                                             const ptrdiff_t stride) {
752
1.93M
  const __m256i r = xy_y_round_avx2(res);
753
1.93M
  const __m256i rr = _mm256_packs_epi32(r, r);
754
1.93M
  pack_store_4x2_avx2(rr, dst, stride);
755
1.93M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_store_4x2_avx2
756
757
static inline void xy_y_pack_store_32_avx2(const __m256i res0,
758
                                           const __m256i res1,
759
5.45M
                                           uint8_t *const dst) {
760
5.45M
  const __m256i d = _mm256_packus_epi16(res0, res1);
761
  // d = _mm256_permute4x64_epi64(d, 0xD8);
762
5.45M
  _mm256_storeu_si256((__m256i *)dst, d);
763
5.45M
}
convolve_2d_avx2.c:xy_y_pack_store_32_avx2
Line
Count
Source
759
5.45M
                                           uint8_t *const dst) {
760
5.45M
  const __m256i d = _mm256_packus_epi16(res0, res1);
761
  // d = _mm256_permute4x64_epi64(d, 0xD8);
762
5.45M
  _mm256_storeu_si256((__m256i *)dst, d);
763
5.45M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_pack_store_32_avx2
764
765
static inline void xy_y_round_store_32_avx2(const __m256i r0[2],
766
                                            const __m256i r1[2],
767
5.33M
                                            uint8_t *const dst) {
768
5.33M
  const __m256i ra = xy_y_round_16_avx2(r0);
769
5.33M
  const __m256i rb = xy_y_round_16_avx2(r1);
770
5.33M
  xy_y_pack_store_32_avx2(ra, rb, dst);
771
5.33M
}
convolve_2d_avx2.c:xy_y_round_store_32_avx2
Line
Count
Source
767
5.33M
                                            uint8_t *const dst) {
768
5.33M
  const __m256i ra = xy_y_round_16_avx2(r0);
769
5.33M
  const __m256i rb = xy_y_round_16_avx2(r1);
770
5.33M
  xy_y_pack_store_32_avx2(ra, rb, dst);
771
5.33M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_store_32_avx2
772
773
static inline void convolve_store_32_avx2(const __m256i res0,
774
                                          const __m256i res1,
775
3.77M
                                          uint8_t *const dst) {
776
3.77M
  const __m256i d = _mm256_packus_epi16(res0, res1);
777
3.77M
  _mm256_storeu_si256((__m256i *)dst, d);
778
3.77M
}
Unexecuted instantiation: convolve_2d_avx2.c:convolve_store_32_avx2
convolve_avx2.c:convolve_store_32_avx2
Line
Count
Source
775
3.77M
                                          uint8_t *const dst) {
776
3.77M
  const __m256i d = _mm256_packus_epi16(res0, res1);
777
3.77M
  _mm256_storeu_si256((__m256i *)dst, d);
778
3.77M
}
779
780
987k
static inline __m128i sr_x_round_sse2(const __m128i src) {
781
987k
  const __m128i round = _mm_set1_epi16(34);
782
987k
  const __m128i dst = _mm_add_epi16(src, round);
783
987k
  return _mm_srai_epi16(dst, 6);
784
987k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_round_sse2
convolve_avx2.c:sr_x_round_sse2
Line
Count
Source
780
987k
static inline __m128i sr_x_round_sse2(const __m128i src) {
781
987k
  const __m128i round = _mm_set1_epi16(34);
782
987k
  const __m128i dst = _mm_add_epi16(src, round);
783
987k
  return _mm_srai_epi16(dst, 6);
784
987k
}
785
786
6.79M
static inline __m256i sr_x_round_avx2(const __m256i src) {
787
6.79M
  const __m256i round = _mm256_set1_epi16(34);
788
6.79M
  const __m256i dst = _mm256_add_epi16(src, round);
789
6.79M
  return _mm256_srai_epi16(dst, 6);
790
6.79M
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_round_avx2
convolve_avx2.c:sr_x_round_avx2
Line
Count
Source
786
6.79M
static inline __m256i sr_x_round_avx2(const __m256i src) {
787
6.79M
  const __m256i round = _mm256_set1_epi16(34);
788
6.79M
  const __m256i dst = _mm256_add_epi16(src, round);
789
6.79M
  return _mm256_srai_epi16(dst, 6);
790
6.79M
}
791
792
822k
static inline __m128i sr_y_round_sse2(const __m128i src) {
793
822k
  const __m128i round = _mm_set1_epi16(32);
794
822k
  const __m128i dst = _mm_add_epi16(src, round);
795
822k
  return _mm_srai_epi16(dst, FILTER_BITS - 1);
796
822k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_round_sse2
convolve_avx2.c:sr_y_round_sse2
Line
Count
Source
792
822k
static inline __m128i sr_y_round_sse2(const __m128i src) {
793
822k
  const __m128i round = _mm_set1_epi16(32);
794
822k
  const __m128i dst = _mm_add_epi16(src, round);
795
822k
  return _mm_srai_epi16(dst, FILTER_BITS - 1);
796
822k
}
797
798
static inline void sr_x_round_store_8x2_avx2(const __m256i res,
799
                                             uint8_t *const dst,
800
857k
                                             const ptrdiff_t dst_stride) {
801
857k
  const __m256i r = sr_x_round_avx2(res);
802
857k
  pack_store_8x2_avx2(r, dst, dst_stride);
803
857k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_round_store_8x2_avx2
convolve_avx2.c:sr_x_round_store_8x2_avx2
Line
Count
Source
800
857k
                                             const ptrdiff_t dst_stride) {
801
857k
  const __m256i r = sr_x_round_avx2(res);
802
857k
  pack_store_8x2_avx2(r, dst, dst_stride);
803
857k
}
804
805
static inline void sr_x_round_store_16x2_avx2(const __m256i res[2],
806
                                              uint8_t *const dst,
807
708k
                                              const ptrdiff_t dst_stride) {
808
708k
  __m256i r[2];
809
810
708k
  r[0] = sr_x_round_avx2(res[0]);
811
708k
  r[1] = sr_x_round_avx2(res[1]);
812
708k
  pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
813
708k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_round_store_16x2_avx2
convolve_avx2.c:sr_x_round_store_16x2_avx2
Line
Count
Source
807
708k
                                              const ptrdiff_t dst_stride) {
808
708k
  __m256i r[2];
809
810
708k
  r[0] = sr_x_round_avx2(res[0]);
811
708k
  r[1] = sr_x_round_avx2(res[1]);
812
708k
  pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
813
708k
}
814
815
static inline void sr_x_round_store_32_avx2(const __m256i res[2],
816
1.86M
                                            uint8_t *const dst) {
817
1.86M
  __m256i r[2];
818
819
1.86M
  r[0] = sr_x_round_avx2(res[0]);
820
1.86M
  r[1] = sr_x_round_avx2(res[1]);
821
1.86M
  convolve_store_32_avx2(r[0], r[1], dst);
822
1.86M
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_round_store_32_avx2
convolve_avx2.c:sr_x_round_store_32_avx2
Line
Count
Source
816
1.86M
                                            uint8_t *const dst) {
817
1.86M
  __m256i r[2];
818
819
1.86M
  r[0] = sr_x_round_avx2(res[0]);
820
1.86M
  r[1] = sr_x_round_avx2(res[1]);
821
1.86M
  convolve_store_32_avx2(r[0], r[1], dst);
822
1.86M
}
823
824
static inline void sr_y_round_store_8x2_avx2(const __m256i res,
825
                                             uint8_t *const dst,
826
767k
                                             const ptrdiff_t dst_stride) {
827
767k
  const __m256i r = sr_y_round_avx2(res);
828
767k
  pack_store_8x2_avx2(r, dst, dst_stride);
829
767k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_round_store_8x2_avx2
convolve_avx2.c:sr_y_round_store_8x2_avx2
Line
Count
Source
826
767k
                                             const ptrdiff_t dst_stride) {
827
767k
  const __m256i r = sr_y_round_avx2(res);
828
767k
  pack_store_8x2_avx2(r, dst, dst_stride);
829
767k
}
830
831
static inline void sr_y_round_store_16x2_avx2(const __m256i res[2],
832
                                              uint8_t *const dst,
833
675k
                                              const ptrdiff_t dst_stride) {
834
675k
  __m256i r[2];
835
836
675k
  r[0] = sr_y_round_avx2(res[0]);
837
675k
  r[1] = sr_y_round_avx2(res[1]);
838
675k
  pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
839
675k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_round_store_16x2_avx2
convolve_avx2.c:sr_y_round_store_16x2_avx2
Line
Count
Source
833
675k
                                              const ptrdiff_t dst_stride) {
834
675k
  __m256i r[2];
835
836
675k
  r[0] = sr_y_round_avx2(res[0]);
837
675k
  r[1] = sr_y_round_avx2(res[1]);
838
675k
  pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
839
675k
}
840
841
static inline void sr_y_2tap_32_avg_avx2(const uint8_t *const src,
842
                                         const __m256i s0, __m256i *const s1,
843
83.0k
                                         uint8_t *const dst) {
844
83.0k
  *s1 = _mm256_loadu_si256((__m256i *)src);
845
83.0k
  const __m256i d = _mm256_avg_epu8(s0, *s1);
846
83.0k
  _mm256_storeu_si256((__m256i *)dst, d);
847
83.0k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_2tap_32_avg_avx2
convolve_avx2.c:sr_y_2tap_32_avg_avx2
Line
Count
Source
843
83.0k
                                         uint8_t *const dst) {
844
83.0k
  *s1 = _mm256_loadu_si256((__m256i *)src);
845
83.0k
  const __m256i d = _mm256_avg_epu8(s0, *s1);
846
83.0k
  _mm256_storeu_si256((__m256i *)dst, d);
847
83.0k
}
848
849
static inline void sr_x_2tap_32_avg_avx2(const uint8_t *const src,
850
111k
                                         uint8_t *const dst) {
851
111k
  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
852
111k
  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
853
111k
  const __m256i d = _mm256_avg_epu8(s0, s1);
854
111k
  _mm256_storeu_si256((__m256i *)dst, d);
855
111k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_2tap_32_avg_avx2
convolve_avx2.c:sr_x_2tap_32_avg_avx2
Line
Count
Source
850
111k
                                         uint8_t *const dst) {
851
111k
  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
852
111k
  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
853
111k
  const __m256i d = _mm256_avg_epu8(s0, s1);
854
111k
  _mm256_storeu_si256((__m256i *)dst, d);
855
111k
}
856
857
static inline __m128i x_convolve_2tap_2x2_sse4_1(const uint8_t *const src,
858
                                                 const ptrdiff_t stride,
859
25.3k
                                                 const __m128i coeffs[1]) {
860
25.3k
  const __m128i sfl =
861
25.3k
      _mm_setr_epi8(0, 1, 1, 2, 4, 5, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0);
862
25.3k
  const __m128i s_128 = load_u8_4x2_sse4_1(src, stride);
863
25.3k
  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
864
25.3k
  return convolve_2tap_ssse3(&ss, coeffs);
865
25.3k
}
convolve_2d_avx2.c:x_convolve_2tap_2x2_sse4_1
Line
Count
Source
859
21.8k
                                                 const __m128i coeffs[1]) {
860
21.8k
  const __m128i sfl =
861
21.8k
      _mm_setr_epi8(0, 1, 1, 2, 4, 5, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0);
862
21.8k
  const __m128i s_128 = load_u8_4x2_sse4_1(src, stride);
863
21.8k
  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
864
21.8k
  return convolve_2tap_ssse3(&ss, coeffs);
865
21.8k
}
convolve_avx2.c:x_convolve_2tap_2x2_sse4_1
Line
Count
Source
859
3.53k
                                                 const __m128i coeffs[1]) {
860
3.53k
  const __m128i sfl =
861
3.53k
      _mm_setr_epi8(0, 1, 1, 2, 4, 5, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0);
862
3.53k
  const __m128i s_128 = load_u8_4x2_sse4_1(src, stride);
863
3.53k
  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
864
3.53k
  return convolve_2tap_ssse3(&ss, coeffs);
865
3.53k
}
866
867
static inline __m128i x_convolve_2tap_4x2_ssse3(const uint8_t *const src,
868
                                                const ptrdiff_t stride,
869
82.4k
                                                const __m128i coeffs[1]) {
870
82.4k
  const __m128i sfl =
871
82.4k
      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
872
82.4k
  const __m128i s_128 = load_u8_8x2_sse2(src, stride);
873
82.4k
  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
874
82.4k
  return convolve_2tap_ssse3(&ss, coeffs);
875
82.4k
}
convolve_2d_avx2.c:x_convolve_2tap_4x2_ssse3
Line
Count
Source
869
70.1k
                                                const __m128i coeffs[1]) {
870
70.1k
  const __m128i sfl =
871
70.1k
      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
872
70.1k
  const __m128i s_128 = load_u8_8x2_sse2(src, stride);
873
70.1k
  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
874
70.1k
  return convolve_2tap_ssse3(&ss, coeffs);
875
70.1k
}
convolve_avx2.c:x_convolve_2tap_4x2_ssse3
Line
Count
Source
869
12.3k
                                                const __m128i coeffs[1]) {
870
12.3k
  const __m128i sfl =
871
12.3k
      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
872
12.3k
  const __m128i s_128 = load_u8_8x2_sse2(src, stride);
873
12.3k
  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
874
12.3k
  return convolve_2tap_ssse3(&ss, coeffs);
875
12.3k
}
876
877
static inline void x_convolve_2tap_8x2_ssse3(const uint8_t *const src,
878
                                             const ptrdiff_t stride,
879
                                             const __m128i coeffs[1],
880
72.2k
                                             __m128i r[2]) {
881
72.2k
  __m128i ss[2];
882
72.2k
  const __m128i s00 = _mm_loadu_si128((__m128i *)src);
883
72.2k
  const __m128i s10 = _mm_loadu_si128((__m128i *)(src + stride));
884
72.2k
  const __m128i s01 = _mm_srli_si128(s00, 1);
885
72.2k
  const __m128i s11 = _mm_srli_si128(s10, 1);
886
72.2k
  ss[0] = _mm_unpacklo_epi8(s00, s01);
887
72.2k
  ss[1] = _mm_unpacklo_epi8(s10, s11);
888
889
72.2k
  r[0] = convolve_2tap_ssse3(&ss[0], coeffs);
890
72.2k
  r[1] = convolve_2tap_ssse3(&ss[1], coeffs);
891
72.2k
}
convolve_2d_avx2.c:x_convolve_2tap_8x2_ssse3
Line
Count
Source
880
60.7k
                                             __m128i r[2]) {
881
60.7k
  __m128i ss[2];
882
60.7k
  const __m128i s00 = _mm_loadu_si128((__m128i *)src);
883
60.7k
  const __m128i s10 = _mm_loadu_si128((__m128i *)(src + stride));
884
60.7k
  const __m128i s01 = _mm_srli_si128(s00, 1);
885
60.7k
  const __m128i s11 = _mm_srli_si128(s10, 1);
886
60.7k
  ss[0] = _mm_unpacklo_epi8(s00, s01);
887
60.7k
  ss[1] = _mm_unpacklo_epi8(s10, s11);
888
889
60.7k
  r[0] = convolve_2tap_ssse3(&ss[0], coeffs);
890
60.7k
  r[1] = convolve_2tap_ssse3(&ss[1], coeffs);
891
60.7k
}
convolve_avx2.c:x_convolve_2tap_8x2_ssse3
Line
Count
Source
880
11.4k
                                             __m128i r[2]) {
881
11.4k
  __m128i ss[2];
882
11.4k
  const __m128i s00 = _mm_loadu_si128((__m128i *)src);
883
11.4k
  const __m128i s10 = _mm_loadu_si128((__m128i *)(src + stride));
884
11.4k
  const __m128i s01 = _mm_srli_si128(s00, 1);
885
11.4k
  const __m128i s11 = _mm_srli_si128(s10, 1);
886
11.4k
  ss[0] = _mm_unpacklo_epi8(s00, s01);
887
11.4k
  ss[1] = _mm_unpacklo_epi8(s10, s11);
888
889
11.4k
  r[0] = convolve_2tap_ssse3(&ss[0], coeffs);
890
11.4k
  r[1] = convolve_2tap_ssse3(&ss[1], coeffs);
891
11.4k
}
892
893
static inline __m256i x_convolve_2tap_8x2_avx2(const uint8_t *const src,
894
                                               const ptrdiff_t stride,
895
0
                                               const __m256i coeffs[1]) {
896
0
  __m128i s_128[2][2];
897
0
  __m256i s_256[2];
898
0
899
0
  s_128[0][0] = _mm_loadu_si128((__m128i *)src);
900
0
  s_128[1][0] = _mm_loadu_si128((__m128i *)(src + stride));
901
0
  s_128[0][1] = _mm_srli_si128(s_128[0][0], 1);
902
0
  s_128[1][1] = _mm_srli_si128(s_128[1][0], 1);
903
0
  s_256[0] = _mm256_setr_m128i(s_128[0][0], s_128[1][0]);
904
0
  s_256[1] = _mm256_setr_m128i(s_128[0][1], s_128[1][1]);
905
0
  const __m256i ss = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
906
0
  return convolve_2tap_avx2(&ss, coeffs);
907
0
}
Unexecuted instantiation: convolve_2d_avx2.c:x_convolve_2tap_8x2_avx2
Unexecuted instantiation: convolve_avx2.c:x_convolve_2tap_8x2_avx2
908
909
static inline void x_convolve_2tap_16x2_avx2(const uint8_t *const src,
910
                                             const ptrdiff_t stride,
911
                                             const __m256i coeffs[1],
912
48.9k
                                             __m256i r[2]) {
913
48.9k
  const __m256i s0_256 = loadu_8bit_16x2_avx2(src, stride);
914
48.9k
  const __m256i s1_256 = loadu_8bit_16x2_avx2(src + 1, stride);
915
48.9k
  const __m256i s0 = _mm256_unpacklo_epi8(s0_256, s1_256);
916
48.9k
  const __m256i s1 = _mm256_unpackhi_epi8(s0_256, s1_256);
917
48.9k
  r[0] = convolve_2tap_avx2(&s0, coeffs);
918
48.9k
  r[1] = convolve_2tap_avx2(&s1, coeffs);
919
48.9k
}
convolve_2d_avx2.c:x_convolve_2tap_16x2_avx2
Line
Count
Source
912
39.6k
                                             __m256i r[2]) {
913
39.6k
  const __m256i s0_256 = loadu_8bit_16x2_avx2(src, stride);
914
39.6k
  const __m256i s1_256 = loadu_8bit_16x2_avx2(src + 1, stride);
915
39.6k
  const __m256i s0 = _mm256_unpacklo_epi8(s0_256, s1_256);
916
39.6k
  const __m256i s1 = _mm256_unpackhi_epi8(s0_256, s1_256);
917
39.6k
  r[0] = convolve_2tap_avx2(&s0, coeffs);
918
39.6k
  r[1] = convolve_2tap_avx2(&s1, coeffs);
919
39.6k
}
convolve_avx2.c:x_convolve_2tap_16x2_avx2
Line
Count
Source
912
9.35k
                                             __m256i r[2]) {
913
9.35k
  const __m256i s0_256 = loadu_8bit_16x2_avx2(src, stride);
914
9.35k
  const __m256i s1_256 = loadu_8bit_16x2_avx2(src + 1, stride);
915
9.35k
  const __m256i s0 = _mm256_unpacklo_epi8(s0_256, s1_256);
916
9.35k
  const __m256i s1 = _mm256_unpackhi_epi8(s0_256, s1_256);
917
9.35k
  r[0] = convolve_2tap_avx2(&s0, coeffs);
918
9.35k
  r[1] = convolve_2tap_avx2(&s1, coeffs);
919
9.35k
}
920
921
static inline void x_convolve_2tap_32_avx2(const uint8_t *const src,
922
                                           const __m256i coeffs[1],
923
110k
                                           __m256i r[2]) {
924
110k
  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
925
110k
  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
926
110k
  const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
927
110k
  const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
928
929
110k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
930
110k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
931
110k
}
Unexecuted instantiation: convolve_2d_avx2.c:x_convolve_2tap_32_avx2
convolve_avx2.c:x_convolve_2tap_32_avx2
Line
Count
Source
923
110k
                                           __m256i r[2]) {
924
110k
  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
925
110k
  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
926
110k
  const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
927
110k
  const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
928
929
110k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
930
110k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
931
110k
}
932
933
static inline __m128i x_convolve_4tap_2x2_ssse3(const uint8_t *const src,
934
                                                const ptrdiff_t stride,
935
938k
                                                const __m128i coeffs[2]) {
936
938k
  const __m128i sfl0 =
937
938k
      _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
938
938k
  const __m128i sfl1 =
939
938k
      _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
940
938k
  const __m128i s = load_u8_8x2_sse2(src, stride);
941
938k
  __m128i ss[2];
942
943
938k
  ss[0] = _mm_shuffle_epi8(s, sfl0);
944
938k
  ss[1] = _mm_shuffle_epi8(s, sfl1);
945
938k
  return convolve_4tap_ssse3(ss, coeffs);
946
938k
}
convolve_2d_avx2.c:x_convolve_4tap_2x2_ssse3
Line
Count
Source
935
792k
                                                const __m128i coeffs[2]) {
936
792k
  const __m128i sfl0 =
937
792k
      _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
938
792k
  const __m128i sfl1 =
939
792k
      _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
940
792k
  const __m128i s = load_u8_8x2_sse2(src, stride);
941
792k
  __m128i ss[2];
942
943
792k
  ss[0] = _mm_shuffle_epi8(s, sfl0);
944
792k
  ss[1] = _mm_shuffle_epi8(s, sfl1);
945
792k
  return convolve_4tap_ssse3(ss, coeffs);
946
792k
}
convolve_avx2.c:x_convolve_4tap_2x2_ssse3
Line
Count
Source
935
145k
                                                const __m128i coeffs[2]) {
936
145k
  const __m128i sfl0 =
937
145k
      _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
938
145k
  const __m128i sfl1 =
939
145k
      _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
940
145k
  const __m128i s = load_u8_8x2_sse2(src, stride);
941
145k
  __m128i ss[2];
942
943
145k
  ss[0] = _mm_shuffle_epi8(s, sfl0);
944
145k
  ss[1] = _mm_shuffle_epi8(s, sfl1);
945
145k
  return convolve_4tap_ssse3(ss, coeffs);
946
145k
}
947
948
static inline __m128i x_convolve_4tap_4x2_ssse3(const uint8_t *const src,
949
                                                const ptrdiff_t stride,
950
4.21M
                                                const __m128i coeffs[2]) {
951
4.21M
  const __m128i s = load_u8_8x2_sse2(src, stride);
952
4.21M
  const __m128i sfl0 =
953
4.21M
      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
954
4.21M
  const __m128i sfl1 =
955
4.21M
      _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14);
956
4.21M
  __m128i ss[2];
957
958
4.21M
  ss[0] = _mm_shuffle_epi8(s, sfl0);
959
4.21M
  ss[1] = _mm_shuffle_epi8(s, sfl1);
960
4.21M
  return convolve_4tap_ssse3(ss, coeffs);
961
4.21M
}
convolve_2d_avx2.c:x_convolve_4tap_4x2_ssse3
Line
Count
Source
950
3.40M
                                                const __m128i coeffs[2]) {
951
3.40M
  const __m128i s = load_u8_8x2_sse2(src, stride);
952
3.40M
  const __m128i sfl0 =
953
3.40M
      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
954
3.40M
  const __m128i sfl1 =
955
3.40M
      _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14);
956
3.40M
  __m128i ss[2];
957
958
3.40M
  ss[0] = _mm_shuffle_epi8(s, sfl0);
959
3.40M
  ss[1] = _mm_shuffle_epi8(s, sfl1);
960
3.40M
  return convolve_4tap_ssse3(ss, coeffs);
961
3.40M
}
convolve_avx2.c:x_convolve_4tap_4x2_ssse3
Line
Count
Source
950
803k
                                                const __m128i coeffs[2]) {
951
803k
  const __m128i s = load_u8_8x2_sse2(src, stride);
952
803k
  const __m128i sfl0 =
953
803k
      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
954
803k
  const __m128i sfl1 =
955
803k
      _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14);
956
803k
  __m128i ss[2];
957
958
803k
  ss[0] = _mm_shuffle_epi8(s, sfl0);
959
803k
  ss[1] = _mm_shuffle_epi8(s, sfl1);
960
803k
  return convolve_4tap_ssse3(ss, coeffs);
961
803k
}
962
963
static inline __m256i x_convolve_4tap_8x2_avx2(const uint8_t *const src,
964
                                               const ptrdiff_t stride,
965
                                               const __m256i coeffs[2],
966
513k
                                               const __m256i filt[2]) {
967
513k
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
968
513k
  return x_convolve_4tap_avx2(s_256, coeffs, filt);
969
513k
}
convolve_2d_avx2.c:x_convolve_4tap_8x2_avx2
Line
Count
Source
966
513k
                                               const __m256i filt[2]) {
967
513k
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
968
513k
  return x_convolve_4tap_avx2(s_256, coeffs, filt);
969
513k
}
Unexecuted instantiation: convolve_avx2.c:x_convolve_4tap_8x2_avx2
970
971
static inline void x_convolve_4tap_16x2_avx2(const uint8_t *const src,
972
                                             const int32_t src_stride,
973
                                             const __m256i coeffs[2],
974
                                             const __m256i filt[2],
975
149k
                                             __m256i r[2]) {
976
149k
  r[0] = x_convolve_4tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
977
149k
  r[1] = x_convolve_4tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
978
149k
}
convolve_2d_avx2.c:x_convolve_4tap_16x2_avx2
Line
Count
Source
975
149k
                                             __m256i r[2]) {
976
149k
  r[0] = x_convolve_4tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
977
149k
  r[1] = x_convolve_4tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
978
149k
}
Unexecuted instantiation: convolve_avx2.c:x_convolve_4tap_16x2_avx2
979
980
static inline void x_convolve_4tap_32_avx2(const uint8_t *const src,
981
                                           const __m256i coeffs[2],
982
                                           const __m256i filt[2],
983
595k
                                           __m256i r[2]) {
984
595k
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
985
595k
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
986
987
595k
  r[0] = x_convolve_4tap_avx2(s0_256, coeffs, filt);
988
595k
  r[1] = x_convolve_4tap_avx2(s1_256, coeffs, filt);
989
595k
}
convolve_2d_avx2.c:x_convolve_4tap_32_avx2
Line
Count
Source
983
595k
                                           __m256i r[2]) {
984
595k
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
985
595k
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
986
987
595k
  r[0] = x_convolve_4tap_avx2(s0_256, coeffs, filt);
988
595k
  r[1] = x_convolve_4tap_avx2(s1_256, coeffs, filt);
989
595k
}
Unexecuted instantiation: convolve_avx2.c:x_convolve_4tap_32_avx2
990
991
static inline __m128i x_convolve_6tap_2x2_ssse3(const uint8_t *const src,
992
                                                const ptrdiff_t stride,
993
0
                                                const __m128i coeffs[3]) {
994
0
  const __m128i sfl0 =
995
0
      _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
996
0
  const __m128i sfl1 =
997
0
      _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
998
0
  const __m128i sfl2 =
999
0
      _mm_setr_epi8(4, 5, 5, 6, 12, 13, 13, 14, 0, 0, 0, 0, 0, 0, 0, 0);
1000
1001
0
  const __m128i s = load_u8_8x2_sse2(src, stride);
1002
0
  __m128i ss[3];
1003
1004
0
  ss[0] = _mm_shuffle_epi8(s, sfl0);
1005
0
  ss[1] = _mm_shuffle_epi8(s, sfl1);
1006
0
  ss[2] = _mm_shuffle_epi8(s, sfl2);
1007
0
  return convolve_6tap_ssse3(ss, coeffs);
1008
0
}
Unexecuted instantiation: convolve_2d_avx2.c:x_convolve_6tap_2x2_ssse3
Unexecuted instantiation: convolve_avx2.c:x_convolve_6tap_2x2_ssse3
1009
1010
static inline __m128i x_convolve_6tap_4x2_ssse3(const uint8_t *const src,
1011
                                                const ptrdiff_t stride,
1012
0
                                                const __m128i coeffs[3]) {
1013
0
  const __m128i s = load_u8_8x2_sse2(src, stride);
1014
0
  const __m128i sfl0 =
1015
0
      _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
1016
0
  const __m128i sfl1 =
1017
0
      _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
1018
0
  const __m128i sfl2 =
1019
0
      _mm_setr_epi8(4, 5, 5, 6, 12, 13, 13, 14, 0, 0, 0, 0, 0, 0, 0, 0);
1020
0
  __m128i ss[3];
1021
1022
0
  ss[0] = _mm_shuffle_epi8(s, sfl0);
1023
0
  ss[1] = _mm_shuffle_epi8(s, sfl1);
1024
0
  ss[2] = _mm_shuffle_epi8(s, sfl2);
1025
0
  return convolve_6tap_ssse3(ss, coeffs);
1026
0
}
Unexecuted instantiation: convolve_2d_avx2.c:x_convolve_6tap_4x2_ssse3
Unexecuted instantiation: convolve_avx2.c:x_convolve_6tap_4x2_ssse3
1027
1028
static inline __m256i x_convolve_6tap_8x2_avx2(const uint8_t *const src,
1029
                                               const ptrdiff_t stride,
1030
                                               const __m256i coeffs[3],
1031
9.66M
                                               const __m256i filt[3]) {
1032
9.66M
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
1033
9.66M
  return x_convolve_6tap_avx2(s_256, coeffs, filt);
1034
9.66M
}
convolve_2d_avx2.c:x_convolve_6tap_8x2_avx2
Line
Count
Source
1031
7.49M
                                               const __m256i filt[3]) {
1032
7.49M
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
1033
7.49M
  return x_convolve_6tap_avx2(s_256, coeffs, filt);
1034
7.49M
}
convolve_avx2.c:x_convolve_6tap_8x2_avx2
Line
Count
Source
1031
2.17M
                                               const __m256i filt[3]) {
1032
2.17M
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
1033
2.17M
  return x_convolve_6tap_avx2(s_256, coeffs, filt);
1034
2.17M
}
1035
1036
static inline void x_convolve_6tap_16x2_avx2(const uint8_t *const src,
1037
                                             const int32_t src_stride,
1038
                                             const __m256i coeffs[3],
1039
                                             const __m256i filt[3],
1040
2.83M
                                             __m256i r[2]) {
1041
2.83M
  r[0] = x_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1042
2.83M
  r[1] = x_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1043
2.83M
}
convolve_2d_avx2.c:x_convolve_6tap_16x2_avx2
Line
Count
Source
1040
2.15M
                                             __m256i r[2]) {
1041
2.15M
  r[0] = x_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1042
2.15M
  r[1] = x_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1043
2.15M
}
convolve_avx2.c:x_convolve_6tap_16x2_avx2
Line
Count
Source
1040
673k
                                             __m256i r[2]) {
1041
673k
  r[0] = x_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1042
673k
  r[1] = x_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1043
673k
}
1044
1045
static inline void x_convolve_6tap_32_avx2(const uint8_t *const src,
1046
                                           const __m256i coeffs[3],
1047
                                           const __m256i filt[3],
1048
5.59M
                                           __m256i r[2]) {
1049
5.59M
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
1050
5.59M
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
1051
1052
5.59M
  r[0] = x_convolve_6tap_avx2(s0_256, coeffs, filt);
1053
5.59M
  r[1] = x_convolve_6tap_avx2(s1_256, coeffs, filt);
1054
5.59M
}
convolve_2d_avx2.c:x_convolve_6tap_32_avx2
Line
Count
Source
1048
4.02M
                                           __m256i r[2]) {
1049
4.02M
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
1050
4.02M
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
1051
1052
4.02M
  r[0] = x_convolve_6tap_avx2(s0_256, coeffs, filt);
1053
4.02M
  r[1] = x_convolve_6tap_avx2(s1_256, coeffs, filt);
1054
4.02M
}
convolve_avx2.c:x_convolve_6tap_32_avx2
Line
Count
Source
1048
1.57M
                                           __m256i r[2]) {
1049
1.57M
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
1050
1.57M
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
1051
1052
1.57M
  r[0] = x_convolve_6tap_avx2(s0_256, coeffs, filt);
1053
1.57M
  r[1] = x_convolve_6tap_avx2(s1_256, coeffs, filt);
1054
1.57M
}
1055
1056
static inline __m256i x_convolve_8tap_8x2_avx2(const uint8_t *const src,
1057
                                               const ptrdiff_t stride,
1058
                                               const __m256i coeffs[4],
1059
407k
                                               const __m256i filt[4]) {
1060
407k
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
1061
407k
  return x_convolve_8tap_avx2(s_256, coeffs, filt);
1062
407k
}
convolve_2d_avx2.c:x_convolve_8tap_8x2_avx2
Line
Count
Source
1059
323k
                                               const __m256i filt[4]) {
1060
323k
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
1061
323k
  return x_convolve_8tap_avx2(s_256, coeffs, filt);
1062
323k
}
convolve_avx2.c:x_convolve_8tap_8x2_avx2
Line
Count
Source
1059
84.3k
                                               const __m256i filt[4]) {
1060
84.3k
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
1061
84.3k
  return x_convolve_8tap_avx2(s_256, coeffs, filt);
1062
84.3k
}
1063
1064
static AOM_FORCE_INLINE void x_convolve_8tap_16x2_avx2(const uint8_t *const src,
1065
                                                       const int32_t src_stride,
1066
                                                       const __m256i coeffs[4],
1067
                                                       const __m256i filt[4],
1068
118k
                                                       __m256i r[2]) {
1069
118k
  r[0] = x_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1070
118k
  r[1] = x_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1071
118k
}
convolve_2d_avx2.c:x_convolve_8tap_16x2_avx2
Line
Count
Source
1068
92.6k
                                                       __m256i r[2]) {
1069
92.6k
  r[0] = x_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1070
92.6k
  r[1] = x_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1071
92.6k
}
convolve_avx2.c:x_convolve_8tap_16x2_avx2
Line
Count
Source
1068
25.9k
                                                       __m256i r[2]) {
1069
25.9k
  r[0] = x_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1070
25.9k
  r[1] = x_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1071
25.9k
}
1072
1073
static AOM_FORCE_INLINE void x_convolve_8tap_32_avx2(const uint8_t *const src,
1074
                                                     const __m256i coeffs[4],
1075
                                                     const __m256i filt[4],
1076
1.71M
                                                     __m256i r[2]) {
1077
1.71M
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
1078
1.71M
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
1079
1080
1.71M
  r[0] = x_convolve_8tap_avx2(s0_256, coeffs, filt);
1081
1.71M
  r[1] = x_convolve_8tap_avx2(s1_256, coeffs, filt);
1082
1.71M
}
convolve_2d_avx2.c:x_convolve_8tap_32_avx2
Line
Count
Source
1076
1.53M
                                                     __m256i r[2]) {
1077
1.53M
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
1078
1.53M
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
1079
1080
1.53M
  r[0] = x_convolve_8tap_avx2(s0_256, coeffs, filt);
1081
1.53M
  r[1] = x_convolve_8tap_avx2(s1_256, coeffs, filt);
1082
1.53M
}
convolve_avx2.c:x_convolve_8tap_32_avx2
Line
Count
Source
1076
179k
                                                     __m256i r[2]) {
1077
179k
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
1078
179k
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
1079
1080
179k
  r[0] = x_convolve_8tap_avx2(s0_256, coeffs, filt);
1081
179k
  r[1] = x_convolve_8tap_avx2(s1_256, coeffs, filt);
1082
179k
}
1083
1084
static inline __m128i y_convolve_2tap_2x2_ssse3(const uint8_t *const src,
1085
                                                const ptrdiff_t stride,
1086
                                                const __m128i coeffs[1],
1087
3.45k
                                                __m128i s_16[2]) {
1088
3.45k
  __m128i s_128[2];
1089
1090
3.45k
  s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src + stride));
1091
3.45k
  s_128[0] = _mm_unpacklo_epi16(s_16[0], s_16[1]);
1092
3.45k
  s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src + 2 * stride));
1093
3.45k
  s_128[1] = _mm_unpacklo_epi16(s_16[1], s_16[0]);
1094
3.45k
  const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
1095
3.45k
  return convolve_2tap_ssse3(&ss, coeffs);
1096
3.45k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_2tap_2x2_ssse3
convolve_avx2.c:y_convolve_2tap_2x2_ssse3
Line
Count
Source
1087
3.45k
                                                __m128i s_16[2]) {
1088
3.45k
  __m128i s_128[2];
1089
1090
3.45k
  s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src + stride));
1091
3.45k
  s_128[0] = _mm_unpacklo_epi16(s_16[0], s_16[1]);
1092
3.45k
  s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src + 2 * stride));
1093
3.45k
  s_128[1] = _mm_unpacklo_epi16(s_16[1], s_16[0]);
1094
3.45k
  const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
1095
3.45k
  return convolve_2tap_ssse3(&ss, coeffs);
1096
3.45k
}
1097
1098
static inline __m128i y_convolve_2tap_4x2_ssse3(const uint8_t *const src,
1099
                                                const ptrdiff_t stride,
1100
                                                const __m128i coeffs[1],
1101
14.2k
                                                __m128i s_32[2]) {
1102
14.2k
  __m128i s_128[2];
1103
1104
14.2k
  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + stride));
1105
14.2k
  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1106
14.2k
  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * stride));
1107
14.2k
  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1108
14.2k
  const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
1109
14.2k
  return convolve_2tap_ssse3(&ss, coeffs);
1110
14.2k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_2tap_4x2_ssse3
convolve_avx2.c:y_convolve_2tap_4x2_ssse3
Line
Count
Source
1101
14.2k
                                                __m128i s_32[2]) {
1102
14.2k
  __m128i s_128[2];
1103
1104
14.2k
  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + stride));
1105
14.2k
  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1106
14.2k
  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * stride));
1107
14.2k
  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1108
14.2k
  const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
1109
14.2k
  return convolve_2tap_ssse3(&ss, coeffs);
1110
14.2k
}
1111
1112
static inline __m256i y_convolve_2tap_8x2_avx2(const uint8_t *const src,
1113
                                               const ptrdiff_t stride,
1114
                                               const __m256i coeffs[1],
1115
0
                                               __m128i s_64[2]) {
1116
0
  __m256i s_256[2];
1117
0
1118
0
  s_64[1] = _mm_loadl_epi64((__m128i *)(src + stride));
1119
0
  s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
1120
0
  s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * stride));
1121
0
  s_256[1] = _mm256_setr_m128i(s_64[1], s_64[0]);
1122
0
  const __m256i ss = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
1123
0
  return convolve_2tap_avx2(&ss, coeffs);
1124
0
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_2tap_8x2_avx2
Unexecuted instantiation: convolve_avx2.c:y_convolve_2tap_8x2_avx2
1125
1126
static inline void y_convolve_2tap_16x2_avx2(const uint8_t *const src,
1127
                                             const ptrdiff_t stride,
1128
                                             const __m256i coeffs[1],
1129
13.9k
                                             __m128i s_128[2], __m256i r[2]) {
1130
13.9k
  __m256i s_256[2];
1131
1132
13.9k
  s_128[1] = _mm_loadu_si128((__m128i *)(src + stride));
1133
13.9k
  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1134
13.9k
  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
1135
13.9k
  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1136
13.9k
  const __m256i ss0 = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
1137
13.9k
  const __m256i ss1 = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
1138
13.9k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
1139
13.9k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
1140
13.9k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_2tap_16x2_avx2
convolve_avx2.c:y_convolve_2tap_16x2_avx2
Line
Count
Source
1129
13.9k
                                             __m128i s_128[2], __m256i r[2]) {
1130
13.9k
  __m256i s_256[2];
1131
1132
13.9k
  s_128[1] = _mm_loadu_si128((__m128i *)(src + stride));
1133
13.9k
  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1134
13.9k
  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
1135
  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1136
13.9k
  const __m256i ss0 = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
1137
13.9k
  const __m256i ss1 = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
1138
13.9k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
1139
13.9k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
1140
13.9k
}
1141
1142
static inline void y_convolve_2tap_32_avx2(const uint8_t *const src,
1143
                                           const __m256i coeffs[1],
1144
                                           const __m256i s0, __m256i *const s1,
1145
116k
                                           __m256i r[2]) {
1146
116k
  *s1 = _mm256_loadu_si256((__m256i *)src);
1147
116k
  const __m256i ss0 = _mm256_unpacklo_epi8(s0, *s1);
1148
116k
  const __m256i ss1 = _mm256_unpackhi_epi8(s0, *s1);
1149
116k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
1150
116k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
1151
116k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_2tap_32_avx2
convolve_avx2.c:y_convolve_2tap_32_avx2
Line
Count
Source
1145
116k
                                           __m256i r[2]) {
1146
116k
  *s1 = _mm256_loadu_si256((__m256i *)src);
1147
116k
  const __m256i ss0 = _mm256_unpacklo_epi8(s0, *s1);
1148
116k
  const __m256i ss1 = _mm256_unpackhi_epi8(s0, *s1);
1149
116k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
1150
116k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
1151
116k
}
1152
1153
static inline __m128i y_convolve_4tap_2x2_ssse3(const uint8_t *const src,
1154
                                                const ptrdiff_t stride,
1155
                                                const __m128i coeffs[2],
1156
                                                __m128i s_16[4],
1157
47.3k
                                                __m128i ss_128[2]) {
1158
47.3k
  s_16[3] = _mm_cvtsi32_si128(loadu_int16(src + stride));
1159
47.3k
  const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
1160
47.3k
  s_16[2] = _mm_cvtsi32_si128(loadu_int16(src + 2 * stride));
1161
47.3k
  const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[2]);
1162
47.3k
  ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1163
47.3k
  return convolve_4tap_ssse3(ss_128, coeffs);
1164
47.3k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_4tap_2x2_ssse3
convolve_avx2.c:y_convolve_4tap_2x2_ssse3
Line
Count
Source
1157
47.3k
                                                __m128i ss_128[2]) {
1158
47.3k
  s_16[3] = _mm_cvtsi32_si128(loadu_int16(src + stride));
1159
47.3k
  const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
1160
47.3k
  s_16[2] = _mm_cvtsi32_si128(loadu_int16(src + 2 * stride));
1161
47.3k
  const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[2]);
1162
47.3k
  ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1163
47.3k
  return convolve_4tap_ssse3(ss_128, coeffs);
1164
47.3k
}
1165
1166
static inline __m128i y_convolve_4tap_4x2_ssse3(const uint8_t *const src,
1167
                                                const ptrdiff_t stride,
1168
                                                const __m128i coeffs[2],
1169
                                                __m128i s_32[4],
1170
249k
                                                __m128i ss_128[2]) {
1171
249k
  s_32[3] = _mm_cvtsi32_si128(loadu_int32(src + stride));
1172
249k
  const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1173
249k
  s_32[2] = _mm_cvtsi32_si128(loadu_int32(src + 2 * stride));
1174
249k
  const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
1175
249k
  ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1176
249k
  return convolve_4tap_ssse3(ss_128, coeffs);
1177
249k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_4tap_4x2_ssse3
convolve_avx2.c:y_convolve_4tap_4x2_ssse3
Line
Count
Source
1170
249k
                                                __m128i ss_128[2]) {
1171
249k
  s_32[3] = _mm_cvtsi32_si128(loadu_int32(src + stride));
1172
249k
  const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1173
249k
  s_32[2] = _mm_cvtsi32_si128(loadu_int32(src + 2 * stride));
1174
249k
  const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
1175
249k
  ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1176
249k
  return convolve_4tap_ssse3(ss_128, coeffs);
1177
249k
}
1178
1179
static inline __m256i y_convolve_4tap_8x2_avx2(const uint8_t *const src,
1180
                                               const ptrdiff_t stride,
1181
                                               const __m256i coeffs[2],
1182
                                               __m128i s_64[4],
1183
212k
                                               __m256i ss_256[2]) {
1184
212k
  s_64[3] = _mm_loadl_epi64((__m128i *)(src + stride));
1185
212k
  const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
1186
212k
  s_64[2] = _mm_loadl_epi64((__m128i *)(src + 2 * stride));
1187
212k
  const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[2]);
1188
212k
  ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1189
212k
  return convolve_4tap_avx2(ss_256, coeffs);
1190
212k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_4tap_8x2_avx2
convolve_avx2.c:y_convolve_4tap_8x2_avx2
Line
Count
Source
1183
212k
                                               __m256i ss_256[2]) {
1184
212k
  s_64[3] = _mm_loadl_epi64((__m128i *)(src + stride));
1185
212k
  const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
1186
212k
  s_64[2] = _mm_loadl_epi64((__m128i *)(src + 2 * stride));
1187
  const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[2]);
1188
212k
  ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1189
212k
  return convolve_4tap_avx2(ss_256, coeffs);
1190
212k
}
1191
1192
static inline void y_convolve_4tap_16x2_avx2(const uint8_t *const src,
1193
                                             const ptrdiff_t stride,
1194
                                             const __m256i coeffs[2],
1195
                                             __m128i s_128[4],
1196
132k
                                             __m256i ss_256[4], __m256i r[2]) {
1197
132k
  s_128[3] = _mm_loadu_si128((__m128i *)(src + stride));
1198
132k
  const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
1199
132k
  s_128[2] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
1200
132k
  const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[2]);
1201
132k
  ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1202
132k
  ss_256[3] = _mm256_unpackhi_epi8(src23, src34);
1203
132k
  r[0] = convolve_4tap_avx2(ss_256, coeffs);
1204
132k
  r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
1205
132k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_4tap_16x2_avx2
convolve_avx2.c:y_convolve_4tap_16x2_avx2
Line
Count
Source
1196
132k
                                             __m256i ss_256[4], __m256i r[2]) {
1197
132k
  s_128[3] = _mm_loadu_si128((__m128i *)(src + stride));
1198
132k
  const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
1199
132k
  s_128[2] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
1200
  const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[2]);
1201
132k
  ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1202
132k
  ss_256[3] = _mm256_unpackhi_epi8(src23, src34);
1203
132k
  r[0] = convolve_4tap_avx2(ss_256, coeffs);
1204
132k
  r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
1205
132k
}
1206
1207
static inline __m128i y_convolve_6tap_2x2_ssse3(const uint8_t *const src,
1208
                                                const ptrdiff_t stride,
1209
                                                const __m128i coeffs[3],
1210
                                                __m128i s_16[6],
1211
65.9k
                                                __m128i ss_128[3]) {
1212
65.9k
  s_16[5] = _mm_cvtsi32_si128(loadu_int16(src + 3 * stride));
1213
65.9k
  const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
1214
65.9k
  s_16[4] = _mm_cvtsi32_si128(loadu_int16(src + 4 * stride));
1215
65.9k
  const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[4]);
1216
65.9k
  ss_128[2] = _mm_unpacklo_epi8(src45, src56);
1217
65.9k
  return convolve_6tap_ssse3(ss_128, coeffs);
1218
65.9k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_6tap_2x2_ssse3
convolve_avx2.c:y_convolve_6tap_2x2_ssse3
Line
Count
Source
1211
65.9k
                                                __m128i ss_128[3]) {
1212
65.9k
  s_16[5] = _mm_cvtsi32_si128(loadu_int16(src + 3 * stride));
1213
65.9k
  const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
1214
65.9k
  s_16[4] = _mm_cvtsi32_si128(loadu_int16(src + 4 * stride));
1215
65.9k
  const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[4]);
1216
65.9k
  ss_128[2] = _mm_unpacklo_epi8(src45, src56);
1217
65.9k
  return convolve_6tap_ssse3(ss_128, coeffs);
1218
65.9k
}
1219
1220
static inline void y_convolve_4tap_32x2_avx2(
1221
    const uint8_t *const src, const ptrdiff_t stride, const __m256i coeffs[2],
1222
180k
    __m256i s_256[4], __m256i ss_256[4], __m256i tt_256[4], __m256i r[4]) {
1223
180k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
1224
180k
  ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
1225
180k
  ss_256[3] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
1226
180k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
1227
180k
  tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[2]);
1228
180k
  tt_256[3] = _mm256_unpackhi_epi8(s_256[3], s_256[2]);
1229
180k
  r[0] = convolve_4tap_avx2(ss_256 + 0, coeffs);
1230
180k
  r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
1231
180k
  r[2] = convolve_4tap_avx2(tt_256 + 0, coeffs);
1232
180k
  r[3] = convolve_4tap_avx2(tt_256 + 2, coeffs);
1233
180k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_4tap_32x2_avx2
convolve_avx2.c:y_convolve_4tap_32x2_avx2
Line
Count
Source
1222
180k
    __m256i s_256[4], __m256i ss_256[4], __m256i tt_256[4], __m256i r[4]) {
1223
180k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
1224
180k
  ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
1225
180k
  ss_256[3] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
1226
180k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
1227
180k
  tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[2]);
1228
180k
  tt_256[3] = _mm256_unpackhi_epi8(s_256[3], s_256[2]);
1229
180k
  r[0] = convolve_4tap_avx2(ss_256 + 0, coeffs);
1230
180k
  r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
1231
180k
  r[2] = convolve_4tap_avx2(tt_256 + 0, coeffs);
1232
180k
  r[3] = convolve_4tap_avx2(tt_256 + 2, coeffs);
1233
180k
}
1234
1235
static inline __m128i y_convolve_6tap_4x2_ssse3(const uint8_t *const src,
1236
                                                const ptrdiff_t stride,
1237
                                                const __m128i coeffs[3],
1238
                                                __m128i s_32[6],
1239
384k
                                                __m128i ss_128[3]) {
1240
384k
  s_32[5] = _mm_cvtsi32_si128(loadu_int32(src + 3 * stride));
1241
384k
  const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
1242
384k
  s_32[4] = _mm_cvtsi32_si128(loadu_int32(src + 4 * stride));
1243
384k
  const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
1244
384k
  ss_128[2] = _mm_unpacklo_epi8(src45, src56);
1245
384k
  return convolve_6tap_ssse3(ss_128, coeffs);
1246
384k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_6tap_4x2_ssse3
convolve_avx2.c:y_convolve_6tap_4x2_ssse3
Line
Count
Source
1239
384k
                                                __m128i ss_128[3]) {
1240
384k
  s_32[5] = _mm_cvtsi32_si128(loadu_int32(src + 3 * stride));
1241
384k
  const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
1242
384k
  s_32[4] = _mm_cvtsi32_si128(loadu_int32(src + 4 * stride));
1243
384k
  const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
1244
384k
  ss_128[2] = _mm_unpacklo_epi8(src45, src56);
1245
384k
  return convolve_6tap_ssse3(ss_128, coeffs);
1246
384k
}
1247
1248
static inline __m256i y_convolve_6tap_8x2_avx2(const uint8_t *const src,
1249
                                               const ptrdiff_t stride,
1250
                                               const __m256i coeffs[3],
1251
                                               __m128i s_64[6],
1252
530k
                                               __m256i ss_256[3]) {
1253
530k
  s_64[5] = _mm_loadl_epi64((__m128i *)(src + 3 * stride));
1254
530k
  const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
1255
530k
  s_64[4] = _mm_loadl_epi64((__m128i *)(src + 4 * stride));
1256
530k
  const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[4]);
1257
530k
  ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
1258
530k
  return convolve_6tap_avx2(ss_256, coeffs);
1259
530k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_6tap_8x2_avx2
convolve_avx2.c:y_convolve_6tap_8x2_avx2
Line
Count
Source
1252
530k
                                               __m256i ss_256[3]) {
1253
530k
  s_64[5] = _mm_loadl_epi64((__m128i *)(src + 3 * stride));
1254
530k
  const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
1255
530k
  s_64[4] = _mm_loadl_epi64((__m128i *)(src + 4 * stride));
1256
  const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[4]);
1257
530k
  ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
1258
530k
  return convolve_6tap_avx2(ss_256, coeffs);
1259
530k
}
1260
1261
static inline void y_convolve_6tap_16x2_avx2(const uint8_t *const src,
1262
                                             const ptrdiff_t stride,
1263
                                             const __m256i coeffs[3],
1264
                                             __m128i s_128[6],
1265
509k
                                             __m256i ss_256[6], __m256i r[2]) {
1266
509k
  s_128[5] = _mm_loadu_si128((__m128i *)(src + 3 * stride));
1267
509k
  const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
1268
509k
  s_128[4] = _mm_loadu_si128((__m128i *)(src + 4 * stride));
1269
509k
  const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[4]);
1270
509k
  ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
1271
509k
  ss_256[5] = _mm256_unpackhi_epi8(src45, src56);
1272
509k
  r[0] = convolve_6tap_avx2(ss_256, coeffs);
1273
509k
  r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
1274
509k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_6tap_16x2_avx2
convolve_avx2.c:y_convolve_6tap_16x2_avx2
Line
Count
Source
1265
509k
                                             __m256i ss_256[6], __m256i r[2]) {
1266
509k
  s_128[5] = _mm_loadu_si128((__m128i *)(src + 3 * stride));
1267
509k
  const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
1268
509k
  s_128[4] = _mm_loadu_si128((__m128i *)(src + 4 * stride));
1269
  const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[4]);
1270
509k
  ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
1271
509k
  ss_256[5] = _mm256_unpackhi_epi8(src45, src56);
1272
509k
  r[0] = convolve_6tap_avx2(ss_256, coeffs);
1273
509k
  r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
1274
509k
}
1275
1276
static inline void y_convolve_6tap_32x2_avx2(
1277
    const uint8_t *const src, const ptrdiff_t stride, const __m256i coeffs[3],
1278
673k
    __m256i s_256[6], __m256i ss_256[6], __m256i tt_256[6], __m256i r[4]) {
1279
673k
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
1280
673k
  ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
1281
673k
  ss_256[5] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
1282
673k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
1283
673k
  tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[4]);
1284
673k
  tt_256[5] = _mm256_unpackhi_epi8(s_256[5], s_256[4]);
1285
673k
  r[0] = convolve_6tap_avx2(ss_256 + 0, coeffs);
1286
673k
  r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
1287
673k
  r[2] = convolve_6tap_avx2(tt_256 + 0, coeffs);
1288
673k
  r[3] = convolve_6tap_avx2(tt_256 + 3, coeffs);
1289
673k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_6tap_32x2_avx2
convolve_avx2.c:y_convolve_6tap_32x2_avx2
Line
Count
Source
1278
673k
    __m256i s_256[6], __m256i ss_256[6], __m256i tt_256[6], __m256i r[4]) {
1279
673k
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
1280
673k
  ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
1281
673k
  ss_256[5] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
1282
673k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
1283
673k
  tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[4]);
1284
673k
  tt_256[5] = _mm256_unpackhi_epi8(s_256[5], s_256[4]);
1285
673k
  r[0] = convolve_6tap_avx2(ss_256 + 0, coeffs);
1286
673k
  r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
1287
673k
  r[2] = convolve_6tap_avx2(tt_256 + 0, coeffs);
1288
673k
  r[3] = convolve_6tap_avx2(tt_256 + 3, coeffs);
1289
673k
}
1290
1291
static inline __m128i y_convolve_8tap_2x2_ssse3(const uint8_t *const src,
1292
                                                const ptrdiff_t stride,
1293
                                                const __m128i coeffs[4],
1294
                                                __m128i s_16[8],
1295
6.39k
                                                __m128i ss_128[4]) {
1296
6.39k
  s_16[7] = _mm_cvtsi32_si128(loadu_int16(src + 7 * stride));
1297
6.39k
  const __m128i src67 = _mm_unpacklo_epi16(s_16[6], s_16[7]);
1298
6.39k
  s_16[6] = _mm_cvtsi32_si128(loadu_int16(src + 8 * stride));
1299
6.39k
  const __m128i src78 = _mm_unpacklo_epi16(s_16[7], s_16[6]);
1300
6.39k
  ss_128[3] = _mm_unpacklo_epi8(src67, src78);
1301
6.39k
  return convolve_8tap_ssse3(ss_128, coeffs);
1302
6.39k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_8tap_2x2_ssse3
convolve_avx2.c:y_convolve_8tap_2x2_ssse3
Line
Count
Source
1295
6.39k
                                                __m128i ss_128[4]) {
1296
6.39k
  s_16[7] = _mm_cvtsi32_si128(loadu_int16(src + 7 * stride));
1297
6.39k
  const __m128i src67 = _mm_unpacklo_epi16(s_16[6], s_16[7]);
1298
6.39k
  s_16[6] = _mm_cvtsi32_si128(loadu_int16(src + 8 * stride));
1299
6.39k
  const __m128i src78 = _mm_unpacklo_epi16(s_16[7], s_16[6]);
1300
6.39k
  ss_128[3] = _mm_unpacklo_epi8(src67, src78);
1301
6.39k
  return convolve_8tap_ssse3(ss_128, coeffs);
1302
6.39k
}
1303
1304
static inline __m128i y_convolve_8tap_4x2_ssse3(const uint8_t *const src,
1305
                                                const ptrdiff_t stride,
1306
                                                const __m128i coeffs[4],
1307
                                                __m128i s_32[8],
1308
26.2k
                                                __m128i ss_128[4]) {
1309
26.2k
  s_32[7] = _mm_cvtsi32_si128(loadu_int32(src + 7 * stride));
1310
26.2k
  const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
1311
26.2k
  s_32[6] = _mm_cvtsi32_si128(loadu_int32(src + 8 * stride));
1312
26.2k
  const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
1313
26.2k
  ss_128[3] = _mm_unpacklo_epi8(src67, src78);
1314
26.2k
  return convolve_8tap_ssse3(ss_128, coeffs);
1315
26.2k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_8tap_4x2_ssse3
convolve_avx2.c:y_convolve_8tap_4x2_ssse3
Line
Count
Source
1308
26.2k
                                                __m128i ss_128[4]) {
1309
26.2k
  s_32[7] = _mm_cvtsi32_si128(loadu_int32(src + 7 * stride));
1310
26.2k
  const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
1311
26.2k
  s_32[6] = _mm_cvtsi32_si128(loadu_int32(src + 8 * stride));
1312
26.2k
  const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
1313
26.2k
  ss_128[3] = _mm_unpacklo_epi8(src67, src78);
1314
26.2k
  return convolve_8tap_ssse3(ss_128, coeffs);
1315
26.2k
}
1316
1317
static inline __m256i y_convolve_8tap_8x2_avx2(const uint8_t *const src,
1318
                                               const ptrdiff_t stride,
1319
                                               const __m256i coeffs[4],
1320
                                               __m128i s_64[8],
1321
24.8k
                                               __m256i ss_256[4]) {
1322
24.8k
  s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * stride));
1323
24.8k
  const __m256i src67 = _mm256_setr_m128i(s_64[6], s_64[7]);
1324
24.8k
  s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * stride));
1325
24.8k
  const __m256i src78 = _mm256_setr_m128i(s_64[7], s_64[6]);
1326
24.8k
  ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
1327
24.8k
  return convolve_8tap_avx2(ss_256, coeffs);
1328
24.8k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_8tap_8x2_avx2
convolve_avx2.c:y_convolve_8tap_8x2_avx2
Line
Count
Source
1321
24.8k
                                               __m256i ss_256[4]) {
1322
24.8k
  s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * stride));
1323
24.8k
  const __m256i src67 = _mm256_setr_m128i(s_64[6], s_64[7]);
1324
24.8k
  s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * stride));
1325
  const __m256i src78 = _mm256_setr_m128i(s_64[7], s_64[6]);
1326
24.8k
  ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
1327
24.8k
  return convolve_8tap_avx2(ss_256, coeffs);
1328
24.8k
}
1329
1330
static inline void y_convolve_8tap_16x2_avx2(const uint8_t *const src,
1331
                                             const ptrdiff_t stride,
1332
                                             const __m256i coeffs[4],
1333
                                             __m128i s_128[8],
1334
19.8k
                                             __m256i ss_256[8], __m256i r[2]) {
1335
19.8k
  s_128[7] = _mm_loadu_si128((__m128i *)(src + 7 * stride));
1336
19.8k
  const __m256i src67 = _mm256_setr_m128i(s_128[6], s_128[7]);
1337
19.8k
  s_128[6] = _mm_loadu_si128((__m128i *)(src + 8 * stride));
1338
19.8k
  const __m256i src78 = _mm256_setr_m128i(s_128[7], s_128[6]);
1339
19.8k
  ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
1340
19.8k
  ss_256[7] = _mm256_unpackhi_epi8(src67, src78);
1341
19.8k
  r[0] = convolve_8tap_avx2(ss_256, coeffs);
1342
19.8k
  r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
1343
19.8k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_8tap_16x2_avx2
convolve_avx2.c:y_convolve_8tap_16x2_avx2
Line
Count
Source
1334
19.8k
                                             __m256i ss_256[8], __m256i r[2]) {
1335
19.8k
  s_128[7] = _mm_loadu_si128((__m128i *)(src + 7 * stride));
1336
19.8k
  const __m256i src67 = _mm256_setr_m128i(s_128[6], s_128[7]);
1337
19.8k
  s_128[6] = _mm_loadu_si128((__m128i *)(src + 8 * stride));
1338
  const __m256i src78 = _mm256_setr_m128i(s_128[7], s_128[6]);
1339
19.8k
  ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
1340
19.8k
  ss_256[7] = _mm256_unpackhi_epi8(src67, src78);
1341
19.8k
  r[0] = convolve_8tap_avx2(ss_256, coeffs);
1342
19.8k
  r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
1343
19.8k
}
1344
1345
static inline void y_convolve_8tap_32x2_avx2(
1346
    const uint8_t *const src, const ptrdiff_t stride, const __m256i coeffs[4],
1347
44.5k
    __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
1348
44.5k
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1349
44.5k
  ss_256[3] = _mm256_unpacklo_epi8(s_256[6], s_256[7]);
1350
44.5k
  ss_256[7] = _mm256_unpackhi_epi8(s_256[6], s_256[7]);
1351
44.5k
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1352
44.5k
  tt_256[3] = _mm256_unpacklo_epi8(s_256[7], s_256[6]);
1353
44.5k
  tt_256[7] = _mm256_unpackhi_epi8(s_256[7], s_256[6]);
1354
44.5k
  r[0] = convolve_8tap_avx2(ss_256 + 0, coeffs);
1355
44.5k
  r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
1356
44.5k
  r[2] = convolve_8tap_avx2(tt_256 + 0, coeffs);
1357
44.5k
  r[3] = convolve_8tap_avx2(tt_256 + 4, coeffs);
1358
44.5k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_8tap_32x2_avx2
convolve_avx2.c:y_convolve_8tap_32x2_avx2
Line
Count
Source
1347
44.5k
    __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
1348
44.5k
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1349
44.5k
  ss_256[3] = _mm256_unpacklo_epi8(s_256[6], s_256[7]);
1350
44.5k
  ss_256[7] = _mm256_unpackhi_epi8(s_256[6], s_256[7]);
1351
44.5k
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1352
44.5k
  tt_256[3] = _mm256_unpacklo_epi8(s_256[7], s_256[6]);
1353
44.5k
  tt_256[7] = _mm256_unpackhi_epi8(s_256[7], s_256[6]);
1354
44.5k
  r[0] = convolve_8tap_avx2(ss_256 + 0, coeffs);
1355
44.5k
  r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
1356
44.5k
  r[2] = convolve_8tap_avx2(tt_256 + 0, coeffs);
1357
44.5k
  r[3] = convolve_8tap_avx2(tt_256 + 4, coeffs);
1358
44.5k
}
1359
1360
static inline void xy_x_convolve_2tap_32_avx2(const uint8_t *const src,
1361
                                              const __m256i coeffs[1],
1362
289k
                                              __m256i r[2]) {
1363
289k
  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
1364
289k
  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
1365
289k
  const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
1366
289k
  const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
1367
1368
289k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
1369
289k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
1370
289k
}
convolve_2d_avx2.c:xy_x_convolve_2tap_32_avx2
Line
Count
Source
1362
289k
                                              __m256i r[2]) {
1363
289k
  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
1364
289k
  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
1365
289k
  const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
1366
289k
  const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
1367
1368
289k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
1369
289k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
1370
289k
}
Unexecuted instantiation: convolve_avx2.c:xy_x_convolve_2tap_32_avx2
1371
1372
static inline void xy_x_2tap_32_avx2(const uint8_t *const src,
1373
                                     const __m256i coeffs[1],
1374
289k
                                     int16_t *const dst) {
1375
289k
  __m256i r[2];
1376
1377
289k
  xy_x_convolve_2tap_32_avx2(src, coeffs, r);
1378
289k
  const __m256i d0 = xy_x_round_avx2(r[0]);
1379
289k
  const __m256i d1 = xy_x_round_avx2(r[1]);
1380
289k
  _mm256_storeu_si256((__m256i *)dst, d0);
1381
289k
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1382
289k
}
convolve_2d_avx2.c:xy_x_2tap_32_avx2
Line
Count
Source
1374
289k
                                     int16_t *const dst) {
1375
289k
  __m256i r[2];
1376
1377
289k
  xy_x_convolve_2tap_32_avx2(src, coeffs, r);
1378
289k
  const __m256i d0 = xy_x_round_avx2(r[0]);
1379
289k
  const __m256i d1 = xy_x_round_avx2(r[1]);
1380
289k
  _mm256_storeu_si256((__m256i *)dst, d0);
1381
289k
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1382
289k
}
Unexecuted instantiation: convolve_avx2.c:xy_x_2tap_32_avx2
1383
1384
static inline void xy_x_4tap_32_avx2(const uint8_t *const src,
1385
                                     const __m256i coeffs[2],
1386
                                     const __m256i filt[2],
1387
595k
                                     int16_t *const dst) {
1388
595k
  __m256i r[2];
1389
1390
595k
  x_convolve_4tap_32_avx2(src, coeffs, filt, r);
1391
595k
  const __m256i d0 = xy_x_round_avx2(r[0]);
1392
595k
  const __m256i d1 = xy_x_round_avx2(r[1]);
1393
595k
  _mm256_storeu_si256((__m256i *)dst, d0);
1394
595k
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1395
595k
}
convolve_2d_avx2.c:xy_x_4tap_32_avx2
Line
Count
Source
1387
595k
                                     int16_t *const dst) {
1388
595k
  __m256i r[2];
1389
1390
595k
  x_convolve_4tap_32_avx2(src, coeffs, filt, r);
1391
595k
  const __m256i d0 = xy_x_round_avx2(r[0]);
1392
595k
  const __m256i d1 = xy_x_round_avx2(r[1]);
1393
595k
  _mm256_storeu_si256((__m256i *)dst, d0);
1394
595k
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1395
595k
}
Unexecuted instantiation: convolve_avx2.c:xy_x_4tap_32_avx2
1396
1397
static inline void xy_x_6tap_32_avx2(const uint8_t *const src,
1398
                                     const __m256i coeffs[3],
1399
                                     const __m256i filt[3],
1400
4.02M
                                     int16_t *const dst) {
1401
4.02M
  __m256i r[2];
1402
1403
4.02M
  x_convolve_6tap_32_avx2(src, coeffs, filt, r);
1404
4.02M
  const __m256i d0 = xy_x_round_avx2(r[0]);
1405
4.02M
  const __m256i d1 = xy_x_round_avx2(r[1]);
1406
4.02M
  _mm256_storeu_si256((__m256i *)dst, d0);
1407
4.02M
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1408
4.02M
}
convolve_2d_avx2.c:xy_x_6tap_32_avx2
Line
Count
Source
1400
4.02M
                                     int16_t *const dst) {
1401
4.02M
  __m256i r[2];
1402
1403
4.02M
  x_convolve_6tap_32_avx2(src, coeffs, filt, r);
1404
4.02M
  const __m256i d0 = xy_x_round_avx2(r[0]);
1405
4.02M
  const __m256i d1 = xy_x_round_avx2(r[1]);
1406
4.02M
  _mm256_storeu_si256((__m256i *)dst, d0);
1407
4.02M
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1408
4.02M
}
Unexecuted instantiation: convolve_avx2.c:xy_x_6tap_32_avx2
1409
1410
static inline void xy_x_8tap_32_avx2(const uint8_t *const src,
1411
                                     const __m256i coeffs[4],
1412
                                     const __m256i filt[4],
1413
1.53M
                                     int16_t *const dst) {
1414
1.53M
  __m256i r[2];
1415
1416
1.53M
  x_convolve_8tap_32_avx2(src, coeffs, filt, r);
1417
1.53M
  const __m256i d0 = xy_x_round_avx2(r[0]);
1418
1.53M
  const __m256i d1 = xy_x_round_avx2(r[1]);
1419
1.53M
  _mm256_storeu_si256((__m256i *)dst, d0);
1420
1.53M
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1421
1.53M
}
convolve_2d_avx2.c:xy_x_8tap_32_avx2
Line
Count
Source
1413
1.53M
                                     int16_t *const dst) {
1414
1.53M
  __m256i r[2];
1415
1416
1.53M
  x_convolve_8tap_32_avx2(src, coeffs, filt, r);
1417
1.53M
  const __m256i d0 = xy_x_round_avx2(r[0]);
1418
1.53M
  const __m256i d1 = xy_x_round_avx2(r[1]);
1419
1.53M
  _mm256_storeu_si256((__m256i *)dst, d0);
1420
1.53M
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1421
1.53M
}
Unexecuted instantiation: convolve_avx2.c:xy_x_8tap_32_avx2
1422
1423
static inline __m128i xy_y_convolve_2tap_2x2_sse2(const int16_t *const src,
1424
                                                  __m128i s_32[2],
1425
8.68k
                                                  const __m128i coeffs[1]) {
1426
8.68k
  __m128i s_128[2];
1427
1428
8.68k
  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + 2));
1429
8.68k
  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1430
8.68k
  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * 2));
1431
8.68k
  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1432
8.68k
  const __m128i ss = _mm_unpacklo_epi16(s_128[0], s_128[1]);
1433
8.68k
  return convolve16_2tap_sse2(&ss, coeffs);
1434
8.68k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_2x2_sse2
Line
Count
Source
1425
8.68k
                                                  const __m128i coeffs[1]) {
1426
8.68k
  __m128i s_128[2];
1427
1428
8.68k
  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + 2));
1429
8.68k
  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1430
8.68k
  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * 2));
1431
8.68k
  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1432
8.68k
  const __m128i ss = _mm_unpacklo_epi16(s_128[0], s_128[1]);
1433
8.68k
  return convolve16_2tap_sse2(&ss, coeffs);
1434
8.68k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_2x2_sse2
1435
1436
static inline __m128i xy_y_convolve_2tap_2x2_half_pel_sse2(
1437
5.83k
    const int16_t *const src, __m128i s_32[2]) {
1438
5.83k
  __m128i s_128[2];
1439
1440
5.83k
  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + 2));
1441
5.83k
  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1442
5.83k
  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * 2));
1443
5.83k
  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1444
5.83k
  return _mm_add_epi16(s_128[0], s_128[1]);
1445
5.83k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_2x2_half_pel_sse2
Line
Count
Source
1437
5.83k
    const int16_t *const src, __m128i s_32[2]) {
1438
5.83k
  __m128i s_128[2];
1439
1440
5.83k
  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + 2));
1441
5.83k
  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1442
5.83k
  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * 2));
1443
5.83k
  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1444
5.83k
  return _mm_add_epi16(s_128[0], s_128[1]);
1445
5.83k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_2x2_half_pel_sse2
1446
1447
static inline void xy_y_convolve_2tap_4x2_sse2(const int16_t *const src,
1448
                                               __m128i s_64[2],
1449
                                               const __m128i coeffs[1],
1450
27.4k
                                               __m128i r[2]) {
1451
27.4k
  __m128i s_128[2];
1452
1453
27.4k
  s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
1454
27.4k
  s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
1455
27.4k
  s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
1456
27.4k
  s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
1457
27.4k
  const __m128i ss0 = _mm_unpacklo_epi16(s_128[0], s_128[1]);
1458
27.4k
  const __m128i ss1 = _mm_unpackhi_epi16(s_128[0], s_128[1]);
1459
27.4k
  r[0] = convolve16_2tap_sse2(&ss0, coeffs);
1460
27.4k
  r[1] = convolve16_2tap_sse2(&ss1, coeffs);
1461
27.4k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_4x2_sse2
Line
Count
Source
1450
27.4k
                                               __m128i r[2]) {
1451
27.4k
  __m128i s_128[2];
1452
1453
27.4k
  s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
1454
27.4k
  s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
1455
27.4k
  s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
1456
27.4k
  s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
1457
27.4k
  const __m128i ss0 = _mm_unpacklo_epi16(s_128[0], s_128[1]);
1458
27.4k
  const __m128i ss1 = _mm_unpackhi_epi16(s_128[0], s_128[1]);
1459
27.4k
  r[0] = convolve16_2tap_sse2(&ss0, coeffs);
1460
27.4k
  r[1] = convolve16_2tap_sse2(&ss1, coeffs);
1461
27.4k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_4x2_sse2
1462
1463
static inline __m128i xy_y_convolve_2tap_4x2_half_pel_sse2(
1464
25.3k
    const int16_t *const src, __m128i s_64[2]) {
1465
25.3k
  __m128i s_128[2];
1466
1467
25.3k
  s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
1468
25.3k
  s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
1469
25.3k
  s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
1470
25.3k
  s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
1471
25.3k
  return _mm_add_epi16(s_128[0], s_128[1]);
1472
25.3k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_4x2_half_pel_sse2
Line
Count
Source
1464
25.3k
    const int16_t *const src, __m128i s_64[2]) {
1465
25.3k
  __m128i s_128[2];
1466
1467
25.3k
  s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
1468
25.3k
  s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
1469
25.3k
  s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
1470
25.3k
  s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
1471
25.3k
  return _mm_add_epi16(s_128[0], s_128[1]);
1472
25.3k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_4x2_half_pel_sse2
1473
1474
static inline void xy_y_convolve_2tap_16_avx2(const __m256i s0,
1475
                                              const __m256i s1,
1476
                                              const __m256i coeffs[1],
1477
585k
                                              __m256i r[2]) {
1478
585k
  const __m256i ss0 = _mm256_unpacklo_epi16(s0, s1);
1479
585k
  const __m256i ss1 = _mm256_unpackhi_epi16(s0, s1);
1480
585k
  r[0] = convolve16_2tap_avx2(&ss0, coeffs);
1481
585k
  r[1] = convolve16_2tap_avx2(&ss1, coeffs);
1482
585k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_16_avx2
Line
Count
Source
1477
585k
                                              __m256i r[2]) {
1478
585k
  const __m256i ss0 = _mm256_unpacklo_epi16(s0, s1);
1479
585k
  const __m256i ss1 = _mm256_unpackhi_epi16(s0, s1);
1480
585k
  r[0] = convolve16_2tap_avx2(&ss0, coeffs);
1481
585k
  r[1] = convolve16_2tap_avx2(&ss1, coeffs);
1482
585k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_16_avx2
1483
1484
static inline void xy_y_convolve_2tap_8x2_avx2(const int16_t *const src,
1485
                                               __m128i s_128[2],
1486
                                               const __m256i coeffs[1],
1487
22.7k
                                               __m256i r[2]) {
1488
22.7k
  __m256i s_256[2];
1489
22.7k
  s_128[1] = _mm_loadu_si128((__m128i *)(src + 8));
1490
22.7k
  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1491
22.7k
  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
1492
22.7k
  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1493
22.7k
  xy_y_convolve_2tap_16_avx2(s_256[0], s_256[1], coeffs, r);
1494
22.7k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_8x2_avx2
Line
Count
Source
1487
22.7k
                                               __m256i r[2]) {
1488
22.7k
  __m256i s_256[2];
1489
22.7k
  s_128[1] = _mm_loadu_si128((__m128i *)(src + 8));
1490
22.7k
  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1491
22.7k
  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
1492
  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1493
22.7k
  xy_y_convolve_2tap_16_avx2(s_256[0], s_256[1], coeffs, r);
1494
22.7k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_8x2_avx2
1495
1496
static inline __m256i xy_y_convolve_2tap_8x2_half_pel_avx2(
1497
24.9k
    const int16_t *const src, __m128i s_128[2]) {
1498
24.9k
  __m256i s_256[2];
1499
24.9k
  s_128[1] = _mm_loadu_si128((__m128i *)(src + 8));
1500
24.9k
  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1501
24.9k
  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
1502
24.9k
  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1503
24.9k
  return _mm256_add_epi16(s_256[0], s_256[1]);
1504
24.9k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_8x2_half_pel_avx2
Line
Count
Source
1497
24.9k
    const int16_t *const src, __m128i s_128[2]) {
1498
24.9k
  __m256i s_256[2];
1499
24.9k
  s_128[1] = _mm_loadu_si128((__m128i *)(src + 8));
1500
24.9k
  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1501
24.9k
  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
1502
  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1503
24.9k
  return _mm256_add_epi16(s_256[0], s_256[1]);
1504
24.9k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_8x2_half_pel_avx2
1505
1506
static inline void xy_y_convolve_2tap_16x2_half_pel_avx2(
1507
15.1k
    const int16_t *const src, __m256i s_256[2], __m256i r[2]) {
1508
15.1k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1509
15.1k
  r[0] = _mm256_add_epi16(s_256[0], s_256[1]);
1510
15.1k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
1511
15.1k
  r[1] = _mm256_add_epi16(s_256[1], s_256[0]);
1512
15.1k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_16x2_half_pel_avx2
Line
Count
Source
1507
15.1k
    const int16_t *const src, __m256i s_256[2], __m256i r[2]) {
1508
15.1k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1509
15.1k
  r[0] = _mm256_add_epi16(s_256[0], s_256[1]);
1510
15.1k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
1511
15.1k
  r[1] = _mm256_add_epi16(s_256[1], s_256[0]);
1512
15.1k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_16x2_half_pel_avx2
1513
1514
static inline void xy_y_store_16x2_avx2(const __m256i r[2], uint8_t *const dst,
1515
0
                                        const ptrdiff_t stride) {
1516
0
  const __m256i t = _mm256_packus_epi16(r[0], r[1]);
1517
0
  const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
1518
0
  storeu_u8_16x2_avx2(d, dst, stride);
1519
0
}
Unexecuted instantiation: convolve_2d_avx2.c:xy_y_store_16x2_avx2
Unexecuted instantiation: convolve_avx2.c:xy_y_store_16x2_avx2
1520
1521
static inline void xy_y_convolve_2tap_16x2_avx2(const int16_t *const src,
1522
                                                __m256i s[2],
1523
                                                const __m256i coeffs[1],
1524
18.6k
                                                __m256i r[4]) {
1525
18.6k
  s[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1526
18.6k
  xy_y_convolve_2tap_16_avx2(s[0], s[1], coeffs, r + 0);
1527
18.6k
  s[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
1528
18.6k
  xy_y_convolve_2tap_16_avx2(s[1], s[0], coeffs, r + 2);
1529
18.6k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_16x2_avx2
Line
Count
Source
1524
18.6k
                                                __m256i r[4]) {
1525
18.6k
  s[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1526
18.6k
  xy_y_convolve_2tap_16_avx2(s[0], s[1], coeffs, r + 0);
1527
18.6k
  s[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
1528
18.6k
  xy_y_convolve_2tap_16_avx2(s[1], s[0], coeffs, r + 2);
1529
18.6k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_16x2_avx2
1530
1531
static inline void xy_y_convolve_2tap_32_avx2(const int16_t *const src,
1532
                                              const __m256i s0[2],
1533
                                              __m256i s1[2],
1534
                                              const __m256i coeffs[1],
1535
158k
                                              __m256i r[4]) {
1536
158k
  s1[0] = _mm256_loadu_si256((__m256i *)src);
1537
158k
  s1[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1538
158k
  xy_y_convolve_2tap_16_avx2(s0[0], s1[0], coeffs, r + 0);
1539
158k
  xy_y_convolve_2tap_16_avx2(s0[1], s1[1], coeffs, r + 2);
1540
158k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_32_avx2
Line
Count
Source
1535
158k
                                              __m256i r[4]) {
1536
158k
  s1[0] = _mm256_loadu_si256((__m256i *)src);
1537
158k
  s1[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1538
158k
  xy_y_convolve_2tap_16_avx2(s0[0], s1[0], coeffs, r + 0);
1539
158k
  xy_y_convolve_2tap_16_avx2(s0[1], s1[1], coeffs, r + 2);
1540
158k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_32_avx2
1541
1542
static inline void xy_y_convolve_2tap_32_all_avx2(const int16_t *const src,
1543
                                                  const __m256i s0[2],
1544
                                                  __m256i s1[2],
1545
                                                  const __m256i coeffs[1],
1546
158k
                                                  uint8_t *const dst) {
1547
158k
  __m256i r[4];
1548
1549
158k
  xy_y_convolve_2tap_32_avx2(src, s0, s1, coeffs, r);
1550
158k
  xy_y_round_store_32_avx2(r + 0, r + 2, dst);
1551
158k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_32_all_avx2
Line
Count
Source
1546
158k
                                                  uint8_t *const dst) {
1547
158k
  __m256i r[4];
1548
1549
158k
  xy_y_convolve_2tap_32_avx2(src, s0, s1, coeffs, r);
1550
158k
  xy_y_round_store_32_avx2(r + 0, r + 2, dst);
1551
158k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_32_all_avx2
1552
1553
static inline void xy_y_convolve_2tap_half_pel_32_avx2(const int16_t *const src,
1554
                                                       const __m256i s0[2],
1555
                                                       __m256i s1[2],
1556
118k
                                                       __m256i r[2]) {
1557
118k
  s1[0] = _mm256_loadu_si256((__m256i *)src);
1558
118k
  s1[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1559
118k
  r[0] = _mm256_add_epi16(s0[0], s1[0]);
1560
118k
  r[1] = _mm256_add_epi16(s0[1], s1[1]);
1561
118k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_half_pel_32_avx2
Line
Count
Source
1556
118k
                                                       __m256i r[2]) {
1557
118k
  s1[0] = _mm256_loadu_si256((__m256i *)src);
1558
118k
  s1[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1559
118k
  r[0] = _mm256_add_epi16(s0[0], s1[0]);
1560
118k
  r[1] = _mm256_add_epi16(s0[1], s1[1]);
1561
118k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_half_pel_32_avx2
1562
1563
static inline void xy_y_convolve_2tap_half_pel_32_all_avx2(
1564
    const int16_t *const src, const __m256i s0[2], __m256i s1[2],
1565
118k
    uint8_t *const dst) {
1566
118k
  __m256i r[2];
1567
1568
118k
  xy_y_convolve_2tap_half_pel_32_avx2(src, s0, s1, r);
1569
118k
  r[0] = xy_y_round_half_pel_avx2(r[0]);
1570
118k
  r[1] = xy_y_round_half_pel_avx2(r[1]);
1571
118k
  xy_y_pack_store_32_avx2(r[0], r[1], dst);
1572
118k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_half_pel_32_all_avx2
Line
Count
Source
1565
118k
    uint8_t *const dst) {
1566
118k
  __m256i r[2];
1567
1568
118k
  xy_y_convolve_2tap_half_pel_32_avx2(src, s0, s1, r);
1569
118k
  r[0] = xy_y_round_half_pel_avx2(r[0]);
1570
118k
  r[1] = xy_y_round_half_pel_avx2(r[1]);
1571
118k
  xy_y_pack_store_32_avx2(r[0], r[1], dst);
1572
118k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_half_pel_32_all_avx2
1573
1574
static inline __m128i xy_y_convolve_4tap_2x2_sse2(const int16_t *const src,
1575
                                                  __m128i s_32[4],
1576
                                                  __m128i ss_128[2],
1577
169k
                                                  const __m128i coeffs[2]) {
1578
169k
  s_32[3] = _mm_cvtsi32_si128(loadu_int32(src + 3 * 2));
1579
169k
  const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1580
169k
  s_32[2] = _mm_cvtsi32_si128(loadu_int32(src + 4 * 2));
1581
169k
  const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
1582
169k
  ss_128[1] = _mm_unpacklo_epi16(src23, src34);
1583
169k
  const __m128i r = convolve16_4tap_sse2(ss_128, coeffs);
1584
169k
  ss_128[0] = ss_128[1];
1585
169k
  return r;
1586
169k
}
convolve_2d_avx2.c:xy_y_convolve_4tap_2x2_sse2
Line
Count
Source
1577
169k
                                                  const __m128i coeffs[2]) {
1578
169k
  s_32[3] = _mm_cvtsi32_si128(loadu_int32(src + 3 * 2));
1579
169k
  const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1580
169k
  s_32[2] = _mm_cvtsi32_si128(loadu_int32(src + 4 * 2));
1581
169k
  const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
1582
169k
  ss_128[1] = _mm_unpacklo_epi16(src23, src34);
1583
169k
  const __m128i r = convolve16_4tap_sse2(ss_128, coeffs);
1584
169k
  ss_128[0] = ss_128[1];
1585
169k
  return r;
1586
169k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_2x2_sse2
1587
1588
static inline __m256i xy_y_convolve_4tap_4x2_avx2(const int16_t *const src,
1589
                                                  __m128i s_64[4],
1590
                                                  __m256i ss_256[2],
1591
804k
                                                  const __m256i coeffs[2]) {
1592
804k
  __m256i s_256[2];
1593
804k
  s_64[3] = _mm_loadl_epi64((__m128i *)(src + 3 * 4));
1594
804k
  s_256[0] = _mm256_setr_m128i(s_64[2], s_64[3]);
1595
804k
  s_64[2] = _mm_loadl_epi64((__m128i *)(src + 4 * 4));
1596
804k
  s_256[1] = _mm256_setr_m128i(s_64[3], s_64[2]);
1597
804k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1598
804k
  const __m256i r = convolve16_4tap_avx2(ss_256, coeffs);
1599
804k
  ss_256[0] = ss_256[1];
1600
804k
  return r;
1601
804k
}
convolve_2d_avx2.c:xy_y_convolve_4tap_4x2_avx2
Line
Count
Source
1591
804k
                                                  const __m256i coeffs[2]) {
1592
804k
  __m256i s_256[2];
1593
804k
  s_64[3] = _mm_loadl_epi64((__m128i *)(src + 3 * 4));
1594
804k
  s_256[0] = _mm256_setr_m128i(s_64[2], s_64[3]);
1595
804k
  s_64[2] = _mm_loadl_epi64((__m128i *)(src + 4 * 4));
1596
  s_256[1] = _mm256_setr_m128i(s_64[3], s_64[2]);
1597
804k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1598
804k
  const __m256i r = convolve16_4tap_avx2(ss_256, coeffs);
1599
804k
  ss_256[0] = ss_256[1];
1600
804k
  return r;
1601
804k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_4x2_avx2
1602
1603
static inline void xy_y_convolve_4tap_16_avx2(const __m256i *const ss,
1604
                                              const __m256i coeffs[2],
1605
2.87M
                                              __m256i r[2]) {
1606
2.87M
  r[0] = convolve16_4tap_avx2(ss, coeffs);
1607
2.87M
  r[1] = convolve16_4tap_avx2(ss + 2, coeffs);
1608
2.87M
}
convolve_2d_avx2.c:xy_y_convolve_4tap_16_avx2
Line
Count
Source
1605
2.87M
                                              __m256i r[2]) {
1606
2.87M
  r[0] = convolve16_4tap_avx2(ss, coeffs);
1607
2.87M
  r[1] = convolve16_4tap_avx2(ss + 2, coeffs);
1608
2.87M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_16_avx2
1609
1610
static inline void xy_y_convolve_4tap_8x2_avx2(const int16_t *const src,
1611
                                               __m256i ss_256[4],
1612
                                               const __m256i coeffs[2],
1613
533k
                                               __m256i r[2]) {
1614
533k
  __m256i s_256[2];
1615
533k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
1616
533k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
1617
533k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1618
533k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1619
533k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1620
533k
  ss_256[0] = ss_256[1];
1621
533k
  ss_256[2] = ss_256[3];
1622
533k
}
convolve_2d_avx2.c:xy_y_convolve_4tap_8x2_avx2
Line
Count
Source
1613
533k
                                               __m256i r[2]) {
1614
533k
  __m256i s_256[2];
1615
533k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
1616
533k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
1617
533k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1618
533k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1619
533k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1620
533k
  ss_256[0] = ss_256[1];
1621
533k
  ss_256[2] = ss_256[3];
1622
533k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_8x2_avx2
1623
1624
static inline void xy_y_convolve_4tap_8x2_half_pel_avx2(
1625
    const int16_t *const src, const __m256i coeffs[1], __m256i s_256[4],
1626
97.6k
    __m256i r[2]) {
1627
97.6k
  __m256i a_256[2];
1628
97.6k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
1629
97.6k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
1630
97.6k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
1631
97.6k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
1632
97.6k
  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r);
1633
97.6k
  s_256[0] = s_256[2];
1634
97.6k
  s_256[1] = s_256[3];
1635
97.6k
}
convolve_2d_avx2.c:xy_y_convolve_4tap_8x2_half_pel_avx2
Line
Count
Source
1626
97.6k
    __m256i r[2]) {
1627
97.6k
  __m256i a_256[2];
1628
97.6k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
1629
97.6k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
1630
97.6k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
1631
97.6k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
1632
97.6k
  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r);
1633
97.6k
  s_256[0] = s_256[2];
1634
97.6k
  s_256[1] = s_256[3];
1635
97.6k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_8x2_half_pel_avx2
1636
1637
static inline void xy_y_convolve_4tap_16x2_avx2(
1638
    const int16_t *const src, __m256i s_256[4], __m256i ss_256[4],
1639
277k
    __m256i tt_256[4], const __m256i coeffs[2], __m256i r[4]) {
1640
277k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
1641
277k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1642
277k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1643
277k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
1644
277k
  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
1645
277k
  tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
1646
277k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1647
277k
  xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
1648
277k
  ss_256[0] = ss_256[1];
1649
277k
  ss_256[2] = ss_256[3];
1650
277k
  tt_256[0] = tt_256[1];
1651
277k
  tt_256[2] = tt_256[3];
1652
277k
}
convolve_2d_avx2.c:xy_y_convolve_4tap_16x2_avx2
Line
Count
Source
1639
277k
    __m256i tt_256[4], const __m256i coeffs[2], __m256i r[4]) {
1640
277k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
1641
277k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1642
277k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1643
277k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
1644
277k
  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
1645
277k
  tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
1646
277k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1647
277k
  xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
1648
277k
  ss_256[0] = ss_256[1];
1649
277k
  ss_256[2] = ss_256[3];
1650
277k
  tt_256[0] = tt_256[1];
1651
277k
  tt_256[2] = tt_256[3];
1652
277k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_16x2_avx2
1653
1654
static inline void xy_y_convolve_4tap_32x2_avx2(
1655
    const int16_t *const src, const ptrdiff_t stride, __m256i s_256[4],
1656
    __m256i ss_256[4], __m256i tt_256[4], const __m256i coeffs[2],
1657
390k
    __m256i r[4]) {
1658
390k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
1659
390k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1660
390k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1661
390k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
1662
390k
  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
1663
390k
  tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
1664
390k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1665
390k
  xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
1666
390k
  ss_256[0] = ss_256[1];
1667
390k
  ss_256[2] = ss_256[3];
1668
390k
  tt_256[0] = tt_256[1];
1669
390k
  tt_256[2] = tt_256[3];
1670
390k
}
convolve_2d_avx2.c:xy_y_convolve_4tap_32x2_avx2
Line
Count
Source
1657
390k
    __m256i r[4]) {
1658
390k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
1659
390k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1660
390k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1661
390k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
1662
390k
  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
1663
390k
  tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
1664
390k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1665
390k
  xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
1666
390k
  ss_256[0] = ss_256[1];
1667
390k
  ss_256[2] = ss_256[3];
1668
390k
  tt_256[0] = tt_256[1];
1669
390k
  tt_256[2] = tt_256[3];
1670
390k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_32x2_avx2
1671
1672
static inline void xy_y_convolve_4tap_16x2_half_pelavx2(
1673
    const int16_t *const src, __m256i s_256[5], const __m256i coeffs[1],
1674
55.3k
    __m256i r[4]) {
1675
55.3k
  __m256i a_256[2];
1676
1677
55.3k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
1678
55.3k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
1679
1680
55.3k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
1681
55.3k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
1682
55.3k
  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 0);
1683
1684
55.3k
  a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
1685
55.3k
  a_256[1] = _mm256_add_epi16(s_256[2], s_256[3]);
1686
55.3k
  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 2);
1687
1688
55.3k
  s_256[0] = s_256[2];
1689
55.3k
  s_256[1] = s_256[3];
1690
55.3k
  s_256[2] = s_256[4];
1691
55.3k
}
convolve_2d_avx2.c:xy_y_convolve_4tap_16x2_half_pelavx2
Line
Count
Source
1674
55.3k
    __m256i r[4]) {
1675
55.3k
  __m256i a_256[2];
1676
1677
55.3k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
1678
55.3k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
1679
1680
55.3k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
1681
55.3k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
1682
55.3k
  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 0);
1683
1684
55.3k
  a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
1685
55.3k
  a_256[1] = _mm256_add_epi16(s_256[2], s_256[3]);
1686
55.3k
  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 2);
1687
1688
55.3k
  s_256[0] = s_256[2];
1689
55.3k
  s_256[1] = s_256[3];
1690
55.3k
  s_256[2] = s_256[4];
1691
55.3k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_16x2_half_pelavx2
1692
1693
static inline __m128i xy_y_convolve_6tap_2x2_sse2(const int16_t *const src,
1694
                                                  __m128i s_32[6],
1695
                                                  __m128i ss_128[3],
1696
231k
                                                  const __m128i coeffs[3]) {
1697
231k
  s_32[5] = _mm_cvtsi32_si128(loadu_int32(src + 5 * 2));
1698
231k
  const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
1699
231k
  s_32[4] = _mm_cvtsi32_si128(loadu_int32(src + 6 * 2));
1700
231k
  const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
1701
231k
  ss_128[2] = _mm_unpacklo_epi16(src45, src56);
1702
231k
  const __m128i r = convolve16_6tap_sse2(ss_128, coeffs);
1703
231k
  ss_128[0] = ss_128[1];
1704
231k
  ss_128[1] = ss_128[2];
1705
231k
  return r;
1706
231k
}
convolve_2d_avx2.c:xy_y_convolve_6tap_2x2_sse2
Line
Count
Source
1696
231k
                                                  const __m128i coeffs[3]) {
1697
231k
  s_32[5] = _mm_cvtsi32_si128(loadu_int32(src + 5 * 2));
1698
231k
  const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
1699
231k
  s_32[4] = _mm_cvtsi32_si128(loadu_int32(src + 6 * 2));
1700
231k
  const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
1701
231k
  ss_128[2] = _mm_unpacklo_epi16(src45, src56);
1702
231k
  const __m128i r = convolve16_6tap_sse2(ss_128, coeffs);
1703
231k
  ss_128[0] = ss_128[1];
1704
231k
  ss_128[1] = ss_128[2];
1705
231k
  return r;
1706
231k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_6tap_2x2_sse2
1707
1708
static inline __m256i xy_y_convolve_6tap_4x2_avx2(const int16_t *const src,
1709
                                                  __m128i s_64[6],
1710
                                                  __m256i ss_256[3],
1711
1.08M
                                                  const __m256i coeffs[3]) {
1712
1.08M
  __m256i s_256[2];
1713
1.08M
  s_64[5] = _mm_loadl_epi64((__m128i *)(src + 5 * 4));
1714
1.08M
  s_256[0] = _mm256_setr_m128i(s_64[4], s_64[5]);
1715
1.08M
  s_64[4] = _mm_loadl_epi64((__m128i *)(src + 6 * 4));
1716
1.08M
  s_256[1] = _mm256_setr_m128i(s_64[5], s_64[4]);
1717
1.08M
  ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1718
1.08M
  const __m256i r = convolve16_6tap_avx2(ss_256, coeffs);
1719
1.08M
  ss_256[0] = ss_256[1];
1720
1.08M
  ss_256[1] = ss_256[2];
1721
1.08M
  return r;
1722
1.08M
}
convolve_2d_avx2.c:xy_y_convolve_6tap_4x2_avx2
Line
Count
Source
1711
1.08M
                                                  const __m256i coeffs[3]) {
1712
1.08M
  __m256i s_256[2];
1713
1.08M
  s_64[5] = _mm_loadl_epi64((__m128i *)(src + 5 * 4));
1714
1.08M
  s_256[0] = _mm256_setr_m128i(s_64[4], s_64[5]);
1715
1.08M
  s_64[4] = _mm_loadl_epi64((__m128i *)(src + 6 * 4));
1716
  s_256[1] = _mm256_setr_m128i(s_64[5], s_64[4]);
1717
1.08M
  ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1718
1.08M
  const __m256i r = convolve16_6tap_avx2(ss_256, coeffs);
1719
1.08M
  ss_256[0] = ss_256[1];
1720
1.08M
  ss_256[1] = ss_256[2];
1721
1.08M
  return r;
1722
1.08M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_6tap_4x2_avx2
1723
1724
static inline void xy_y_convolve_6tap_16_avx2(const __m256i ss[6],
1725
                                              const __m256i coeffs[3],
1726
10.0M
                                              __m256i r[2]) {
1727
10.0M
  r[0] = convolve16_6tap_avx2(ss, coeffs);
1728
10.0M
  r[1] = convolve16_6tap_avx2(ss + 3, coeffs);
1729
10.0M
}
convolve_2d_avx2.c:xy_y_convolve_6tap_16_avx2
Line
Count
Source
1726
10.0M
                                              __m256i r[2]) {
1727
10.0M
  r[0] = convolve16_6tap_avx2(ss, coeffs);
1728
10.0M
  r[1] = convolve16_6tap_avx2(ss + 3, coeffs);
1729
10.0M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_6tap_16_avx2
1730
1731
static inline void xy_y_convolve_6tap_8x2_avx2(const int16_t *const src,
1732
                                               __m256i ss_256[6],
1733
                                               const __m256i coeffs[3],
1734
1.05M
                                               __m256i r[2]) {
1735
1.05M
  __m256i s_256[2];
1736
1.05M
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
1737
1.05M
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
1738
1.05M
  ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1739
1.05M
  ss_256[5] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1740
1.05M
  xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r);
1741
1.05M
  ss_256[0] = ss_256[1];
1742
1.05M
  ss_256[1] = ss_256[2];
1743
1.05M
  ss_256[3] = ss_256[4];
1744
1.05M
  ss_256[4] = ss_256[5];
1745
1.05M
}
convolve_2d_avx2.c:xy_y_convolve_6tap_8x2_avx2
Line
Count
Source
1734
1.05M
                                               __m256i r[2]) {
1735
1.05M
  __m256i s_256[2];
1736
1.05M
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
1737
1.05M
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
1738
1.05M
  ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1739
1.05M
  ss_256[5] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1740
1.05M
  xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r);
1741
1.05M
  ss_256[0] = ss_256[1];
1742
1.05M
  ss_256[1] = ss_256[2];
1743
1.05M
  ss_256[3] = ss_256[4];
1744
1.05M
  ss_256[4] = ss_256[5];
1745
1.05M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_6tap_8x2_avx2
1746
1747
static inline void xy_y_convolve_6tap_8x2_half_pel_avx2(
1748
    const int16_t *const src, const __m256i coeffs[2], __m256i s_256[6],
1749
335k
    __m256i r[2]) {
1750
335k
  __m256i a_256[2], ss_256[4];
1751
335k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
1752
335k
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
1753
335k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
1754
335k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
1755
335k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1756
335k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1757
335k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1758
335k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1759
335k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1760
335k
  s_256[0] = s_256[2];
1761
335k
  s_256[1] = s_256[3];
1762
335k
  s_256[2] = s_256[4];
1763
335k
  s_256[3] = s_256[5];
1764
335k
}
convolve_2d_avx2.c:xy_y_convolve_6tap_8x2_half_pel_avx2
Line
Count
Source
1749
335k
    __m256i r[2]) {
1750
335k
  __m256i a_256[2], ss_256[4];
1751
335k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
1752
335k
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
1753
335k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
1754
335k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
1755
335k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1756
335k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1757
335k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1758
335k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1759
335k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1760
335k
  s_256[0] = s_256[2];
1761
335k
  s_256[1] = s_256[3];
1762
335k
  s_256[2] = s_256[4];
1763
335k
  s_256[3] = s_256[5];
1764
335k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_6tap_8x2_half_pel_avx2
1765
1766
static inline void xy_y_convolve_6tap_16x2_avx2(
1767
    const int16_t *const src, const ptrdiff_t stride, __m256i s_256[6],
1768
    __m256i ss_256[6], __m256i tt_256[6], const __m256i coeffs[3],
1769
4.50M
    __m256i r[4]) {
1770
4.50M
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
1771
4.50M
  ss_256[2] = _mm256_unpacklo_epi16(s_256[4], s_256[5]);
1772
4.50M
  ss_256[5] = _mm256_unpackhi_epi16(s_256[4], s_256[5]);
1773
4.50M
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
1774
4.50M
  tt_256[2] = _mm256_unpacklo_epi16(s_256[5], s_256[4]);
1775
4.50M
  tt_256[5] = _mm256_unpackhi_epi16(s_256[5], s_256[4]);
1776
1777
4.50M
  xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r + 0);
1778
4.50M
  xy_y_convolve_6tap_16_avx2(tt_256, coeffs, r + 2);
1779
1780
4.50M
  ss_256[0] = ss_256[1];
1781
4.50M
  ss_256[1] = ss_256[2];
1782
4.50M
  ss_256[3] = ss_256[4];
1783
4.50M
  ss_256[4] = ss_256[5];
1784
1785
4.50M
  tt_256[0] = tt_256[1];
1786
4.50M
  tt_256[1] = tt_256[2];
1787
4.50M
  tt_256[3] = tt_256[4];
1788
4.50M
  tt_256[4] = tt_256[5];
1789
4.50M
}
convolve_2d_avx2.c:xy_y_convolve_6tap_16x2_avx2
Line
Count
Source
1769
4.50M
    __m256i r[4]) {
1770
4.50M
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
1771
4.50M
  ss_256[2] = _mm256_unpacklo_epi16(s_256[4], s_256[5]);
1772
4.50M
  ss_256[5] = _mm256_unpackhi_epi16(s_256[4], s_256[5]);
1773
4.50M
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
1774
4.50M
  tt_256[2] = _mm256_unpacklo_epi16(s_256[5], s_256[4]);
1775
4.50M
  tt_256[5] = _mm256_unpackhi_epi16(s_256[5], s_256[4]);
1776
1777
4.50M
  xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r + 0);
1778
4.50M
  xy_y_convolve_6tap_16_avx2(tt_256, coeffs, r + 2);
1779
1780
4.50M
  ss_256[0] = ss_256[1];
1781
4.50M
  ss_256[1] = ss_256[2];
1782
4.50M
  ss_256[3] = ss_256[4];
1783
4.50M
  ss_256[4] = ss_256[5];
1784
1785
4.50M
  tt_256[0] = tt_256[1];
1786
4.50M
  tt_256[1] = tt_256[2];
1787
4.50M
  tt_256[3] = tt_256[4];
1788
4.50M
  tt_256[4] = tt_256[5];
1789
4.50M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_6tap_16x2_avx2
1790
1791
static inline void xy_y_convolve_6tap_16x2_half_pel_avx2(
1792
    const int16_t *const src, const ptrdiff_t stride, __m256i s_256[6],
1793
307k
    __m256i ss_256[4], const __m256i coeffs[2], __m256i r[4]) {
1794
307k
  __m256i a_256[2];
1795
1796
307k
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
1797
307k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
1798
307k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
1799
307k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1800
307k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1801
307k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1802
307k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1803
307k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1804
1805
307k
  a_256[1] = _mm256_add_epi16(s_256[2], s_256[5]);
1806
307k
  s_256[0] = s_256[2];
1807
307k
  s_256[2] = s_256[4];
1808
307k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
1809
307k
  a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
1810
307k
  s_256[1] = s_256[3];
1811
307k
  s_256[3] = s_256[5];
1812
307k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1813
307k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
1814
307k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1815
307k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
1816
307k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
1817
307k
}
convolve_2d_avx2.c:xy_y_convolve_6tap_16x2_half_pel_avx2
Line
Count
Source
1793
307k
    __m256i ss_256[4], const __m256i coeffs[2], __m256i r[4]) {
1794
307k
  __m256i a_256[2];
1795
1796
307k
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
1797
307k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
1798
307k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
1799
307k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1800
307k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1801
307k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1802
307k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1803
307k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1804
1805
307k
  a_256[1] = _mm256_add_epi16(s_256[2], s_256[5]);
1806
307k
  s_256[0] = s_256[2];
1807
307k
  s_256[2] = s_256[4];
1808
307k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
1809
307k
  a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
1810
307k
  s_256[1] = s_256[3];
1811
307k
  s_256[3] = s_256[5];
1812
307k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1813
307k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
1814
307k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1815
307k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
1816
307k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
1817
307k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_6tap_16x2_half_pel_avx2
1818
1819
static inline __m128i xy_y_convolve_8tap_2x2_sse2(const int16_t *const src,
1820
                                                  __m128i s_32[8],
1821
                                                  __m128i ss_128[4],
1822
11.1k
                                                  const __m128i coeffs[4]) {
1823
11.1k
  s_32[7] = _mm_cvtsi32_si128(loadu_int32(src + 7 * 2));
1824
11.1k
  const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
1825
11.1k
  s_32[6] = _mm_cvtsi32_si128(loadu_int32(src + 8 * 2));
1826
11.1k
  const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
1827
11.1k
  ss_128[3] = _mm_unpacklo_epi16(src67, src78);
1828
11.1k
  const __m128i r = convolve16_8tap_sse2(ss_128, coeffs);
1829
11.1k
  ss_128[0] = ss_128[1];
1830
11.1k
  ss_128[1] = ss_128[2];
1831
11.1k
  ss_128[2] = ss_128[3];
1832
11.1k
  return r;
1833
11.1k
}
convolve_2d_avx2.c:xy_y_convolve_8tap_2x2_sse2
Line
Count
Source
1822
11.1k
                                                  const __m128i coeffs[4]) {
1823
11.1k
  s_32[7] = _mm_cvtsi32_si128(loadu_int32(src + 7 * 2));
1824
11.1k
  const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
1825
11.1k
  s_32[6] = _mm_cvtsi32_si128(loadu_int32(src + 8 * 2));
1826
11.1k
  const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
1827
11.1k
  ss_128[3] = _mm_unpacklo_epi16(src67, src78);
1828
11.1k
  const __m128i r = convolve16_8tap_sse2(ss_128, coeffs);
1829
11.1k
  ss_128[0] = ss_128[1];
1830
11.1k
  ss_128[1] = ss_128[2];
1831
11.1k
  ss_128[2] = ss_128[3];
1832
11.1k
  return r;
1833
11.1k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_8tap_2x2_sse2
1834
1835
static inline __m256i xy_y_convolve_8tap_4x2_avx2(const int16_t *const src,
1836
                                                  __m128i s_64[8],
1837
                                                  __m256i ss_256[4],
1838
49.0k
                                                  const __m256i coeffs[4]) {
1839
49.0k
  __m256i s_256[2];
1840
49.0k
  s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * 4));
1841
49.0k
  s_256[0] = _mm256_setr_m128i(s_64[6], s_64[7]);
1842
49.0k
  s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * 4));
1843
49.0k
  s_256[1] = _mm256_setr_m128i(s_64[7], s_64[6]);
1844
49.0k
  ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1845
49.0k
  const __m256i r = convolve16_8tap_avx2(ss_256, coeffs);
1846
49.0k
  ss_256[0] = ss_256[1];
1847
49.0k
  ss_256[1] = ss_256[2];
1848
49.0k
  ss_256[2] = ss_256[3];
1849
49.0k
  return r;
1850
49.0k
}
convolve_2d_avx2.c:xy_y_convolve_8tap_4x2_avx2
Line
Count
Source
1838
49.0k
                                                  const __m256i coeffs[4]) {
1839
49.0k
  __m256i s_256[2];
1840
49.0k
  s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * 4));
1841
49.0k
  s_256[0] = _mm256_setr_m128i(s_64[6], s_64[7]);
1842
49.0k
  s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * 4));
1843
  s_256[1] = _mm256_setr_m128i(s_64[7], s_64[6]);
1844
49.0k
  ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1845
49.0k
  const __m256i r = convolve16_8tap_avx2(ss_256, coeffs);
1846
49.0k
  ss_256[0] = ss_256[1];
1847
49.0k
  ss_256[1] = ss_256[2];
1848
49.0k
  ss_256[2] = ss_256[3];
1849
49.0k
  return r;
1850
49.0k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_8tap_4x2_avx2
1851
1852
static inline void xy_y_convolve_8tap_16_avx2(const __m256i *const ss,
1853
                                              const __m256i coeffs[4],
1854
2.47M
                                              __m256i r[2]) {
1855
2.47M
  r[0] = convolve16_8tap_avx2(ss, coeffs);
1856
2.47M
  r[1] = convolve16_8tap_avx2(ss + 4, coeffs);
1857
2.47M
}
convolve_2d_avx2.c:xy_y_convolve_8tap_16_avx2
Line
Count
Source
1854
2.47M
                                              __m256i r[2]) {
1855
2.47M
  r[0] = convolve16_8tap_avx2(ss, coeffs);
1856
2.47M
  r[1] = convolve16_8tap_avx2(ss + 4, coeffs);
1857
2.47M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_8tap_16_avx2
1858
1859
static inline void xy_y_convolve_8tap_8x2_avx2(const int16_t *const src,
1860
                                               __m256i ss_256[8],
1861
                                               const __m256i coeffs[4],
1862
38.8k
                                               __m256i r[2]) {
1863
38.8k
  __m256i s_256[2];
1864
38.8k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
1865
38.8k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
1866
38.8k
  ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1867
38.8k
  ss_256[7] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1868
38.8k
  xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r);
1869
38.8k
  ss_256[0] = ss_256[1];
1870
38.8k
  ss_256[1] = ss_256[2];
1871
38.8k
  ss_256[2] = ss_256[3];
1872
38.8k
  ss_256[4] = ss_256[5];
1873
38.8k
  ss_256[5] = ss_256[6];
1874
38.8k
  ss_256[6] = ss_256[7];
1875
38.8k
}
convolve_2d_avx2.c:xy_y_convolve_8tap_8x2_avx2
Line
Count
Source
1862
38.8k
                                               __m256i r[2]) {
1863
38.8k
  __m256i s_256[2];
1864
38.8k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
1865
38.8k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
1866
38.8k
  ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1867
38.8k
  ss_256[7] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1868
38.8k
  xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r);
1869
38.8k
  ss_256[0] = ss_256[1];
1870
38.8k
  ss_256[1] = ss_256[2];
1871
38.8k
  ss_256[2] = ss_256[3];
1872
38.8k
  ss_256[4] = ss_256[5];
1873
38.8k
  ss_256[5] = ss_256[6];
1874
38.8k
  ss_256[6] = ss_256[7];
1875
38.8k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_8tap_8x2_avx2
1876
1877
static inline void xy_y_convolve_8tap_8x2_half_pel_avx2(
1878
    const int16_t *const src, const __m256i coeffs[2], __m256i s_256[8],
1879
17.9k
    __m256i r[2]) {
1880
17.9k
  __m256i a_256[4], ss_256[4];
1881
1882
17.9k
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
1883
17.9k
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
1884
17.9k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
1885
17.9k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
1886
17.9k
  a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
1887
17.9k
  a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
1888
17.9k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1889
17.9k
  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1890
17.9k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1891
17.9k
  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1892
17.9k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1893
17.9k
  s_256[0] = s_256[2];
1894
17.9k
  s_256[1] = s_256[3];
1895
17.9k
  s_256[2] = s_256[4];
1896
17.9k
  s_256[3] = s_256[5];
1897
17.9k
  s_256[4] = s_256[6];
1898
17.9k
  s_256[5] = s_256[7];
1899
17.9k
}
convolve_2d_avx2.c:xy_y_convolve_8tap_8x2_half_pel_avx2
Line
Count
Source
1879
17.9k
    __m256i r[2]) {
1880
17.9k
  __m256i a_256[4], ss_256[4];
1881
1882
17.9k
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
1883
17.9k
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
1884
17.9k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
1885
17.9k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
1886
17.9k
  a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
1887
17.9k
  a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
1888
17.9k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1889
17.9k
  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1890
17.9k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1891
17.9k
  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1892
17.9k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1893
17.9k
  s_256[0] = s_256[2];
1894
17.9k
  s_256[1] = s_256[3];
1895
17.9k
  s_256[2] = s_256[4];
1896
17.9k
  s_256[3] = s_256[5];
1897
17.9k
  s_256[4] = s_256[6];
1898
17.9k
  s_256[5] = s_256[7];
1899
17.9k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_8tap_8x2_half_pel_avx2
1900
1901
static AOM_FORCE_INLINE void xy_y_convolve_8tap_16x2_avx2(
1902
    const int16_t *const src, const ptrdiff_t stride, const __m256i coeffs[4],
1903
1.21M
    __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
1904
1.21M
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1905
1.21M
  ss_256[3] = _mm256_unpacklo_epi16(s_256[6], s_256[7]);
1906
1.21M
  ss_256[7] = _mm256_unpackhi_epi16(s_256[6], s_256[7]);
1907
1.21M
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1908
1.21M
  tt_256[3] = _mm256_unpacklo_epi16(s_256[7], s_256[6]);
1909
1.21M
  tt_256[7] = _mm256_unpackhi_epi16(s_256[7], s_256[6]);
1910
1911
1.21M
  xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r + 0);
1912
1.21M
  xy_y_convolve_8tap_16_avx2(tt_256, coeffs, r + 2);
1913
1914
1.21M
  ss_256[0] = ss_256[1];
1915
1.21M
  ss_256[1] = ss_256[2];
1916
1.21M
  ss_256[2] = ss_256[3];
1917
1.21M
  ss_256[4] = ss_256[5];
1918
1.21M
  ss_256[5] = ss_256[6];
1919
1.21M
  ss_256[6] = ss_256[7];
1920
1921
1.21M
  tt_256[0] = tt_256[1];
1922
1.21M
  tt_256[1] = tt_256[2];
1923
1.21M
  tt_256[2] = tt_256[3];
1924
1.21M
  tt_256[4] = tt_256[5];
1925
1.21M
  tt_256[5] = tt_256[6];
1926
1.21M
  tt_256[6] = tt_256[7];
1927
1.21M
}
convolve_2d_avx2.c:xy_y_convolve_8tap_16x2_avx2
Line
Count
Source
1903
1.21M
    __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
1904
1.21M
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1905
1.21M
  ss_256[3] = _mm256_unpacklo_epi16(s_256[6], s_256[7]);
1906
1.21M
  ss_256[7] = _mm256_unpackhi_epi16(s_256[6], s_256[7]);
1907
1.21M
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1908
1.21M
  tt_256[3] = _mm256_unpacklo_epi16(s_256[7], s_256[6]);
1909
1.21M
  tt_256[7] = _mm256_unpackhi_epi16(s_256[7], s_256[6]);
1910
1911
1.21M
  xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r + 0);
1912
1.21M
  xy_y_convolve_8tap_16_avx2(tt_256, coeffs, r + 2);
1913
1914
1.21M
  ss_256[0] = ss_256[1];
1915
1.21M
  ss_256[1] = ss_256[2];
1916
1.21M
  ss_256[2] = ss_256[3];
1917
1.21M
  ss_256[4] = ss_256[5];
1918
1.21M
  ss_256[5] = ss_256[6];
1919
1.21M
  ss_256[6] = ss_256[7];
1920
1921
1.21M
  tt_256[0] = tt_256[1];
1922
1.21M
  tt_256[1] = tt_256[2];
1923
1.21M
  tt_256[2] = tt_256[3];
1924
1.21M
  tt_256[4] = tt_256[5];
1925
1.21M
  tt_256[5] = tt_256[6];
1926
1.21M
  tt_256[6] = tt_256[7];
1927
1.21M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_8tap_16x2_avx2
1928
1929
static inline void xy_y_convolve_8tap_16x2_half_pel_avx2(
1930
    const int16_t *const src, const ptrdiff_t stride, const __m256i coeffs[4],
1931
17.3k
    __m256i s_256[8], __m256i r[4]) {
1932
17.3k
  __m256i a_256[4], ss_256[4];
1933
17.3k
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1934
1935
17.3k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
1936
17.3k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
1937
17.3k
  a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
1938
17.3k
  a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
1939
17.3k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1940
17.3k
  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1941
17.3k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1942
17.3k
  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1943
1944
17.3k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1945
1946
17.3k
  a_256[1] = _mm256_add_epi16(s_256[2], s_256[7]);
1947
17.3k
  a_256[2] = _mm256_add_epi16(s_256[3], s_256[6]);
1948
17.3k
  a_256[3] = _mm256_add_epi16(s_256[4], s_256[5]);
1949
17.3k
  s_256[0] = s_256[2];
1950
17.3k
  s_256[2] = s_256[4];
1951
17.3k
  s_256[4] = s_256[6];
1952
17.3k
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1953
1954
17.3k
  a_256[0] = _mm256_add_epi16(s_256[1], s_256[6]);
1955
17.3k
  s_256[1] = s_256[3];
1956
17.3k
  s_256[3] = s_256[5];
1957
17.3k
  s_256[5] = s_256[7];
1958
17.3k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1959
17.3k
  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1960
17.3k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1961
17.3k
  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1962
1963
17.3k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
1964
17.3k
}
convolve_2d_avx2.c:xy_y_convolve_8tap_16x2_half_pel_avx2
Line
Count
Source
1931
17.3k
    __m256i s_256[8], __m256i r[4]) {
1932
17.3k
  __m256i a_256[4], ss_256[4];
1933
17.3k
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1934
1935
17.3k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
1936
17.3k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
1937
17.3k
  a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
1938
17.3k
  a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
1939
17.3k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1940
17.3k
  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1941
17.3k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1942
17.3k
  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1943
1944
17.3k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1945
1946
17.3k
  a_256[1] = _mm256_add_epi16(s_256[2], s_256[7]);
1947
17.3k
  a_256[2] = _mm256_add_epi16(s_256[3], s_256[6]);
1948
17.3k
  a_256[3] = _mm256_add_epi16(s_256[4], s_256[5]);
1949
17.3k
  s_256[0] = s_256[2];
1950
17.3k
  s_256[2] = s_256[4];
1951
17.3k
  s_256[4] = s_256[6];
1952
17.3k
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1953
1954
17.3k
  a_256[0] = _mm256_add_epi16(s_256[1], s_256[6]);
1955
17.3k
  s_256[1] = s_256[3];
1956
17.3k
  s_256[3] = s_256[5];
1957
17.3k
  s_256[5] = s_256[7];
1958
17.3k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1959
17.3k
  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1960
17.3k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1961
17.3k
  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1962
1963
17.3k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
1964
17.3k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_8tap_16x2_half_pel_avx2
1965
1966
static inline void xy_y_round_store_8x2_avx2(const __m256i res[2],
1967
                                             uint8_t *const dst,
1968
2.09M
                                             const ptrdiff_t stride) {
1969
2.09M
  const __m256i r = xy_y_round_16_avx2(res);
1970
2.09M
  pack_store_8x2_avx2(r, dst, stride);
1971
2.09M
}
convolve_2d_avx2.c:xy_y_round_store_8x2_avx2
Line
Count
Source
1968
2.09M
                                             const ptrdiff_t stride) {
1969
2.09M
  const __m256i r = xy_y_round_16_avx2(res);
1970
2.09M
  pack_store_8x2_avx2(r, dst, stride);
1971
2.09M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_store_8x2_avx2
1972
1973
static inline void xy_y_round_store_16x2_avx2(const __m256i res[4],
1974
                                              uint8_t *const dst,
1975
1.61M
                                              const ptrdiff_t stride) {
1976
1.61M
  const __m256i r0 = xy_y_round_16_avx2(res + 0);
1977
1.61M
  const __m256i r1 = xy_y_round_16_avx2(res + 2);
1978
1.61M
  xy_y_pack_store_16x2_avx2(r0, r1, dst, stride);
1979
1.61M
}
convolve_2d_avx2.c:xy_y_round_store_16x2_avx2
Line
Count
Source
1975
1.61M
                                              const ptrdiff_t stride) {
1976
1.61M
  const __m256i r0 = xy_y_round_16_avx2(res + 0);
1977
1.61M
  const __m256i r1 = xy_y_round_16_avx2(res + 2);
1978
1.61M
  xy_y_pack_store_16x2_avx2(r0, r1, dst, stride);
1979
1.61M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_store_16x2_avx2
1980
1981
static inline void sr_y_round_store_32_avx2(const __m256i res[2],
1982
1.91M
                                            uint8_t *const dst) {
1983
1.91M
  __m256i r[2];
1984
1985
1.91M
  r[0] = sr_y_round_avx2(res[0]);
1986
1.91M
  r[1] = sr_y_round_avx2(res[1]);
1987
1.91M
  convolve_store_32_avx2(r[0], r[1], dst);
1988
1.91M
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_round_store_32_avx2
convolve_avx2.c:sr_y_round_store_32_avx2
Line
Count
Source
1982
1.91M
                                            uint8_t *const dst) {
1983
1.91M
  __m256i r[2];
1984
1985
1.91M
  r[0] = sr_y_round_avx2(res[0]);
1986
1.91M
  r[1] = sr_y_round_avx2(res[1]);
1987
1.91M
  convolve_store_32_avx2(r[0], r[1], dst);
1988
1.91M
}
1989
1990
static inline void sr_y_round_store_32x2_avx2(const __m256i res[4],
1991
                                              uint8_t *const dst,
1992
898k
                                              const int32_t dst_stride) {
1993
898k
  sr_y_round_store_32_avx2(res, dst);
1994
898k
  sr_y_round_store_32_avx2(res + 2, dst + dst_stride);
1995
898k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_round_store_32x2_avx2
convolve_avx2.c:sr_y_round_store_32x2_avx2
Line
Count
Source
1992
898k
                                              const int32_t dst_stride) {
1993
898k
  sr_y_round_store_32_avx2(res, dst);
1994
898k
  sr_y_round_store_32_avx2(res + 2, dst + dst_stride);
1995
898k
}
1996
1997
static inline void sr_y_2tap_32_avx2(const uint8_t *const src,
1998
                                     const __m256i coeffs[1], const __m256i s0,
1999
116k
                                     __m256i *const s1, uint8_t *const dst) {
2000
116k
  __m256i r[2];
2001
116k
  y_convolve_2tap_32_avx2(src, coeffs, s0, s1, r);
2002
116k
  sr_y_round_store_32_avx2(r, dst);
2003
116k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_2tap_32_avx2
convolve_avx2.c:sr_y_2tap_32_avx2
Line
Count
Source
1999
116k
                                     __m256i *const s1, uint8_t *const dst) {
2000
116k
  __m256i r[2];
2001
116k
  y_convolve_2tap_32_avx2(src, coeffs, s0, s1, r);
2002
116k
  sr_y_round_store_32_avx2(r, dst);
2003
116k
}
2004
2005
static AOM_FORCE_INLINE void av1_convolve_y_sr_specialized_avx2(
2006
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
2007
    int32_t w, int32_t h, const InterpFilterParams *filter_params_y,
2008
655k
    const int32_t subpel_y_q4) {
2009
655k
  int32_t x, y;
2010
655k
  __m128i coeffs_128[4];
2011
655k
  __m256i coeffs_256[4];
2012
2013
655k
  int vert_tap = get_filter_tap(filter_params_y, subpel_y_q4);
2014
2015
655k
  if (vert_tap == 2) {
2016
    // vert_filt as 2 tap
2017
40.9k
    const uint8_t *src_ptr = src;
2018
2019
40.9k
    y = h;
2020
2021
40.9k
    if (subpel_y_q4 != 8) {
2022
13.9k
      if (w <= 8) {
2023
10.5k
        prepare_half_coeffs_2tap_ssse3(filter_params_y, subpel_y_q4,
2024
10.5k
                                       coeffs_128);
2025
2026
10.5k
        if (w == 2) {
2027
1.87k
          __m128i s_16[2];
2028
2029
1.87k
          s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
2030
2031
3.45k
          do {
2032
3.45k
            const __m128i res = y_convolve_2tap_2x2_ssse3(src_ptr, src_stride,
2033
3.45k
                                                          coeffs_128, s_16);
2034
3.45k
            const __m128i r = sr_y_round_sse2(res);
2035
3.45k
            pack_store_2x2_sse2(r, dst, dst_stride);
2036
3.45k
            src_ptr += 2 * src_stride;
2037
3.45k
            dst += 2 * dst_stride;
2038
3.45k
            y -= 2;
2039
3.45k
          } while (y);
2040
8.64k
        } else if (w == 4) {
2041
4.81k
          __m128i s_32[2];
2042
2043
4.81k
          s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr));
2044
2045
14.2k
          do {
2046
14.2k
            const __m128i res = y_convolve_2tap_4x2_ssse3(src_ptr, src_stride,
2047
14.2k
                                                          coeffs_128, s_32);
2048
14.2k
            const __m128i r = sr_y_round_sse2(res);
2049
14.2k
            pack_store_4x2_sse2(r, dst, dst_stride);
2050
14.2k
            src_ptr += 2 * src_stride;
2051
14.2k
            dst += 2 * dst_stride;
2052
14.2k
            y -= 2;
2053
14.2k
          } while (y);
2054
4.81k
        } else {
2055
3.83k
          __m128i s_64[2], s_128[2];
2056
2057
3.83k
          assert(w == 8);
2058
2059
3.83k
          s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
2060
2061
12.7k
          do {
2062
            // Note: Faster than binding to AVX2 registers.
2063
12.7k
            s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
2064
12.7k
            s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
2065
12.7k
            s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2066
12.7k
            s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
2067
12.7k
            const __m128i ss0 = _mm_unpacklo_epi8(s_128[0], s_128[1]);
2068
12.7k
            const __m128i ss1 = _mm_unpackhi_epi8(s_128[0], s_128[1]);
2069
12.7k
            const __m128i res0 = convolve_2tap_ssse3(&ss0, coeffs_128);
2070
12.7k
            const __m128i res1 = convolve_2tap_ssse3(&ss1, coeffs_128);
2071
12.7k
            const __m128i r0 = sr_y_round_sse2(res0);
2072
12.7k
            const __m128i r1 = sr_y_round_sse2(res1);
2073
12.7k
            const __m128i d = _mm_packus_epi16(r0, r1);
2074
12.7k
            _mm_storel_epi64((__m128i *)dst, d);
2075
12.7k
            _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
2076
12.7k
            src_ptr += 2 * src_stride;
2077
12.7k
            dst += 2 * dst_stride;
2078
12.7k
            y -= 2;
2079
12.7k
          } while (y);
2080
3.83k
        }
2081
10.5k
      } else {
2082
3.44k
        prepare_half_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2083
2084
3.44k
        if (w == 16) {
2085
1.98k
          __m128i s_128[2];
2086
2087
1.98k
          s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
2088
2089
13.9k
          do {
2090
13.9k
            __m256i r[2];
2091
2092
13.9k
            y_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2093
13.9k
                                      r);
2094
13.9k
            sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2095
13.9k
            src_ptr += 2 * src_stride;
2096
13.9k
            dst += 2 * dst_stride;
2097
13.9k
            y -= 2;
2098
13.9k
          } while (y);
2099
1.98k
        } else if (w == 32) {
2100
907
          __m256i s_256[2];
2101
2102
907
          s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
2103
2104
12.8k
          do {
2105
12.8k
            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0],
2106
12.8k
                              &s_256[1], dst);
2107
12.8k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1],
2108
12.8k
                              &s_256[0], dst + dst_stride);
2109
12.8k
            src_ptr += 2 * src_stride;
2110
12.8k
            dst += 2 * dst_stride;
2111
12.8k
            y -= 2;
2112
12.8k
          } while (y);
2113
907
        } else if (w == 64) {
2114
433
          __m256i s_256[2][2];
2115
2116
433
          s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2117
433
          s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2118
2119
11.1k
          do {
2120
11.1k
            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0][0],
2121
11.1k
                              &s_256[1][0], dst);
2122
11.1k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 32, coeffs_256,
2123
11.1k
                              s_256[0][1], &s_256[1][1], dst + 32);
2124
11.1k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1][0],
2125
11.1k
                              &s_256[0][0], dst + dst_stride);
2126
11.1k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 32, coeffs_256,
2127
11.1k
                              s_256[1][1], &s_256[0][1], dst + dst_stride + 32);
2128
2129
11.1k
            src_ptr += 2 * src_stride;
2130
11.1k
            dst += 2 * dst_stride;
2131
11.1k
            y -= 2;
2132
11.1k
          } while (y);
2133
433
        } else {
2134
119
          __m256i s_256[2][4];
2135
2136
119
          assert(w == 128);
2137
2138
119
          s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2139
119
          s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2140
119
          s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
2141
119
          s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
2142
2143
5.69k
          do {
2144
5.69k
            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0][0],
2145
5.69k
                              &s_256[1][0], dst);
2146
5.69k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 1 * 32, coeffs_256,
2147
5.69k
                              s_256[0][1], &s_256[1][1], dst + 1 * 32);
2148
5.69k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 2 * 32, coeffs_256,
2149
5.69k
                              s_256[0][2], &s_256[1][2], dst + 2 * 32);
2150
5.69k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 3 * 32, coeffs_256,
2151
5.69k
                              s_256[0][3], &s_256[1][3], dst + 3 * 32);
2152
2153
5.69k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1][0],
2154
5.69k
                              &s_256[0][0], dst + dst_stride);
2155
5.69k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 1 * 32, coeffs_256,
2156
5.69k
                              s_256[1][1], &s_256[0][1],
2157
5.69k
                              dst + dst_stride + 1 * 32);
2158
5.69k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 2 * 32, coeffs_256,
2159
5.69k
                              s_256[1][2], &s_256[0][2],
2160
5.69k
                              dst + dst_stride + 2 * 32);
2161
5.69k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 3 * 32, coeffs_256,
2162
5.69k
                              s_256[1][3], &s_256[0][3],
2163
5.69k
                              dst + dst_stride + 3 * 32);
2164
2165
5.69k
            src_ptr += 2 * src_stride;
2166
5.69k
            dst += 2 * dst_stride;
2167
5.69k
            y -= 2;
2168
5.69k
          } while (y);
2169
119
        }
2170
3.44k
      }
2171
26.9k
    } else {
2172
      // average to get half pel
2173
26.9k
      if (w <= 8) {
2174
23.5k
        if (w == 2) {
2175
5.16k
          __m128i s_16[2];
2176
2177
5.16k
          s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
2178
2179
10.5k
          do {
2180
10.5k
            s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + src_stride));
2181
10.5k
            const __m128i d0 = _mm_avg_epu8(s_16[0], s_16[1]);
2182
10.5k
            *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d0);
2183
10.5k
            s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
2184
10.5k
            const __m128i d1 = _mm_avg_epu8(s_16[1], s_16[0]);
2185
10.5k
            *(int16_t *)(dst + dst_stride) = (int16_t)_mm_cvtsi128_si32(d1);
2186
10.5k
            src_ptr += 2 * src_stride;
2187
10.5k
            dst += 2 * dst_stride;
2188
10.5k
            y -= 2;
2189
10.5k
          } while (y);
2190
18.3k
        } else if (w == 4) {
2191
11.3k
          __m128i s_32[2];
2192
2193
11.3k
          s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr));
2194
2195
31.4k
          do {
2196
31.4k
            s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + src_stride));
2197
31.4k
            const __m128i d0 = _mm_avg_epu8(s_32[0], s_32[1]);
2198
31.4k
            xx_storel_32(dst, d0);
2199
31.4k
            s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2200
31.4k
            const __m128i d1 = _mm_avg_epu8(s_32[1], s_32[0]);
2201
31.4k
            xx_storel_32(dst + dst_stride, d1);
2202
31.4k
            src_ptr += 2 * src_stride;
2203
31.4k
            dst += 2 * dst_stride;
2204
31.4k
            y -= 2;
2205
31.4k
          } while (y);
2206
11.3k
        } else {
2207
7.00k
          __m128i s_64[2];
2208
2209
7.00k
          assert(w == 8);
2210
2211
7.00k
          s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
2212
2213
22.4k
          do {
2214
            // Note: Faster than binding to AVX2 registers.
2215
22.4k
            s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
2216
22.4k
            const __m128i d0 = _mm_avg_epu8(s_64[0], s_64[1]);
2217
22.4k
            _mm_storel_epi64((__m128i *)dst, d0);
2218
22.4k
            s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2219
22.4k
            const __m128i d1 = _mm_avg_epu8(s_64[1], s_64[0]);
2220
22.4k
            _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
2221
22.4k
            src_ptr += 2 * src_stride;
2222
22.4k
            dst += 2 * dst_stride;
2223
22.4k
            y -= 2;
2224
22.4k
          } while (y);
2225
7.00k
        }
2226
23.5k
      } else if (w == 16) {
2227
2.36k
        __m128i s_128[2];
2228
2229
2.36k
        s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
2230
2231
12.9k
        do {
2232
12.9k
          s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
2233
12.9k
          const __m128i d0 = _mm_avg_epu8(s_128[0], s_128[1]);
2234
12.9k
          _mm_storeu_si128((__m128i *)dst, d0);
2235
12.9k
          s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2236
12.9k
          const __m128i d1 = _mm_avg_epu8(s_128[1], s_128[0]);
2237
12.9k
          _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
2238
12.9k
          src_ptr += 2 * src_stride;
2239
12.9k
          dst += 2 * dst_stride;
2240
12.9k
          y -= 2;
2241
12.9k
        } while (y);
2242
2.36k
      } else if (w == 32) {
2243
691
        __m256i s_256[2];
2244
2245
691
        s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
2246
2247
9.43k
        do {
2248
9.43k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0], &s_256[1], dst);
2249
9.43k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1], &s_256[0],
2250
9.43k
                                dst + dst_stride);
2251
9.43k
          src_ptr += 2 * src_stride;
2252
9.43k
          dst += 2 * dst_stride;
2253
9.43k
          y -= 2;
2254
9.43k
        } while (y);
2255
691
      } else if (w == 64) {
2256
324
        __m256i s_256[2][2];
2257
2258
324
        s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2259
324
        s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2260
2261
8.94k
        do {
2262
8.94k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0][0], &s_256[1][0],
2263
8.94k
                                dst);
2264
8.94k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 32, s_256[0][1],
2265
8.94k
                                &s_256[1][1], dst + 32);
2266
2267
8.94k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1][0],
2268
8.94k
                                &s_256[0][0], dst + dst_stride);
2269
8.94k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 32, s_256[1][1],
2270
8.94k
                                &s_256[0][1], dst + dst_stride + 32);
2271
2272
8.94k
          src_ptr += 2 * src_stride;
2273
8.94k
          dst += 2 * dst_stride;
2274
8.94k
          y -= 2;
2275
8.94k
        } while (y);
2276
324
      } else {
2277
73
        __m256i s_256[2][4];
2278
2279
73
        assert(w == 128);
2280
2281
73
        s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2282
73
        s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2283
73
        s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
2284
73
        s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
2285
2286
3.55k
        do {
2287
3.55k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0][0], &s_256[1][0],
2288
3.55k
                                dst);
2289
3.55k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 1 * 32, s_256[0][1],
2290
3.55k
                                &s_256[1][1], dst + 1 * 32);
2291
3.55k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 2 * 32, s_256[0][2],
2292
3.55k
                                &s_256[1][2], dst + 2 * 32);
2293
3.55k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 3 * 32, s_256[0][3],
2294
3.55k
                                &s_256[1][3], dst + 3 * 32);
2295
2296
3.55k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1][0],
2297
3.55k
                                &s_256[0][0], dst + dst_stride);
2298
3.55k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 1 * 32, s_256[1][1],
2299
3.55k
                                &s_256[0][1], dst + dst_stride + 1 * 32);
2300
3.55k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 2 * 32, s_256[1][2],
2301
3.55k
                                &s_256[0][2], dst + dst_stride + 2 * 32);
2302
3.55k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 3 * 32, s_256[1][3],
2303
3.55k
                                &s_256[0][3], dst + dst_stride + 3 * 32);
2304
2305
3.55k
          src_ptr += 2 * src_stride;
2306
3.55k
          dst += 2 * dst_stride;
2307
3.55k
          y -= 2;
2308
3.55k
        } while (y);
2309
73
      }
2310
26.9k
    }
2311
614k
  } else if (vert_tap == 4) {
2312
    // vert_filt as 4 tap
2313
316k
    const uint8_t *src_ptr = src - src_stride;
2314
2315
316k
    y = h;
2316
2317
316k
    if (w <= 4) {
2318
151k
      prepare_half_coeffs_4tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2319
2320
151k
      if (w == 2) {
2321
27.4k
        __m128i s_16[4], ss_128[2];
2322
2323
27.4k
        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2324
27.4k
        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2325
27.4k
        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2326
2327
27.4k
        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2328
27.4k
        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2329
2330
27.4k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2331
2332
47.3k
        do {
2333
47.3k
          src_ptr += 2 * src_stride;
2334
47.3k
          const __m128i res = y_convolve_4tap_2x2_ssse3(
2335
47.3k
              src_ptr, src_stride, coeffs_128, s_16, ss_128);
2336
47.3k
          const __m128i r = sr_y_round_sse2(res);
2337
47.3k
          pack_store_2x2_sse2(r, dst, dst_stride);
2338
2339
47.3k
          ss_128[0] = ss_128[1];
2340
47.3k
          dst += 2 * dst_stride;
2341
47.3k
          y -= 2;
2342
47.3k
        } while (y);
2343
124k
      } else {
2344
124k
        __m128i s_32[4], ss_128[2];
2345
2346
124k
        assert(w == 4);
2347
2348
124k
        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2349
124k
        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2350
124k
        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2351
2352
124k
        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2353
124k
        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2354
2355
124k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2356
2357
249k
        do {
2358
249k
          src_ptr += 2 * src_stride;
2359
249k
          const __m128i res = y_convolve_4tap_4x2_ssse3(
2360
249k
              src_ptr, src_stride, coeffs_128, s_32, ss_128);
2361
249k
          const __m128i r = sr_y_round_sse2(res);
2362
249k
          pack_store_4x2_sse2(r, dst, dst_stride);
2363
2364
249k
          ss_128[0] = ss_128[1];
2365
249k
          dst += 2 * dst_stride;
2366
249k
          y -= 2;
2367
249k
        } while (y);
2368
124k
      }
2369
165k
    } else {
2370
165k
      prepare_half_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2371
2372
165k
      if (w == 8) {
2373
107k
        __m128i s_64[4];
2374
107k
        __m256i ss_256[2];
2375
2376
107k
        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2377
107k
        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2378
107k
        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2379
2380
        // Load lines a and b. Line a to lower 128, line b to upper 128
2381
107k
        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2382
107k
        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2383
2384
107k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2385
2386
212k
        do {
2387
212k
          src_ptr += 2 * src_stride;
2388
212k
          const __m256i res = y_convolve_4tap_8x2_avx2(
2389
212k
              src_ptr, src_stride, coeffs_256, s_64, ss_256);
2390
212k
          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2391
2392
212k
          ss_256[0] = ss_256[1];
2393
212k
          dst += 2 * dst_stride;
2394
212k
          y -= 2;
2395
212k
        } while (y);
2396
107k
      } else if (w == 16) {
2397
52.4k
        __m128i s_128[4];
2398
52.4k
        __m256i ss_256[4], r[2];
2399
2400
52.4k
        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2401
52.4k
        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2402
52.4k
        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2403
2404
        // Load lines a and b. Line a to lower 128, line b to upper 128
2405
52.4k
        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2406
52.4k
        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2407
2408
52.4k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2409
52.4k
        ss_256[2] = _mm256_unpackhi_epi8(src01, src12);
2410
2411
132k
        do {
2412
132k
          src_ptr += 2 * src_stride;
2413
132k
          y_convolve_4tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2414
132k
                                    ss_256, r);
2415
132k
          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2416
2417
132k
          ss_256[0] = ss_256[1];
2418
132k
          ss_256[2] = ss_256[3];
2419
132k
          dst += 2 * dst_stride;
2420
132k
          y -= 2;
2421
132k
        } while (y);
2422
52.4k
      } else if (w == 32) {
2423
        // AV1 standard won't have 32x4 case.
2424
        // This only favors some optimization feature which
2425
        // subsamples 32x8 to 32x4 and triggers 4-tap filter.
2426
2427
4.14k
        __m256i s_256[4], ss_256[4], tt_256[4], r[4];
2428
2429
4.14k
        s_256[0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * src_stride));
2430
4.14k
        s_256[1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * src_stride));
2431
4.14k
        s_256[2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * src_stride));
2432
2433
4.14k
        ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2434
4.14k
        ss_256[2] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2435
2436
4.14k
        tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2437
4.14k
        tt_256[2] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2438
2439
30.9k
        do {
2440
30.9k
          src_ptr += 2 * src_stride;
2441
30.9k
          y_convolve_4tap_32x2_avx2(src_ptr, src_stride, coeffs_256, s_256,
2442
30.9k
                                    ss_256, tt_256, r);
2443
30.9k
          sr_y_round_store_32x2_avx2(r, dst, dst_stride);
2444
2445
30.9k
          ss_256[0] = ss_256[1];
2446
30.9k
          ss_256[2] = ss_256[3];
2447
2448
30.9k
          tt_256[0] = tt_256[1];
2449
30.9k
          tt_256[2] = tt_256[3];
2450
30.9k
          dst += 2 * dst_stride;
2451
30.9k
          y -= 2;
2452
30.9k
        } while (y);
2453
4.14k
      } else {
2454
1.40k
        assert(!(w % 32));
2455
2456
1.40k
        __m256i s_256[4], ss_256[4], tt_256[4], r[4];
2457
1.40k
        x = 0;
2458
3.49k
        do {
2459
3.49k
          const uint8_t *s = src_ptr + x;
2460
3.49k
          uint8_t *d = dst + x;
2461
3.49k
          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2462
3.49k
          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2463
3.49k
          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2464
2465
3.49k
          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2466
3.49k
          ss_256[2] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2467
2468
3.49k
          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2469
3.49k
          tt_256[2] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2470
2471
3.49k
          y = h;
2472
149k
          do {
2473
149k
            s += 2 * src_stride;
2474
149k
            y_convolve_4tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2475
149k
                                      tt_256, r);
2476
149k
            sr_y_round_store_32x2_avx2(r, d, dst_stride);
2477
2478
149k
            ss_256[0] = ss_256[1];
2479
149k
            ss_256[2] = ss_256[3];
2480
2481
149k
            tt_256[0] = tt_256[1];
2482
149k
            tt_256[2] = tt_256[3];
2483
149k
            d += 2 * dst_stride;
2484
149k
            y -= 2;
2485
149k
          } while (y);
2486
3.49k
          x += 32;
2487
3.49k
        } while (x < w);
2488
1.40k
      }
2489
165k
    }
2490
316k
  } else if (vert_tap == 6) {
2491
    // vert_filt as 6 tap
2492
281k
    const uint8_t *src_ptr = src - 2 * src_stride;
2493
2494
281k
    if (w <= 4) {
2495
86.9k
      prepare_half_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2496
2497
86.9k
      y = h;
2498
2499
86.9k
      if (w == 2) {
2500
16.4k
        __m128i s_16[6], ss_128[3];
2501
2502
16.4k
        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2503
16.4k
        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2504
16.4k
        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2505
16.4k
        s_16[3] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 3 * src_stride));
2506
16.4k
        s_16[4] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 4 * src_stride));
2507
2508
16.4k
        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2509
16.4k
        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2510
16.4k
        const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
2511
16.4k
        const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
2512
2513
16.4k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2514
16.4k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2515
2516
65.9k
        do {
2517
65.9k
          src_ptr += 2 * src_stride;
2518
65.9k
          const __m128i res = y_convolve_6tap_2x2_ssse3(
2519
65.9k
              src_ptr, src_stride, coeffs_128, s_16, ss_128);
2520
65.9k
          const __m128i r = sr_y_round_sse2(res);
2521
65.9k
          pack_store_2x2_sse2(r, dst, dst_stride);
2522
2523
65.9k
          ss_128[0] = ss_128[1];
2524
65.9k
          ss_128[1] = ss_128[2];
2525
65.9k
          dst += 2 * dst_stride;
2526
65.9k
          y -= 2;
2527
65.9k
        } while (y);
2528
70.4k
      } else {
2529
70.4k
        __m128i s_32[6], ss_128[3];
2530
2531
70.4k
        assert(w == 4);
2532
2533
70.4k
        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2534
70.4k
        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2535
70.4k
        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2536
70.4k
        s_32[3] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 3 * src_stride));
2537
70.4k
        s_32[4] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 4 * src_stride));
2538
2539
70.4k
        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2540
70.4k
        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2541
70.4k
        const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
2542
70.4k
        const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
2543
2544
70.4k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2545
70.4k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2546
2547
384k
        do {
2548
384k
          src_ptr += 2 * src_stride;
2549
384k
          const __m128i res = y_convolve_6tap_4x2_ssse3(
2550
384k
              src_ptr, src_stride, coeffs_128, s_32, ss_128);
2551
384k
          const __m128i r = sr_y_round_sse2(res);
2552
384k
          pack_store_4x2_sse2(r, dst, dst_stride);
2553
2554
384k
          ss_128[0] = ss_128[1];
2555
384k
          ss_128[1] = ss_128[2];
2556
384k
          dst += 2 * dst_stride;
2557
384k
          y -= 2;
2558
384k
        } while (y);
2559
70.4k
      }
2560
194k
    } else {
2561
194k
      prepare_half_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2562
2563
194k
      if (w == 8) {
2564
92.5k
        __m128i s_64[6];
2565
92.5k
        __m256i ss_256[3];
2566
2567
92.5k
        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2568
92.5k
        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2569
92.5k
        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2570
92.5k
        s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
2571
92.5k
        s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
2572
2573
        // Load lines a and b. Line a to lower 128, line b to upper 128
2574
92.5k
        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2575
92.5k
        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2576
92.5k
        const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
2577
92.5k
        const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
2578
2579
92.5k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2580
92.5k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2581
2582
92.5k
        y = h;
2583
530k
        do {
2584
530k
          src_ptr += 2 * src_stride;
2585
530k
          const __m256i res = y_convolve_6tap_8x2_avx2(
2586
530k
              src_ptr, src_stride, coeffs_256, s_64, ss_256);
2587
530k
          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2588
2589
530k
          ss_256[0] = ss_256[1];
2590
530k
          ss_256[1] = ss_256[2];
2591
530k
          dst += 2 * dst_stride;
2592
530k
          y -= 2;
2593
530k
        } while (y);
2594
102k
      } else if (w == 16) {
2595
70.0k
        __m128i s_128[6];
2596
70.0k
        __m256i ss_256[6], r[2];
2597
2598
70.0k
        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2599
70.0k
        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2600
70.0k
        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2601
70.0k
        s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
2602
70.0k
        s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
2603
2604
        // Load lines a and b. Line a to lower 128, line b to upper 128
2605
70.0k
        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2606
70.0k
        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2607
70.0k
        const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
2608
70.0k
        const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
2609
2610
70.0k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2611
70.0k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2612
2613
70.0k
        ss_256[3] = _mm256_unpackhi_epi8(src01, src12);
2614
70.0k
        ss_256[4] = _mm256_unpackhi_epi8(src23, src34);
2615
2616
70.0k
        y = h;
2617
509k
        do {
2618
509k
          src_ptr += 2 * src_stride;
2619
509k
          y_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2620
509k
                                    ss_256, r);
2621
509k
          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2622
2623
509k
          ss_256[0] = ss_256[1];
2624
509k
          ss_256[1] = ss_256[2];
2625
2626
509k
          ss_256[3] = ss_256[4];
2627
509k
          ss_256[4] = ss_256[5];
2628
509k
          dst += 2 * dst_stride;
2629
509k
          y -= 2;
2630
509k
        } while (y);
2631
70.0k
      } else {
2632
32.2k
        __m256i s_256[6], ss_256[6], tt_256[6], r[4];
2633
2634
32.2k
        assert(!(w % 32));
2635
2636
32.2k
        x = 0;
2637
38.8k
        do {
2638
38.8k
          const uint8_t *s = src_ptr + x;
2639
38.8k
          uint8_t *d = dst + x;
2640
2641
38.8k
          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2642
38.8k
          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2643
38.8k
          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2644
38.8k
          s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
2645
38.8k
          s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
2646
2647
38.8k
          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2648
38.8k
          ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
2649
38.8k
          ss_256[3] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2650
38.8k
          ss_256[4] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
2651
2652
38.8k
          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2653
38.8k
          tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
2654
38.8k
          tt_256[3] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2655
38.8k
          tt_256[4] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
2656
2657
38.8k
          y = h;
2658
673k
          do {
2659
673k
            s += 2 * src_stride;
2660
673k
            y_convolve_6tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2661
673k
                                      tt_256, r);
2662
673k
            sr_y_round_store_32x2_avx2(r, d, dst_stride);
2663
2664
673k
            ss_256[0] = ss_256[1];
2665
673k
            ss_256[1] = ss_256[2];
2666
673k
            ss_256[3] = ss_256[4];
2667
673k
            ss_256[4] = ss_256[5];
2668
2669
673k
            tt_256[0] = tt_256[1];
2670
673k
            tt_256[1] = tt_256[2];
2671
673k
            tt_256[3] = tt_256[4];
2672
673k
            tt_256[4] = tt_256[5];
2673
673k
            d += 2 * dst_stride;
2674
673k
            y -= 2;
2675
673k
          } while (y);
2676
2677
38.8k
          x += 32;
2678
38.8k
        } while (x < w);
2679
32.2k
      }
2680
194k
    }
2681
281k
  } else if (vert_tap == 8) {
2682
    // vert_filt as 8 tap
2683
15.4k
    const uint8_t *src_ptr = src - 3 * src_stride;
2684
2685
15.4k
    if (w <= 4) {
2686
6.74k
      prepare_half_coeffs_8tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2687
2688
6.74k
      y = h;
2689
2690
6.74k
      if (w == 2) {
2691
1.59k
        __m128i s_16[8], ss_128[4];
2692
2693
1.59k
        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2694
1.59k
        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2695
1.59k
        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2696
1.59k
        s_16[3] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 3 * src_stride));
2697
1.59k
        s_16[4] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 4 * src_stride));
2698
1.59k
        s_16[5] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 5 * src_stride));
2699
1.59k
        s_16[6] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 6 * src_stride));
2700
2701
1.59k
        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2702
1.59k
        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2703
1.59k
        const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
2704
1.59k
        const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
2705
1.59k
        const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
2706
1.59k
        const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[6]);
2707
2708
1.59k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2709
1.59k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2710
1.59k
        ss_128[2] = _mm_unpacklo_epi8(src45, src56);
2711
2712
6.39k
        do {
2713
6.39k
          const __m128i res = y_convolve_8tap_2x2_ssse3(
2714
6.39k
              src_ptr, src_stride, coeffs_128, s_16, ss_128);
2715
6.39k
          const __m128i r = sr_y_round_sse2(res);
2716
6.39k
          pack_store_2x2_sse2(r, dst, dst_stride);
2717
6.39k
          ss_128[0] = ss_128[1];
2718
6.39k
          ss_128[1] = ss_128[2];
2719
6.39k
          ss_128[2] = ss_128[3];
2720
6.39k
          src_ptr += 2 * src_stride;
2721
6.39k
          dst += 2 * dst_stride;
2722
6.39k
          y -= 2;
2723
6.39k
        } while (y);
2724
5.14k
      } else {
2725
5.14k
        __m128i s_32[8], ss_128[4];
2726
2727
5.14k
        assert(w == 4);
2728
2729
5.14k
        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2730
5.14k
        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2731
5.14k
        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2732
5.14k
        s_32[3] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 3 * src_stride));
2733
5.14k
        s_32[4] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 4 * src_stride));
2734
5.14k
        s_32[5] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 5 * src_stride));
2735
5.14k
        s_32[6] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 6 * src_stride));
2736
2737
5.14k
        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2738
5.14k
        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2739
5.14k
        const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
2740
5.14k
        const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
2741
5.14k
        const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
2742
5.14k
        const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
2743
2744
5.14k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2745
5.14k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2746
5.14k
        ss_128[2] = _mm_unpacklo_epi8(src45, src56);
2747
2748
26.2k
        do {
2749
26.2k
          const __m128i res = y_convolve_8tap_4x2_ssse3(
2750
26.2k
              src_ptr, src_stride, coeffs_128, s_32, ss_128);
2751
26.2k
          const __m128i r = sr_y_round_sse2(res);
2752
26.2k
          pack_store_4x2_sse2(r, dst, dst_stride);
2753
26.2k
          ss_128[0] = ss_128[1];
2754
26.2k
          ss_128[1] = ss_128[2];
2755
26.2k
          ss_128[2] = ss_128[3];
2756
26.2k
          src_ptr += 2 * src_stride;
2757
26.2k
          dst += 2 * dst_stride;
2758
26.2k
          y -= 2;
2759
26.2k
        } while (y);
2760
5.14k
      }
2761
8.69k
    } else {
2762
8.69k
      prepare_half_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2763
2764
8.69k
      if (w == 8) {
2765
4.39k
        __m128i s_64[8];
2766
4.39k
        __m256i ss_256[4];
2767
2768
4.39k
        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2769
4.39k
        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2770
4.39k
        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2771
4.39k
        s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
2772
4.39k
        s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
2773
4.39k
        s_64[5] = _mm_loadl_epi64((__m128i *)(src_ptr + 5 * src_stride));
2774
4.39k
        s_64[6] = _mm_loadl_epi64((__m128i *)(src_ptr + 6 * src_stride));
2775
2776
        // Load lines a and b. Line a to lower 128, line b to upper 128
2777
4.39k
        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2778
4.39k
        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2779
4.39k
        const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
2780
4.39k
        const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
2781
4.39k
        const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
2782
4.39k
        const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[6]);
2783
2784
4.39k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2785
4.39k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2786
4.39k
        ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
2787
2788
4.39k
        y = h;
2789
24.8k
        do {
2790
24.8k
          const __m256i res = y_convolve_8tap_8x2_avx2(
2791
24.8k
              src_ptr, src_stride, coeffs_256, s_64, ss_256);
2792
24.8k
          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2793
24.8k
          ss_256[0] = ss_256[1];
2794
24.8k
          ss_256[1] = ss_256[2];
2795
24.8k
          ss_256[2] = ss_256[3];
2796
24.8k
          src_ptr += 2 * src_stride;
2797
24.8k
          dst += 2 * dst_stride;
2798
24.8k
          y -= 2;
2799
24.8k
        } while (y);
2800
4.39k
      } else if (w == 16) {
2801
2.77k
        __m128i s_128[8];
2802
2.77k
        __m256i ss_256[8], r[2];
2803
2804
2.77k
        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2805
2.77k
        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2806
2.77k
        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2807
2.77k
        s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
2808
2.77k
        s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
2809
2.77k
        s_128[5] = _mm_loadu_si128((__m128i *)(src_ptr + 5 * src_stride));
2810
2.77k
        s_128[6] = _mm_loadu_si128((__m128i *)(src_ptr + 6 * src_stride));
2811
2812
        // Load lines a and b. Line a to lower 128, line b to upper 128
2813
2.77k
        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2814
2.77k
        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2815
2.77k
        const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
2816
2.77k
        const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
2817
2.77k
        const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
2818
2.77k
        const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[6]);
2819
2820
2.77k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2821
2.77k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2822
2.77k
        ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
2823
2824
2.77k
        ss_256[4] = _mm256_unpackhi_epi8(src01, src12);
2825
2.77k
        ss_256[5] = _mm256_unpackhi_epi8(src23, src34);
2826
2.77k
        ss_256[6] = _mm256_unpackhi_epi8(src45, src56);
2827
2828
2.77k
        y = h;
2829
19.8k
        do {
2830
19.8k
          y_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2831
19.8k
                                    ss_256, r);
2832
19.8k
          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2833
2834
19.8k
          ss_256[0] = ss_256[1];
2835
19.8k
          ss_256[1] = ss_256[2];
2836
19.8k
          ss_256[2] = ss_256[3];
2837
2838
19.8k
          ss_256[4] = ss_256[5];
2839
19.8k
          ss_256[5] = ss_256[6];
2840
19.8k
          ss_256[6] = ss_256[7];
2841
19.8k
          src_ptr += 2 * src_stride;
2842
19.8k
          dst += 2 * dst_stride;
2843
19.8k
          y -= 2;
2844
19.8k
        } while (y);
2845
2.77k
      } else {
2846
1.53k
        __m256i s_256[8], ss_256[8], tt_256[8], r[4];
2847
2848
1.53k
        assert(!(w % 32));
2849
2850
1.53k
        x = 0;
2851
2.16k
        do {
2852
2.16k
          const uint8_t *s = src_ptr + x;
2853
2.16k
          uint8_t *d = dst + x;
2854
2855
2.16k
          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2856
2.16k
          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2857
2.16k
          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2858
2.16k
          s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
2859
2.16k
          s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
2860
2.16k
          s_256[5] = _mm256_loadu_si256((__m256i *)(s + 5 * src_stride));
2861
2.16k
          s_256[6] = _mm256_loadu_si256((__m256i *)(s + 6 * src_stride));
2862
2863
2.16k
          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2864
2.16k
          ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
2865
2.16k
          ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
2866
2.16k
          ss_256[4] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2867
2.16k
          ss_256[5] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
2868
2.16k
          ss_256[6] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
2869
2870
2.16k
          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2871
2.16k
          tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
2872
2.16k
          tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[6]);
2873
2.16k
          tt_256[4] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2874
2.16k
          tt_256[5] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
2875
2.16k
          tt_256[6] = _mm256_unpackhi_epi8(s_256[5], s_256[6]);
2876
2877
2.16k
          y = h;
2878
44.5k
          do {
2879
44.5k
            y_convolve_8tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2880
44.5k
                                      tt_256, r);
2881
44.5k
            sr_y_round_store_32x2_avx2(r, d, dst_stride);
2882
2883
44.5k
            ss_256[0] = ss_256[1];
2884
44.5k
            ss_256[1] = ss_256[2];
2885
44.5k
            ss_256[2] = ss_256[3];
2886
44.5k
            ss_256[4] = ss_256[5];
2887
44.5k
            ss_256[5] = ss_256[6];
2888
44.5k
            ss_256[6] = ss_256[7];
2889
2890
44.5k
            tt_256[0] = tt_256[1];
2891
44.5k
            tt_256[1] = tt_256[2];
2892
44.5k
            tt_256[2] = tt_256[3];
2893
44.5k
            tt_256[4] = tt_256[5];
2894
44.5k
            tt_256[5] = tt_256[6];
2895
44.5k
            tt_256[6] = tt_256[7];
2896
44.5k
            s += 2 * src_stride;
2897
44.5k
            d += 2 * dst_stride;
2898
44.5k
            y -= 2;
2899
44.5k
          } while (y);
2900
2901
2.16k
          x += 32;
2902
2.16k
        } while (x < w);
2903
1.53k
      }
2904
8.69k
    }
2905
15.4k
  }
2906
655k
}
Unexecuted instantiation: convolve_2d_avx2.c:av1_convolve_y_sr_specialized_avx2
convolve_avx2.c:av1_convolve_y_sr_specialized_avx2
Line
Count
Source
2008
655k
    const int32_t subpel_y_q4) {
2009
655k
  int32_t x, y;
2010
655k
  __m128i coeffs_128[4];
2011
655k
  __m256i coeffs_256[4];
2012
2013
655k
  int vert_tap = get_filter_tap(filter_params_y, subpel_y_q4);
2014
2015
655k
  if (vert_tap == 2) {
2016
    // vert_filt as 2 tap
2017
40.9k
    const uint8_t *src_ptr = src;
2018
2019
40.9k
    y = h;
2020
2021
40.9k
    if (subpel_y_q4 != 8) {
2022
13.9k
      if (w <= 8) {
2023
10.5k
        prepare_half_coeffs_2tap_ssse3(filter_params_y, subpel_y_q4,
2024
10.5k
                                       coeffs_128);
2025
2026
10.5k
        if (w == 2) {
2027
1.87k
          __m128i s_16[2];
2028
2029
1.87k
          s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
2030
2031
3.45k
          do {
2032
3.45k
            const __m128i res = y_convolve_2tap_2x2_ssse3(src_ptr, src_stride,
2033
3.45k
                                                          coeffs_128, s_16);
2034
3.45k
            const __m128i r = sr_y_round_sse2(res);
2035
3.45k
            pack_store_2x2_sse2(r, dst, dst_stride);
2036
3.45k
            src_ptr += 2 * src_stride;
2037
3.45k
            dst += 2 * dst_stride;
2038
3.45k
            y -= 2;
2039
3.45k
          } while (y);
2040
8.64k
        } else if (w == 4) {
2041
4.81k
          __m128i s_32[2];
2042
2043
4.81k
          s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr));
2044
2045
14.2k
          do {
2046
14.2k
            const __m128i res = y_convolve_2tap_4x2_ssse3(src_ptr, src_stride,
2047
14.2k
                                                          coeffs_128, s_32);
2048
14.2k
            const __m128i r = sr_y_round_sse2(res);
2049
14.2k
            pack_store_4x2_sse2(r, dst, dst_stride);
2050
14.2k
            src_ptr += 2 * src_stride;
2051
14.2k
            dst += 2 * dst_stride;
2052
14.2k
            y -= 2;
2053
14.2k
          } while (y);
2054
4.81k
        } else {
2055
3.83k
          __m128i s_64[2], s_128[2];
2056
2057
3.83k
          assert(w == 8);
2058
2059
3.83k
          s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
2060
2061
12.7k
          do {
2062
            // Note: Faster than binding to AVX2 registers.
2063
12.7k
            s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
2064
12.7k
            s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
2065
12.7k
            s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2066
12.7k
            s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
2067
12.7k
            const __m128i ss0 = _mm_unpacklo_epi8(s_128[0], s_128[1]);
2068
12.7k
            const __m128i ss1 = _mm_unpackhi_epi8(s_128[0], s_128[1]);
2069
12.7k
            const __m128i res0 = convolve_2tap_ssse3(&ss0, coeffs_128);
2070
12.7k
            const __m128i res1 = convolve_2tap_ssse3(&ss1, coeffs_128);
2071
12.7k
            const __m128i r0 = sr_y_round_sse2(res0);
2072
12.7k
            const __m128i r1 = sr_y_round_sse2(res1);
2073
12.7k
            const __m128i d = _mm_packus_epi16(r0, r1);
2074
12.7k
            _mm_storel_epi64((__m128i *)dst, d);
2075
12.7k
            _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
2076
12.7k
            src_ptr += 2 * src_stride;
2077
12.7k
            dst += 2 * dst_stride;
2078
12.7k
            y -= 2;
2079
12.7k
          } while (y);
2080
3.83k
        }
2081
10.5k
      } else {
2082
3.44k
        prepare_half_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2083
2084
3.44k
        if (w == 16) {
2085
1.98k
          __m128i s_128[2];
2086
2087
1.98k
          s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
2088
2089
13.9k
          do {
2090
13.9k
            __m256i r[2];
2091
2092
13.9k
            y_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2093
13.9k
                                      r);
2094
13.9k
            sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2095
13.9k
            src_ptr += 2 * src_stride;
2096
13.9k
            dst += 2 * dst_stride;
2097
13.9k
            y -= 2;
2098
13.9k
          } while (y);
2099
1.98k
        } else if (w == 32) {
2100
907
          __m256i s_256[2];
2101
2102
907
          s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
2103
2104
12.8k
          do {
2105
12.8k
            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0],
2106
12.8k
                              &s_256[1], dst);
2107
12.8k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1],
2108
12.8k
                              &s_256[0], dst + dst_stride);
2109
12.8k
            src_ptr += 2 * src_stride;
2110
12.8k
            dst += 2 * dst_stride;
2111
12.8k
            y -= 2;
2112
12.8k
          } while (y);
2113
907
        } else if (w == 64) {
2114
433
          __m256i s_256[2][2];
2115
2116
433
          s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2117
433
          s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2118
2119
11.1k
          do {
2120
11.1k
            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0][0],
2121
11.1k
                              &s_256[1][0], dst);
2122
11.1k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 32, coeffs_256,
2123
11.1k
                              s_256[0][1], &s_256[1][1], dst + 32);
2124
11.1k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1][0],
2125
11.1k
                              &s_256[0][0], dst + dst_stride);
2126
11.1k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 32, coeffs_256,
2127
11.1k
                              s_256[1][1], &s_256[0][1], dst + dst_stride + 32);
2128
2129
11.1k
            src_ptr += 2 * src_stride;
2130
11.1k
            dst += 2 * dst_stride;
2131
11.1k
            y -= 2;
2132
11.1k
          } while (y);
2133
433
        } else {
2134
119
          __m256i s_256[2][4];
2135
2136
119
          assert(w == 128);
2137
2138
119
          s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2139
119
          s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2140
119
          s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
2141
119
          s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
2142
2143
5.69k
          do {
2144
5.69k
            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0][0],
2145
5.69k
                              &s_256[1][0], dst);
2146
5.69k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 1 * 32, coeffs_256,
2147
5.69k
                              s_256[0][1], &s_256[1][1], dst + 1 * 32);
2148
5.69k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 2 * 32, coeffs_256,
2149
5.69k
                              s_256[0][2], &s_256[1][2], dst + 2 * 32);
2150
5.69k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 3 * 32, coeffs_256,
2151
5.69k
                              s_256[0][3], &s_256[1][3], dst + 3 * 32);
2152
2153
5.69k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1][0],
2154
5.69k
                              &s_256[0][0], dst + dst_stride);
2155
5.69k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 1 * 32, coeffs_256,
2156
5.69k
                              s_256[1][1], &s_256[0][1],
2157
5.69k
                              dst + dst_stride + 1 * 32);
2158
5.69k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 2 * 32, coeffs_256,
2159
5.69k
                              s_256[1][2], &s_256[0][2],
2160
5.69k
                              dst + dst_stride + 2 * 32);
2161
5.69k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 3 * 32, coeffs_256,
2162
5.69k
                              s_256[1][3], &s_256[0][3],
2163
5.69k
                              dst + dst_stride + 3 * 32);
2164
2165
5.69k
            src_ptr += 2 * src_stride;
2166
5.69k
            dst += 2 * dst_stride;
2167
5.69k
            y -= 2;
2168
5.69k
          } while (y);
2169
119
        }
2170
3.44k
      }
2171
26.9k
    } else {
2172
      // average to get half pel
2173
26.9k
      if (w <= 8) {
2174
23.5k
        if (w == 2) {
2175
5.16k
          __m128i s_16[2];
2176
2177
5.16k
          s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
2178
2179
10.5k
          do {
2180
10.5k
            s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + src_stride));
2181
10.5k
            const __m128i d0 = _mm_avg_epu8(s_16[0], s_16[1]);
2182
10.5k
            *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d0);
2183
10.5k
            s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
2184
10.5k
            const __m128i d1 = _mm_avg_epu8(s_16[1], s_16[0]);
2185
10.5k
            *(int16_t *)(dst + dst_stride) = (int16_t)_mm_cvtsi128_si32(d1);
2186
10.5k
            src_ptr += 2 * src_stride;
2187
10.5k
            dst += 2 * dst_stride;
2188
10.5k
            y -= 2;
2189
10.5k
          } while (y);
2190
18.3k
        } else if (w == 4) {
2191
11.3k
          __m128i s_32[2];
2192
2193
11.3k
          s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr));
2194
2195
31.4k
          do {
2196
31.4k
            s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + src_stride));
2197
31.4k
            const __m128i d0 = _mm_avg_epu8(s_32[0], s_32[1]);
2198
31.4k
            xx_storel_32(dst, d0);
2199
31.4k
            s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2200
31.4k
            const __m128i d1 = _mm_avg_epu8(s_32[1], s_32[0]);
2201
31.4k
            xx_storel_32(dst + dst_stride, d1);
2202
31.4k
            src_ptr += 2 * src_stride;
2203
31.4k
            dst += 2 * dst_stride;
2204
31.4k
            y -= 2;
2205
31.4k
          } while (y);
2206
11.3k
        } else {
2207
7.00k
          __m128i s_64[2];
2208
2209
7.00k
          assert(w == 8);
2210
2211
7.00k
          s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
2212
2213
22.4k
          do {
2214
            // Note: Faster than binding to AVX2 registers.
2215
22.4k
            s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
2216
22.4k
            const __m128i d0 = _mm_avg_epu8(s_64[0], s_64[1]);
2217
22.4k
            _mm_storel_epi64((__m128i *)dst, d0);
2218
22.4k
            s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2219
22.4k
            const __m128i d1 = _mm_avg_epu8(s_64[1], s_64[0]);
2220
22.4k
            _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
2221
22.4k
            src_ptr += 2 * src_stride;
2222
22.4k
            dst += 2 * dst_stride;
2223
22.4k
            y -= 2;
2224
22.4k
          } while (y);
2225
7.00k
        }
2226
23.5k
      } else if (w == 16) {
2227
2.36k
        __m128i s_128[2];
2228
2229
2.36k
        s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
2230
2231
12.9k
        do {
2232
12.9k
          s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
2233
12.9k
          const __m128i d0 = _mm_avg_epu8(s_128[0], s_128[1]);
2234
12.9k
          _mm_storeu_si128((__m128i *)dst, d0);
2235
12.9k
          s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2236
12.9k
          const __m128i d1 = _mm_avg_epu8(s_128[1], s_128[0]);
2237
12.9k
          _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
2238
12.9k
          src_ptr += 2 * src_stride;
2239
12.9k
          dst += 2 * dst_stride;
2240
12.9k
          y -= 2;
2241
12.9k
        } while (y);
2242
2.36k
      } else if (w == 32) {
2243
691
        __m256i s_256[2];
2244
2245
691
        s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
2246
2247
9.43k
        do {
2248
9.43k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0], &s_256[1], dst);
2249
9.43k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1], &s_256[0],
2250
9.43k
                                dst + dst_stride);
2251
9.43k
          src_ptr += 2 * src_stride;
2252
9.43k
          dst += 2 * dst_stride;
2253
9.43k
          y -= 2;
2254
9.43k
        } while (y);
2255
691
      } else if (w == 64) {
2256
324
        __m256i s_256[2][2];
2257
2258
324
        s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2259
324
        s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2260
2261
8.94k
        do {
2262
8.94k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0][0], &s_256[1][0],
2263
8.94k
                                dst);
2264
8.94k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 32, s_256[0][1],
2265
8.94k
                                &s_256[1][1], dst + 32);
2266
2267
8.94k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1][0],
2268
8.94k
                                &s_256[0][0], dst + dst_stride);
2269
8.94k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 32, s_256[1][1],
2270
8.94k
                                &s_256[0][1], dst + dst_stride + 32);
2271
2272
8.94k
          src_ptr += 2 * src_stride;
2273
8.94k
          dst += 2 * dst_stride;
2274
8.94k
          y -= 2;
2275
8.94k
        } while (y);
2276
324
      } else {
2277
73
        __m256i s_256[2][4];
2278
2279
73
        assert(w == 128);
2280
2281
73
        s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2282
73
        s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2283
73
        s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
2284
73
        s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
2285
2286
3.55k
        do {
2287
3.55k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0][0], &s_256[1][0],
2288
3.55k
                                dst);
2289
3.55k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 1 * 32, s_256[0][1],
2290
3.55k
                                &s_256[1][1], dst + 1 * 32);
2291
3.55k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 2 * 32, s_256[0][2],
2292
3.55k
                                &s_256[1][2], dst + 2 * 32);
2293
3.55k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 3 * 32, s_256[0][3],
2294
3.55k
                                &s_256[1][3], dst + 3 * 32);
2295
2296
3.55k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1][0],
2297
3.55k
                                &s_256[0][0], dst + dst_stride);
2298
3.55k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 1 * 32, s_256[1][1],
2299
3.55k
                                &s_256[0][1], dst + dst_stride + 1 * 32);
2300
3.55k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 2 * 32, s_256[1][2],
2301
3.55k
                                &s_256[0][2], dst + dst_stride + 2 * 32);
2302
3.55k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 3 * 32, s_256[1][3],
2303
3.55k
                                &s_256[0][3], dst + dst_stride + 3 * 32);
2304
2305
3.55k
          src_ptr += 2 * src_stride;
2306
3.55k
          dst += 2 * dst_stride;
2307
3.55k
          y -= 2;
2308
3.55k
        } while (y);
2309
73
      }
2310
26.9k
    }
2311
614k
  } else if (vert_tap == 4) {
2312
    // vert_filt as 4 tap
2313
316k
    const uint8_t *src_ptr = src - src_stride;
2314
2315
316k
    y = h;
2316
2317
316k
    if (w <= 4) {
2318
151k
      prepare_half_coeffs_4tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2319
2320
151k
      if (w == 2) {
2321
27.4k
        __m128i s_16[4], ss_128[2];
2322
2323
27.4k
        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2324
27.4k
        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2325
27.4k
        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2326
2327
27.4k
        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2328
27.4k
        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2329
2330
27.4k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2331
2332
47.3k
        do {
2333
47.3k
          src_ptr += 2 * src_stride;
2334
47.3k
          const __m128i res = y_convolve_4tap_2x2_ssse3(
2335
47.3k
              src_ptr, src_stride, coeffs_128, s_16, ss_128);
2336
47.3k
          const __m128i r = sr_y_round_sse2(res);
2337
47.3k
          pack_store_2x2_sse2(r, dst, dst_stride);
2338
2339
47.3k
          ss_128[0] = ss_128[1];
2340
47.3k
          dst += 2 * dst_stride;
2341
47.3k
          y -= 2;
2342
47.3k
        } while (y);
2343
124k
      } else {
2344
124k
        __m128i s_32[4], ss_128[2];
2345
2346
124k
        assert(w == 4);
2347
2348
124k
        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2349
124k
        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2350
124k
        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2351
2352
124k
        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2353
124k
        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2354
2355
124k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2356
2357
249k
        do {
2358
249k
          src_ptr += 2 * src_stride;
2359
249k
          const __m128i res = y_convolve_4tap_4x2_ssse3(
2360
249k
              src_ptr, src_stride, coeffs_128, s_32, ss_128);
2361
249k
          const __m128i r = sr_y_round_sse2(res);
2362
249k
          pack_store_4x2_sse2(r, dst, dst_stride);
2363
2364
249k
          ss_128[0] = ss_128[1];
2365
249k
          dst += 2 * dst_stride;
2366
249k
          y -= 2;
2367
249k
        } while (y);
2368
124k
      }
2369
165k
    } else {
2370
165k
      prepare_half_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2371
2372
165k
      if (w == 8) {
2373
107k
        __m128i s_64[4];
2374
107k
        __m256i ss_256[2];
2375
2376
107k
        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2377
107k
        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2378
107k
        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2379
2380
        // Load lines a and b. Line a to lower 128, line b to upper 128
2381
107k
        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2382
107k
        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2383
2384
107k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2385
2386
212k
        do {
2387
212k
          src_ptr += 2 * src_stride;
2388
212k
          const __m256i res = y_convolve_4tap_8x2_avx2(
2389
212k
              src_ptr, src_stride, coeffs_256, s_64, ss_256);
2390
212k
          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2391
2392
212k
          ss_256[0] = ss_256[1];
2393
212k
          dst += 2 * dst_stride;
2394
212k
          y -= 2;
2395
212k
        } while (y);
2396
107k
      } else if (w == 16) {
2397
52.4k
        __m128i s_128[4];
2398
52.4k
        __m256i ss_256[4], r[2];
2399
2400
52.4k
        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2401
52.4k
        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2402
52.4k
        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2403
2404
        // Load lines a and b. Line a to lower 128, line b to upper 128
2405
52.4k
        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2406
52.4k
        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2407
2408
52.4k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2409
52.4k
        ss_256[2] = _mm256_unpackhi_epi8(src01, src12);
2410
2411
132k
        do {
2412
132k
          src_ptr += 2 * src_stride;
2413
132k
          y_convolve_4tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2414
132k
                                    ss_256, r);
2415
132k
          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2416
2417
132k
          ss_256[0] = ss_256[1];
2418
132k
          ss_256[2] = ss_256[3];
2419
132k
          dst += 2 * dst_stride;
2420
132k
          y -= 2;
2421
132k
        } while (y);
2422
52.4k
      } else if (w == 32) {
2423
        // AV1 standard won't have 32x4 case.
2424
        // This only favors some optimization feature which
2425
        // subsamples 32x8 to 32x4 and triggers 4-tap filter.
2426
2427
4.14k
        __m256i s_256[4], ss_256[4], tt_256[4], r[4];
2428
2429
4.14k
        s_256[0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * src_stride));
2430
4.14k
        s_256[1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * src_stride));
2431
4.14k
        s_256[2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * src_stride));
2432
2433
4.14k
        ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2434
4.14k
        ss_256[2] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2435
2436
4.14k
        tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2437
4.14k
        tt_256[2] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2438
2439
30.9k
        do {
2440
30.9k
          src_ptr += 2 * src_stride;
2441
30.9k
          y_convolve_4tap_32x2_avx2(src_ptr, src_stride, coeffs_256, s_256,
2442
30.9k
                                    ss_256, tt_256, r);
2443
30.9k
          sr_y_round_store_32x2_avx2(r, dst, dst_stride);
2444
2445
30.9k
          ss_256[0] = ss_256[1];
2446
30.9k
          ss_256[2] = ss_256[3];
2447
2448
30.9k
          tt_256[0] = tt_256[1];
2449
30.9k
          tt_256[2] = tt_256[3];
2450
30.9k
          dst += 2 * dst_stride;
2451
30.9k
          y -= 2;
2452
30.9k
        } while (y);
2453
4.14k
      } else {
2454
1.40k
        assert(!(w % 32));
2455
2456
1.40k
        __m256i s_256[4], ss_256[4], tt_256[4], r[4];
2457
1.40k
        x = 0;
2458
3.49k
        do {
2459
3.49k
          const uint8_t *s = src_ptr + x;
2460
3.49k
          uint8_t *d = dst + x;
2461
3.49k
          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2462
3.49k
          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2463
3.49k
          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2464
2465
3.49k
          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2466
3.49k
          ss_256[2] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2467
2468
3.49k
          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2469
3.49k
          tt_256[2] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2470
2471
3.49k
          y = h;
2472
149k
          do {
2473
149k
            s += 2 * src_stride;
2474
149k
            y_convolve_4tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2475
149k
                                      tt_256, r);
2476
149k
            sr_y_round_store_32x2_avx2(r, d, dst_stride);
2477
2478
149k
            ss_256[0] = ss_256[1];
2479
149k
            ss_256[2] = ss_256[3];
2480
2481
149k
            tt_256[0] = tt_256[1];
2482
149k
            tt_256[2] = tt_256[3];
2483
149k
            d += 2 * dst_stride;
2484
149k
            y -= 2;
2485
149k
          } while (y);
2486
3.49k
          x += 32;
2487
3.49k
        } while (x < w);
2488
1.40k
      }
2489
165k
    }
2490
316k
  } else if (vert_tap == 6) {
2491
    // vert_filt as 6 tap
2492
281k
    const uint8_t *src_ptr = src - 2 * src_stride;
2493
2494
281k
    if (w <= 4) {
2495
86.9k
      prepare_half_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2496
2497
86.9k
      y = h;
2498
2499
86.9k
      if (w == 2) {
2500
16.4k
        __m128i s_16[6], ss_128[3];
2501
2502
16.4k
        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2503
16.4k
        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2504
16.4k
        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2505
16.4k
        s_16[3] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 3 * src_stride));
2506
16.4k
        s_16[4] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 4 * src_stride));
2507
2508
16.4k
        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2509
16.4k
        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2510
16.4k
        const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
2511
16.4k
        const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
2512
2513
16.4k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2514
16.4k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2515
2516
65.9k
        do {
2517
65.9k
          src_ptr += 2 * src_stride;
2518
65.9k
          const __m128i res = y_convolve_6tap_2x2_ssse3(
2519
65.9k
              src_ptr, src_stride, coeffs_128, s_16, ss_128);
2520
65.9k
          const __m128i r = sr_y_round_sse2(res);
2521
65.9k
          pack_store_2x2_sse2(r, dst, dst_stride);
2522
2523
65.9k
          ss_128[0] = ss_128[1];
2524
65.9k
          ss_128[1] = ss_128[2];
2525
65.9k
          dst += 2 * dst_stride;
2526
65.9k
          y -= 2;
2527
65.9k
        } while (y);
2528
70.4k
      } else {
2529
70.4k
        __m128i s_32[6], ss_128[3];
2530
2531
70.4k
        assert(w == 4);
2532
2533
70.4k
        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2534
70.4k
        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2535
70.4k
        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2536
70.4k
        s_32[3] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 3 * src_stride));
2537
70.4k
        s_32[4] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 4 * src_stride));
2538
2539
70.4k
        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2540
70.4k
        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2541
70.4k
        const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
2542
70.4k
        const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
2543
2544
70.4k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2545
70.4k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2546
2547
384k
        do {
2548
384k
          src_ptr += 2 * src_stride;
2549
384k
          const __m128i res = y_convolve_6tap_4x2_ssse3(
2550
384k
              src_ptr, src_stride, coeffs_128, s_32, ss_128);
2551
384k
          const __m128i r = sr_y_round_sse2(res);
2552
384k
          pack_store_4x2_sse2(r, dst, dst_stride);
2553
2554
384k
          ss_128[0] = ss_128[1];
2555
384k
          ss_128[1] = ss_128[2];
2556
384k
          dst += 2 * dst_stride;
2557
384k
          y -= 2;
2558
384k
        } while (y);
2559
70.4k
      }
2560
194k
    } else {
2561
194k
      prepare_half_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2562
2563
194k
      if (w == 8) {
2564
92.5k
        __m128i s_64[6];
2565
92.5k
        __m256i ss_256[3];
2566
2567
92.5k
        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2568
92.5k
        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2569
92.5k
        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2570
92.5k
        s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
2571
92.5k
        s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
2572
2573
        // Load lines a and b. Line a to lower 128, line b to upper 128
2574
92.5k
        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2575
92.5k
        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2576
92.5k
        const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
2577
92.5k
        const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
2578
2579
92.5k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2580
92.5k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2581
2582
92.5k
        y = h;
2583
530k
        do {
2584
530k
          src_ptr += 2 * src_stride;
2585
530k
          const __m256i res = y_convolve_6tap_8x2_avx2(
2586
530k
              src_ptr, src_stride, coeffs_256, s_64, ss_256);
2587
530k
          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2588
2589
530k
          ss_256[0] = ss_256[1];
2590
530k
          ss_256[1] = ss_256[2];
2591
530k
          dst += 2 * dst_stride;
2592
530k
          y -= 2;
2593
530k
        } while (y);
2594
102k
      } else if (w == 16) {
2595
70.0k
        __m128i s_128[6];
2596
70.0k
        __m256i ss_256[6], r[2];
2597
2598
70.0k
        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2599
70.0k
        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2600
70.0k
        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2601
70.0k
        s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
2602
70.0k
        s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
2603
2604
        // Load lines a and b. Line a to lower 128, line b to upper 128
2605
70.0k
        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2606
70.0k
        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2607
70.0k
        const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
2608
70.0k
        const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
2609
2610
70.0k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2611
70.0k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2612
2613
70.0k
        ss_256[3] = _mm256_unpackhi_epi8(src01, src12);
2614
70.0k
        ss_256[4] = _mm256_unpackhi_epi8(src23, src34);
2615
2616
70.0k
        y = h;
2617
509k
        do {
2618
509k
          src_ptr += 2 * src_stride;
2619
509k
          y_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2620
509k
                                    ss_256, r);
2621
509k
          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2622
2623
509k
          ss_256[0] = ss_256[1];
2624
509k
          ss_256[1] = ss_256[2];
2625
2626
509k
          ss_256[3] = ss_256[4];
2627
509k
          ss_256[4] = ss_256[5];
2628
509k
          dst += 2 * dst_stride;
2629
509k
          y -= 2;
2630
509k
        } while (y);
2631
70.0k
      } else {
2632
32.2k
        __m256i s_256[6], ss_256[6], tt_256[6], r[4];
2633
2634
32.2k
        assert(!(w % 32));
2635
2636
32.2k
        x = 0;
2637
38.8k
        do {
2638
38.8k
          const uint8_t *s = src_ptr + x;
2639
38.8k
          uint8_t *d = dst + x;
2640
2641
38.8k
          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2642
38.8k
          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2643
38.8k
          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2644
38.8k
          s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
2645
38.8k
          s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
2646
2647
38.8k
          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2648
38.8k
          ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
2649
38.8k
          ss_256[3] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2650
38.8k
          ss_256[4] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
2651
2652
38.8k
          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2653
38.8k
          tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
2654
38.8k
          tt_256[3] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2655
38.8k
          tt_256[4] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
2656
2657
38.8k
          y = h;
2658
673k
          do {
2659
673k
            s += 2 * src_stride;
2660
673k
            y_convolve_6tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2661
673k
                                      tt_256, r);
2662
673k
            sr_y_round_store_32x2_avx2(r, d, dst_stride);
2663
2664
673k
            ss_256[0] = ss_256[1];
2665
673k
            ss_256[1] = ss_256[2];
2666
673k
            ss_256[3] = ss_256[4];
2667
673k
            ss_256[4] = ss_256[5];
2668
2669
673k
            tt_256[0] = tt_256[1];
2670
673k
            tt_256[1] = tt_256[2];
2671
673k
            tt_256[3] = tt_256[4];
2672
673k
            tt_256[4] = tt_256[5];
2673
673k
            d += 2 * dst_stride;
2674
673k
            y -= 2;
2675
673k
          } while (y);
2676
2677
38.8k
          x += 32;
2678
38.8k
        } while (x < w);
2679
32.2k
      }
2680
194k
    }
2681
281k
  } else if (vert_tap == 8) {
2682
    // vert_filt as 8 tap
2683
15.4k
    const uint8_t *src_ptr = src - 3 * src_stride;
2684
2685
15.4k
    if (w <= 4) {
2686
6.74k
      prepare_half_coeffs_8tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2687
2688
6.74k
      y = h;
2689
2690
6.74k
      if (w == 2) {
2691
1.59k
        __m128i s_16[8], ss_128[4];
2692
2693
1.59k
        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2694
1.59k
        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2695
1.59k
        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2696
1.59k
        s_16[3] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 3 * src_stride));
2697
1.59k
        s_16[4] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 4 * src_stride));
2698
1.59k
        s_16[5] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 5 * src_stride));
2699
1.59k
        s_16[6] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 6 * src_stride));
2700
2701
1.59k
        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2702
1.59k
        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2703
1.59k
        const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
2704
1.59k
        const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
2705
1.59k
        const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
2706
1.59k
        const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[6]);
2707
2708
1.59k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2709
1.59k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2710
1.59k
        ss_128[2] = _mm_unpacklo_epi8(src45, src56);
2711
2712
6.39k
        do {
2713
6.39k
          const __m128i res = y_convolve_8tap_2x2_ssse3(
2714
6.39k
              src_ptr, src_stride, coeffs_128, s_16, ss_128);
2715
6.39k
          const __m128i r = sr_y_round_sse2(res);
2716
6.39k
          pack_store_2x2_sse2(r, dst, dst_stride);
2717
6.39k
          ss_128[0] = ss_128[1];
2718
6.39k
          ss_128[1] = ss_128[2];
2719
6.39k
          ss_128[2] = ss_128[3];
2720
6.39k
          src_ptr += 2 * src_stride;
2721
6.39k
          dst += 2 * dst_stride;
2722
6.39k
          y -= 2;
2723
6.39k
        } while (y);
2724
5.14k
      } else {
2725
5.14k
        __m128i s_32[8], ss_128[4];
2726
2727
5.14k
        assert(w == 4);
2728
2729
5.14k
        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2730
5.14k
        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2731
5.14k
        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2732
5.14k
        s_32[3] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 3 * src_stride));
2733
5.14k
        s_32[4] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 4 * src_stride));
2734
5.14k
        s_32[5] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 5 * src_stride));
2735
5.14k
        s_32[6] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 6 * src_stride));
2736
2737
5.14k
        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2738
5.14k
        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2739
5.14k
        const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
2740
5.14k
        const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
2741
5.14k
        const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
2742
5.14k
        const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
2743
2744
5.14k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2745
5.14k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2746
5.14k
        ss_128[2] = _mm_unpacklo_epi8(src45, src56);
2747
2748
26.2k
        do {
2749
26.2k
          const __m128i res = y_convolve_8tap_4x2_ssse3(
2750
26.2k
              src_ptr, src_stride, coeffs_128, s_32, ss_128);
2751
26.2k
          const __m128i r = sr_y_round_sse2(res);
2752
26.2k
          pack_store_4x2_sse2(r, dst, dst_stride);
2753
26.2k
          ss_128[0] = ss_128[1];
2754
26.2k
          ss_128[1] = ss_128[2];
2755
26.2k
          ss_128[2] = ss_128[3];
2756
26.2k
          src_ptr += 2 * src_stride;
2757
26.2k
          dst += 2 * dst_stride;
2758
26.2k
          y -= 2;
2759
26.2k
        } while (y);
2760
5.14k
      }
2761
8.69k
    } else {
2762
8.69k
      prepare_half_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2763
2764
8.69k
      if (w == 8) {
2765
4.39k
        __m128i s_64[8];
2766
4.39k
        __m256i ss_256[4];
2767
2768
4.39k
        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2769
4.39k
        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2770
4.39k
        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2771
4.39k
        s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
2772
4.39k
        s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
2773
4.39k
        s_64[5] = _mm_loadl_epi64((__m128i *)(src_ptr + 5 * src_stride));
2774
4.39k
        s_64[6] = _mm_loadl_epi64((__m128i *)(src_ptr + 6 * src_stride));
2775
2776
        // Load lines a and b. Line a to lower 128, line b to upper 128
2777
4.39k
        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2778
4.39k
        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2779
4.39k
        const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
2780
4.39k
        const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
2781
4.39k
        const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
2782
4.39k
        const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[6]);
2783
2784
4.39k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2785
4.39k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2786
4.39k
        ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
2787
2788
4.39k
        y = h;
2789
24.8k
        do {
2790
24.8k
          const __m256i res = y_convolve_8tap_8x2_avx2(
2791
24.8k
              src_ptr, src_stride, coeffs_256, s_64, ss_256);
2792
24.8k
          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2793
24.8k
          ss_256[0] = ss_256[1];
2794
24.8k
          ss_256[1] = ss_256[2];
2795
24.8k
          ss_256[2] = ss_256[3];
2796
24.8k
          src_ptr += 2 * src_stride;
2797
24.8k
          dst += 2 * dst_stride;
2798
24.8k
          y -= 2;
2799
24.8k
        } while (y);
2800
4.39k
      } else if (w == 16) {
2801
2.77k
        __m128i s_128[8];
2802
2.77k
        __m256i ss_256[8], r[2];
2803
2804
2.77k
        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2805
2.77k
        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2806
2.77k
        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2807
2.77k
        s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
2808
2.77k
        s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
2809
2.77k
        s_128[5] = _mm_loadu_si128((__m128i *)(src_ptr + 5 * src_stride));
2810
2.77k
        s_128[6] = _mm_loadu_si128((__m128i *)(src_ptr + 6 * src_stride));
2811
2812
        // Load lines a and b. Line a to lower 128, line b to upper 128
2813
2.77k
        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2814
2.77k
        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2815
2.77k
        const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
2816
2.77k
        const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
2817
2.77k
        const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
2818
2.77k
        const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[6]);
2819
2820
2.77k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2821
2.77k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2822
2.77k
        ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
2823
2824
2.77k
        ss_256[4] = _mm256_unpackhi_epi8(src01, src12);
2825
2.77k
        ss_256[5] = _mm256_unpackhi_epi8(src23, src34);
2826
2.77k
        ss_256[6] = _mm256_unpackhi_epi8(src45, src56);
2827
2828
2.77k
        y = h;
2829
19.8k
        do {
2830
19.8k
          y_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2831
19.8k
                                    ss_256, r);
2832
19.8k
          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2833
2834
19.8k
          ss_256[0] = ss_256[1];
2835
19.8k
          ss_256[1] = ss_256[2];
2836
19.8k
          ss_256[2] = ss_256[3];
2837
2838
19.8k
          ss_256[4] = ss_256[5];
2839
19.8k
          ss_256[5] = ss_256[6];
2840
19.8k
          ss_256[6] = ss_256[7];
2841
19.8k
          src_ptr += 2 * src_stride;
2842
19.8k
          dst += 2 * dst_stride;
2843
19.8k
          y -= 2;
2844
19.8k
        } while (y);
2845
2.77k
      } else {
2846
1.53k
        __m256i s_256[8], ss_256[8], tt_256[8], r[4];
2847
2848
1.53k
        assert(!(w % 32));
2849
2850
1.53k
        x = 0;
2851
2.16k
        do {
2852
2.16k
          const uint8_t *s = src_ptr + x;
2853
2.16k
          uint8_t *d = dst + x;
2854
2855
2.16k
          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2856
2.16k
          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2857
2.16k
          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2858
2.16k
          s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
2859
2.16k
          s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
2860
2.16k
          s_256[5] = _mm256_loadu_si256((__m256i *)(s + 5 * src_stride));
2861
2.16k
          s_256[6] = _mm256_loadu_si256((__m256i *)(s + 6 * src_stride));
2862
2863
2.16k
          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2864
2.16k
          ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
2865
2.16k
          ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
2866
2.16k
          ss_256[4] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2867
2.16k
          ss_256[5] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
2868
2.16k
          ss_256[6] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
2869
2870
2.16k
          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2871
2.16k
          tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
2872
2.16k
          tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[6]);
2873
2.16k
          tt_256[4] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2874
2.16k
          tt_256[5] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
2875
2.16k
          tt_256[6] = _mm256_unpackhi_epi8(s_256[5], s_256[6]);
2876
2877
2.16k
          y = h;
2878
44.5k
          do {
2879
44.5k
            y_convolve_8tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2880
44.5k
                                      tt_256, r);
2881
44.5k
            sr_y_round_store_32x2_avx2(r, d, dst_stride);
2882
2883
44.5k
            ss_256[0] = ss_256[1];
2884
44.5k
            ss_256[1] = ss_256[2];
2885
44.5k
            ss_256[2] = ss_256[3];
2886
44.5k
            ss_256[4] = ss_256[5];
2887
44.5k
            ss_256[5] = ss_256[6];
2888
44.5k
            ss_256[6] = ss_256[7];
2889
2890
44.5k
            tt_256[0] = tt_256[1];
2891
44.5k
            tt_256[1] = tt_256[2];
2892
44.5k
            tt_256[2] = tt_256[3];
2893
44.5k
            tt_256[4] = tt_256[5];
2894
44.5k
            tt_256[5] = tt_256[6];
2895
44.5k
            tt_256[6] = tt_256[7];
2896
44.5k
            s += 2 * src_stride;
2897
44.5k
            d += 2 * dst_stride;
2898
44.5k
            y -= 2;
2899
44.5k
          } while (y);
2900
2901
2.16k
          x += 32;
2902
2.16k
        } while (x < w);
2903
1.53k
      }
2904
8.69k
    }
2905
15.4k
  }
2906
655k
}
2907
2908
static inline void sr_x_2tap_32_avx2(const uint8_t *const src,
2909
                                     const __m256i coeffs[1],
2910
110k
                                     uint8_t *const dst) {
2911
110k
  __m256i r[2];
2912
2913
110k
  x_convolve_2tap_32_avx2(src, coeffs, r);
2914
110k
  sr_x_round_store_32_avx2(r, dst);
2915
110k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_2tap_32_avx2
convolve_avx2.c:sr_x_2tap_32_avx2
Line
Count
Source
2910
110k
                                     uint8_t *const dst) {
2911
110k
  __m256i r[2];
2912
2913
110k
  x_convolve_2tap_32_avx2(src, coeffs, r);
2914
110k
  sr_x_round_store_32_avx2(r, dst);
2915
110k
}
2916
2917
static inline void sr_x_6tap_32_avx2(const uint8_t *const src,
2918
                                     const __m256i coeffs[3],
2919
                                     const __m256i filt[3],
2920
1.57M
                                     uint8_t *const dst) {
2921
1.57M
  __m256i r[2];
2922
2923
1.57M
  x_convolve_6tap_32_avx2(src, coeffs, filt, r);
2924
1.57M
  sr_x_round_store_32_avx2(r, dst);
2925
1.57M
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_6tap_32_avx2
convolve_avx2.c:sr_x_6tap_32_avx2
Line
Count
Source
2920
1.57M
                                     uint8_t *const dst) {
2921
1.57M
  __m256i r[2];
2922
2923
1.57M
  x_convolve_6tap_32_avx2(src, coeffs, filt, r);
2924
1.57M
  sr_x_round_store_32_avx2(r, dst);
2925
1.57M
}
2926
2927
static AOM_FORCE_INLINE void sr_x_8tap_32_avx2(const uint8_t *const src,
2928
                                               const __m256i coeffs[4],
2929
                                               const __m256i filt[4],
2930
179k
                                               uint8_t *const dst) {
2931
179k
  __m256i r[2];
2932
2933
179k
  x_convolve_8tap_32_avx2(src, coeffs, filt, r);
2934
179k
  sr_x_round_store_32_avx2(r, dst);
2935
179k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_8tap_32_avx2
convolve_avx2.c:sr_x_8tap_32_avx2
Line
Count
Source
2930
179k
                                               uint8_t *const dst) {
2931
179k
  __m256i r[2];
2932
2933
179k
  x_convolve_8tap_32_avx2(src, coeffs, filt, r);
2934
179k
  sr_x_round_store_32_avx2(r, dst);
2935
179k
}
2936
2937
static AOM_FORCE_INLINE void av1_convolve_x_sr_specialized_avx2(
2938
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
2939
    int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
2940
755k
    const int32_t subpel_x_q4, ConvolveParams *conv_params) {
2941
755k
  int32_t y = h;
2942
755k
  __m128i coeffs_128[4];
2943
755k
  __m256i coeffs_256[4];
2944
2945
755k
  assert(conv_params->round_0 == 3);
2946
755k
  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
2947
755k
         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
2948
755k
  (void)conv_params;
2949
2950
755k
  const int horz_tap = get_filter_tap(filter_params_x, subpel_x_q4);
2951
2952
755k
  if (horz_tap == 2) {
2953
    // horz_filt as 2 tap
2954
32.6k
    const uint8_t *src_ptr = src;
2955
2956
32.6k
    if (subpel_x_q4 != 8) {
2957
11.9k
      if (w <= 8) {
2958
8.72k
        prepare_half_coeffs_2tap_ssse3(filter_params_x, subpel_x_q4,
2959
8.72k
                                       coeffs_128);
2960
2961
8.72k
        if (w == 2) {
2962
3.53k
          do {
2963
3.53k
            const __m128i res =
2964
3.53k
                x_convolve_2tap_2x2_sse4_1(src_ptr, src_stride, coeffs_128);
2965
3.53k
            const __m128i r = sr_x_round_sse2(res);
2966
3.53k
            pack_store_2x2_sse2(r, dst, dst_stride);
2967
3.53k
            src_ptr += 2 * src_stride;
2968
3.53k
            dst += 2 * dst_stride;
2969
3.53k
            y -= 2;
2970
3.53k
          } while (y);
2971
7.23k
        } else if (w == 4) {
2972
12.3k
          do {
2973
12.3k
            const __m128i res =
2974
12.3k
                x_convolve_2tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
2975
12.3k
            const __m128i r = sr_x_round_sse2(res);
2976
12.3k
            pack_store_4x2_sse2(r, dst, dst_stride);
2977
12.3k
            src_ptr += 2 * src_stride;
2978
12.3k
            dst += 2 * dst_stride;
2979
12.3k
            y -= 2;
2980
12.3k
          } while (y);
2981
3.88k
        } else {
2982
3.35k
          assert(w == 8);
2983
2984
11.4k
          do {
2985
11.4k
            __m128i res[2];
2986
2987
11.4k
            x_convolve_2tap_8x2_ssse3(src_ptr, src_stride, coeffs_128, res);
2988
11.4k
            res[0] = sr_x_round_sse2(res[0]);
2989
11.4k
            res[1] = sr_x_round_sse2(res[1]);
2990
11.4k
            const __m128i d = _mm_packus_epi16(res[0], res[1]);
2991
11.4k
            _mm_storel_epi64((__m128i *)dst, d);
2992
11.4k
            _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
2993
2994
11.4k
            src_ptr += 2 * src_stride;
2995
11.4k
            dst += 2 * dst_stride;
2996
11.4k
            y -= 2;
2997
11.4k
          } while (y);
2998
3.35k
        }
2999
8.72k
      } else {
3000
3.24k
        prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3001
3002
3.24k
        if (w == 16) {
3003
9.35k
          do {
3004
9.35k
            __m256i r[2];
3005
3006
9.35k
            x_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, r);
3007
9.35k
            sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3008
9.35k
            src_ptr += 2 * src_stride;
3009
9.35k
            dst += 2 * dst_stride;
3010
9.35k
            y -= 2;
3011
9.35k
          } while (y);
3012
1.84k
        } else if (w == 32) {
3013
18.6k
          do {
3014
18.6k
            sr_x_2tap_32_avx2(src_ptr, coeffs_256, dst);
3015
18.6k
            src_ptr += src_stride;
3016
18.6k
            dst += dst_stride;
3017
18.6k
          } while (--y);
3018
750
        } else if (w == 64) {
3019
23.8k
          do {
3020
23.8k
            sr_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
3021
23.8k
            sr_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
3022
23.8k
            src_ptr += src_stride;
3023
23.8k
            dst += dst_stride;
3024
23.8k
          } while (--y);
3025
513
        } else {
3026
136
          assert(w == 128);
3027
3028
11.0k
          do {
3029
11.0k
            sr_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
3030
11.0k
            sr_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
3031
11.0k
            sr_x_2tap_32_avx2(src_ptr + 2 * 32, coeffs_256, dst + 2 * 32);
3032
11.0k
            sr_x_2tap_32_avx2(src_ptr + 3 * 32, coeffs_256, dst + 3 * 32);
3033
11.0k
            src_ptr += src_stride;
3034
11.0k
            dst += dst_stride;
3035
11.0k
          } while (--y);
3036
136
        }
3037
3.24k
      }
3038
20.6k
    } else {
3039
      // average to get half pel
3040
20.6k
      if (w == 2) {
3041
7.38k
        do {
3042
7.38k
          __m128i s_128;
3043
3044
7.38k
          s_128 = load_u8_4x2_sse4_1(src_ptr, src_stride);
3045
7.38k
          const __m128i s1 = _mm_srli_si128(s_128, 1);
3046
7.38k
          const __m128i d = _mm_avg_epu8(s_128, s1);
3047
7.38k
          *(uint16_t *)dst = (uint16_t)_mm_cvtsi128_si32(d);
3048
7.38k
          *(uint16_t *)(dst + dst_stride) = _mm_extract_epi16(d, 2);
3049
3050
7.38k
          src_ptr += 2 * src_stride;
3051
7.38k
          dst += 2 * dst_stride;
3052
7.38k
          y -= 2;
3053
7.38k
        } while (y);
3054
17.0k
      } else if (w == 4) {
3055
21.4k
        do {
3056
21.4k
          __m128i s_128;
3057
3058
21.4k
          s_128 = load_u8_8x2_sse2(src_ptr, src_stride);
3059
21.4k
          const __m128i s1 = _mm_srli_si128(s_128, 1);
3060
21.4k
          const __m128i d = _mm_avg_epu8(s_128, s1);
3061
21.4k
          xx_storel_32(dst, d);
3062
21.4k
          *(int32_t *)(dst + dst_stride) = _mm_extract_epi32(d, 2);
3063
3064
21.4k
          src_ptr += 2 * src_stride;
3065
21.4k
          dst += 2 * dst_stride;
3066
21.4k
          y -= 2;
3067
21.4k
        } while (y);
3068
9.14k
      } else if (w == 8) {
3069
17.6k
        do {
3070
17.6k
          const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
3071
17.6k
          const __m128i s10 =
3072
17.6k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
3073
17.6k
          const __m128i s01 = _mm_srli_si128(s00, 1);
3074
17.6k
          const __m128i s11 = _mm_srli_si128(s10, 1);
3075
17.6k
          const __m128i d0 = _mm_avg_epu8(s00, s01);
3076
17.6k
          const __m128i d1 = _mm_avg_epu8(s10, s11);
3077
17.6k
          _mm_storel_epi64((__m128i *)dst, d0);
3078
17.6k
          _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
3079
3080
17.6k
          src_ptr += 2 * src_stride;
3081
17.6k
          dst += 2 * dst_stride;
3082
17.6k
          y -= 2;
3083
17.6k
        } while (y);
3084
5.59k
      } else if (w == 16) {
3085
12.2k
        do {
3086
12.2k
          const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
3087
12.2k
          const __m128i s01 = _mm_loadu_si128((__m128i *)(src_ptr + 1));
3088
12.2k
          const __m128i s10 =
3089
12.2k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
3090
12.2k
          const __m128i s11 =
3091
12.2k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride + 1));
3092
12.2k
          const __m128i d0 = _mm_avg_epu8(s00, s01);
3093
12.2k
          const __m128i d1 = _mm_avg_epu8(s10, s11);
3094
12.2k
          _mm_storeu_si128((__m128i *)dst, d0);
3095
12.2k
          _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
3096
3097
12.2k
          src_ptr += 2 * src_stride;
3098
12.2k
          dst += 2 * dst_stride;
3099
12.2k
          y -= 2;
3100
12.2k
        } while (y);
3101
2.19k
      } else if (w == 32) {
3102
21.6k
        do {
3103
21.6k
          sr_x_2tap_32_avg_avx2(src_ptr, dst);
3104
21.6k
          src_ptr += src_stride;
3105
21.6k
          dst += dst_stride;
3106
21.6k
        } while (--y);
3107
905
      } else if (w == 64) {
3108
14.2k
        do {
3109
14.2k
          sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
3110
14.2k
          sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
3111
14.2k
          src_ptr += src_stride;
3112
14.2k
          dst += dst_stride;
3113
14.2k
        } while (--y);
3114
298
      } else {
3115
151
        assert(w == 128);
3116
3117
15.4k
        do {
3118
15.4k
          sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
3119
15.4k
          sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
3120
15.4k
          sr_x_2tap_32_avg_avx2(src_ptr + 2 * 32, dst + 2 * 32);
3121
15.4k
          sr_x_2tap_32_avg_avx2(src_ptr + 3 * 32, dst + 3 * 32);
3122
15.4k
          src_ptr += src_stride;
3123
15.4k
          dst += dst_stride;
3124
15.4k
        } while (--y);
3125
151
      }
3126
20.6k
    }
3127
723k
  } else if (horz_tap == 4) {
3128
    // horz_filt as 4 tap
3129
318k
    const uint8_t *src_ptr = src - 1;
3130
3131
318k
    prepare_half_coeffs_4tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
3132
3133
318k
    if (w == 2) {
3134
145k
      do {
3135
145k
        const __m128i res =
3136
145k
            x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
3137
145k
        const __m128i r = sr_x_round_sse2(res);
3138
145k
        pack_store_2x2_sse2(r, dst, dst_stride);
3139
145k
        src_ptr += 2 * src_stride;
3140
145k
        dst += 2 * dst_stride;
3141
145k
        y -= 2;
3142
145k
      } while (y);
3143
264k
    } else if (w == 4) {
3144
803k
      do {
3145
803k
        const __m128i res =
3146
803k
            x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
3147
803k
        const __m128i r = sr_x_round_sse2(res);
3148
803k
        pack_store_4x2_sse2(r, dst, dst_stride);
3149
803k
        src_ptr += 2 * src_stride;
3150
803k
        dst += 2 * dst_stride;
3151
803k
        y -= 2;
3152
803k
      } while (y);
3153
238k
    } else if (w == 8) {
3154
      // TODO(chiyotsai@google.com): Reuse the old SIMD code here. Need to
3155
      // rewrite this for better performance later.
3156
14.1k
      __m256i filt_256[2];
3157
14.1k
      prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_256);
3158
3159
14.1k
      filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3160
14.1k
      filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3161
69.7k
      for (int i = 0; i < h; i += 2) {
3162
55.5k
        const __m256i data = _mm256_permute2x128_si256(
3163
55.5k
            _mm256_castsi128_si256(
3164
55.5k
                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
3165
55.5k
            _mm256_castsi128_si256(_mm_loadu_si128(
3166
55.5k
                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
3167
55.5k
            0x20);
3168
3169
55.5k
        __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs_256 + 1, filt_256);
3170
55.5k
        res_16b = sr_x_round_avx2(res_16b);
3171
3172
55.5k
        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
3173
3174
55.5k
        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
3175
55.5k
        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
3176
3177
55.5k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
3178
55.5k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
3179
55.5k
      }
3180
14.1k
    } else {
3181
11.6k
      assert(!(w % 16));
3182
      // TODO(chiyotsai@google.com): Reuse the old SIMD code here. Need to
3183
      // rewrite this for better performance later.
3184
11.6k
      __m256i filt_256[2];
3185
11.6k
      prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_256);
3186
11.6k
      filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3187
11.6k
      filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3188
3189
252k
      for (int i = 0; i < h; ++i) {
3190
984k
        for (int j = 0; j < w; j += 16) {
3191
          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
3192
          // 18 19 20 21 22 23
3193
744k
          const __m256i data = _mm256_inserti128_si256(
3194
744k
              _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
3195
744k
              _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
3196
744k
              1);
3197
3198
744k
          __m256i res_16b =
3199
744k
              convolve_lowbd_x_4tap(data, coeffs_256 + 1, filt_256);
3200
744k
          res_16b = sr_x_round_avx2(res_16b);
3201
3202
          /* rounding code */
3203
          // 8 bit conversion and saturation to uint8
3204
744k
          __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
3205
3206
          // Store values into the destination buffer
3207
          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
3208
744k
          res_8b = _mm256_permute4x64_epi64(res_8b, 216);
3209
744k
          __m128i res = _mm256_castsi256_si128(res_8b);
3210
744k
          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
3211
744k
        }
3212
240k
      }
3213
11.6k
    }
3214
404k
  } else {
3215
404k
    __m256i filt_256[4];
3216
3217
404k
    filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3218
404k
    filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3219
404k
    filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx2);
3220
3221
404k
    if (horz_tap == 6) {
3222
      // horz_filt as 6 tap
3223
389k
      const uint8_t *src_ptr = src - 2;
3224
3225
389k
      prepare_half_coeffs_6tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3226
3227
389k
      if (w == 8) {
3228
824k
        do {
3229
824k
          const __m256i res = x_convolve_6tap_8x2_avx2(src_ptr, src_stride,
3230
824k
                                                       coeffs_256, filt_256);
3231
824k
          sr_x_round_store_8x2_avx2(res, dst, dst_stride);
3232
824k
          src_ptr += 2 * src_stride;
3233
824k
          dst += 2 * dst_stride;
3234
824k
          y -= 2;
3235
824k
        } while (y);
3236
219k
      } else if (w == 16) {
3237
673k
        do {
3238
673k
          __m256i r[2];
3239
3240
673k
          x_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256,
3241
673k
                                    r);
3242
673k
          sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3243
673k
          src_ptr += 2 * src_stride;
3244
673k
          dst += 2 * dst_stride;
3245
673k
          y -= 2;
3246
673k
        } while (y);
3247
131k
      } else if (w == 32) {
3248
636k
        do {
3249
636k
          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3250
636k
          src_ptr += src_stride;
3251
636k
          dst += dst_stride;
3252
636k
        } while (--y);
3253
32.9k
      } else if (w == 64) {
3254
255k
        do {
3255
255k
          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3256
255k
          sr_x_6tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, dst + 32);
3257
255k
          src_ptr += src_stride;
3258
255k
          dst += dst_stride;
3259
255k
        } while (--y);
3260
5.13k
      } else {
3261
874
        assert(w == 128);
3262
3263
106k
        do {
3264
106k
          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3265
106k
          sr_x_6tap_32_avx2(src_ptr + 1 * 32, coeffs_256, filt_256,
3266
106k
                            dst + 1 * 32);
3267
106k
          sr_x_6tap_32_avx2(src_ptr + 2 * 32, coeffs_256, filt_256,
3268
106k
                            dst + 2 * 32);
3269
106k
          sr_x_6tap_32_avx2(src_ptr + 3 * 32, coeffs_256, filt_256,
3270
106k
                            dst + 3 * 32);
3271
106k
          src_ptr += src_stride;
3272
106k
          dst += dst_stride;
3273
106k
        } while (--y);
3274
891
      }
3275
389k
    } else if (horz_tap == 8) {
3276
      // horz_filt as 8 tap
3277
15.0k
      const uint8_t *src_ptr = src - 3;
3278
3279
15.0k
      filt_256[3] = _mm256_loadu_si256((__m256i const *)filt4_global_avx2);
3280
3281
15.0k
      prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3282
3283
15.0k
      if (w == 8) {
3284
32.5k
        do {
3285
32.5k
          const __m256i res = x_convolve_8tap_8x2_avx2(src_ptr, src_stride,
3286
32.5k
                                                       coeffs_256, filt_256);
3287
32.5k
          sr_x_round_store_8x2_avx2(res, dst, dst_stride);
3288
32.5k
          src_ptr += 2 * src_stride;
3289
32.5k
          dst += 2 * dst_stride;
3290
32.5k
          y -= 2;
3291
32.5k
        } while (y);
3292
7.99k
      } else if (w == 16) {
3293
25.9k
        do {
3294
25.9k
          __m256i r[2];
3295
3296
25.9k
          x_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256,
3297
25.9k
                                    r);
3298
25.9k
          sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3299
25.9k
          src_ptr += 2 * src_stride;
3300
25.9k
          dst += 2 * dst_stride;
3301
25.9k
          y -= 2;
3302
25.9k
        } while (y);
3303
4.69k
      } else if (w == 32) {
3304
35.4k
        do {
3305
35.4k
          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3306
35.4k
          src_ptr += src_stride;
3307
35.4k
          dst += dst_stride;
3308
35.4k
        } while (--y);
3309
1.55k
      } else if (w == 64) {
3310
28.9k
        do {
3311
28.9k
          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3312
28.9k
          sr_x_8tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, dst + 32);
3313
28.9k
          src_ptr += src_stride;
3314
28.9k
          dst += dst_stride;
3315
28.9k
        } while (--y);
3316
594
      } else {
3317
211
        assert(w == 128);
3318
3319
21.8k
        do {
3320
21.8k
          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3321
21.8k
          sr_x_8tap_32_avx2(src_ptr + 1 * 32, coeffs_256, filt_256,
3322
21.8k
                            dst + 1 * 32);
3323
21.8k
          sr_x_8tap_32_avx2(src_ptr + 2 * 32, coeffs_256, filt_256,
3324
21.8k
                            dst + 2 * 32);
3325
21.8k
          sr_x_8tap_32_avx2(src_ptr + 3 * 32, coeffs_256, filt_256,
3326
21.8k
                            dst + 3 * 32);
3327
21.8k
          src_ptr += src_stride;
3328
21.8k
          dst += dst_stride;
3329
21.8k
        } while (--y);
3330
210
      }
3331
15.0k
    }
3332
404k
  }
3333
755k
}
Unexecuted instantiation: convolve_2d_avx2.c:av1_convolve_x_sr_specialized_avx2
convolve_avx2.c:av1_convolve_x_sr_specialized_avx2
Line
Count
Source
2940
755k
    const int32_t subpel_x_q4, ConvolveParams *conv_params) {
2941
755k
  int32_t y = h;
2942
755k
  __m128i coeffs_128[4];
2943
755k
  __m256i coeffs_256[4];
2944
2945
755k
  assert(conv_params->round_0 == 3);
2946
755k
  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
2947
755k
         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
2948
755k
  (void)conv_params;
2949
2950
755k
  const int horz_tap = get_filter_tap(filter_params_x, subpel_x_q4);
2951
2952
755k
  if (horz_tap == 2) {
2953
    // horz_filt as 2 tap
2954
32.6k
    const uint8_t *src_ptr = src;
2955
2956
32.6k
    if (subpel_x_q4 != 8) {
2957
11.9k
      if (w <= 8) {
2958
8.72k
        prepare_half_coeffs_2tap_ssse3(filter_params_x, subpel_x_q4,
2959
8.72k
                                       coeffs_128);
2960
2961
8.72k
        if (w == 2) {
2962
3.53k
          do {
2963
3.53k
            const __m128i res =
2964
3.53k
                x_convolve_2tap_2x2_sse4_1(src_ptr, src_stride, coeffs_128);
2965
3.53k
            const __m128i r = sr_x_round_sse2(res);
2966
3.53k
            pack_store_2x2_sse2(r, dst, dst_stride);
2967
3.53k
            src_ptr += 2 * src_stride;
2968
3.53k
            dst += 2 * dst_stride;
2969
3.53k
            y -= 2;
2970
3.53k
          } while (y);
2971
7.23k
        } else if (w == 4) {
2972
12.3k
          do {
2973
12.3k
            const __m128i res =
2974
12.3k
                x_convolve_2tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
2975
12.3k
            const __m128i r = sr_x_round_sse2(res);
2976
12.3k
            pack_store_4x2_sse2(r, dst, dst_stride);
2977
12.3k
            src_ptr += 2 * src_stride;
2978
12.3k
            dst += 2 * dst_stride;
2979
12.3k
            y -= 2;
2980
12.3k
          } while (y);
2981
3.88k
        } else {
2982
3.35k
          assert(w == 8);
2983
2984
11.4k
          do {
2985
11.4k
            __m128i res[2];
2986
2987
11.4k
            x_convolve_2tap_8x2_ssse3(src_ptr, src_stride, coeffs_128, res);
2988
11.4k
            res[0] = sr_x_round_sse2(res[0]);
2989
11.4k
            res[1] = sr_x_round_sse2(res[1]);
2990
11.4k
            const __m128i d = _mm_packus_epi16(res[0], res[1]);
2991
11.4k
            _mm_storel_epi64((__m128i *)dst, d);
2992
11.4k
            _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
2993
2994
11.4k
            src_ptr += 2 * src_stride;
2995
11.4k
            dst += 2 * dst_stride;
2996
11.4k
            y -= 2;
2997
11.4k
          } while (y);
2998
3.35k
        }
2999
8.72k
      } else {
3000
3.24k
        prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3001
3002
3.24k
        if (w == 16) {
3003
9.35k
          do {
3004
9.35k
            __m256i r[2];
3005
3006
9.35k
            x_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, r);
3007
9.35k
            sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3008
9.35k
            src_ptr += 2 * src_stride;
3009
9.35k
            dst += 2 * dst_stride;
3010
9.35k
            y -= 2;
3011
9.35k
          } while (y);
3012
1.84k
        } else if (w == 32) {
3013
18.6k
          do {
3014
18.6k
            sr_x_2tap_32_avx2(src_ptr, coeffs_256, dst);
3015
18.6k
            src_ptr += src_stride;
3016
18.6k
            dst += dst_stride;
3017
18.6k
          } while (--y);
3018
750
        } else if (w == 64) {
3019
23.8k
          do {
3020
23.8k
            sr_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
3021
23.8k
            sr_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
3022
23.8k
            src_ptr += src_stride;
3023
23.8k
            dst += dst_stride;
3024
23.8k
          } while (--y);
3025
513
        } else {
3026
136
          assert(w == 128);
3027
3028
11.0k
          do {
3029
11.0k
            sr_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
3030
11.0k
            sr_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
3031
11.0k
            sr_x_2tap_32_avx2(src_ptr + 2 * 32, coeffs_256, dst + 2 * 32);
3032
11.0k
            sr_x_2tap_32_avx2(src_ptr + 3 * 32, coeffs_256, dst + 3 * 32);
3033
11.0k
            src_ptr += src_stride;
3034
11.0k
            dst += dst_stride;
3035
11.0k
          } while (--y);
3036
136
        }
3037
3.24k
      }
3038
20.6k
    } else {
3039
      // average to get half pel
3040
20.6k
      if (w == 2) {
3041
7.38k
        do {
3042
7.38k
          __m128i s_128;
3043
3044
7.38k
          s_128 = load_u8_4x2_sse4_1(src_ptr, src_stride);
3045
7.38k
          const __m128i s1 = _mm_srli_si128(s_128, 1);
3046
7.38k
          const __m128i d = _mm_avg_epu8(s_128, s1);
3047
7.38k
          *(uint16_t *)dst = (uint16_t)_mm_cvtsi128_si32(d);
3048
7.38k
          *(uint16_t *)(dst + dst_stride) = _mm_extract_epi16(d, 2);
3049
3050
7.38k
          src_ptr += 2 * src_stride;
3051
7.38k
          dst += 2 * dst_stride;
3052
7.38k
          y -= 2;
3053
7.38k
        } while (y);
3054
17.0k
      } else if (w == 4) {
3055
21.4k
        do {
3056
21.4k
          __m128i s_128;
3057
3058
21.4k
          s_128 = load_u8_8x2_sse2(src_ptr, src_stride);
3059
21.4k
          const __m128i s1 = _mm_srli_si128(s_128, 1);
3060
21.4k
          const __m128i d = _mm_avg_epu8(s_128, s1);
3061
21.4k
          xx_storel_32(dst, d);
3062
21.4k
          *(int32_t *)(dst + dst_stride) = _mm_extract_epi32(d, 2);
3063
3064
21.4k
          src_ptr += 2 * src_stride;
3065
21.4k
          dst += 2 * dst_stride;
3066
21.4k
          y -= 2;
3067
21.4k
        } while (y);
3068
9.14k
      } else if (w == 8) {
3069
17.6k
        do {
3070
17.6k
          const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
3071
17.6k
          const __m128i s10 =
3072
17.6k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
3073
17.6k
          const __m128i s01 = _mm_srli_si128(s00, 1);
3074
17.6k
          const __m128i s11 = _mm_srli_si128(s10, 1);
3075
17.6k
          const __m128i d0 = _mm_avg_epu8(s00, s01);
3076
17.6k
          const __m128i d1 = _mm_avg_epu8(s10, s11);
3077
17.6k
          _mm_storel_epi64((__m128i *)dst, d0);
3078
17.6k
          _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
3079
3080
17.6k
          src_ptr += 2 * src_stride;
3081
17.6k
          dst += 2 * dst_stride;
3082
17.6k
          y -= 2;
3083
17.6k
        } while (y);
3084
5.59k
      } else if (w == 16) {
3085
12.2k
        do {
3086
12.2k
          const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
3087
12.2k
          const __m128i s01 = _mm_loadu_si128((__m128i *)(src_ptr + 1));
3088
12.2k
          const __m128i s10 =
3089
12.2k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
3090
12.2k
          const __m128i s11 =
3091
12.2k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride + 1));
3092
12.2k
          const __m128i d0 = _mm_avg_epu8(s00, s01);
3093
12.2k
          const __m128i d1 = _mm_avg_epu8(s10, s11);
3094
12.2k
          _mm_storeu_si128((__m128i *)dst, d0);
3095
12.2k
          _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
3096
3097
12.2k
          src_ptr += 2 * src_stride;
3098
12.2k
          dst += 2 * dst_stride;
3099
12.2k
          y -= 2;
3100
12.2k
        } while (y);
3101
2.19k
      } else if (w == 32) {
3102
21.6k
        do {
3103
21.6k
          sr_x_2tap_32_avg_avx2(src_ptr, dst);
3104
21.6k
          src_ptr += src_stride;
3105
21.6k
          dst += dst_stride;
3106
21.6k
        } while (--y);
3107
905
      } else if (w == 64) {
3108
14.2k
        do {
3109
14.2k
          sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
3110
14.2k
          sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
3111
14.2k
          src_ptr += src_stride;
3112
14.2k
          dst += dst_stride;
3113
14.2k
        } while (--y);
3114
298
      } else {
3115
151
        assert(w == 128);
3116
3117
15.4k
        do {
3118
15.4k
          sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
3119
15.4k
          sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
3120
15.4k
          sr_x_2tap_32_avg_avx2(src_ptr + 2 * 32, dst + 2 * 32);
3121
15.4k
          sr_x_2tap_32_avg_avx2(src_ptr + 3 * 32, dst + 3 * 32);
3122
15.4k
          src_ptr += src_stride;
3123
15.4k
          dst += dst_stride;
3124
15.4k
        } while (--y);
3125
151
      }
3126
20.6k
    }
3127
723k
  } else if (horz_tap == 4) {
3128
    // horz_filt as 4 tap
3129
318k
    const uint8_t *src_ptr = src - 1;
3130
3131
318k
    prepare_half_coeffs_4tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
3132
3133
318k
    if (w == 2) {
3134
145k
      do {
3135
145k
        const __m128i res =
3136
145k
            x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
3137
145k
        const __m128i r = sr_x_round_sse2(res);
3138
145k
        pack_store_2x2_sse2(r, dst, dst_stride);
3139
145k
        src_ptr += 2 * src_stride;
3140
145k
        dst += 2 * dst_stride;
3141
145k
        y -= 2;
3142
145k
      } while (y);
3143
264k
    } else if (w == 4) {
3144
803k
      do {
3145
803k
        const __m128i res =
3146
803k
            x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
3147
803k
        const __m128i r = sr_x_round_sse2(res);
3148
803k
        pack_store_4x2_sse2(r, dst, dst_stride);
3149
803k
        src_ptr += 2 * src_stride;
3150
803k
        dst += 2 * dst_stride;
3151
803k
        y -= 2;
3152
803k
      } while (y);
3153
238k
    } else if (w == 8) {
3154
      // TODO(chiyotsai@google.com): Reuse the old SIMD code here. Need to
3155
      // rewrite this for better performance later.
3156
14.1k
      __m256i filt_256[2];
3157
14.1k
      prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_256);
3158
3159
14.1k
      filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3160
14.1k
      filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3161
69.7k
      for (int i = 0; i < h; i += 2) {
3162
55.5k
        const __m256i data = _mm256_permute2x128_si256(
3163
55.5k
            _mm256_castsi128_si256(
3164
55.5k
                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
3165
55.5k
            _mm256_castsi128_si256(_mm_loadu_si128(
3166
55.5k
                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
3167
55.5k
            0x20);
3168
3169
55.5k
        __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs_256 + 1, filt_256);
3170
55.5k
        res_16b = sr_x_round_avx2(res_16b);
3171
3172
55.5k
        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
3173
3174
55.5k
        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
3175
55.5k
        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
3176
3177
55.5k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
3178
55.5k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
3179
55.5k
      }
3180
14.1k
    } else {
3181
11.6k
      assert(!(w % 16));
3182
      // TODO(chiyotsai@google.com): Reuse the old SIMD code here. Need to
3183
      // rewrite this for better performance later.
3184
11.6k
      __m256i filt_256[2];
3185
11.6k
      prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_256);
3186
11.6k
      filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3187
11.6k
      filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3188
3189
252k
      for (int i = 0; i < h; ++i) {
3190
984k
        for (int j = 0; j < w; j += 16) {
3191
          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
3192
          // 18 19 20 21 22 23
3193
744k
          const __m256i data = _mm256_inserti128_si256(
3194
744k
              _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
3195
744k
              _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
3196
744k
              1);
3197
3198
744k
          __m256i res_16b =
3199
744k
              convolve_lowbd_x_4tap(data, coeffs_256 + 1, filt_256);
3200
744k
          res_16b = sr_x_round_avx2(res_16b);
3201
3202
          /* rounding code */
3203
          // 8 bit conversion and saturation to uint8
3204
744k
          __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
3205
3206
          // Store values into the destination buffer
3207
          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
3208
744k
          res_8b = _mm256_permute4x64_epi64(res_8b, 216);
3209
744k
          __m128i res = _mm256_castsi256_si128(res_8b);
3210
744k
          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
3211
744k
        }
3212
240k
      }
3213
11.6k
    }
3214
404k
  } else {
3215
404k
    __m256i filt_256[4];
3216
3217
404k
    filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3218
404k
    filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3219
404k
    filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx2);
3220
3221
404k
    if (horz_tap == 6) {
3222
      // horz_filt as 6 tap
3223
389k
      const uint8_t *src_ptr = src - 2;
3224
3225
389k
      prepare_half_coeffs_6tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3226
3227
389k
      if (w == 8) {
3228
824k
        do {
3229
824k
          const __m256i res = x_convolve_6tap_8x2_avx2(src_ptr, src_stride,
3230
824k
                                                       coeffs_256, filt_256);
3231
824k
          sr_x_round_store_8x2_avx2(res, dst, dst_stride);
3232
824k
          src_ptr += 2 * src_stride;
3233
824k
          dst += 2 * dst_stride;
3234
824k
          y -= 2;
3235
824k
        } while (y);
3236
219k
      } else if (w == 16) {
3237
673k
        do {
3238
673k
          __m256i r[2];
3239
3240
673k
          x_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256,
3241
673k
                                    r);
3242
673k
          sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3243
673k
          src_ptr += 2 * src_stride;
3244
673k
          dst += 2 * dst_stride;
3245
673k
          y -= 2;
3246
673k
        } while (y);
3247
131k
      } else if (w == 32) {
3248
636k
        do {
3249
636k
          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3250
636k
          src_ptr += src_stride;
3251
636k
          dst += dst_stride;
3252
636k
        } while (--y);
3253
32.9k
      } else if (w == 64) {
3254
255k
        do {
3255
255k
          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3256
255k
          sr_x_6tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, dst + 32);
3257
255k
          src_ptr += src_stride;
3258
255k
          dst += dst_stride;
3259
255k
        } while (--y);
3260
5.13k
      } else {
3261
874
        assert(w == 128);
3262
3263
106k
        do {
3264
106k
          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3265
106k
          sr_x_6tap_32_avx2(src_ptr + 1 * 32, coeffs_256, filt_256,
3266
106k
                            dst + 1 * 32);
3267
106k
          sr_x_6tap_32_avx2(src_ptr + 2 * 32, coeffs_256, filt_256,
3268
106k
                            dst + 2 * 32);
3269
106k
          sr_x_6tap_32_avx2(src_ptr + 3 * 32, coeffs_256, filt_256,
3270
106k
                            dst + 3 * 32);
3271
106k
          src_ptr += src_stride;
3272
106k
          dst += dst_stride;
3273
106k
        } while (--y);
3274
891
      }
3275
389k
    } else if (horz_tap == 8) {
3276
      // horz_filt as 8 tap
3277
15.0k
      const uint8_t *src_ptr = src - 3;
3278
3279
15.0k
      filt_256[3] = _mm256_loadu_si256((__m256i const *)filt4_global_avx2);
3280
3281
15.0k
      prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3282
3283
15.0k
      if (w == 8) {
3284
32.5k
        do {
3285
32.5k
          const __m256i res = x_convolve_8tap_8x2_avx2(src_ptr, src_stride,
3286
32.5k
                                                       coeffs_256, filt_256);
3287
32.5k
          sr_x_round_store_8x2_avx2(res, dst, dst_stride);
3288
32.5k
          src_ptr += 2 * src_stride;
3289
32.5k
          dst += 2 * dst_stride;
3290
32.5k
          y -= 2;
3291
32.5k
        } while (y);
3292
7.99k
      } else if (w == 16) {
3293
25.9k
        do {
3294
25.9k
          __m256i r[2];
3295
3296
25.9k
          x_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256,
3297
25.9k
                                    r);
3298
25.9k
          sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3299
25.9k
          src_ptr += 2 * src_stride;
3300
25.9k
          dst += 2 * dst_stride;
3301
25.9k
          y -= 2;
3302
25.9k
        } while (y);
3303
4.69k
      } else if (w == 32) {
3304
35.4k
        do {
3305
35.4k
          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3306
35.4k
          src_ptr += src_stride;
3307
35.4k
          dst += dst_stride;
3308
35.4k
        } while (--y);
3309
1.55k
      } else if (w == 64) {
3310
28.9k
        do {
3311
28.9k
          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3312
28.9k
          sr_x_8tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, dst + 32);
3313
28.9k
          src_ptr += src_stride;
3314
28.9k
          dst += dst_stride;
3315
28.9k
        } while (--y);
3316
594
      } else {
3317
211
        assert(w == 128);
3318
3319
21.8k
        do {
3320
21.8k
          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3321
21.8k
          sr_x_8tap_32_avx2(src_ptr + 1 * 32, coeffs_256, filt_256,
3322
21.8k
                            dst + 1 * 32);
3323
21.8k
          sr_x_8tap_32_avx2(src_ptr + 2 * 32, coeffs_256, filt_256,
3324
21.8k
                            dst + 2 * 32);
3325
21.8k
          sr_x_8tap_32_avx2(src_ptr + 3 * 32, coeffs_256, filt_256,
3326
21.8k
                            dst + 3 * 32);
3327
21.8k
          src_ptr += src_stride;
3328
21.8k
          dst += dst_stride;
3329
21.8k
        } while (--y);
3330
210
      }
3331
15.0k
    }
3332
404k
  }
3333
755k
}
3334
3335
#endif  // THIRD_PARTY_SVT_AV1_CONVOLVE_AVX2_H_