Coverage Report

Created: 2025-06-13 07:07

/src/aom/third_party/SVT-AV1/convolve_avx2.h
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#ifndef THIRD_PARTY_SVT_AV1_CONVOLVE_AVX2_H_
13
#define THIRD_PARTY_SVT_AV1_CONVOLVE_AVX2_H_
14
15
#include "EbMemory_AVX2.h"
16
#include "EbMemory_SSE4_1.h"
17
#include "synonyms.h"
18
19
#include "aom_dsp/aom_filter.h"
20
#include "aom_dsp/x86/convolve_avx2.h"
21
#include "aom_dsp/x86/mem_sse2.h"
22
23
static inline void populate_coeffs_4tap_avx2(const __m128i coeffs_128,
24
227k
                                             __m256i coeffs[2]) {
25
227k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
26
27
  // coeffs 2 3 2 3 2 3 2 3
28
227k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
29
  // coeffs 4 5 4 5 4 5 4 5
30
227k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
31
227k
}
convolve_2d_avx2.c:populate_coeffs_4tap_avx2
Line
Count
Source
24
58.6k
                                             __m256i coeffs[2]) {
25
58.6k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
26
27
  // coeffs 2 3 2 3 2 3 2 3
28
58.6k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
29
  // coeffs 4 5 4 5 4 5 4 5
30
58.6k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
31
58.6k
}
convolve_avx2.c:populate_coeffs_4tap_avx2
Line
Count
Source
24
169k
                                             __m256i coeffs[2]) {
25
169k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
26
27
  // coeffs 2 3 2 3 2 3 2 3
28
169k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
29
  // coeffs 4 5 4 5 4 5 4 5
30
169k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
31
169k
}
32
33
static inline void populate_coeffs_6tap_avx2(const __m128i coeffs_128,
34
1.55M
                                             __m256i coeffs[3]) {
35
1.55M
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
36
37
  // coeffs 1 2 1 2 1 2 1 2
38
1.55M
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0402u));
39
  // coeffs 3 4 3 4 3 4 3 4
40
1.55M
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0806u));
41
  // coeffs 5 6 5 6 5 6 5 6
42
1.55M
  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0C0Au));
43
1.55M
}
convolve_2d_avx2.c:populate_coeffs_6tap_avx2
Line
Count
Source
34
890k
                                             __m256i coeffs[3]) {
35
890k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
36
37
  // coeffs 1 2 1 2 1 2 1 2
38
890k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0402u));
39
  // coeffs 3 4 3 4 3 4 3 4
40
890k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0806u));
41
  // coeffs 5 6 5 6 5 6 5 6
42
890k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0C0Au));
43
890k
}
convolve_avx2.c:populate_coeffs_6tap_avx2
Line
Count
Source
34
664k
                                             __m256i coeffs[3]) {
35
664k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
36
37
  // coeffs 1 2 1 2 1 2 1 2
38
664k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0402u));
39
  // coeffs 3 4 3 4 3 4 3 4
40
664k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0806u));
41
  // coeffs 5 6 5 6 5 6 5 6
42
664k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0C0Au));
43
664k
}
44
45
static inline void populate_coeffs_8tap_avx2(const __m128i coeffs_128,
46
85.8k
                                             __m256i coeffs[4]) {
47
85.8k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
48
49
  // coeffs 0 1 0 1 0 1 0 1
50
85.8k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0200u));
51
  // coeffs 2 3 2 3 2 3 2 3
52
85.8k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
53
  // coeffs 4 5 4 5 4 5 4 5
54
85.8k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
55
  // coeffs 6 7 6 7 6 7 6 7
56
85.8k
  coeffs[3] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0e0cu));
57
85.8k
}
convolve_2d_avx2.c:populate_coeffs_8tap_avx2
Line
Count
Source
46
53.9k
                                             __m256i coeffs[4]) {
47
53.9k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
48
49
  // coeffs 0 1 0 1 0 1 0 1
50
53.9k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0200u));
51
  // coeffs 2 3 2 3 2 3 2 3
52
53.9k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
53
  // coeffs 4 5 4 5 4 5 4 5
54
53.9k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
55
  // coeffs 6 7 6 7 6 7 6 7
56
53.9k
  coeffs[3] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0e0cu));
57
53.9k
}
convolve_avx2.c:populate_coeffs_8tap_avx2
Line
Count
Source
46
31.8k
                                             __m256i coeffs[4]) {
47
31.8k
  const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
48
49
  // coeffs 0 1 0 1 0 1 0 1
50
31.8k
  coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0200u));
51
  // coeffs 2 3 2 3 2 3 2 3
52
31.8k
  coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
53
  // coeffs 4 5 4 5 4 5 4 5
54
31.8k
  coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
55
  // coeffs 6 7 6 7 6 7 6 7
56
31.8k
  coeffs[3] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0e0cu));
57
31.8k
}
58
59
static inline void prepare_half_coeffs_2tap_ssse3(
60
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
61
94.4k
    __m128i *const coeffs /* [1] */) {
62
94.4k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
63
94.4k
      filter_params, subpel_q4 & SUBPEL_MASK);
64
94.4k
  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
65
66
  // right shift all filter co-efficients by 1 to reduce the bits required.
67
  // This extra right shift will be taken care of at the end while rounding
68
  // the result.
69
  // Since all filter co-efficients are even, this change will not affect the
70
  // end result
71
94.4k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
72
94.4k
                            _mm_set1_epi16((short)0xffff)));
73
74
94.4k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
75
76
  // coeffs 3 4 3 4 3 4 3 4
77
94.4k
  *coeffs = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
78
94.4k
}
convolve_2d_avx2.c:prepare_half_coeffs_2tap_ssse3
Line
Count
Source
61
55.7k
    __m128i *const coeffs /* [1] */) {
62
55.7k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
63
55.7k
      filter_params, subpel_q4 & SUBPEL_MASK);
64
55.7k
  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
65
66
  // right shift all filter co-efficients by 1 to reduce the bits required.
67
  // This extra right shift will be taken care of at the end while rounding
68
  // the result.
69
  // Since all filter co-efficients are even, this change will not affect the
70
  // end result
71
55.7k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
72
55.7k
                            _mm_set1_epi16((short)0xffff)));
73
74
55.7k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
75
76
  // coeffs 3 4 3 4 3 4 3 4
77
55.7k
  *coeffs = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
78
55.7k
}
convolve_avx2.c:prepare_half_coeffs_2tap_ssse3
Line
Count
Source
61
38.6k
    __m128i *const coeffs /* [1] */) {
62
38.6k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
63
38.6k
      filter_params, subpel_q4 & SUBPEL_MASK);
64
38.6k
  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
65
66
  // right shift all filter co-efficients by 1 to reduce the bits required.
67
  // This extra right shift will be taken care of at the end while rounding
68
  // the result.
69
  // Since all filter co-efficients are even, this change will not affect the
70
  // end result
71
38.6k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
72
38.6k
                            _mm_set1_epi16((short)0xffff)));
73
74
38.6k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
75
76
  // coeffs 3 4 3 4 3 4 3 4
77
38.6k
  *coeffs = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
78
38.6k
}
79
80
static inline void prepare_half_coeffs_4tap_ssse3(
81
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
82
1.25M
    __m128i *const coeffs /* [2] */) {
83
1.25M
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
84
1.25M
      filter_params, subpel_q4 & SUBPEL_MASK);
85
1.25M
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
86
87
  // right shift all filter co-efficients by 1 to reduce the bits required.
88
  // This extra right shift will be taken care of at the end while rounding
89
  // the result.
90
  // Since all filter co-efficients are even, this change will not affect the
91
  // end result
92
1.25M
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
93
1.25M
                            _mm_set1_epi16((short)0xffff)));
94
95
1.25M
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
96
97
  // coeffs 2 3 2 3 2 3 2 3
98
1.25M
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
99
  // coeffs 4 5 4 5 4 5 4 5
100
1.25M
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
101
1.25M
}
convolve_2d_avx2.c:prepare_half_coeffs_4tap_ssse3
Line
Count
Source
82
747k
    __m128i *const coeffs /* [2] */) {
83
747k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
84
747k
      filter_params, subpel_q4 & SUBPEL_MASK);
85
747k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
86
87
  // right shift all filter co-efficients by 1 to reduce the bits required.
88
  // This extra right shift will be taken care of at the end while rounding
89
  // the result.
90
  // Since all filter co-efficients are even, this change will not affect the
91
  // end result
92
747k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
93
747k
                            _mm_set1_epi16((short)0xffff)));
94
95
747k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
96
97
  // coeffs 2 3 2 3 2 3 2 3
98
747k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
99
  // coeffs 4 5 4 5 4 5 4 5
100
747k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
101
747k
}
convolve_avx2.c:prepare_half_coeffs_4tap_ssse3
Line
Count
Source
82
510k
    __m128i *const coeffs /* [2] */) {
83
510k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
84
510k
      filter_params, subpel_q4 & SUBPEL_MASK);
85
510k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
86
87
  // right shift all filter co-efficients by 1 to reduce the bits required.
88
  // This extra right shift will be taken care of at the end while rounding
89
  // the result.
90
  // Since all filter co-efficients are even, this change will not affect the
91
  // end result
92
510k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
93
510k
                            _mm_set1_epi16((short)0xffff)));
94
95
510k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
96
97
  // coeffs 2 3 2 3 2 3 2 3
98
510k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
99
  // coeffs 4 5 4 5 4 5 4 5
100
510k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
101
510k
}
102
103
static inline void prepare_half_coeffs_6tap_ssse3(
104
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
105
109k
    __m128i *const coeffs /* [3] */) {
106
109k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
107
109k
      filter_params, subpel_q4 & SUBPEL_MASK);
108
109k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
109
110
  // right shift all filter co-efficients by 1 to reduce the bits required.
111
  // This extra right shift will be taken care of at the end while rounding
112
  // the result.
113
  // Since all filter co-efficients are even, this change will not affect the
114
  // end result
115
109k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
116
109k
                            _mm_set1_epi16((short)0xffff)));
117
118
109k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
119
120
  // coeffs 1 2 1 2 1 2 1 2
121
109k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0402u));
122
  // coeffs 3 4 3 4 3 4 3 4
123
109k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u));
124
  // coeffs 5 6 5 6 5 6 5 6
125
109k
  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0C0Au));
126
109k
}
Unexecuted instantiation: convolve_2d_avx2.c:prepare_half_coeffs_6tap_ssse3
convolve_avx2.c:prepare_half_coeffs_6tap_ssse3
Line
Count
Source
105
109k
    __m128i *const coeffs /* [3] */) {
106
109k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
107
109k
      filter_params, subpel_q4 & SUBPEL_MASK);
108
109k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
109
110
  // right shift all filter co-efficients by 1 to reduce the bits required.
111
  // This extra right shift will be taken care of at the end while rounding
112
  // the result.
113
  // Since all filter co-efficients are even, this change will not affect the
114
  // end result
115
109k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
116
109k
                            _mm_set1_epi16((short)0xffff)));
117
118
109k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
119
120
  // coeffs 1 2 1 2 1 2 1 2
121
109k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0402u));
122
  // coeffs 3 4 3 4 3 4 3 4
123
109k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u));
124
  // coeffs 5 6 5 6 5 6 5 6
125
109k
  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0C0Au));
126
109k
}
127
128
static inline void prepare_half_coeffs_8tap_ssse3(
129
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
130
7.32k
    __m128i *const coeffs /* [4] */) {
131
7.32k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
132
7.32k
      filter_params, subpel_q4 & SUBPEL_MASK);
133
7.32k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
134
135
  // right shift all filter co-efficients by 1 to reduce the bits required.
136
  // This extra right shift will be taken care of at the end while rounding
137
  // the result.
138
  // Since all filter co-efficients are even, this change will not affect the
139
  // end result
140
7.32k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
141
7.32k
                            _mm_set1_epi16((short)0xffff)));
142
143
7.32k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
144
145
  // coeffs 0 1 0 1 0 1 0 1
146
7.32k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
147
  // coeffs 2 3 2 3 2 3 2 3
148
7.32k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
149
  // coeffs 4 5 4 5 4 5 4 5
150
7.32k
  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
151
  // coeffs 6 7 6 7 6 7 6 7
152
7.32k
  coeffs[3] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0e0cu));
153
7.32k
}
Unexecuted instantiation: convolve_2d_avx2.c:prepare_half_coeffs_8tap_ssse3
convolve_avx2.c:prepare_half_coeffs_8tap_ssse3
Line
Count
Source
130
7.32k
    __m128i *const coeffs /* [4] */) {
131
7.32k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
132
7.32k
      filter_params, subpel_q4 & SUBPEL_MASK);
133
7.32k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
134
135
  // right shift all filter co-efficients by 1 to reduce the bits required.
136
  // This extra right shift will be taken care of at the end while rounding
137
  // the result.
138
  // Since all filter co-efficients are even, this change will not affect the
139
  // end result
140
7.32k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
141
7.32k
                            _mm_set1_epi16((short)0xffff)));
142
143
7.32k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
144
145
  // coeffs 0 1 0 1 0 1 0 1
146
7.32k
  coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
147
  // coeffs 2 3 2 3 2 3 2 3
148
7.32k
  coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
149
  // coeffs 4 5 4 5 4 5 4 5
150
7.32k
  coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
151
  // coeffs 6 7 6 7 6 7 6 7
152
7.32k
  coeffs[3] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0e0cu));
153
7.32k
}
154
155
static inline void prepare_half_coeffs_2tap_avx2(
156
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
157
26.9k
    __m256i *const coeffs /* [1] */) {
158
26.9k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
159
26.9k
      filter_params, subpel_q4 & SUBPEL_MASK);
160
26.9k
  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
161
26.9k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
162
163
  // right shift all filter co-efficients by 1 to reduce the bits required.
164
  // This extra right shift will be taken care of at the end while rounding
165
  // the result.
166
  // Since all filter co-efficients are even, this change will not affect the
167
  // end result
168
26.9k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
169
26.9k
                            _mm_set1_epi16((short)0xffff)));
170
171
26.9k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
172
173
  // coeffs 3 4 3 4 3 4 3 4
174
26.9k
  *coeffs = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
175
26.9k
}
convolve_2d_avx2.c:prepare_half_coeffs_2tap_avx2
Line
Count
Source
157
14.4k
    __m256i *const coeffs /* [1] */) {
158
14.4k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
159
14.4k
      filter_params, subpel_q4 & SUBPEL_MASK);
160
14.4k
  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
161
14.4k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
162
163
  // right shift all filter co-efficients by 1 to reduce the bits required.
164
  // This extra right shift will be taken care of at the end while rounding
165
  // the result.
166
  // Since all filter co-efficients are even, this change will not affect the
167
  // end result
168
14.4k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
169
14.4k
                            _mm_set1_epi16((short)0xffff)));
170
171
14.4k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
172
173
  // coeffs 3 4 3 4 3 4 3 4
174
14.4k
  *coeffs = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
175
14.4k
}
convolve_avx2.c:prepare_half_coeffs_2tap_avx2
Line
Count
Source
157
12.4k
    __m256i *const coeffs /* [1] */) {
158
12.4k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
159
12.4k
      filter_params, subpel_q4 & SUBPEL_MASK);
160
12.4k
  const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
161
12.4k
  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
162
163
  // right shift all filter co-efficients by 1 to reduce the bits required.
164
  // This extra right shift will be taken care of at the end while rounding
165
  // the result.
166
  // Since all filter co-efficients are even, this change will not affect the
167
  // end result
168
12.4k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
169
12.4k
                            _mm_set1_epi16((short)0xffff)));
170
171
12.4k
  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
172
173
  // coeffs 3 4 3 4 3 4 3 4
174
12.4k
  *coeffs = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
175
12.4k
}
176
177
static inline void prepare_half_coeffs_4tap_avx2(
178
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
179
227k
    __m256i *const coeffs /* [2] */) {
180
227k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
181
227k
      filter_params, subpel_q4 & SUBPEL_MASK);
182
227k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
183
184
  // right shift all filter co-efficients by 1 to reduce the bits required.
185
  // This extra right shift will be taken care of at the end while rounding
186
  // the result.
187
  // Since all filter co-efficients are even, this change will not affect the
188
  // end result
189
227k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
190
227k
                            _mm_set1_epi16((short)0xffff)));
191
227k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
192
227k
  populate_coeffs_4tap_avx2(coeffs_1, coeffs);
193
227k
}
convolve_2d_avx2.c:prepare_half_coeffs_4tap_avx2
Line
Count
Source
179
58.6k
    __m256i *const coeffs /* [2] */) {
180
58.6k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
181
58.6k
      filter_params, subpel_q4 & SUBPEL_MASK);
182
58.6k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
183
184
  // right shift all filter co-efficients by 1 to reduce the bits required.
185
  // This extra right shift will be taken care of at the end while rounding
186
  // the result.
187
  // Since all filter co-efficients are even, this change will not affect the
188
  // end result
189
58.6k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
190
58.6k
                            _mm_set1_epi16((short)0xffff)));
191
58.6k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
192
58.6k
  populate_coeffs_4tap_avx2(coeffs_1, coeffs);
193
58.6k
}
convolve_avx2.c:prepare_half_coeffs_4tap_avx2
Line
Count
Source
179
169k
    __m256i *const coeffs /* [2] */) {
180
169k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
181
169k
      filter_params, subpel_q4 & SUBPEL_MASK);
182
169k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
183
184
  // right shift all filter co-efficients by 1 to reduce the bits required.
185
  // This extra right shift will be taken care of at the end while rounding
186
  // the result.
187
  // Since all filter co-efficients are even, this change will not affect the
188
  // end result
189
169k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
190
169k
                            _mm_set1_epi16((short)0xffff)));
191
169k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
192
169k
  populate_coeffs_4tap_avx2(coeffs_1, coeffs);
193
169k
}
194
195
static inline void prepare_half_coeffs_6tap_avx2(
196
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
197
1.55M
    __m256i *const coeffs /* [3] */) {
198
1.55M
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
199
1.55M
      filter_params, subpel_q4 & SUBPEL_MASK);
200
1.55M
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
201
202
  // right shift all filter co-efficients by 1 to reduce the bits required.
203
  // This extra right shift will be taken care of at the end while rounding
204
  // the result.
205
  // Since all filter co-efficients are even, this change will not affect the
206
  // end result
207
1.55M
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
208
1.55M
                            _mm_set1_epi16((short)0xffff)));
209
1.55M
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
210
1.55M
  populate_coeffs_6tap_avx2(coeffs_1, coeffs);
211
1.55M
}
convolve_2d_avx2.c:prepare_half_coeffs_6tap_avx2
Line
Count
Source
197
890k
    __m256i *const coeffs /* [3] */) {
198
890k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
199
890k
      filter_params, subpel_q4 & SUBPEL_MASK);
200
890k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
201
202
  // right shift all filter co-efficients by 1 to reduce the bits required.
203
  // This extra right shift will be taken care of at the end while rounding
204
  // the result.
205
  // Since all filter co-efficients are even, this change will not affect the
206
  // end result
207
890k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
208
890k
                            _mm_set1_epi16((short)0xffff)));
209
890k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
210
890k
  populate_coeffs_6tap_avx2(coeffs_1, coeffs);
211
890k
}
convolve_avx2.c:prepare_half_coeffs_6tap_avx2
Line
Count
Source
197
664k
    __m256i *const coeffs /* [3] */) {
198
664k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
199
664k
      filter_params, subpel_q4 & SUBPEL_MASK);
200
664k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
201
202
  // right shift all filter co-efficients by 1 to reduce the bits required.
203
  // This extra right shift will be taken care of at the end while rounding
204
  // the result.
205
  // Since all filter co-efficients are even, this change will not affect the
206
  // end result
207
664k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
208
664k
                            _mm_set1_epi16((short)0xffff)));
209
664k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
210
664k
  populate_coeffs_6tap_avx2(coeffs_1, coeffs);
211
664k
}
212
213
static inline void prepare_half_coeffs_8tap_avx2(
214
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
215
85.8k
    __m256i *const coeffs /* [4] */) {
216
85.8k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
217
85.8k
      filter_params, subpel_q4 & SUBPEL_MASK);
218
85.8k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
219
220
  // right shift all filter co-efficients by 1 to reduce the bits required.
221
  // This extra right shift will be taken care of at the end while rounding
222
  // the result.
223
  // Since all filter co-efficients are even, this change will not affect the
224
  // end result
225
85.8k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
226
85.8k
                            _mm_set1_epi16((short)0xffff)));
227
85.8k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
228
85.8k
  populate_coeffs_8tap_avx2(coeffs_1, coeffs);
229
85.8k
}
convolve_2d_avx2.c:prepare_half_coeffs_8tap_avx2
Line
Count
Source
215
53.9k
    __m256i *const coeffs /* [4] */) {
216
53.9k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
217
53.9k
      filter_params, subpel_q4 & SUBPEL_MASK);
218
53.9k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
219
220
  // right shift all filter co-efficients by 1 to reduce the bits required.
221
  // This extra right shift will be taken care of at the end while rounding
222
  // the result.
223
  // Since all filter co-efficients are even, this change will not affect the
224
  // end result
225
53.9k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
226
53.9k
                            _mm_set1_epi16((short)0xffff)));
227
53.9k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
228
53.9k
  populate_coeffs_8tap_avx2(coeffs_1, coeffs);
229
53.9k
}
convolve_avx2.c:prepare_half_coeffs_8tap_avx2
Line
Count
Source
215
31.8k
    __m256i *const coeffs /* [4] */) {
216
31.8k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
217
31.8k
      filter_params, subpel_q4 & SUBPEL_MASK);
218
31.8k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
219
220
  // right shift all filter co-efficients by 1 to reduce the bits required.
221
  // This extra right shift will be taken care of at the end while rounding
222
  // the result.
223
  // Since all filter co-efficients are even, this change will not affect the
224
  // end result
225
31.8k
  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
226
31.8k
                            _mm_set1_epi16((short)0xffff)));
227
31.8k
  const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
228
31.8k
  populate_coeffs_8tap_avx2(coeffs_1, coeffs);
229
31.8k
}
230
231
static inline void prepare_coeffs_2tap_sse2(
232
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
233
28.4k
    __m128i *const coeffs /* [1] */) {
234
28.4k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
235
28.4k
      filter_params, subpel_q4 & SUBPEL_MASK);
236
237
28.4k
  const __m128i coeff = _mm_cvtsi32_si128(loadu_int32(filter + 3));
238
239
  // coeffs 3 4 3 4 3 4 3 4
240
28.4k
  coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
241
28.4k
}
convolve_2d_avx2.c:prepare_coeffs_2tap_sse2
Line
Count
Source
233
28.4k
    __m128i *const coeffs /* [1] */) {
234
28.4k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
235
28.4k
      filter_params, subpel_q4 & SUBPEL_MASK);
236
237
28.4k
  const __m128i coeff = _mm_cvtsi32_si128(loadu_int32(filter + 3));
238
239
  // coeffs 3 4 3 4 3 4 3 4
240
28.4k
  coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
241
28.4k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_2tap_sse2
242
243
static inline void prepare_coeffs_4tap_sse2(
244
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
245
86.6k
    __m128i *const coeffs /* [2] */) {
246
86.6k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
247
86.6k
      filter_params, subpel_q4 & SUBPEL_MASK);
248
249
86.6k
  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
250
251
  // coeffs 2 3 2 3 2 3 2 3
252
86.6k
  coeffs[0] = _mm_shuffle_epi32(coeff, 0x55);
253
  // coeffs 4 5 4 5 4 5 4 5
254
86.6k
  coeffs[1] = _mm_shuffle_epi32(coeff, 0xaa);
255
86.6k
}
convolve_2d_avx2.c:prepare_coeffs_4tap_sse2
Line
Count
Source
245
86.6k
    __m128i *const coeffs /* [2] */) {
246
86.6k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
247
86.6k
      filter_params, subpel_q4 & SUBPEL_MASK);
248
249
86.6k
  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
250
251
  // coeffs 2 3 2 3 2 3 2 3
252
86.6k
  coeffs[0] = _mm_shuffle_epi32(coeff, 0x55);
253
  // coeffs 4 5 4 5 4 5 4 5
254
86.6k
  coeffs[1] = _mm_shuffle_epi32(coeff, 0xaa);
255
86.6k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_4tap_sse2
256
257
static inline void prepare_coeffs_6tap_ssse3(
258
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
259
59.7k
    __m128i *const coeffs /* [3] */) {
260
59.7k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
261
59.7k
      filter_params, subpel_q4 & SUBPEL_MASK);
262
59.7k
  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
263
264
  // coeffs 1 2 1 2 1 2 1 2
265
59.7k
  coeffs[0] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x05040302u));
266
  // coeffs 3 4 3 4 3 4 3 4
267
59.7k
  coeffs[1] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x09080706u));
268
  // coeffs 5 6 5 6 5 6 5 6
269
59.7k
  coeffs[2] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x0D0C0B0Au));
270
59.7k
}
convolve_2d_avx2.c:prepare_coeffs_6tap_ssse3
Line
Count
Source
259
59.7k
    __m128i *const coeffs /* [3] */) {
260
59.7k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
261
59.7k
      filter_params, subpel_q4 & SUBPEL_MASK);
262
59.7k
  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
263
264
  // coeffs 1 2 1 2 1 2 1 2
265
59.7k
  coeffs[0] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x05040302u));
266
  // coeffs 3 4 3 4 3 4 3 4
267
59.7k
  coeffs[1] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x09080706u));
268
  // coeffs 5 6 5 6 5 6 5 6
269
59.7k
  coeffs[2] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x0D0C0B0Au));
270
59.7k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_6tap_ssse3
271
272
static inline void prepare_coeffs_8tap_sse2(
273
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
274
2.74k
    __m128i *const coeffs /* [4] */) {
275
2.74k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
276
2.74k
      filter_params, subpel_q4 & SUBPEL_MASK);
277
278
2.74k
  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
279
280
  // coeffs 0 1 0 1 0 1 0 1
281
2.74k
  coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
282
  // coeffs 2 3 2 3 2 3 2 3
283
2.74k
  coeffs[1] = _mm_shuffle_epi32(coeff, 0x55);
284
  // coeffs 4 5 4 5 4 5 4 5
285
2.74k
  coeffs[2] = _mm_shuffle_epi32(coeff, 0xaa);
286
  // coeffs 6 7 6 7 6 7 6 7
287
2.74k
  coeffs[3] = _mm_shuffle_epi32(coeff, 0xff);
288
2.74k
}
convolve_2d_avx2.c:prepare_coeffs_8tap_sse2
Line
Count
Source
274
2.74k
    __m128i *const coeffs /* [4] */) {
275
2.74k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
276
2.74k
      filter_params, subpel_q4 & SUBPEL_MASK);
277
278
2.74k
  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
279
280
  // coeffs 0 1 0 1 0 1 0 1
281
2.74k
  coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
282
  // coeffs 2 3 2 3 2 3 2 3
283
2.74k
  coeffs[1] = _mm_shuffle_epi32(coeff, 0x55);
284
  // coeffs 4 5 4 5 4 5 4 5
285
2.74k
  coeffs[2] = _mm_shuffle_epi32(coeff, 0xaa);
286
  // coeffs 6 7 6 7 6 7 6 7
287
2.74k
  coeffs[3] = _mm_shuffle_epi32(coeff, 0xff);
288
2.74k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_8tap_sse2
289
290
static inline void prepare_coeffs_2tap_avx2(
291
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
292
26.4k
    __m256i *const coeffs /* [1] */) {
293
26.4k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
294
26.4k
      filter_params, subpel_q4 & SUBPEL_MASK);
295
296
26.4k
  const __m128i coeff_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
297
26.4k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
298
299
  // coeffs 3 4 3 4 3 4 3 4
300
26.4k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
301
26.4k
}
convolve_2d_avx2.c:prepare_coeffs_2tap_avx2
Line
Count
Source
292
26.4k
    __m256i *const coeffs /* [1] */) {
293
26.4k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
294
26.4k
      filter_params, subpel_q4 & SUBPEL_MASK);
295
296
26.4k
  const __m128i coeff_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
297
26.4k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
298
299
  // coeffs 3 4 3 4 3 4 3 4
300
26.4k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
301
26.4k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_2tap_avx2
302
303
static inline void prepare_coeffs_4tap_avx2(
304
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
305
792k
    __m256i *const coeffs /* [2] */) {
306
792k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
307
792k
      filter_params, subpel_q4 & SUBPEL_MASK);
308
309
792k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
310
792k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
311
312
  // coeffs 2 3 2 3 2 3 2 3
313
792k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55);
314
  // coeffs 4 5 4 5 4 5 4 5
315
792k
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0xaa);
316
792k
}
convolve_2d_avx2.c:prepare_coeffs_4tap_avx2
Line
Count
Source
305
792k
    __m256i *const coeffs /* [2] */) {
306
792k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
307
792k
      filter_params, subpel_q4 & SUBPEL_MASK);
308
309
792k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
310
792k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
311
312
  // coeffs 2 3 2 3 2 3 2 3
313
792k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55);
314
  // coeffs 4 5 4 5 4 5 4 5
315
792k
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0xaa);
316
792k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_4tap_avx2
317
318
static inline void prepare_coeffs_6tap_avx2(
319
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
320
758k
    __m256i *const coeffs /* [3]*/) {
321
758k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
322
758k
      filter_params, subpel_q4 & SUBPEL_MASK);
323
758k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
324
758k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeffs_8);
325
326
  // coeffs 1 2 1 2 1 2 1 2
327
758k
  coeffs[0] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x05040302u));
328
  // coeffs 3 4 3 4 3 4 3 4
329
758k
  coeffs[1] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x09080706u));
330
  // coeffs 5 6 5 6 5 6 5 6
331
758k
  coeffs[2] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x0D0C0B0Au));
332
758k
}
convolve_2d_avx2.c:prepare_coeffs_6tap_avx2
Line
Count
Source
320
758k
    __m256i *const coeffs /* [3]*/) {
321
758k
  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
322
758k
      filter_params, subpel_q4 & SUBPEL_MASK);
323
758k
  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
324
758k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeffs_8);
325
326
  // coeffs 1 2 1 2 1 2 1 2
327
758k
  coeffs[0] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x05040302u));
328
  // coeffs 3 4 3 4 3 4 3 4
329
758k
  coeffs[1] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x09080706u));
330
  // coeffs 5 6 5 6 5 6 5 6
331
758k
  coeffs[2] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x0D0C0B0Au));
332
758k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_6tap_avx2
333
334
static inline void prepare_coeffs_8tap_avx2(
335
    const InterpFilterParams *const filter_params, const int32_t subpel_q4,
336
49.4k
    __m256i *const coeffs /* [4] */) {
337
49.4k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
338
49.4k
      filter_params, subpel_q4 & SUBPEL_MASK);
339
340
49.4k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
341
49.4k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
342
343
  // coeffs 0 1 0 1 0 1 0 1
344
49.4k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
345
  // coeffs 2 3 2 3 2 3 2 3
346
49.4k
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
347
  // coeffs 4 5 4 5 4 5 4 5
348
49.4k
  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
349
  // coeffs 6 7 6 7 6 7 6 7
350
49.4k
  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
351
49.4k
}
convolve_2d_avx2.c:prepare_coeffs_8tap_avx2
Line
Count
Source
336
49.4k
    __m256i *const coeffs /* [4] */) {
337
49.4k
  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
338
49.4k
      filter_params, subpel_q4 & SUBPEL_MASK);
339
340
49.4k
  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
341
49.4k
  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
342
343
  // coeffs 0 1 0 1 0 1 0 1
344
49.4k
  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
345
  // coeffs 2 3 2 3 2 3 2 3
346
49.4k
  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
347
  // coeffs 4 5 4 5 4 5 4 5
348
49.4k
  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
349
  // coeffs 6 7 6 7 6 7 6 7
350
49.4k
  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
351
49.4k
}
Unexecuted instantiation: convolve_avx2.c:prepare_coeffs_8tap_avx2
352
353
static inline void load_16bit_5rows_avx2(const int16_t *const src,
354
                                         const ptrdiff_t stride,
355
0
                                         __m256i dst[5]) {
356
0
  dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
357
0
  dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
358
0
  dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
359
0
  dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
360
0
  dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
361
0
}
Unexecuted instantiation: convolve_2d_avx2.c:load_16bit_5rows_avx2
Unexecuted instantiation: convolve_avx2.c:load_16bit_5rows_avx2
362
363
static inline void load_16bit_7rows_avx2(const int16_t *const src,
364
                                         const ptrdiff_t stride,
365
65.4k
                                         __m256i dst[7]) {
366
65.4k
  dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
367
65.4k
  dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
368
65.4k
  dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
369
65.4k
  dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
370
65.4k
  dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
371
65.4k
  dst[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
372
65.4k
  dst[6] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
373
65.4k
}
convolve_2d_avx2.c:load_16bit_7rows_avx2
Line
Count
Source
365
65.4k
                                         __m256i dst[7]) {
366
65.4k
  dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
367
65.4k
  dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
368
65.4k
  dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
369
65.4k
  dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
370
65.4k
  dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
371
65.4k
  dst[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
372
65.4k
  dst[6] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
373
65.4k
}
Unexecuted instantiation: convolve_avx2.c:load_16bit_7rows_avx2
374
375
static AOM_FORCE_INLINE void load_16bit_8rows_avx2(const int16_t *const src,
376
                                                   const ptrdiff_t stride,
377
494
                                                   __m256i dst[8]) {
378
494
  dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
379
494
  dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
380
494
  dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
381
494
  dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
382
494
  dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
383
494
  dst[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
384
494
  dst[6] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
385
494
  dst[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
386
494
}
convolve_2d_avx2.c:load_16bit_8rows_avx2
Line
Count
Source
377
494
                                                   __m256i dst[8]) {
378
494
  dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
379
494
  dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
380
494
  dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
381
494
  dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
382
494
  dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
383
494
  dst[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
384
494
  dst[6] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
385
494
  dst[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
386
494
}
Unexecuted instantiation: convolve_avx2.c:load_16bit_8rows_avx2
387
388
static AOM_FORCE_INLINE void loadu_unpack_16bit_5rows_avx2(
389
    const int16_t *const src, const ptrdiff_t stride, __m256i s_256[5],
390
186k
    __m256i ss_256[5], __m256i tt_256[5]) {
391
186k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
392
186k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
393
186k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
394
186k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
395
186k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
396
397
186k
  ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
398
186k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
399
186k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
400
186k
  ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
401
402
186k
  tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
403
186k
  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[4]);
404
186k
  tt_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
405
186k
  tt_256[4] = _mm256_unpackhi_epi16(s_256[3], s_256[4]);
406
186k
}
convolve_2d_avx2.c:loadu_unpack_16bit_5rows_avx2
Line
Count
Source
390
186k
    __m256i ss_256[5], __m256i tt_256[5]) {
391
186k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
392
186k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
393
186k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
394
186k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
395
186k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
396
397
186k
  ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
398
186k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
399
186k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
400
186k
  ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
401
402
186k
  tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
403
186k
  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[4]);
404
186k
  tt_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
405
186k
  tt_256[4] = _mm256_unpackhi_epi16(s_256[3], s_256[4]);
406
186k
}
Unexecuted instantiation: convolve_avx2.c:loadu_unpack_16bit_5rows_avx2
407
408
static AOM_FORCE_INLINE void loadu_unpack_16bit_3rows_avx2(
409
    const int16_t *const src, const ptrdiff_t stride, __m256i s_256[3],
410
27.1k
    __m256i ss_256[3], __m256i tt_256[3]) {
411
27.1k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
412
27.1k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
413
27.1k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
414
415
27.1k
  ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
416
27.1k
  ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
417
418
27.1k
  tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
419
27.1k
  tt_256[2] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
420
27.1k
}
convolve_2d_avx2.c:loadu_unpack_16bit_3rows_avx2
Line
Count
Source
410
27.1k
    __m256i ss_256[3], __m256i tt_256[3]) {
411
27.1k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
412
27.1k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
413
27.1k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
414
415
27.1k
  ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
416
27.1k
  ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
417
418
27.1k
  tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
419
27.1k
  tt_256[2] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
420
27.1k
}
Unexecuted instantiation: convolve_avx2.c:loadu_unpack_16bit_3rows_avx2
421
422
static inline void convolve_8tap_unpack_avx2(const __m256i s[6],
423
133k
                                             __m256i ss[7]) {
424
133k
  ss[0] = _mm256_unpacklo_epi16(s[0], s[1]);
425
133k
  ss[1] = _mm256_unpacklo_epi16(s[2], s[3]);
426
133k
  ss[2] = _mm256_unpacklo_epi16(s[4], s[5]);
427
133k
  ss[4] = _mm256_unpackhi_epi16(s[0], s[1]);
428
133k
  ss[5] = _mm256_unpackhi_epi16(s[2], s[3]);
429
133k
  ss[6] = _mm256_unpackhi_epi16(s[4], s[5]);
430
133k
}
convolve_2d_avx2.c:convolve_8tap_unpack_avx2
Line
Count
Source
423
133k
                                             __m256i ss[7]) {
424
133k
  ss[0] = _mm256_unpacklo_epi16(s[0], s[1]);
425
133k
  ss[1] = _mm256_unpacklo_epi16(s[2], s[3]);
426
133k
  ss[2] = _mm256_unpacklo_epi16(s[4], s[5]);
427
133k
  ss[4] = _mm256_unpackhi_epi16(s[0], s[1]);
428
133k
  ss[5] = _mm256_unpackhi_epi16(s[2], s[3]);
429
133k
  ss[6] = _mm256_unpackhi_epi16(s[4], s[5]);
430
133k
}
Unexecuted instantiation: convolve_avx2.c:convolve_8tap_unpack_avx2
431
432
static inline __m128i convolve_2tap_ssse3(const __m128i ss[1],
433
483k
                                          const __m128i coeffs[1]) {
434
483k
  return _mm_maddubs_epi16(ss[0], coeffs[0]);
435
483k
}
convolve_2d_avx2.c:convolve_2tap_ssse3
Line
Count
Source
433
312k
                                          const __m128i coeffs[1]) {
434
312k
  return _mm_maddubs_epi16(ss[0], coeffs[0]);
435
312k
}
convolve_avx2.c:convolve_2tap_ssse3
Line
Count
Source
433
171k
                                          const __m128i coeffs[1]) {
434
171k
  return _mm_maddubs_epi16(ss[0], coeffs[0]);
435
171k
}
436
437
static inline __m128i convolve_4tap_ssse3(const __m128i ss[2],
438
5.58M
                                          const __m128i coeffs[2]) {
439
5.58M
  const __m128i res_23 = _mm_maddubs_epi16(ss[0], coeffs[0]);
440
5.58M
  const __m128i res_45 = _mm_maddubs_epi16(ss[1], coeffs[1]);
441
5.58M
  return _mm_add_epi16(res_23, res_45);
442
5.58M
}
convolve_2d_avx2.c:convolve_4tap_ssse3
Line
Count
Source
438
4.18M
                                          const __m128i coeffs[2]) {
439
4.18M
  const __m128i res_23 = _mm_maddubs_epi16(ss[0], coeffs[0]);
440
4.18M
  const __m128i res_45 = _mm_maddubs_epi16(ss[1], coeffs[1]);
441
4.18M
  return _mm_add_epi16(res_23, res_45);
442
4.18M
}
convolve_avx2.c:convolve_4tap_ssse3
Line
Count
Source
438
1.39M
                                          const __m128i coeffs[2]) {
439
1.39M
  const __m128i res_23 = _mm_maddubs_epi16(ss[0], coeffs[0]);
440
1.39M
  const __m128i res_45 = _mm_maddubs_epi16(ss[1], coeffs[1]);
441
1.39M
  return _mm_add_epi16(res_23, res_45);
442
1.39M
}
443
444
static inline __m128i convolve_6tap_ssse3(const __m128i ss[3],
445
567k
                                          const __m128i coeffs[3]) {
446
567k
  const __m128i res_12 = _mm_maddubs_epi16(ss[0], coeffs[0]);
447
567k
  const __m128i res_34 = _mm_maddubs_epi16(ss[1], coeffs[1]);
448
567k
  const __m128i res_56 = _mm_maddubs_epi16(ss[2], coeffs[2]);
449
567k
  const __m128i res_1256 = _mm_add_epi16(res_12, res_56);
450
567k
  return _mm_add_epi16(res_1256, res_34);
451
567k
}
Unexecuted instantiation: convolve_2d_avx2.c:convolve_6tap_ssse3
convolve_avx2.c:convolve_6tap_ssse3
Line
Count
Source
445
567k
                                          const __m128i coeffs[3]) {
446
567k
  const __m128i res_12 = _mm_maddubs_epi16(ss[0], coeffs[0]);
447
567k
  const __m128i res_34 = _mm_maddubs_epi16(ss[1], coeffs[1]);
448
567k
  const __m128i res_56 = _mm_maddubs_epi16(ss[2], coeffs[2]);
449
567k
  const __m128i res_1256 = _mm_add_epi16(res_12, res_56);
450
567k
  return _mm_add_epi16(res_1256, res_34);
451
567k
}
452
453
static inline __m128i convolve_8tap_ssse3(const __m128i ss[4],
454
38.2k
                                          const __m128i coeffs[4]) {
455
38.2k
  const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
456
38.2k
  const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
457
38.2k
  const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]);
458
38.2k
  const __m128i res_67 = _mm_maddubs_epi16(ss[3], coeffs[3]);
459
38.2k
  const __m128i res_0145 = _mm_add_epi16(res_01, res_45);
460
38.2k
  const __m128i res_2367 = _mm_add_epi16(res_23, res_67);
461
38.2k
  return _mm_add_epi16(res_0145, res_2367);
462
38.2k
}
Unexecuted instantiation: convolve_2d_avx2.c:convolve_8tap_ssse3
convolve_avx2.c:convolve_8tap_ssse3
Line
Count
Source
454
38.2k
                                          const __m128i coeffs[4]) {
455
38.2k
  const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
456
38.2k
  const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
457
38.2k
  const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]);
458
38.2k
  const __m128i res_67 = _mm_maddubs_epi16(ss[3], coeffs[3]);
459
38.2k
  const __m128i res_0145 = _mm_add_epi16(res_01, res_45);
460
38.2k
  const __m128i res_2367 = _mm_add_epi16(res_23, res_67);
461
38.2k
  return _mm_add_epi16(res_0145, res_2367);
462
38.2k
}
463
464
static inline __m256i convolve_2tap_avx2(const __m256i ss[1],
465
1.86M
                                         const __m256i coeffs[1]) {
466
1.86M
  return _mm256_maddubs_epi16(ss[0], coeffs[0]);
467
1.86M
}
convolve_2d_avx2.c:convolve_2tap_avx2
Line
Count
Source
465
973k
                                         const __m256i coeffs[1]) {
466
973k
  return _mm256_maddubs_epi16(ss[0], coeffs[0]);
467
973k
}
convolve_avx2.c:convolve_2tap_avx2
Line
Count
Source
465
890k
                                         const __m256i coeffs[1]) {
466
890k
  return _mm256_maddubs_epi16(ss[0], coeffs[0]);
467
890k
}
468
469
static inline __m256i convolve_4tap_avx2(const __m256i ss[2],
470
2.67M
                                         const __m256i coeffs[2]) {
471
2.67M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
472
2.67M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
473
2.67M
  return _mm256_add_epi16(res_23, res_45);
474
2.67M
}
convolve_2d_avx2.c:convolve_4tap_avx2
Line
Count
Source
470
1.47M
                                         const __m256i coeffs[2]) {
471
1.47M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
472
1.47M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
473
1.47M
  return _mm256_add_epi16(res_23, res_45);
474
1.47M
}
convolve_avx2.c:convolve_4tap_avx2
Line
Count
Source
470
1.20M
                                         const __m256i coeffs[2]) {
471
1.20M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
472
1.20M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
473
1.20M
  return _mm256_add_epi16(res_23, res_45);
474
1.20M
}
475
476
static inline __m256i convolve_6tap_avx2(const __m256i ss[3],
477
26.6M
                                         const __m256i coeffs[3]) {
478
26.6M
  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
479
26.6M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
480
26.6M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
481
26.6M
  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
482
26.6M
  return _mm256_add_epi16(res_0145, res_23);
483
26.6M
}
convolve_2d_avx2.c:convolve_6tap_avx2
Line
Count
Source
477
15.7M
                                         const __m256i coeffs[3]) {
478
15.7M
  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
479
15.7M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
480
15.7M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
481
15.7M
  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
482
15.7M
  return _mm256_add_epi16(res_0145, res_23);
483
15.7M
}
convolve_avx2.c:convolve_6tap_avx2
Line
Count
Source
477
10.8M
                                         const __m256i coeffs[3]) {
478
10.8M
  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
479
10.8M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
480
10.8M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
481
10.8M
  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
482
10.8M
  return _mm256_add_epi16(res_0145, res_23);
483
10.8M
}
484
485
static inline __m256i convolve_8tap_avx2(const __m256i ss[4],
486
3.58M
                                         const __m256i coeffs[4]) {
487
3.58M
  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
488
3.58M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
489
3.58M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
490
3.58M
  const __m256i res_67 = _mm256_maddubs_epi16(ss[3], coeffs[3]);
491
3.58M
  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
492
3.58M
  const __m256i res_2367 = _mm256_add_epi16(res_23, res_67);
493
3.58M
  return _mm256_add_epi16(res_0145, res_2367);
494
3.58M
}
convolve_2d_avx2.c:convolve_8tap_avx2
Line
Count
Source
486
2.56M
                                         const __m256i coeffs[4]) {
487
2.56M
  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
488
2.56M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
489
2.56M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
490
2.56M
  const __m256i res_67 = _mm256_maddubs_epi16(ss[3], coeffs[3]);
491
2.56M
  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
492
2.56M
  const __m256i res_2367 = _mm256_add_epi16(res_23, res_67);
493
2.56M
  return _mm256_add_epi16(res_0145, res_2367);
494
2.56M
}
convolve_avx2.c:convolve_8tap_avx2
Line
Count
Source
486
1.01M
                                         const __m256i coeffs[4]) {
487
1.01M
  const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
488
1.01M
  const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
489
1.01M
  const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
490
1.01M
  const __m256i res_67 = _mm256_maddubs_epi16(ss[3], coeffs[3]);
491
1.01M
  const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
492
1.01M
  const __m256i res_2367 = _mm256_add_epi16(res_23, res_67);
493
1.01M
  return _mm256_add_epi16(res_0145, res_2367);
494
1.01M
}
495
496
static inline __m128i convolve16_2tap_sse2(const __m128i ss[1],
497
142k
                                           const __m128i coeffs[1]) {
498
142k
  return _mm_madd_epi16(ss[0], coeffs[0]);
499
142k
}
convolve_2d_avx2.c:convolve16_2tap_sse2
Line
Count
Source
497
142k
                                           const __m128i coeffs[1]) {
498
142k
  return _mm_madd_epi16(ss[0], coeffs[0]);
499
142k
}
Unexecuted instantiation: convolve_avx2.c:convolve16_2tap_sse2
500
501
static inline __m128i convolve16_4tap_sse2(const __m128i ss[2],
502
146k
                                           const __m128i coeffs[2]) {
503
146k
  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
504
146k
  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
505
146k
  return _mm_add_epi32(res_01, res_23);
506
146k
}
convolve_2d_avx2.c:convolve16_4tap_sse2
Line
Count
Source
502
146k
                                           const __m128i coeffs[2]) {
503
146k
  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
504
146k
  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
505
146k
  return _mm_add_epi32(res_01, res_23);
506
146k
}
Unexecuted instantiation: convolve_avx2.c:convolve16_4tap_sse2
507
508
static inline __m128i convolve16_6tap_sse2(const __m128i ss[3],
509
239k
                                           const __m128i coeffs[3]) {
510
239k
  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
511
239k
  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
512
239k
  const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
513
239k
  const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
514
239k
  return _mm_add_epi32(res_0123, res_45);
515
239k
}
convolve_2d_avx2.c:convolve16_6tap_sse2
Line
Count
Source
509
239k
                                           const __m128i coeffs[3]) {
510
239k
  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
511
239k
  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
512
239k
  const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
513
239k
  const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
514
239k
  return _mm_add_epi32(res_0123, res_45);
515
239k
}
Unexecuted instantiation: convolve_avx2.c:convolve16_6tap_sse2
516
517
static inline __m128i convolve16_8tap_sse2(const __m128i ss[4],
518
10.9k
                                           const __m128i coeffs[4]) {
519
10.9k
  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
520
10.9k
  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
521
10.9k
  const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
522
10.9k
  const __m128i res_67 = _mm_madd_epi16(ss[3], coeffs[3]);
523
10.9k
  const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
524
10.9k
  const __m128i res_4567 = _mm_add_epi32(res_45, res_67);
525
10.9k
  return _mm_add_epi32(res_0123, res_4567);
526
10.9k
}
convolve_2d_avx2.c:convolve16_8tap_sse2
Line
Count
Source
518
10.9k
                                           const __m128i coeffs[4]) {
519
10.9k
  const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
520
10.9k
  const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
521
10.9k
  const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
522
10.9k
  const __m128i res_67 = _mm_madd_epi16(ss[3], coeffs[3]);
523
10.9k
  const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
524
10.9k
  const __m128i res_4567 = _mm_add_epi32(res_45, res_67);
525
10.9k
  return _mm_add_epi32(res_0123, res_4567);
526
10.9k
}
Unexecuted instantiation: convolve_avx2.c:convolve16_8tap_sse2
527
528
static inline __m256i convolve16_2tap_avx2(const __m256i ss[1],
529
1.72M
                                           const __m256i coeffs[1]) {
530
1.72M
  return _mm256_madd_epi16(ss[0], coeffs[0]);
531
1.72M
}
convolve_2d_avx2.c:convolve16_2tap_avx2
Line
Count
Source
529
1.72M
                                           const __m256i coeffs[1]) {
530
1.72M
  return _mm256_madd_epi16(ss[0], coeffs[0]);
531
1.72M
}
Unexecuted instantiation: convolve_avx2.c:convolve16_2tap_avx2
532
533
static inline __m256i convolve16_4tap_avx2(const __m256i ss[2],
534
6.65M
                                           const __m256i coeffs[2]) {
535
6.65M
  const __m256i res_1 = _mm256_madd_epi16(ss[0], coeffs[0]);
536
6.65M
  const __m256i res_2 = _mm256_madd_epi16(ss[1], coeffs[1]);
537
6.65M
  return _mm256_add_epi32(res_1, res_2);
538
6.65M
}
convolve_2d_avx2.c:convolve16_4tap_avx2
Line
Count
Source
534
6.65M
                                           const __m256i coeffs[2]) {
535
6.65M
  const __m256i res_1 = _mm256_madd_epi16(ss[0], coeffs[0]);
536
6.65M
  const __m256i res_2 = _mm256_madd_epi16(ss[1], coeffs[1]);
537
6.65M
  return _mm256_add_epi32(res_1, res_2);
538
6.65M
}
Unexecuted instantiation: convolve_avx2.c:convolve16_4tap_avx2
539
540
static inline __m256i convolve16_6tap_avx2(const __m256i ss[3],
541
21.2M
                                           const __m256i coeffs[3]) {
542
21.2M
  const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
543
21.2M
  const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
544
21.2M
  const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
545
21.2M
  const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
546
21.2M
  return _mm256_add_epi32(res_0123, res_45);
547
21.2M
}
convolve_2d_avx2.c:convolve16_6tap_avx2
Line
Count
Source
541
21.2M
                                           const __m256i coeffs[3]) {
542
21.2M
  const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
543
21.2M
  const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
544
21.2M
  const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
545
21.2M
  const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
546
21.2M
  return _mm256_add_epi32(res_0123, res_45);
547
21.2M
}
Unexecuted instantiation: convolve_avx2.c:convolve16_6tap_avx2
548
549
static inline __m256i convolve16_8tap_avx2(const __m256i ss[4],
550
3.78M
                                           const __m256i coeffs[4]) {
551
3.78M
  const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
552
3.78M
  const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
553
3.78M
  const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
554
3.78M
  const __m256i res_67 = _mm256_madd_epi16(ss[3], coeffs[3]);
555
3.78M
  const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
556
3.78M
  const __m256i res_4567 = _mm256_add_epi32(res_45, res_67);
557
3.78M
  return _mm256_add_epi32(res_0123, res_4567);
558
3.78M
}
convolve_2d_avx2.c:convolve16_8tap_avx2
Line
Count
Source
550
3.78M
                                           const __m256i coeffs[4]) {
551
3.78M
  const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
552
3.78M
  const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
553
3.78M
  const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
554
3.78M
  const __m256i res_67 = _mm256_madd_epi16(ss[3], coeffs[3]);
555
3.78M
  const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
556
3.78M
  const __m256i res_4567 = _mm256_add_epi32(res_45, res_67);
557
3.78M
  return _mm256_add_epi32(res_0123, res_4567);
558
3.78M
}
Unexecuted instantiation: convolve_avx2.c:convolve16_8tap_avx2
559
560
static inline __m256i x_convolve_4tap_avx2(const __m256i data,
561
                                           const __m256i coeffs[2],
562
1.47M
                                           const __m256i filt[2]) {
563
1.47M
  __m256i ss[2];
564
565
1.47M
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
566
1.47M
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
567
568
1.47M
  return convolve_4tap_avx2(ss, coeffs);
569
1.47M
}
convolve_2d_avx2.c:x_convolve_4tap_avx2
Line
Count
Source
562
1.47M
                                           const __m256i filt[2]) {
563
1.47M
  __m256i ss[2];
564
565
1.47M
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
566
1.47M
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
567
568
1.47M
  return convolve_4tap_avx2(ss, coeffs);
569
1.47M
}
Unexecuted instantiation: convolve_avx2.c:x_convolve_4tap_avx2
570
571
static inline __m256i x_convolve_6tap_avx2(const __m256i data,
572
                                           const __m256i coeffs[3],
573
21.8M
                                           const __m256i filt[3]) {
574
21.8M
  __m256i ss[3];
575
576
21.8M
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
577
21.8M
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
578
21.8M
  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
579
580
21.8M
  return convolve_6tap_avx2(ss, coeffs);
581
21.8M
}
convolve_2d_avx2.c:x_convolve_6tap_avx2
Line
Count
Source
573
15.7M
                                           const __m256i filt[3]) {
574
15.7M
  __m256i ss[3];
575
576
15.7M
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
577
15.7M
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
578
15.7M
  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
579
580
15.7M
  return convolve_6tap_avx2(ss, coeffs);
581
15.7M
}
convolve_avx2.c:x_convolve_6tap_avx2
Line
Count
Source
573
6.08M
                                           const __m256i filt[3]) {
574
6.08M
  __m256i ss[3];
575
576
6.08M
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
577
6.08M
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
578
6.08M
  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
579
580
6.08M
  return convolve_6tap_avx2(ss, coeffs);
581
6.08M
}
582
583
static inline __m256i x_convolve_8tap_avx2(const __m256i data,
584
                                           const __m256i coeffs[4],
585
3.02M
                                           const __m256i filt[4]) {
586
3.02M
  __m256i ss[4];
587
588
3.02M
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
589
3.02M
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
590
3.02M
  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
591
3.02M
  ss[3] = _mm256_shuffle_epi8(data, filt[3]);
592
593
3.02M
  return convolve_8tap_avx2(ss, coeffs);
594
3.02M
}
convolve_2d_avx2.c:x_convolve_8tap_avx2
Line
Count
Source
585
2.56M
                                           const __m256i filt[4]) {
586
2.56M
  __m256i ss[4];
587
588
2.56M
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
589
2.56M
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
590
2.56M
  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
591
2.56M
  ss[3] = _mm256_shuffle_epi8(data, filt[3]);
592
593
2.56M
  return convolve_8tap_avx2(ss, coeffs);
594
2.56M
}
convolve_avx2.c:x_convolve_8tap_avx2
Line
Count
Source
585
454k
                                           const __m256i filt[4]) {
586
454k
  __m256i ss[4];
587
588
454k
  ss[0] = _mm256_shuffle_epi8(data, filt[0]);
589
454k
  ss[1] = _mm256_shuffle_epi8(data, filt[1]);
590
454k
  ss[2] = _mm256_shuffle_epi8(data, filt[2]);
591
454k
  ss[3] = _mm256_shuffle_epi8(data, filt[3]);
592
593
454k
  return convolve_8tap_avx2(ss, coeffs);
594
454k
}
595
596
6.89M
static inline __m256i sr_y_round_avx2(const __m256i src) {
597
6.89M
  const __m256i round = _mm256_set1_epi16(32);
598
6.89M
  const __m256i dst = _mm256_add_epi16(src, round);
599
6.89M
  return _mm256_srai_epi16(dst, FILTER_BITS - 1);
600
6.89M
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_round_avx2
convolve_avx2.c:sr_y_round_avx2
Line
Count
Source
596
6.89M
static inline __m256i sr_y_round_avx2(const __m256i src) {
597
6.89M
  const __m256i round = _mm256_set1_epi16(32);
598
6.89M
  const __m256i dst = _mm256_add_epi16(src, round);
599
6.89M
  return _mm256_srai_epi16(dst, FILTER_BITS - 1);
600
6.89M
}
601
602
4.49M
static inline __m128i xy_x_round_sse2(const __m128i src) {
603
4.49M
  const __m128i round = _mm_set1_epi16(2);
604
4.49M
  const __m128i dst = _mm_add_epi16(src, round);
605
4.49M
  return _mm_srai_epi16(dst, 2);
606
4.49M
}
convolve_2d_avx2.c:xy_x_round_sse2
Line
Count
Source
602
4.49M
static inline __m128i xy_x_round_sse2(const __m128i src) {
603
4.49M
  const __m128i round = _mm_set1_epi16(2);
604
4.49M
  const __m128i dst = _mm_add_epi16(src, round);
605
4.49M
  return _mm_srai_epi16(dst, 2);
606
4.49M
}
Unexecuted instantiation: convolve_avx2.c:xy_x_round_sse2
607
608
20.7M
static inline __m256i xy_x_round_avx2(const __m256i src) {
609
20.7M
  const __m256i round = _mm256_set1_epi16(2);
610
20.7M
  const __m256i dst = _mm256_add_epi16(src, round);
611
20.7M
  return _mm256_srai_epi16(dst, 2);
612
20.7M
}
convolve_2d_avx2.c:xy_x_round_avx2
Line
Count
Source
608
20.7M
static inline __m256i xy_x_round_avx2(const __m256i src) {
609
20.7M
  const __m256i round = _mm256_set1_epi16(2);
610
20.7M
  const __m256i dst = _mm256_add_epi16(src, round);
611
20.7M
  return _mm256_srai_epi16(dst, 2);
612
20.7M
}
Unexecuted instantiation: convolve_avx2.c:xy_x_round_avx2
613
614
static inline void xy_x_round_store_2x2_sse2(const __m128i res,
615
784k
                                             int16_t *const dst) {
616
784k
  const __m128i d = xy_x_round_sse2(res);
617
784k
  _mm_storel_epi64((__m128i *)dst, d);
618
784k
}
convolve_2d_avx2.c:xy_x_round_store_2x2_sse2
Line
Count
Source
615
784k
                                             int16_t *const dst) {
616
784k
  const __m128i d = xy_x_round_sse2(res);
617
784k
  _mm_storel_epi64((__m128i *)dst, d);
618
784k
}
Unexecuted instantiation: convolve_avx2.c:xy_x_round_store_2x2_sse2
619
620
static inline void xy_x_round_store_4x2_sse2(const __m128i res,
621
3.52M
                                             int16_t *const dst) {
622
3.52M
  const __m128i d = xy_x_round_sse2(res);
623
3.52M
  _mm_storeu_si128((__m128i *)dst, d);
624
3.52M
}
convolve_2d_avx2.c:xy_x_round_store_4x2_sse2
Line
Count
Source
621
3.52M
                                             int16_t *const dst) {
622
3.52M
  const __m128i d = xy_x_round_sse2(res);
623
3.52M
  _mm_storeu_si128((__m128i *)dst, d);
624
3.52M
}
Unexecuted instantiation: convolve_avx2.c:xy_x_round_store_4x2_sse2
625
626
static inline void xy_x_round_store_8x2_sse2(const __m128i res[2],
627
90.6k
                                             int16_t *const dst) {
628
90.6k
  __m128i r[2];
629
630
90.6k
  r[0] = xy_x_round_sse2(res[0]);
631
90.6k
  r[1] = xy_x_round_sse2(res[1]);
632
90.6k
  _mm_storeu_si128((__m128i *)dst, r[0]);
633
90.6k
  _mm_storeu_si128((__m128i *)(dst + 8), r[1]);
634
90.6k
}
convolve_2d_avx2.c:xy_x_round_store_8x2_sse2
Line
Count
Source
627
90.6k
                                             int16_t *const dst) {
628
90.6k
  __m128i r[2];
629
630
90.6k
  r[0] = xy_x_round_sse2(res[0]);
631
90.6k
  r[1] = xy_x_round_sse2(res[1]);
632
90.6k
  _mm_storeu_si128((__m128i *)dst, r[0]);
633
90.6k
  _mm_storeu_si128((__m128i *)(dst + 8), r[1]);
634
90.6k
}
Unexecuted instantiation: convolve_avx2.c:xy_x_round_store_8x2_sse2
635
636
static inline void xy_x_round_store_8x2_avx2(const __m256i res,
637
3.70M
                                             int16_t *const dst) {
638
3.70M
  const __m256i d = xy_x_round_avx2(res);
639
3.70M
  _mm256_storeu_si256((__m256i *)dst, d);
640
3.70M
}
convolve_2d_avx2.c:xy_x_round_store_8x2_avx2
Line
Count
Source
637
3.70M
                                             int16_t *const dst) {
638
3.70M
  const __m256i d = xy_x_round_avx2(res);
639
3.70M
  _mm256_storeu_si256((__m256i *)dst, d);
640
3.70M
}
Unexecuted instantiation: convolve_avx2.c:xy_x_round_store_8x2_avx2
641
642
static inline void xy_x_round_store_32_avx2(const __m256i res[2],
643
2.53M
                                            int16_t *const dst) {
644
2.53M
  __m256i r[2];
645
646
2.53M
  r[0] = xy_x_round_avx2(res[0]);
647
2.53M
  r[1] = xy_x_round_avx2(res[1]);
648
2.53M
  const __m256i d0 =
649
2.53M
      _mm256_inserti128_si256(r[0], _mm256_castsi256_si128(r[1]), 1);
650
2.53M
  const __m256i d1 =
651
2.53M
      _mm256_inserti128_si256(r[1], _mm256_extracti128_si256(r[0], 1), 0);
652
2.53M
  _mm256_storeu_si256((__m256i *)dst, d0);
653
2.53M
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
654
2.53M
}
convolve_2d_avx2.c:xy_x_round_store_32_avx2
Line
Count
Source
643
2.53M
                                            int16_t *const dst) {
644
2.53M
  __m256i r[2];
645
646
2.53M
  r[0] = xy_x_round_avx2(res[0]);
647
2.53M
  r[1] = xy_x_round_avx2(res[1]);
648
2.53M
  const __m256i d0 =
649
2.53M
      _mm256_inserti128_si256(r[0], _mm256_castsi256_si128(r[1]), 1);
650
2.53M
  const __m256i d1 =
651
2.53M
      _mm256_inserti128_si256(r[1], _mm256_extracti128_si256(r[0], 1), 0);
652
2.53M
  _mm256_storeu_si256((__m256i *)dst, d0);
653
2.53M
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
654
2.53M
}
Unexecuted instantiation: convolve_avx2.c:xy_x_round_store_32_avx2
655
656
539k
static inline __m128i xy_y_round_sse2(const __m128i src) {
657
539k
  const __m128i round = _mm_set1_epi32(1024);
658
539k
  const __m128i dst = _mm_add_epi32(src, round);
659
539k
  return _mm_srai_epi32(dst, 11);
660
539k
}
convolve_2d_avx2.c:xy_y_round_sse2
Line
Count
Source
656
539k
static inline __m128i xy_y_round_sse2(const __m128i src) {
657
539k
  const __m128i round = _mm_set1_epi32(1024);
658
539k
  const __m128i dst = _mm_add_epi32(src, round);
659
539k
  return _mm_srai_epi32(dst, 11);
660
539k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_sse2
661
662
18.2k
static inline __m128i xy_y_round_half_pel_sse2(const __m128i src) {
663
18.2k
  const __m128i round = _mm_set1_epi16(16);
664
18.2k
  const __m128i dst = _mm_add_epi16(src, round);
665
18.2k
  return _mm_srai_epi16(dst, 5);
666
18.2k
}
convolve_2d_avx2.c:xy_y_round_half_pel_sse2
Line
Count
Source
662
18.2k
static inline __m128i xy_y_round_half_pel_sse2(const __m128i src) {
663
18.2k
  const __m128i round = _mm_set1_epi16(16);
664
18.2k
  const __m128i dst = _mm_add_epi16(src, round);
665
18.2k
  return _mm_srai_epi16(dst, 5);
666
18.2k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_half_pel_sse2
667
668
33.3M
static inline __m256i xy_y_round_avx2(const __m256i src) {
669
33.3M
  const __m256i round = _mm256_set1_epi32(1024);
670
33.3M
  const __m256i dst = _mm256_add_epi32(src, round);
671
33.3M
  return _mm256_srai_epi32(dst, 11);
672
33.3M
}
convolve_2d_avx2.c:xy_y_round_avx2
Line
Count
Source
668
33.3M
static inline __m256i xy_y_round_avx2(const __m256i src) {
669
33.3M
  const __m256i round = _mm256_set1_epi32(1024);
670
33.3M
  const __m256i dst = _mm256_add_epi32(src, round);
671
33.3M
  return _mm256_srai_epi32(dst, 11);
672
33.3M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_avx2
673
674
15.6M
static inline __m256i xy_y_round_16_avx2(const __m256i r[2]) {
675
15.6M
  const __m256i r0 = xy_y_round_avx2(r[0]);
676
15.6M
  const __m256i r1 = xy_y_round_avx2(r[1]);
677
15.6M
  return _mm256_packs_epi32(r0, r1);
678
15.6M
}
convolve_2d_avx2.c:xy_y_round_16_avx2
Line
Count
Source
674
15.6M
static inline __m256i xy_y_round_16_avx2(const __m256i r[2]) {
675
15.6M
  const __m256i r0 = xy_y_round_avx2(r[0]);
676
15.6M
  const __m256i r1 = xy_y_round_avx2(r[1]);
677
15.6M
  return _mm256_packs_epi32(r0, r1);
678
15.6M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_16_avx2
679
680
315k
static inline __m256i xy_y_round_half_pel_avx2(const __m256i src) {
681
315k
  const __m256i round = _mm256_set1_epi16(16);
682
315k
  const __m256i dst = _mm256_add_epi16(src, round);
683
315k
  return _mm256_srai_epi16(dst, 5);
684
315k
}
convolve_2d_avx2.c:xy_y_round_half_pel_avx2
Line
Count
Source
680
315k
static inline __m256i xy_y_round_half_pel_avx2(const __m256i src) {
681
315k
  const __m256i round = _mm256_set1_epi16(16);
682
315k
  const __m256i dst = _mm256_add_epi16(src, round);
683
315k
  return _mm256_srai_epi16(dst, 5);
684
315k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_half_pel_avx2
685
686
static inline void pack_store_2x2_sse2(const __m128i res, uint8_t *const dst,
687
730k
                                       const ptrdiff_t stride) {
688
730k
  const __m128i d = _mm_packus_epi16(res, res);
689
730k
  *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d);
690
730k
  *(int16_t *)(dst + stride) = (int16_t)_mm_extract_epi16(d, 1);
691
730k
}
convolve_2d_avx2.c:pack_store_2x2_sse2
Line
Count
Source
687
413k
                                       const ptrdiff_t stride) {
688
413k
  const __m128i d = _mm_packus_epi16(res, res);
689
413k
  *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d);
690
413k
  *(int16_t *)(dst + stride) = (int16_t)_mm_extract_epi16(d, 1);
691
413k
}
convolve_avx2.c:pack_store_2x2_sse2
Line
Count
Source
687
316k
                                       const ptrdiff_t stride) {
688
316k
  const __m128i d = _mm_packus_epi16(res, res);
689
316k
  *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d);
690
316k
  *(int16_t *)(dst + stride) = (int16_t)_mm_extract_epi16(d, 1);
691
316k
}
692
693
static inline void pack_store_4x2_sse2(const __m128i res, uint8_t *const dst,
694
1.83M
                                       const ptrdiff_t stride) {
695
1.83M
  const __m128i d = _mm_packus_epi16(res, res);
696
1.83M
  store_u8_4x2_sse2(d, dst, stride);
697
1.83M
}
convolve_2d_avx2.c:pack_store_4x2_sse2
Line
Count
Source
694
79.8k
                                       const ptrdiff_t stride) {
695
79.8k
  const __m128i d = _mm_packus_epi16(res, res);
696
79.8k
  store_u8_4x2_sse2(d, dst, stride);
697
79.8k
}
convolve_avx2.c:pack_store_4x2_sse2
Line
Count
Source
694
1.75M
                                       const ptrdiff_t stride) {
695
1.75M
  const __m128i d = _mm_packus_epi16(res, res);
696
1.75M
  store_u8_4x2_sse2(d, dst, stride);
697
1.75M
}
698
699
static inline void pack_store_4x2_avx2(const __m256i res, uint8_t *const dst,
700
1.98M
                                       const ptrdiff_t stride) {
701
1.98M
  const __m256i d = _mm256_packus_epi16(res, res);
702
1.98M
  const __m128i d0 = _mm256_castsi256_si128(d);
703
1.98M
  const __m128i d1 = _mm256_extracti128_si256(d, 1);
704
705
1.98M
  xx_storel_32(dst, d0);
706
1.98M
  xx_storel_32(dst + stride, d1);
707
1.98M
}
convolve_2d_avx2.c:pack_store_4x2_avx2
Line
Count
Source
700
1.98M
                                       const ptrdiff_t stride) {
701
1.98M
  const __m256i d = _mm256_packus_epi16(res, res);
702
1.98M
  const __m128i d0 = _mm256_castsi256_si128(d);
703
1.98M
  const __m128i d1 = _mm256_extracti128_si256(d, 1);
704
705
1.98M
  xx_storel_32(dst, d0);
706
1.98M
  xx_storel_32(dst + stride, d1);
707
1.98M
}
Unexecuted instantiation: convolve_avx2.c:pack_store_4x2_avx2
708
709
static inline void pack_store_8x2_avx2(const __m256i res, uint8_t *const dst,
710
4.26M
                                       const ptrdiff_t stride) {
711
4.26M
  const __m256i d = _mm256_packus_epi16(res, res);
712
4.26M
  const __m128i d0 = _mm256_castsi256_si128(d);
713
4.26M
  const __m128i d1 = _mm256_extracti128_si256(d, 1);
714
4.26M
  _mm_storel_epi64((__m128i *)dst, d0);
715
4.26M
  _mm_storel_epi64((__m128i *)(dst + stride), d1);
716
4.26M
}
convolve_2d_avx2.c:pack_store_8x2_avx2
Line
Count
Source
710
2.30M
                                       const ptrdiff_t stride) {
711
2.30M
  const __m256i d = _mm256_packus_epi16(res, res);
712
2.30M
  const __m128i d0 = _mm256_castsi256_si128(d);
713
2.30M
  const __m128i d1 = _mm256_extracti128_si256(d, 1);
714
2.30M
  _mm_storel_epi64((__m128i *)dst, d0);
715
2.30M
  _mm_storel_epi64((__m128i *)(dst + stride), d1);
716
2.30M
}
convolve_avx2.c:pack_store_8x2_avx2
Line
Count
Source
710
1.96M
                                       const ptrdiff_t stride) {
711
1.96M
  const __m256i d = _mm256_packus_epi16(res, res);
712
1.96M
  const __m128i d0 = _mm256_castsi256_si128(d);
713
1.96M
  const __m128i d1 = _mm256_extracti128_si256(d, 1);
714
1.96M
  _mm_storel_epi64((__m128i *)dst, d0);
715
1.96M
  _mm_storel_epi64((__m128i *)(dst + stride), d1);
716
1.96M
}
717
718
static inline void pack_store_16x2_avx2(const __m256i res0, const __m256i res1,
719
                                        uint8_t *const dst,
720
1.64M
                                        const ptrdiff_t stride) {
721
1.64M
  const __m256i d = _mm256_packus_epi16(res0, res1);
722
1.64M
  storeu_u8_16x2_avx2(d, dst, stride);
723
1.64M
}
Unexecuted instantiation: convolve_2d_avx2.c:pack_store_16x2_avx2
convolve_avx2.c:pack_store_16x2_avx2
Line
Count
Source
720
1.64M
                                        const ptrdiff_t stride) {
721
1.64M
  const __m256i d = _mm256_packus_epi16(res0, res1);
722
1.64M
  storeu_u8_16x2_avx2(d, dst, stride);
723
1.64M
}
724
725
static inline void xy_y_pack_store_16x2_avx2(const __m256i res0,
726
                                             const __m256i res1,
727
                                             uint8_t *const dst,
728
1.71M
                                             const ptrdiff_t stride) {
729
1.71M
  const __m256i t = _mm256_packus_epi16(res0, res1);
730
1.71M
  const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
731
1.71M
  storeu_u8_16x2_avx2(d, dst, stride);
732
1.71M
}
convolve_2d_avx2.c:xy_y_pack_store_16x2_avx2
Line
Count
Source
728
1.71M
                                             const ptrdiff_t stride) {
729
1.71M
  const __m256i t = _mm256_packus_epi16(res0, res1);
730
1.71M
  const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
731
1.71M
  storeu_u8_16x2_avx2(d, dst, stride);
732
1.71M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_pack_store_16x2_avx2
733
734
static inline void pack_store_32_avx2(const __m256i res0, const __m256i res1,
735
0
                                      uint8_t *const dst) {
736
0
  const __m256i t = _mm256_packus_epi16(res0, res1);
737
0
  const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
738
0
  _mm256_storeu_si256((__m256i *)dst, d);
739
0
}
Unexecuted instantiation: convolve_2d_avx2.c:pack_store_32_avx2
Unexecuted instantiation: convolve_avx2.c:pack_store_32_avx2
740
741
static inline void xy_y_round_store_2x2_sse2(const __m128i res,
742
                                             uint8_t *const dst,
743
410k
                                             const ptrdiff_t stride) {
744
410k
  const __m128i r = xy_y_round_sse2(res);
745
410k
  const __m128i rr = _mm_packs_epi32(r, r);
746
410k
  pack_store_2x2_sse2(rr, dst, stride);
747
410k
}
convolve_2d_avx2.c:xy_y_round_store_2x2_sse2
Line
Count
Source
743
410k
                                             const ptrdiff_t stride) {
744
410k
  const __m128i r = xy_y_round_sse2(res);
745
410k
  const __m128i rr = _mm_packs_epi32(r, r);
746
410k
  pack_store_2x2_sse2(rr, dst, stride);
747
410k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_store_2x2_sse2
748
749
static inline void xy_y_round_store_4x2_avx2(const __m256i res,
750
                                             uint8_t *const dst,
751
1.98M
                                             const ptrdiff_t stride) {
752
1.98M
  const __m256i r = xy_y_round_avx2(res);
753
1.98M
  const __m256i rr = _mm256_packs_epi32(r, r);
754
1.98M
  pack_store_4x2_avx2(rr, dst, stride);
755
1.98M
}
convolve_2d_avx2.c:xy_y_round_store_4x2_avx2
Line
Count
Source
751
1.98M
                                             const ptrdiff_t stride) {
752
1.98M
  const __m256i r = xy_y_round_avx2(res);
753
1.98M
  const __m256i rr = _mm256_packs_epi32(r, r);
754
1.98M
  pack_store_4x2_avx2(rr, dst, stride);
755
1.98M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_store_4x2_avx2
756
757
static inline void xy_y_pack_store_32_avx2(const __m256i res0,
758
                                           const __m256i res1,
759
5.14M
                                           uint8_t *const dst) {
760
5.14M
  const __m256i d = _mm256_packus_epi16(res0, res1);
761
  // d = _mm256_permute4x64_epi64(d, 0xD8);
762
5.14M
  _mm256_storeu_si256((__m256i *)dst, d);
763
5.14M
}
convolve_2d_avx2.c:xy_y_pack_store_32_avx2
Line
Count
Source
759
5.14M
                                           uint8_t *const dst) {
760
5.14M
  const __m256i d = _mm256_packus_epi16(res0, res1);
761
  // d = _mm256_permute4x64_epi64(d, 0xD8);
762
5.14M
  _mm256_storeu_si256((__m256i *)dst, d);
763
5.14M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_pack_store_32_avx2
764
765
static inline void xy_y_round_store_32_avx2(const __m256i r0[2],
766
                                            const __m256i r1[2],
767
5.01M
                                            uint8_t *const dst) {
768
5.01M
  const __m256i ra = xy_y_round_16_avx2(r0);
769
5.01M
  const __m256i rb = xy_y_round_16_avx2(r1);
770
5.01M
  xy_y_pack_store_32_avx2(ra, rb, dst);
771
5.01M
}
convolve_2d_avx2.c:xy_y_round_store_32_avx2
Line
Count
Source
767
5.01M
                                            uint8_t *const dst) {
768
5.01M
  const __m256i ra = xy_y_round_16_avx2(r0);
769
5.01M
  const __m256i rb = xy_y_round_16_avx2(r1);
770
5.01M
  xy_y_pack_store_32_avx2(ra, rb, dst);
771
5.01M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_store_32_avx2
772
773
static inline void convolve_store_32_avx2(const __m256i res0,
774
                                          const __m256i res1,
775
4.34M
                                          uint8_t *const dst) {
776
4.34M
  const __m256i d = _mm256_packus_epi16(res0, res1);
777
4.34M
  _mm256_storeu_si256((__m256i *)dst, d);
778
4.34M
}
Unexecuted instantiation: convolve_2d_avx2.c:convolve_store_32_avx2
convolve_avx2.c:convolve_store_32_avx2
Line
Count
Source
775
4.34M
                                          uint8_t *const dst) {
776
4.34M
  const __m256i d = _mm256_packus_epi16(res0, res1);
777
4.34M
  _mm256_storeu_si256((__m256i *)dst, d);
778
4.34M
}
779
780
1.16M
static inline __m128i sr_x_round_sse2(const __m128i src) {
781
1.16M
  const __m128i round = _mm_set1_epi16(34);
782
1.16M
  const __m128i dst = _mm_add_epi16(src, round);
783
1.16M
  return _mm_srai_epi16(dst, 6);
784
1.16M
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_round_sse2
convolve_avx2.c:sr_x_round_sse2
Line
Count
Source
780
1.16M
static inline __m128i sr_x_round_sse2(const __m128i src) {
781
1.16M
  const __m128i round = _mm_set1_epi16(34);
782
1.16M
  const __m128i dst = _mm_add_epi16(src, round);
783
1.16M
  return _mm_srai_epi16(dst, 6);
784
1.16M
}
785
786
7.66M
static inline __m256i sr_x_round_avx2(const __m256i src) {
787
7.66M
  const __m256i round = _mm256_set1_epi16(34);
788
7.66M
  const __m256i dst = _mm256_add_epi16(src, round);
789
7.66M
  return _mm256_srai_epi16(dst, 6);
790
7.66M
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_round_avx2
convolve_avx2.c:sr_x_round_avx2
Line
Count
Source
786
7.66M
static inline __m256i sr_x_round_avx2(const __m256i src) {
787
7.66M
  const __m256i round = _mm256_set1_epi16(34);
788
7.66M
  const __m256i dst = _mm256_add_epi16(src, round);
789
7.66M
  return _mm256_srai_epi16(dst, 6);
790
7.66M
}
791
792
1.01M
static inline __m128i sr_y_round_sse2(const __m128i src) {
793
1.01M
  const __m128i round = _mm_set1_epi16(32);
794
1.01M
  const __m128i dst = _mm_add_epi16(src, round);
795
1.01M
  return _mm_srai_epi16(dst, FILTER_BITS - 1);
796
1.01M
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_round_sse2
convolve_avx2.c:sr_y_round_sse2
Line
Count
Source
792
1.01M
static inline __m128i sr_y_round_sse2(const __m128i src) {
793
1.01M
  const __m128i round = _mm_set1_epi16(32);
794
1.01M
  const __m128i dst = _mm_add_epi16(src, round);
795
1.01M
  return _mm_srai_epi16(dst, FILTER_BITS - 1);
796
1.01M
}
797
798
static inline void sr_x_round_store_8x2_avx2(const __m256i res,
799
                                             uint8_t *const dst,
800
997k
                                             const ptrdiff_t dst_stride) {
801
997k
  const __m256i r = sr_x_round_avx2(res);
802
997k
  pack_store_8x2_avx2(r, dst, dst_stride);
803
997k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_round_store_8x2_avx2
convolve_avx2.c:sr_x_round_store_8x2_avx2
Line
Count
Source
800
997k
                                             const ptrdiff_t dst_stride) {
801
997k
  const __m256i r = sr_x_round_avx2(res);
802
997k
  pack_store_8x2_avx2(r, dst, dst_stride);
803
997k
}
804
805
static inline void sr_x_round_store_16x2_avx2(const __m256i res[2],
806
                                              uint8_t *const dst,
807
842k
                                              const ptrdiff_t dst_stride) {
808
842k
  __m256i r[2];
809
810
842k
  r[0] = sr_x_round_avx2(res[0]);
811
842k
  r[1] = sr_x_round_avx2(res[1]);
812
842k
  pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
813
842k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_round_store_16x2_avx2
convolve_avx2.c:sr_x_round_store_16x2_avx2
Line
Count
Source
807
842k
                                              const ptrdiff_t dst_stride) {
808
842k
  __m256i r[2];
809
810
842k
  r[0] = sr_x_round_avx2(res[0]);
811
842k
  r[1] = sr_x_round_avx2(res[1]);
812
842k
  pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
813
842k
}
814
815
static inline void sr_x_round_store_32_avx2(const __m256i res[2],
816
2.18M
                                            uint8_t *const dst) {
817
2.18M
  __m256i r[2];
818
819
2.18M
  r[0] = sr_x_round_avx2(res[0]);
820
2.18M
  r[1] = sr_x_round_avx2(res[1]);
821
2.18M
  convolve_store_32_avx2(r[0], r[1], dst);
822
2.18M
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_round_store_32_avx2
convolve_avx2.c:sr_x_round_store_32_avx2
Line
Count
Source
816
2.18M
                                            uint8_t *const dst) {
817
2.18M
  __m256i r[2];
818
819
2.18M
  r[0] = sr_x_round_avx2(res[0]);
820
2.18M
  r[1] = sr_x_round_avx2(res[1]);
821
2.18M
  convolve_store_32_avx2(r[0], r[1], dst);
822
2.18M
}
823
824
static inline void sr_y_round_store_8x2_avx2(const __m256i res,
825
                                             uint8_t *const dst,
826
965k
                                             const ptrdiff_t dst_stride) {
827
965k
  const __m256i r = sr_y_round_avx2(res);
828
965k
  pack_store_8x2_avx2(r, dst, dst_stride);
829
965k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_round_store_8x2_avx2
convolve_avx2.c:sr_y_round_store_8x2_avx2
Line
Count
Source
826
965k
                                             const ptrdiff_t dst_stride) {
827
965k
  const __m256i r = sr_y_round_avx2(res);
828
965k
  pack_store_8x2_avx2(r, dst, dst_stride);
829
965k
}
830
831
static inline void sr_y_round_store_16x2_avx2(const __m256i res[2],
832
                                              uint8_t *const dst,
833
803k
                                              const ptrdiff_t dst_stride) {
834
803k
  __m256i r[2];
835
836
803k
  r[0] = sr_y_round_avx2(res[0]);
837
803k
  r[1] = sr_y_round_avx2(res[1]);
838
803k
  pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
839
803k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_round_store_16x2_avx2
convolve_avx2.c:sr_y_round_store_16x2_avx2
Line
Count
Source
833
803k
                                              const ptrdiff_t dst_stride) {
834
803k
  __m256i r[2];
835
836
803k
  r[0] = sr_y_round_avx2(res[0]);
837
803k
  r[1] = sr_y_round_avx2(res[1]);
838
803k
  pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
839
803k
}
840
841
static inline void sr_y_2tap_32_avg_avx2(const uint8_t *const src,
842
                                         const __m256i s0, __m256i *const s1,
843
125k
                                         uint8_t *const dst) {
844
125k
  *s1 = _mm256_loadu_si256((__m256i *)src);
845
125k
  const __m256i d = _mm256_avg_epu8(s0, *s1);
846
125k
  _mm256_storeu_si256((__m256i *)dst, d);
847
125k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_2tap_32_avg_avx2
convolve_avx2.c:sr_y_2tap_32_avg_avx2
Line
Count
Source
843
125k
                                         uint8_t *const dst) {
844
125k
  *s1 = _mm256_loadu_si256((__m256i *)src);
845
125k
  const __m256i d = _mm256_avg_epu8(s0, *s1);
846
125k
  _mm256_storeu_si256((__m256i *)dst, d);
847
125k
}
848
849
static inline void sr_x_2tap_32_avg_avx2(const uint8_t *const src,
850
135k
                                         uint8_t *const dst) {
851
135k
  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
852
135k
  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
853
135k
  const __m256i d = _mm256_avg_epu8(s0, s1);
854
135k
  _mm256_storeu_si256((__m256i *)dst, d);
855
135k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_2tap_32_avg_avx2
convolve_avx2.c:sr_x_2tap_32_avg_avx2
Line
Count
Source
850
135k
                                         uint8_t *const dst) {
851
135k
  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
852
135k
  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
853
135k
  const __m256i d = _mm256_avg_epu8(s0, s1);
854
135k
  _mm256_storeu_si256((__m256i *)dst, d);
855
135k
}
856
857
static inline __m128i x_convolve_2tap_2x2_sse4_1(const uint8_t *const src,
858
                                                 const ptrdiff_t stride,
859
30.5k
                                                 const __m128i coeffs[1]) {
860
30.5k
  const __m128i sfl =
861
30.5k
      _mm_setr_epi8(0, 1, 1, 2, 4, 5, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0);
862
30.5k
  const __m128i s_128 = load_u8_4x2_sse4_1(src, stride);
863
30.5k
  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
864
30.5k
  return convolve_2tap_ssse3(&ss, coeffs);
865
30.5k
}
convolve_2d_avx2.c:x_convolve_2tap_2x2_sse4_1
Line
Count
Source
859
24.6k
                                                 const __m128i coeffs[1]) {
860
24.6k
  const __m128i sfl =
861
24.6k
      _mm_setr_epi8(0, 1, 1, 2, 4, 5, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0);
862
24.6k
  const __m128i s_128 = load_u8_4x2_sse4_1(src, stride);
863
24.6k
  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
864
24.6k
  return convolve_2tap_ssse3(&ss, coeffs);
865
24.6k
}
convolve_avx2.c:x_convolve_2tap_2x2_sse4_1
Line
Count
Source
859
5.93k
                                                 const __m128i coeffs[1]) {
860
5.93k
  const __m128i sfl =
861
5.93k
      _mm_setr_epi8(0, 1, 1, 2, 4, 5, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0);
862
5.93k
  const __m128i s_128 = load_u8_4x2_sse4_1(src, stride);
863
5.93k
  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
864
5.93k
  return convolve_2tap_ssse3(&ss, coeffs);
865
5.93k
}
866
867
static inline __m128i x_convolve_2tap_4x2_ssse3(const uint8_t *const src,
868
                                                const ptrdiff_t stride,
869
137k
                                                const __m128i coeffs[1]) {
870
137k
  const __m128i sfl =
871
137k
      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
872
137k
  const __m128i s_128 = load_u8_8x2_sse2(src, stride);
873
137k
  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
874
137k
  return convolve_2tap_ssse3(&ss, coeffs);
875
137k
}
convolve_2d_avx2.c:x_convolve_2tap_4x2_ssse3
Line
Count
Source
869
106k
                                                const __m128i coeffs[1]) {
870
106k
  const __m128i sfl =
871
106k
      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
872
106k
  const __m128i s_128 = load_u8_8x2_sse2(src, stride);
873
106k
  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
874
106k
  return convolve_2tap_ssse3(&ss, coeffs);
875
106k
}
convolve_avx2.c:x_convolve_2tap_4x2_ssse3
Line
Count
Source
869
30.9k
                                                const __m128i coeffs[1]) {
870
30.9k
  const __m128i sfl =
871
30.9k
      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
872
30.9k
  const __m128i s_128 = load_u8_8x2_sse2(src, stride);
873
30.9k
  const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
874
30.9k
  return convolve_2tap_ssse3(&ss, coeffs);
875
30.9k
}
876
877
static inline void x_convolve_2tap_8x2_ssse3(const uint8_t *const src,
878
                                             const ptrdiff_t stride,
879
                                             const __m128i coeffs[1],
880
121k
                                             __m128i r[2]) {
881
121k
  __m128i ss[2];
882
121k
  const __m128i s00 = _mm_loadu_si128((__m128i *)src);
883
121k
  const __m128i s10 = _mm_loadu_si128((__m128i *)(src + stride));
884
121k
  const __m128i s01 = _mm_srli_si128(s00, 1);
885
121k
  const __m128i s11 = _mm_srli_si128(s10, 1);
886
121k
  ss[0] = _mm_unpacklo_epi8(s00, s01);
887
121k
  ss[1] = _mm_unpacklo_epi8(s10, s11);
888
889
121k
  r[0] = convolve_2tap_ssse3(&ss[0], coeffs);
890
121k
  r[1] = convolve_2tap_ssse3(&ss[1], coeffs);
891
121k
}
convolve_2d_avx2.c:x_convolve_2tap_8x2_ssse3
Line
Count
Source
880
90.6k
                                             __m128i r[2]) {
881
90.6k
  __m128i ss[2];
882
90.6k
  const __m128i s00 = _mm_loadu_si128((__m128i *)src);
883
90.6k
  const __m128i s10 = _mm_loadu_si128((__m128i *)(src + stride));
884
90.6k
  const __m128i s01 = _mm_srli_si128(s00, 1);
885
90.6k
  const __m128i s11 = _mm_srli_si128(s10, 1);
886
90.6k
  ss[0] = _mm_unpacklo_epi8(s00, s01);
887
90.6k
  ss[1] = _mm_unpacklo_epi8(s10, s11);
888
889
90.6k
  r[0] = convolve_2tap_ssse3(&ss[0], coeffs);
890
90.6k
  r[1] = convolve_2tap_ssse3(&ss[1], coeffs);
891
90.6k
}
convolve_avx2.c:x_convolve_2tap_8x2_ssse3
Line
Count
Source
880
31.1k
                                             __m128i r[2]) {
881
31.1k
  __m128i ss[2];
882
31.1k
  const __m128i s00 = _mm_loadu_si128((__m128i *)src);
883
31.1k
  const __m128i s10 = _mm_loadu_si128((__m128i *)(src + stride));
884
31.1k
  const __m128i s01 = _mm_srli_si128(s00, 1);
885
31.1k
  const __m128i s11 = _mm_srli_si128(s10, 1);
886
31.1k
  ss[0] = _mm_unpacklo_epi8(s00, s01);
887
31.1k
  ss[1] = _mm_unpacklo_epi8(s10, s11);
888
889
31.1k
  r[0] = convolve_2tap_ssse3(&ss[0], coeffs);
890
31.1k
  r[1] = convolve_2tap_ssse3(&ss[1], coeffs);
891
31.1k
}
892
893
static inline __m256i x_convolve_2tap_8x2_avx2(const uint8_t *const src,
894
                                               const ptrdiff_t stride,
895
0
                                               const __m256i coeffs[1]) {
896
0
  __m128i s_128[2][2];
897
0
  __m256i s_256[2];
898
0
899
0
  s_128[0][0] = _mm_loadu_si128((__m128i *)src);
900
0
  s_128[1][0] = _mm_loadu_si128((__m128i *)(src + stride));
901
0
  s_128[0][1] = _mm_srli_si128(s_128[0][0], 1);
902
0
  s_128[1][1] = _mm_srli_si128(s_128[1][0], 1);
903
0
  s_256[0] = _mm256_setr_m128i(s_128[0][0], s_128[1][0]);
904
0
  s_256[1] = _mm256_setr_m128i(s_128[0][1], s_128[1][1]);
905
0
  const __m256i ss = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
906
0
  return convolve_2tap_avx2(&ss, coeffs);
907
0
}
Unexecuted instantiation: convolve_2d_avx2.c:x_convolve_2tap_8x2_avx2
Unexecuted instantiation: convolve_avx2.c:x_convolve_2tap_8x2_avx2
908
909
static inline void x_convolve_2tap_16x2_avx2(const uint8_t *const src,
910
                                             const ptrdiff_t stride,
911
                                             const __m256i coeffs[1],
912
81.9k
                                             __m256i r[2]) {
913
81.9k
  const __m256i s0_256 = loadu_8bit_16x2_avx2(src, stride);
914
81.9k
  const __m256i s1_256 = loadu_8bit_16x2_avx2(src + 1, stride);
915
81.9k
  const __m256i s0 = _mm256_unpacklo_epi8(s0_256, s1_256);
916
81.9k
  const __m256i s1 = _mm256_unpackhi_epi8(s0_256, s1_256);
917
81.9k
  r[0] = convolve_2tap_avx2(&s0, coeffs);
918
81.9k
  r[1] = convolve_2tap_avx2(&s1, coeffs);
919
81.9k
}
convolve_2d_avx2.c:x_convolve_2tap_16x2_avx2
Line
Count
Source
912
56.7k
                                             __m256i r[2]) {
913
56.7k
  const __m256i s0_256 = loadu_8bit_16x2_avx2(src, stride);
914
56.7k
  const __m256i s1_256 = loadu_8bit_16x2_avx2(src + 1, stride);
915
56.7k
  const __m256i s0 = _mm256_unpacklo_epi8(s0_256, s1_256);
916
56.7k
  const __m256i s1 = _mm256_unpackhi_epi8(s0_256, s1_256);
917
56.7k
  r[0] = convolve_2tap_avx2(&s0, coeffs);
918
56.7k
  r[1] = convolve_2tap_avx2(&s1, coeffs);
919
56.7k
}
convolve_avx2.c:x_convolve_2tap_16x2_avx2
Line
Count
Source
912
25.2k
                                             __m256i r[2]) {
913
25.2k
  const __m256i s0_256 = loadu_8bit_16x2_avx2(src, stride);
914
25.2k
  const __m256i s1_256 = loadu_8bit_16x2_avx2(src + 1, stride);
915
25.2k
  const __m256i s0 = _mm256_unpacklo_epi8(s0_256, s1_256);
916
25.2k
  const __m256i s1 = _mm256_unpackhi_epi8(s0_256, s1_256);
917
25.2k
  r[0] = convolve_2tap_avx2(&s0, coeffs);
918
25.2k
  r[1] = convolve_2tap_avx2(&s1, coeffs);
919
25.2k
}
920
921
static inline void x_convolve_2tap_32_avx2(const uint8_t *const src,
922
                                           const __m256i coeffs[1],
923
232k
                                           __m256i r[2]) {
924
232k
  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
925
232k
  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
926
232k
  const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
927
232k
  const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
928
929
232k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
930
232k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
931
232k
}
Unexecuted instantiation: convolve_2d_avx2.c:x_convolve_2tap_32_avx2
convolve_avx2.c:x_convolve_2tap_32_avx2
Line
Count
Source
923
232k
                                           __m256i r[2]) {
924
232k
  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
925
232k
  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
926
232k
  const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
927
232k
  const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
928
929
232k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
930
232k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
931
232k
}
932
933
static inline __m128i x_convolve_4tap_2x2_ssse3(const uint8_t *const src,
934
                                                const ptrdiff_t stride,
935
934k
                                                const __m128i coeffs[2]) {
936
934k
  const __m128i sfl0 =
937
934k
      _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
938
934k
  const __m128i sfl1 =
939
934k
      _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
940
934k
  const __m128i s = load_u8_8x2_sse2(src, stride);
941
934k
  __m128i ss[2];
942
943
934k
  ss[0] = _mm_shuffle_epi8(s, sfl0);
944
934k
  ss[1] = _mm_shuffle_epi8(s, sfl1);
945
934k
  return convolve_4tap_ssse3(ss, coeffs);
946
934k
}
convolve_2d_avx2.c:x_convolve_4tap_2x2_ssse3
Line
Count
Source
935
760k
                                                const __m128i coeffs[2]) {
936
760k
  const __m128i sfl0 =
937
760k
      _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
938
760k
  const __m128i sfl1 =
939
760k
      _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
940
760k
  const __m128i s = load_u8_8x2_sse2(src, stride);
941
760k
  __m128i ss[2];
942
943
760k
  ss[0] = _mm_shuffle_epi8(s, sfl0);
944
760k
  ss[1] = _mm_shuffle_epi8(s, sfl1);
945
760k
  return convolve_4tap_ssse3(ss, coeffs);
946
760k
}
convolve_avx2.c:x_convolve_4tap_2x2_ssse3
Line
Count
Source
935
173k
                                                const __m128i coeffs[2]) {
936
173k
  const __m128i sfl0 =
937
173k
      _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
938
173k
  const __m128i sfl1 =
939
173k
      _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
940
173k
  const __m128i s = load_u8_8x2_sse2(src, stride);
941
173k
  __m128i ss[2];
942
943
173k
  ss[0] = _mm_shuffle_epi8(s, sfl0);
944
173k
  ss[1] = _mm_shuffle_epi8(s, sfl1);
945
173k
  return convolve_4tap_ssse3(ss, coeffs);
946
173k
}
947
948
static inline __m128i x_convolve_4tap_4x2_ssse3(const uint8_t *const src,
949
                                                const ptrdiff_t stride,
950
4.31M
                                                const __m128i coeffs[2]) {
951
4.31M
  const __m128i s = load_u8_8x2_sse2(src, stride);
952
4.31M
  const __m128i sfl0 =
953
4.31M
      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
954
4.31M
  const __m128i sfl1 =
955
4.31M
      _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14);
956
4.31M
  __m128i ss[2];
957
958
4.31M
  ss[0] = _mm_shuffle_epi8(s, sfl0);
959
4.31M
  ss[1] = _mm_shuffle_epi8(s, sfl1);
960
4.31M
  return convolve_4tap_ssse3(ss, coeffs);
961
4.31M
}
convolve_2d_avx2.c:x_convolve_4tap_4x2_ssse3
Line
Count
Source
950
3.42M
                                                const __m128i coeffs[2]) {
951
3.42M
  const __m128i s = load_u8_8x2_sse2(src, stride);
952
3.42M
  const __m128i sfl0 =
953
3.42M
      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
954
3.42M
  const __m128i sfl1 =
955
3.42M
      _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14);
956
3.42M
  __m128i ss[2];
957
958
3.42M
  ss[0] = _mm_shuffle_epi8(s, sfl0);
959
3.42M
  ss[1] = _mm_shuffle_epi8(s, sfl1);
960
3.42M
  return convolve_4tap_ssse3(ss, coeffs);
961
3.42M
}
convolve_avx2.c:x_convolve_4tap_4x2_ssse3
Line
Count
Source
950
891k
                                                const __m128i coeffs[2]) {
951
891k
  const __m128i s = load_u8_8x2_sse2(src, stride);
952
891k
  const __m128i sfl0 =
953
891k
      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
954
891k
  const __m128i sfl1 =
955
891k
      _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14);
956
891k
  __m128i ss[2];
957
958
891k
  ss[0] = _mm_shuffle_epi8(s, sfl0);
959
891k
  ss[1] = _mm_shuffle_epi8(s, sfl1);
960
891k
  return convolve_4tap_ssse3(ss, coeffs);
961
891k
}
962
963
static inline __m256i x_convolve_4tap_8x2_avx2(const uint8_t *const src,
964
                                               const ptrdiff_t stride,
965
                                               const __m256i coeffs[2],
966
483k
                                               const __m256i filt[2]) {
967
483k
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
968
483k
  return x_convolve_4tap_avx2(s_256, coeffs, filt);
969
483k
}
convolve_2d_avx2.c:x_convolve_4tap_8x2_avx2
Line
Count
Source
966
483k
                                               const __m256i filt[2]) {
967
483k
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
968
483k
  return x_convolve_4tap_avx2(s_256, coeffs, filt);
969
483k
}
Unexecuted instantiation: convolve_avx2.c:x_convolve_4tap_8x2_avx2
970
971
static inline void x_convolve_4tap_16x2_avx2(const uint8_t *const src,
972
                                             const int32_t src_stride,
973
                                             const __m256i coeffs[2],
974
                                             const __m256i filt[2],
975
141k
                                             __m256i r[2]) {
976
141k
  r[0] = x_convolve_4tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
977
141k
  r[1] = x_convolve_4tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
978
141k
}
convolve_2d_avx2.c:x_convolve_4tap_16x2_avx2
Line
Count
Source
975
141k
                                             __m256i r[2]) {
976
141k
  r[0] = x_convolve_4tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
977
141k
  r[1] = x_convolve_4tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
978
141k
}
Unexecuted instantiation: convolve_avx2.c:x_convolve_4tap_16x2_avx2
979
980
static inline void x_convolve_4tap_32_avx2(const uint8_t *const src,
981
                                           const __m256i coeffs[2],
982
                                           const __m256i filt[2],
983
494k
                                           __m256i r[2]) {
984
494k
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
985
494k
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
986
987
494k
  r[0] = x_convolve_4tap_avx2(s0_256, coeffs, filt);
988
494k
  r[1] = x_convolve_4tap_avx2(s1_256, coeffs, filt);
989
494k
}
convolve_2d_avx2.c:x_convolve_4tap_32_avx2
Line
Count
Source
983
494k
                                           __m256i r[2]) {
984
494k
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
985
494k
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
986
987
494k
  r[0] = x_convolve_4tap_avx2(s0_256, coeffs, filt);
988
494k
  r[1] = x_convolve_4tap_avx2(s1_256, coeffs, filt);
989
494k
}
Unexecuted instantiation: convolve_avx2.c:x_convolve_4tap_32_avx2
990
991
static inline __m128i x_convolve_6tap_2x2_ssse3(const uint8_t *const src,
992
                                                const ptrdiff_t stride,
993
0
                                                const __m128i coeffs[3]) {
994
0
  const __m128i sfl0 =
995
0
      _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
996
0
  const __m128i sfl1 =
997
0
      _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
998
0
  const __m128i sfl2 =
999
0
      _mm_setr_epi8(4, 5, 5, 6, 12, 13, 13, 14, 0, 0, 0, 0, 0, 0, 0, 0);
1000
1001
0
  const __m128i s = load_u8_8x2_sse2(src, stride);
1002
0
  __m128i ss[3];
1003
1004
0
  ss[0] = _mm_shuffle_epi8(s, sfl0);
1005
0
  ss[1] = _mm_shuffle_epi8(s, sfl1);
1006
0
  ss[2] = _mm_shuffle_epi8(s, sfl2);
1007
0
  return convolve_6tap_ssse3(ss, coeffs);
1008
0
}
Unexecuted instantiation: convolve_2d_avx2.c:x_convolve_6tap_2x2_ssse3
Unexecuted instantiation: convolve_avx2.c:x_convolve_6tap_2x2_ssse3
1009
1010
static inline __m128i x_convolve_6tap_4x2_ssse3(const uint8_t *const src,
1011
                                                const ptrdiff_t stride,
1012
0
                                                const __m128i coeffs[3]) {
1013
0
  const __m128i s = load_u8_8x2_sse2(src, stride);
1014
0
  const __m128i sfl0 =
1015
0
      _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
1016
0
  const __m128i sfl1 =
1017
0
      _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
1018
0
  const __m128i sfl2 =
1019
0
      _mm_setr_epi8(4, 5, 5, 6, 12, 13, 13, 14, 0, 0, 0, 0, 0, 0, 0, 0);
1020
0
  __m128i ss[3];
1021
1022
0
  ss[0] = _mm_shuffle_epi8(s, sfl0);
1023
0
  ss[1] = _mm_shuffle_epi8(s, sfl1);
1024
0
  ss[2] = _mm_shuffle_epi8(s, sfl2);
1025
0
  return convolve_6tap_ssse3(ss, coeffs);
1026
0
}
Unexecuted instantiation: convolve_2d_avx2.c:x_convolve_6tap_4x2_ssse3
Unexecuted instantiation: convolve_avx2.c:x_convolve_6tap_4x2_ssse3
1027
1028
static inline __m256i x_convolve_6tap_8x2_avx2(const uint8_t *const src,
1029
                                               const ptrdiff_t stride,
1030
                                               const __m256i coeffs[3],
1031
10.3M
                                               const __m256i filt[3]) {
1032
10.3M
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
1033
10.3M
  return x_convolve_6tap_avx2(s_256, coeffs, filt);
1034
10.3M
}
convolve_2d_avx2.c:x_convolve_6tap_8x2_avx2
Line
Count
Source
1031
7.82M
                                               const __m256i filt[3]) {
1032
7.82M
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
1033
7.82M
  return x_convolve_6tap_avx2(s_256, coeffs, filt);
1034
7.82M
}
convolve_avx2.c:x_convolve_6tap_8x2_avx2
Line
Count
Source
1031
2.53M
                                               const __m256i filt[3]) {
1032
2.53M
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
1033
2.53M
  return x_convolve_6tap_avx2(s_256, coeffs, filt);
1034
2.53M
}
1035
1036
static inline void x_convolve_6tap_16x2_avx2(const uint8_t *const src,
1037
                                             const int32_t src_stride,
1038
                                             const __m256i coeffs[3],
1039
                                             const __m256i filt[3],
1040
3.02M
                                             __m256i r[2]) {
1041
3.02M
  r[0] = x_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1042
3.02M
  r[1] = x_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1043
3.02M
}
convolve_2d_avx2.c:x_convolve_6tap_16x2_avx2
Line
Count
Source
1040
2.23M
                                             __m256i r[2]) {
1041
2.23M
  r[0] = x_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1042
2.23M
  r[1] = x_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1043
2.23M
}
convolve_avx2.c:x_convolve_6tap_16x2_avx2
Line
Count
Source
1040
788k
                                             __m256i r[2]) {
1041
788k
  r[0] = x_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1042
788k
  r[1] = x_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1043
788k
}
1044
1045
static inline void x_convolve_6tap_32_avx2(const uint8_t *const src,
1046
                                           const __m256i coeffs[3],
1047
                                           const __m256i filt[3],
1048
5.75M
                                           __m256i r[2]) {
1049
5.75M
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
1050
5.75M
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
1051
1052
5.75M
  r[0] = x_convolve_6tap_avx2(s0_256, coeffs, filt);
1053
5.75M
  r[1] = x_convolve_6tap_avx2(s1_256, coeffs, filt);
1054
5.75M
}
convolve_2d_avx2.c:x_convolve_6tap_32_avx2
Line
Count
Source
1048
3.97M
                                           __m256i r[2]) {
1049
3.97M
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
1050
3.97M
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
1051
1052
3.97M
  r[0] = x_convolve_6tap_avx2(s0_256, coeffs, filt);
1053
3.97M
  r[1] = x_convolve_6tap_avx2(s1_256, coeffs, filt);
1054
3.97M
}
convolve_avx2.c:x_convolve_6tap_32_avx2
Line
Count
Source
1048
1.77M
                                           __m256i r[2]) {
1049
1.77M
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
1050
1.77M
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
1051
1052
1.77M
  r[0] = x_convolve_6tap_avx2(s0_256, coeffs, filt);
1053
1.77M
  r[1] = x_convolve_6tap_avx2(s1_256, coeffs, filt);
1054
1.77M
}
1055
1056
static inline __m256i x_convolve_8tap_8x2_avx2(const uint8_t *const src,
1057
                                               const ptrdiff_t stride,
1058
                                               const __m256i coeffs[4],
1059
442k
                                               const __m256i filt[4]) {
1060
442k
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
1061
442k
  return x_convolve_8tap_avx2(s_256, coeffs, filt);
1062
442k
}
convolve_2d_avx2.c:x_convolve_8tap_8x2_avx2
Line
Count
Source
1059
348k
                                               const __m256i filt[4]) {
1060
348k
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
1061
348k
  return x_convolve_8tap_avx2(s_256, coeffs, filt);
1062
348k
}
convolve_avx2.c:x_convolve_8tap_8x2_avx2
Line
Count
Source
1059
94.8k
                                               const __m256i filt[4]) {
1060
94.8k
  const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
1061
94.8k
  return x_convolve_8tap_avx2(s_256, coeffs, filt);
1062
94.8k
}
1063
1064
static AOM_FORCE_INLINE void x_convolve_8tap_16x2_avx2(const uint8_t *const src,
1065
                                                       const int32_t src_stride,
1066
                                                       const __m256i coeffs[4],
1067
                                                       const __m256i filt[4],
1068
129k
                                                       __m256i r[2]) {
1069
129k
  r[0] = x_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1070
129k
  r[1] = x_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1071
129k
}
convolve_2d_avx2.c:x_convolve_8tap_16x2_avx2
Line
Count
Source
1068
100k
                                                       __m256i r[2]) {
1069
100k
  r[0] = x_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1070
100k
  r[1] = x_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1071
100k
}
convolve_avx2.c:x_convolve_8tap_16x2_avx2
Line
Count
Source
1068
29.3k
                                                       __m256i r[2]) {
1069
29.3k
  r[0] = x_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1070
29.3k
  r[1] = x_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1071
29.3k
}
1072
1073
static AOM_FORCE_INLINE void x_convolve_8tap_32_avx2(const uint8_t *const src,
1074
                                                     const __m256i coeffs[4],
1075
                                                     const __m256i filt[4],
1076
1.28M
                                                     __m256i r[2]) {
1077
1.28M
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
1078
1.28M
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
1079
1080
1.28M
  r[0] = x_convolve_8tap_avx2(s0_256, coeffs, filt);
1081
1.28M
  r[1] = x_convolve_8tap_avx2(s1_256, coeffs, filt);
1082
1.28M
}
convolve_2d_avx2.c:x_convolve_8tap_32_avx2
Line
Count
Source
1076
1.10M
                                                     __m256i r[2]) {
1077
1.10M
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
1078
1.10M
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
1079
1080
1.10M
  r[0] = x_convolve_8tap_avx2(s0_256, coeffs, filt);
1081
1.10M
  r[1] = x_convolve_8tap_avx2(s1_256, coeffs, filt);
1082
1.10M
}
convolve_avx2.c:x_convolve_8tap_32_avx2
Line
Count
Source
1076
179k
                                                     __m256i r[2]) {
1077
179k
  const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
1078
179k
  const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
1079
1080
179k
  r[0] = x_convolve_8tap_avx2(s0_256, coeffs, filt);
1081
179k
  r[1] = x_convolve_8tap_avx2(s1_256, coeffs, filt);
1082
179k
}
1083
1084
static inline __m128i y_convolve_2tap_2x2_ssse3(const uint8_t *const src,
1085
                                                const ptrdiff_t stride,
1086
                                                const __m128i coeffs[1],
1087
5.59k
                                                __m128i s_16[2]) {
1088
5.59k
  __m128i s_128[2];
1089
1090
5.59k
  s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src + stride));
1091
5.59k
  s_128[0] = _mm_unpacklo_epi16(s_16[0], s_16[1]);
1092
5.59k
  s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src + 2 * stride));
1093
5.59k
  s_128[1] = _mm_unpacklo_epi16(s_16[1], s_16[0]);
1094
5.59k
  const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
1095
5.59k
  return convolve_2tap_ssse3(&ss, coeffs);
1096
5.59k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_2tap_2x2_ssse3
convolve_avx2.c:y_convolve_2tap_2x2_ssse3
Line
Count
Source
1087
5.59k
                                                __m128i s_16[2]) {
1088
5.59k
  __m128i s_128[2];
1089
1090
5.59k
  s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src + stride));
1091
5.59k
  s_128[0] = _mm_unpacklo_epi16(s_16[0], s_16[1]);
1092
5.59k
  s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src + 2 * stride));
1093
5.59k
  s_128[1] = _mm_unpacklo_epi16(s_16[1], s_16[0]);
1094
5.59k
  const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
1095
5.59k
  return convolve_2tap_ssse3(&ss, coeffs);
1096
5.59k
}
1097
1098
static inline __m128i y_convolve_2tap_4x2_ssse3(const uint8_t *const src,
1099
                                                const ptrdiff_t stride,
1100
                                                const __m128i coeffs[1],
1101
26.0k
                                                __m128i s_32[2]) {
1102
26.0k
  __m128i s_128[2];
1103
1104
26.0k
  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + stride));
1105
26.0k
  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1106
26.0k
  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * stride));
1107
26.0k
  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1108
26.0k
  const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
1109
26.0k
  return convolve_2tap_ssse3(&ss, coeffs);
1110
26.0k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_2tap_4x2_ssse3
convolve_avx2.c:y_convolve_2tap_4x2_ssse3
Line
Count
Source
1101
26.0k
                                                __m128i s_32[2]) {
1102
26.0k
  __m128i s_128[2];
1103
1104
26.0k
  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + stride));
1105
26.0k
  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1106
26.0k
  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * stride));
1107
26.0k
  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1108
26.0k
  const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
1109
26.0k
  return convolve_2tap_ssse3(&ss, coeffs);
1110
26.0k
}
1111
1112
static inline __m256i y_convolve_2tap_8x2_avx2(const uint8_t *const src,
1113
                                               const ptrdiff_t stride,
1114
                                               const __m256i coeffs[1],
1115
0
                                               __m128i s_64[2]) {
1116
0
  __m256i s_256[2];
1117
0
1118
0
  s_64[1] = _mm_loadl_epi64((__m128i *)(src + stride));
1119
0
  s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
1120
0
  s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * stride));
1121
0
  s_256[1] = _mm256_setr_m128i(s_64[1], s_64[0]);
1122
0
  const __m256i ss = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
1123
0
  return convolve_2tap_avx2(&ss, coeffs);
1124
0
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_2tap_8x2_avx2
Unexecuted instantiation: convolve_avx2.c:y_convolve_2tap_8x2_avx2
1125
1126
static inline void y_convolve_2tap_16x2_avx2(const uint8_t *const src,
1127
                                             const ptrdiff_t stride,
1128
                                             const __m256i coeffs[1],
1129
18.8k
                                             __m128i s_128[2], __m256i r[2]) {
1130
18.8k
  __m256i s_256[2];
1131
1132
18.8k
  s_128[1] = _mm_loadu_si128((__m128i *)(src + stride));
1133
18.8k
  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1134
18.8k
  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
1135
18.8k
  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1136
18.8k
  const __m256i ss0 = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
1137
18.8k
  const __m256i ss1 = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
1138
18.8k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
1139
18.8k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
1140
18.8k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_2tap_16x2_avx2
convolve_avx2.c:y_convolve_2tap_16x2_avx2
Line
Count
Source
1129
18.8k
                                             __m128i s_128[2], __m256i r[2]) {
1130
18.8k
  __m256i s_256[2];
1131
1132
18.8k
  s_128[1] = _mm_loadu_si128((__m128i *)(src + stride));
1133
18.8k
  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1134
18.8k
  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
1135
18.8k
  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1136
18.8k
  const __m256i ss0 = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
1137
18.8k
  const __m256i ss1 = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
1138
18.8k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
1139
18.8k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
1140
18.8k
}
1141
1142
static inline void y_convolve_2tap_32_avx2(const uint8_t *const src,
1143
                                           const __m256i coeffs[1],
1144
                                           const __m256i s0, __m256i *const s1,
1145
169k
                                           __m256i r[2]) {
1146
169k
  *s1 = _mm256_loadu_si256((__m256i *)src);
1147
169k
  const __m256i ss0 = _mm256_unpacklo_epi8(s0, *s1);
1148
169k
  const __m256i ss1 = _mm256_unpackhi_epi8(s0, *s1);
1149
169k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
1150
169k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
1151
169k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_2tap_32_avx2
convolve_avx2.c:y_convolve_2tap_32_avx2
Line
Count
Source
1145
169k
                                           __m256i r[2]) {
1146
169k
  *s1 = _mm256_loadu_si256((__m256i *)src);
1147
169k
  const __m256i ss0 = _mm256_unpacklo_epi8(s0, *s1);
1148
169k
  const __m256i ss1 = _mm256_unpackhi_epi8(s0, *s1);
1149
169k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
1150
169k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
1151
169k
}
1152
1153
static inline __m128i y_convolve_4tap_2x2_ssse3(const uint8_t *const src,
1154
                                                const ptrdiff_t stride,
1155
                                                const __m128i coeffs[2],
1156
                                                __m128i s_16[4],
1157
51.2k
                                                __m128i ss_128[2]) {
1158
51.2k
  s_16[3] = _mm_cvtsi32_si128(loadu_int16(src + stride));
1159
51.2k
  const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
1160
51.2k
  s_16[2] = _mm_cvtsi32_si128(loadu_int16(src + 2 * stride));
1161
51.2k
  const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[2]);
1162
51.2k
  ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1163
51.2k
  return convolve_4tap_ssse3(ss_128, coeffs);
1164
51.2k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_4tap_2x2_ssse3
convolve_avx2.c:y_convolve_4tap_2x2_ssse3
Line
Count
Source
1157
51.2k
                                                __m128i ss_128[2]) {
1158
51.2k
  s_16[3] = _mm_cvtsi32_si128(loadu_int16(src + stride));
1159
51.2k
  const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
1160
51.2k
  s_16[2] = _mm_cvtsi32_si128(loadu_int16(src + 2 * stride));
1161
51.2k
  const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[2]);
1162
51.2k
  ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1163
51.2k
  return convolve_4tap_ssse3(ss_128, coeffs);
1164
51.2k
}
1165
1166
static inline __m128i y_convolve_4tap_4x2_ssse3(const uint8_t *const src,
1167
                                                const ptrdiff_t stride,
1168
                                                const __m128i coeffs[2],
1169
                                                __m128i s_32[4],
1170
281k
                                                __m128i ss_128[2]) {
1171
281k
  s_32[3] = _mm_cvtsi32_si128(loadu_int32(src + stride));
1172
281k
  const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1173
281k
  s_32[2] = _mm_cvtsi32_si128(loadu_int32(src + 2 * stride));
1174
281k
  const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
1175
281k
  ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1176
281k
  return convolve_4tap_ssse3(ss_128, coeffs);
1177
281k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_4tap_4x2_ssse3
convolve_avx2.c:y_convolve_4tap_4x2_ssse3
Line
Count
Source
1170
281k
                                                __m128i ss_128[2]) {
1171
281k
  s_32[3] = _mm_cvtsi32_si128(loadu_int32(src + stride));
1172
281k
  const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1173
281k
  s_32[2] = _mm_cvtsi32_si128(loadu_int32(src + 2 * stride));
1174
281k
  const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
1175
281k
  ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1176
281k
  return convolve_4tap_ssse3(ss_128, coeffs);
1177
281k
}
1178
1179
static inline __m256i y_convolve_4tap_8x2_avx2(const uint8_t *const src,
1180
                                               const ptrdiff_t stride,
1181
                                               const __m256i coeffs[2],
1182
                                               __m128i s_64[4],
1183
228k
                                               __m256i ss_256[2]) {
1184
228k
  s_64[3] = _mm_loadl_epi64((__m128i *)(src + stride));
1185
228k
  const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
1186
228k
  s_64[2] = _mm_loadl_epi64((__m128i *)(src + 2 * stride));
1187
228k
  const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[2]);
1188
228k
  ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1189
228k
  return convolve_4tap_avx2(ss_256, coeffs);
1190
228k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_4tap_8x2_avx2
convolve_avx2.c:y_convolve_4tap_8x2_avx2
Line
Count
Source
1183
228k
                                               __m256i ss_256[2]) {
1184
228k
  s_64[3] = _mm_loadl_epi64((__m128i *)(src + stride));
1185
228k
  const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
1186
228k
  s_64[2] = _mm_loadl_epi64((__m128i *)(src + 2 * stride));
1187
228k
  const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[2]);
1188
228k
  ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1189
228k
  return convolve_4tap_avx2(ss_256, coeffs);
1190
228k
}
1191
1192
static inline void y_convolve_4tap_16x2_avx2(const uint8_t *const src,
1193
                                             const ptrdiff_t stride,
1194
                                             const __m256i coeffs[2],
1195
                                             __m128i s_128[4],
1196
140k
                                             __m256i ss_256[4], __m256i r[2]) {
1197
140k
  s_128[3] = _mm_loadu_si128((__m128i *)(src + stride));
1198
140k
  const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
1199
140k
  s_128[2] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
1200
140k
  const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[2]);
1201
140k
  ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1202
140k
  ss_256[3] = _mm256_unpackhi_epi8(src23, src34);
1203
140k
  r[0] = convolve_4tap_avx2(ss_256, coeffs);
1204
140k
  r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
1205
140k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_4tap_16x2_avx2
convolve_avx2.c:y_convolve_4tap_16x2_avx2
Line
Count
Source
1196
140k
                                             __m256i ss_256[4], __m256i r[2]) {
1197
140k
  s_128[3] = _mm_loadu_si128((__m128i *)(src + stride));
1198
140k
  const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
1199
140k
  s_128[2] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
1200
140k
  const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[2]);
1201
140k
  ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1202
140k
  ss_256[3] = _mm256_unpackhi_epi8(src23, src34);
1203
140k
  r[0] = convolve_4tap_avx2(ss_256, coeffs);
1204
140k
  r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
1205
140k
}
1206
1207
static inline __m128i y_convolve_6tap_2x2_ssse3(const uint8_t *const src,
1208
                                                const ptrdiff_t stride,
1209
                                                const __m128i coeffs[3],
1210
                                                __m128i s_16[6],
1211
74.4k
                                                __m128i ss_128[3]) {
1212
74.4k
  s_16[5] = _mm_cvtsi32_si128(loadu_int16(src + 3 * stride));
1213
74.4k
  const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
1214
74.4k
  s_16[4] = _mm_cvtsi32_si128(loadu_int16(src + 4 * stride));
1215
74.4k
  const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[4]);
1216
74.4k
  ss_128[2] = _mm_unpacklo_epi8(src45, src56);
1217
74.4k
  return convolve_6tap_ssse3(ss_128, coeffs);
1218
74.4k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_6tap_2x2_ssse3
convolve_avx2.c:y_convolve_6tap_2x2_ssse3
Line
Count
Source
1211
74.4k
                                                __m128i ss_128[3]) {
1212
74.4k
  s_16[5] = _mm_cvtsi32_si128(loadu_int16(src + 3 * stride));
1213
74.4k
  const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
1214
74.4k
  s_16[4] = _mm_cvtsi32_si128(loadu_int16(src + 4 * stride));
1215
74.4k
  const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[4]);
1216
74.4k
  ss_128[2] = _mm_unpacklo_epi8(src45, src56);
1217
74.4k
  return convolve_6tap_ssse3(ss_128, coeffs);
1218
74.4k
}
1219
1220
static inline void y_convolve_4tap_32x2_avx2(
1221
    const uint8_t *const src, const ptrdiff_t stride, const __m256i coeffs[2],
1222
173k
    __m256i s_256[4], __m256i ss_256[4], __m256i tt_256[4], __m256i r[4]) {
1223
173k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
1224
173k
  ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
1225
173k
  ss_256[3] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
1226
173k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
1227
173k
  tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[2]);
1228
173k
  tt_256[3] = _mm256_unpackhi_epi8(s_256[3], s_256[2]);
1229
173k
  r[0] = convolve_4tap_avx2(ss_256 + 0, coeffs);
1230
173k
  r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
1231
173k
  r[2] = convolve_4tap_avx2(tt_256 + 0, coeffs);
1232
173k
  r[3] = convolve_4tap_avx2(tt_256 + 2, coeffs);
1233
173k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_4tap_32x2_avx2
convolve_avx2.c:y_convolve_4tap_32x2_avx2
Line
Count
Source
1222
173k
    __m256i s_256[4], __m256i ss_256[4], __m256i tt_256[4], __m256i r[4]) {
1223
173k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
1224
173k
  ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
1225
173k
  ss_256[3] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
1226
173k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
1227
173k
  tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[2]);
1228
173k
  tt_256[3] = _mm256_unpackhi_epi8(s_256[3], s_256[2]);
1229
173k
  r[0] = convolve_4tap_avx2(ss_256 + 0, coeffs);
1230
173k
  r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
1231
173k
  r[2] = convolve_4tap_avx2(tt_256 + 0, coeffs);
1232
173k
  r[3] = convolve_4tap_avx2(tt_256 + 2, coeffs);
1233
173k
}
1234
1235
static inline __m128i y_convolve_6tap_4x2_ssse3(const uint8_t *const src,
1236
                                                const ptrdiff_t stride,
1237
                                                const __m128i coeffs[3],
1238
                                                __m128i s_32[6],
1239
493k
                                                __m128i ss_128[3]) {
1240
493k
  s_32[5] = _mm_cvtsi32_si128(loadu_int32(src + 3 * stride));
1241
493k
  const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
1242
493k
  s_32[4] = _mm_cvtsi32_si128(loadu_int32(src + 4 * stride));
1243
493k
  const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
1244
493k
  ss_128[2] = _mm_unpacklo_epi8(src45, src56);
1245
493k
  return convolve_6tap_ssse3(ss_128, coeffs);
1246
493k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_6tap_4x2_ssse3
convolve_avx2.c:y_convolve_6tap_4x2_ssse3
Line
Count
Source
1239
493k
                                                __m128i ss_128[3]) {
1240
493k
  s_32[5] = _mm_cvtsi32_si128(loadu_int32(src + 3 * stride));
1241
493k
  const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
1242
493k
  s_32[4] = _mm_cvtsi32_si128(loadu_int32(src + 4 * stride));
1243
493k
  const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
1244
493k
  ss_128[2] = _mm_unpacklo_epi8(src45, src56);
1245
493k
  return convolve_6tap_ssse3(ss_128, coeffs);
1246
493k
}
1247
1248
static inline __m256i y_convolve_6tap_8x2_avx2(const uint8_t *const src,
1249
                                               const ptrdiff_t stride,
1250
                                               const __m256i coeffs[3],
1251
                                               __m128i s_64[6],
1252
690k
                                               __m256i ss_256[3]) {
1253
690k
  s_64[5] = _mm_loadl_epi64((__m128i *)(src + 3 * stride));
1254
690k
  const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
1255
690k
  s_64[4] = _mm_loadl_epi64((__m128i *)(src + 4 * stride));
1256
690k
  const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[4]);
1257
690k
  ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
1258
690k
  return convolve_6tap_avx2(ss_256, coeffs);
1259
690k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_6tap_8x2_avx2
convolve_avx2.c:y_convolve_6tap_8x2_avx2
Line
Count
Source
1252
690k
                                               __m256i ss_256[3]) {
1253
690k
  s_64[5] = _mm_loadl_epi64((__m128i *)(src + 3 * stride));
1254
690k
  const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
1255
690k
  s_64[4] = _mm_loadl_epi64((__m128i *)(src + 4 * stride));
1256
690k
  const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[4]);
1257
690k
  ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
1258
690k
  return convolve_6tap_avx2(ss_256, coeffs);
1259
690k
}
1260
1261
static inline void y_convolve_6tap_16x2_avx2(const uint8_t *const src,
1262
                                             const ptrdiff_t stride,
1263
                                             const __m256i coeffs[3],
1264
                                             __m128i s_128[6],
1265
602k
                                             __m256i ss_256[6], __m256i r[2]) {
1266
602k
  s_128[5] = _mm_loadu_si128((__m128i *)(src + 3 * stride));
1267
602k
  const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
1268
602k
  s_128[4] = _mm_loadu_si128((__m128i *)(src + 4 * stride));
1269
602k
  const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[4]);
1270
602k
  ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
1271
602k
  ss_256[5] = _mm256_unpackhi_epi8(src45, src56);
1272
602k
  r[0] = convolve_6tap_avx2(ss_256, coeffs);
1273
602k
  r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
1274
602k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_6tap_16x2_avx2
convolve_avx2.c:y_convolve_6tap_16x2_avx2
Line
Count
Source
1265
602k
                                             __m256i ss_256[6], __m256i r[2]) {
1266
602k
  s_128[5] = _mm_loadu_si128((__m128i *)(src + 3 * stride));
1267
602k
  const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
1268
602k
  s_128[4] = _mm_loadu_si128((__m128i *)(src + 4 * stride));
1269
602k
  const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[4]);
1270
602k
  ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
1271
602k
  ss_256[5] = _mm256_unpackhi_epi8(src45, src56);
1272
602k
  r[0] = convolve_6tap_avx2(ss_256, coeffs);
1273
602k
  r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
1274
602k
}
1275
1276
static inline void y_convolve_6tap_32x2_avx2(
1277
    const uint8_t *const src, const ptrdiff_t stride, const __m256i coeffs[3],
1278
713k
    __m256i s_256[6], __m256i ss_256[6], __m256i tt_256[6], __m256i r[4]) {
1279
713k
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
1280
713k
  ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
1281
713k
  ss_256[5] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
1282
713k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
1283
713k
  tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[4]);
1284
713k
  tt_256[5] = _mm256_unpackhi_epi8(s_256[5], s_256[4]);
1285
713k
  r[0] = convolve_6tap_avx2(ss_256 + 0, coeffs);
1286
713k
  r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
1287
713k
  r[2] = convolve_6tap_avx2(tt_256 + 0, coeffs);
1288
713k
  r[3] = convolve_6tap_avx2(tt_256 + 3, coeffs);
1289
713k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_6tap_32x2_avx2
convolve_avx2.c:y_convolve_6tap_32x2_avx2
Line
Count
Source
1278
713k
    __m256i s_256[6], __m256i ss_256[6], __m256i tt_256[6], __m256i r[4]) {
1279
713k
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
1280
713k
  ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
1281
713k
  ss_256[5] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
1282
713k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
1283
713k
  tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[4]);
1284
713k
  tt_256[5] = _mm256_unpackhi_epi8(s_256[5], s_256[4]);
1285
713k
  r[0] = convolve_6tap_avx2(ss_256 + 0, coeffs);
1286
713k
  r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
1287
713k
  r[2] = convolve_6tap_avx2(tt_256 + 0, coeffs);
1288
713k
  r[3] = convolve_6tap_avx2(tt_256 + 3, coeffs);
1289
713k
}
1290
1291
static inline __m128i y_convolve_8tap_2x2_ssse3(const uint8_t *const src,
1292
                                                const ptrdiff_t stride,
1293
                                                const __m128i coeffs[4],
1294
                                                __m128i s_16[8],
1295
5.50k
                                                __m128i ss_128[4]) {
1296
5.50k
  s_16[7] = _mm_cvtsi32_si128(loadu_int16(src + 7 * stride));
1297
5.50k
  const __m128i src67 = _mm_unpacklo_epi16(s_16[6], s_16[7]);
1298
5.50k
  s_16[6] = _mm_cvtsi32_si128(loadu_int16(src + 8 * stride));
1299
5.50k
  const __m128i src78 = _mm_unpacklo_epi16(s_16[7], s_16[6]);
1300
5.50k
  ss_128[3] = _mm_unpacklo_epi8(src67, src78);
1301
5.50k
  return convolve_8tap_ssse3(ss_128, coeffs);
1302
5.50k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_8tap_2x2_ssse3
convolve_avx2.c:y_convolve_8tap_2x2_ssse3
Line
Count
Source
1295
5.50k
                                                __m128i ss_128[4]) {
1296
5.50k
  s_16[7] = _mm_cvtsi32_si128(loadu_int16(src + 7 * stride));
1297
5.50k
  const __m128i src67 = _mm_unpacklo_epi16(s_16[6], s_16[7]);
1298
5.50k
  s_16[6] = _mm_cvtsi32_si128(loadu_int16(src + 8 * stride));
1299
5.50k
  const __m128i src78 = _mm_unpacklo_epi16(s_16[7], s_16[6]);
1300
5.50k
  ss_128[3] = _mm_unpacklo_epi8(src67, src78);
1301
5.50k
  return convolve_8tap_ssse3(ss_128, coeffs);
1302
5.50k
}
1303
1304
static inline __m128i y_convolve_8tap_4x2_ssse3(const uint8_t *const src,
1305
                                                const ptrdiff_t stride,
1306
                                                const __m128i coeffs[4],
1307
                                                __m128i s_32[8],
1308
32.7k
                                                __m128i ss_128[4]) {
1309
32.7k
  s_32[7] = _mm_cvtsi32_si128(loadu_int32(src + 7 * stride));
1310
32.7k
  const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
1311
32.7k
  s_32[6] = _mm_cvtsi32_si128(loadu_int32(src + 8 * stride));
1312
32.7k
  const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
1313
32.7k
  ss_128[3] = _mm_unpacklo_epi8(src67, src78);
1314
32.7k
  return convolve_8tap_ssse3(ss_128, coeffs);
1315
32.7k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_8tap_4x2_ssse3
convolve_avx2.c:y_convolve_8tap_4x2_ssse3
Line
Count
Source
1308
32.7k
                                                __m128i ss_128[4]) {
1309
32.7k
  s_32[7] = _mm_cvtsi32_si128(loadu_int32(src + 7 * stride));
1310
32.7k
  const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
1311
32.7k
  s_32[6] = _mm_cvtsi32_si128(loadu_int32(src + 8 * stride));
1312
32.7k
  const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
1313
32.7k
  ss_128[3] = _mm_unpacklo_epi8(src67, src78);
1314
32.7k
  return convolve_8tap_ssse3(ss_128, coeffs);
1315
32.7k
}
1316
1317
static inline __m256i y_convolve_8tap_8x2_avx2(const uint8_t *const src,
1318
                                               const ptrdiff_t stride,
1319
                                               const __m256i coeffs[4],
1320
                                               __m128i s_64[8],
1321
47.0k
                                               __m256i ss_256[4]) {
1322
47.0k
  s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * stride));
1323
47.0k
  const __m256i src67 = _mm256_setr_m128i(s_64[6], s_64[7]);
1324
47.0k
  s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * stride));
1325
47.0k
  const __m256i src78 = _mm256_setr_m128i(s_64[7], s_64[6]);
1326
47.0k
  ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
1327
47.0k
  return convolve_8tap_avx2(ss_256, coeffs);
1328
47.0k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_8tap_8x2_avx2
convolve_avx2.c:y_convolve_8tap_8x2_avx2
Line
Count
Source
1321
47.0k
                                               __m256i ss_256[4]) {
1322
47.0k
  s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * stride));
1323
47.0k
  const __m256i src67 = _mm256_setr_m128i(s_64[6], s_64[7]);
1324
47.0k
  s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * stride));
1325
47.0k
  const __m256i src78 = _mm256_setr_m128i(s_64[7], s_64[6]);
1326
47.0k
  ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
1327
47.0k
  return convolve_8tap_avx2(ss_256, coeffs);
1328
47.0k
}
1329
1330
static inline void y_convolve_8tap_16x2_avx2(const uint8_t *const src,
1331
                                             const ptrdiff_t stride,
1332
                                             const __m256i coeffs[4],
1333
                                             __m128i s_128[8],
1334
41.2k
                                             __m256i ss_256[8], __m256i r[2]) {
1335
41.2k
  s_128[7] = _mm_loadu_si128((__m128i *)(src + 7 * stride));
1336
41.2k
  const __m256i src67 = _mm256_setr_m128i(s_128[6], s_128[7]);
1337
41.2k
  s_128[6] = _mm_loadu_si128((__m128i *)(src + 8 * stride));
1338
41.2k
  const __m256i src78 = _mm256_setr_m128i(s_128[7], s_128[6]);
1339
41.2k
  ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
1340
41.2k
  ss_256[7] = _mm256_unpackhi_epi8(src67, src78);
1341
41.2k
  r[0] = convolve_8tap_avx2(ss_256, coeffs);
1342
41.2k
  r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
1343
41.2k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_8tap_16x2_avx2
convolve_avx2.c:y_convolve_8tap_16x2_avx2
Line
Count
Source
1334
41.2k
                                             __m256i ss_256[8], __m256i r[2]) {
1335
41.2k
  s_128[7] = _mm_loadu_si128((__m128i *)(src + 7 * stride));
1336
41.2k
  const __m256i src67 = _mm256_setr_m128i(s_128[6], s_128[7]);
1337
41.2k
  s_128[6] = _mm_loadu_si128((__m128i *)(src + 8 * stride));
1338
41.2k
  const __m256i src78 = _mm256_setr_m128i(s_128[7], s_128[6]);
1339
41.2k
  ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
1340
41.2k
  ss_256[7] = _mm256_unpackhi_epi8(src67, src78);
1341
41.2k
  r[0] = convolve_8tap_avx2(ss_256, coeffs);
1342
41.2k
  r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
1343
41.2k
}
1344
1345
static inline void y_convolve_8tap_32x2_avx2(
1346
    const uint8_t *const src, const ptrdiff_t stride, const __m256i coeffs[4],
1347
107k
    __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
1348
107k
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1349
107k
  ss_256[3] = _mm256_unpacklo_epi8(s_256[6], s_256[7]);
1350
107k
  ss_256[7] = _mm256_unpackhi_epi8(s_256[6], s_256[7]);
1351
107k
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1352
107k
  tt_256[3] = _mm256_unpacklo_epi8(s_256[7], s_256[6]);
1353
107k
  tt_256[7] = _mm256_unpackhi_epi8(s_256[7], s_256[6]);
1354
107k
  r[0] = convolve_8tap_avx2(ss_256 + 0, coeffs);
1355
107k
  r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
1356
107k
  r[2] = convolve_8tap_avx2(tt_256 + 0, coeffs);
1357
107k
  r[3] = convolve_8tap_avx2(tt_256 + 4, coeffs);
1358
107k
}
Unexecuted instantiation: convolve_2d_avx2.c:y_convolve_8tap_32x2_avx2
convolve_avx2.c:y_convolve_8tap_32x2_avx2
Line
Count
Source
1347
107k
    __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
1348
107k
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1349
107k
  ss_256[3] = _mm256_unpacklo_epi8(s_256[6], s_256[7]);
1350
107k
  ss_256[7] = _mm256_unpackhi_epi8(s_256[6], s_256[7]);
1351
107k
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1352
107k
  tt_256[3] = _mm256_unpacklo_epi8(s_256[7], s_256[6]);
1353
107k
  tt_256[7] = _mm256_unpackhi_epi8(s_256[7], s_256[6]);
1354
107k
  r[0] = convolve_8tap_avx2(ss_256 + 0, coeffs);
1355
107k
  r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
1356
107k
  r[2] = convolve_8tap_avx2(tt_256 + 0, coeffs);
1357
107k
  r[3] = convolve_8tap_avx2(tt_256 + 4, coeffs);
1358
107k
}
1359
1360
static inline void xy_x_convolve_2tap_32_avx2(const uint8_t *const src,
1361
                                              const __m256i coeffs[1],
1362
430k
                                              __m256i r[2]) {
1363
430k
  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
1364
430k
  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
1365
430k
  const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
1366
430k
  const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
1367
1368
430k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
1369
430k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
1370
430k
}
convolve_2d_avx2.c:xy_x_convolve_2tap_32_avx2
Line
Count
Source
1362
430k
                                              __m256i r[2]) {
1363
430k
  const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
1364
430k
  const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
1365
430k
  const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
1366
430k
  const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
1367
1368
430k
  r[0] = convolve_2tap_avx2(&ss0, coeffs);
1369
430k
  r[1] = convolve_2tap_avx2(&ss1, coeffs);
1370
430k
}
Unexecuted instantiation: convolve_avx2.c:xy_x_convolve_2tap_32_avx2
1371
1372
static inline void xy_x_2tap_32_avx2(const uint8_t *const src,
1373
                                     const __m256i coeffs[1],
1374
430k
                                     int16_t *const dst) {
1375
430k
  __m256i r[2];
1376
1377
430k
  xy_x_convolve_2tap_32_avx2(src, coeffs, r);
1378
430k
  const __m256i d0 = xy_x_round_avx2(r[0]);
1379
430k
  const __m256i d1 = xy_x_round_avx2(r[1]);
1380
430k
  _mm256_storeu_si256((__m256i *)dst, d0);
1381
430k
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1382
430k
}
convolve_2d_avx2.c:xy_x_2tap_32_avx2
Line
Count
Source
1374
430k
                                     int16_t *const dst) {
1375
430k
  __m256i r[2];
1376
1377
430k
  xy_x_convolve_2tap_32_avx2(src, coeffs, r);
1378
430k
  const __m256i d0 = xy_x_round_avx2(r[0]);
1379
430k
  const __m256i d1 = xy_x_round_avx2(r[1]);
1380
430k
  _mm256_storeu_si256((__m256i *)dst, d0);
1381
430k
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1382
430k
}
Unexecuted instantiation: convolve_avx2.c:xy_x_2tap_32_avx2
1383
1384
static inline void xy_x_4tap_32_avx2(const uint8_t *const src,
1385
                                     const __m256i coeffs[2],
1386
                                     const __m256i filt[2],
1387
494k
                                     int16_t *const dst) {
1388
494k
  __m256i r[2];
1389
1390
494k
  x_convolve_4tap_32_avx2(src, coeffs, filt, r);
1391
494k
  const __m256i d0 = xy_x_round_avx2(r[0]);
1392
494k
  const __m256i d1 = xy_x_round_avx2(r[1]);
1393
494k
  _mm256_storeu_si256((__m256i *)dst, d0);
1394
494k
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1395
494k
}
convolve_2d_avx2.c:xy_x_4tap_32_avx2
Line
Count
Source
1387
494k
                                     int16_t *const dst) {
1388
494k
  __m256i r[2];
1389
1390
494k
  x_convolve_4tap_32_avx2(src, coeffs, filt, r);
1391
494k
  const __m256i d0 = xy_x_round_avx2(r[0]);
1392
494k
  const __m256i d1 = xy_x_round_avx2(r[1]);
1393
494k
  _mm256_storeu_si256((__m256i *)dst, d0);
1394
494k
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1395
494k
}
Unexecuted instantiation: convolve_avx2.c:xy_x_4tap_32_avx2
1396
1397
static inline void xy_x_6tap_32_avx2(const uint8_t *const src,
1398
                                     const __m256i coeffs[3],
1399
                                     const __m256i filt[3],
1400
3.97M
                                     int16_t *const dst) {
1401
3.97M
  __m256i r[2];
1402
1403
3.97M
  x_convolve_6tap_32_avx2(src, coeffs, filt, r);
1404
3.97M
  const __m256i d0 = xy_x_round_avx2(r[0]);
1405
3.97M
  const __m256i d1 = xy_x_round_avx2(r[1]);
1406
3.97M
  _mm256_storeu_si256((__m256i *)dst, d0);
1407
3.97M
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1408
3.97M
}
convolve_2d_avx2.c:xy_x_6tap_32_avx2
Line
Count
Source
1400
3.97M
                                     int16_t *const dst) {
1401
3.97M
  __m256i r[2];
1402
1403
3.97M
  x_convolve_6tap_32_avx2(src, coeffs, filt, r);
1404
3.97M
  const __m256i d0 = xy_x_round_avx2(r[0]);
1405
3.97M
  const __m256i d1 = xy_x_round_avx2(r[1]);
1406
3.97M
  _mm256_storeu_si256((__m256i *)dst, d0);
1407
3.97M
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1408
3.97M
}
Unexecuted instantiation: convolve_avx2.c:xy_x_6tap_32_avx2
1409
1410
static inline void xy_x_8tap_32_avx2(const uint8_t *const src,
1411
                                     const __m256i coeffs[4],
1412
                                     const __m256i filt[4],
1413
1.10M
                                     int16_t *const dst) {
1414
1.10M
  __m256i r[2];
1415
1416
1.10M
  x_convolve_8tap_32_avx2(src, coeffs, filt, r);
1417
1.10M
  const __m256i d0 = xy_x_round_avx2(r[0]);
1418
1.10M
  const __m256i d1 = xy_x_round_avx2(r[1]);
1419
1.10M
  _mm256_storeu_si256((__m256i *)dst, d0);
1420
1.10M
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1421
1.10M
}
convolve_2d_avx2.c:xy_x_8tap_32_avx2
Line
Count
Source
1413
1.10M
                                     int16_t *const dst) {
1414
1.10M
  __m256i r[2];
1415
1416
1.10M
  x_convolve_8tap_32_avx2(src, coeffs, filt, r);
1417
1.10M
  const __m256i d0 = xy_x_round_avx2(r[0]);
1418
1.10M
  const __m256i d1 = xy_x_round_avx2(r[1]);
1419
1.10M
  _mm256_storeu_si256((__m256i *)dst, d0);
1420
1.10M
  _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1421
1.10M
}
Unexecuted instantiation: convolve_avx2.c:xy_x_8tap_32_avx2
1422
1423
static inline __m128i xy_y_convolve_2tap_2x2_sse2(const int16_t *const src,
1424
                                                  __m128i s_32[2],
1425
13.4k
                                                  const __m128i coeffs[1]) {
1426
13.4k
  __m128i s_128[2];
1427
1428
13.4k
  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + 2));
1429
13.4k
  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1430
13.4k
  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * 2));
1431
13.4k
  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1432
13.4k
  const __m128i ss = _mm_unpacklo_epi16(s_128[0], s_128[1]);
1433
13.4k
  return convolve16_2tap_sse2(&ss, coeffs);
1434
13.4k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_2x2_sse2
Line
Count
Source
1425
13.4k
                                                  const __m128i coeffs[1]) {
1426
13.4k
  __m128i s_128[2];
1427
1428
13.4k
  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + 2));
1429
13.4k
  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1430
13.4k
  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * 2));
1431
13.4k
  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1432
13.4k
  const __m128i ss = _mm_unpacklo_epi16(s_128[0], s_128[1]);
1433
13.4k
  return convolve16_2tap_sse2(&ss, coeffs);
1434
13.4k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_2x2_sse2
1435
1436
static inline __m128i xy_y_convolve_2tap_2x2_half_pel_sse2(
1437
3.03k
    const int16_t *const src, __m128i s_32[2]) {
1438
3.03k
  __m128i s_128[2];
1439
1440
3.03k
  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + 2));
1441
3.03k
  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1442
3.03k
  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * 2));
1443
3.03k
  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1444
3.03k
  return _mm_add_epi16(s_128[0], s_128[1]);
1445
3.03k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_2x2_half_pel_sse2
Line
Count
Source
1437
3.03k
    const int16_t *const src, __m128i s_32[2]) {
1438
3.03k
  __m128i s_128[2];
1439
1440
3.03k
  s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + 2));
1441
3.03k
  s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1442
3.03k
  s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * 2));
1443
3.03k
  s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1444
3.03k
  return _mm_add_epi16(s_128[0], s_128[1]);
1445
3.03k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_2x2_half_pel_sse2
1446
1447
static inline void xy_y_convolve_2tap_4x2_sse2(const int16_t *const src,
1448
                                               __m128i s_64[2],
1449
                                               const __m128i coeffs[1],
1450
64.6k
                                               __m128i r[2]) {
1451
64.6k
  __m128i s_128[2];
1452
1453
64.6k
  s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
1454
64.6k
  s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
1455
64.6k
  s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
1456
64.6k
  s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
1457
64.6k
  const __m128i ss0 = _mm_unpacklo_epi16(s_128[0], s_128[1]);
1458
64.6k
  const __m128i ss1 = _mm_unpackhi_epi16(s_128[0], s_128[1]);
1459
64.6k
  r[0] = convolve16_2tap_sse2(&ss0, coeffs);
1460
64.6k
  r[1] = convolve16_2tap_sse2(&ss1, coeffs);
1461
64.6k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_4x2_sse2
Line
Count
Source
1450
64.6k
                                               __m128i r[2]) {
1451
64.6k
  __m128i s_128[2];
1452
1453
64.6k
  s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
1454
64.6k
  s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
1455
64.6k
  s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
1456
64.6k
  s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
1457
64.6k
  const __m128i ss0 = _mm_unpacklo_epi16(s_128[0], s_128[1]);
1458
64.6k
  const __m128i ss1 = _mm_unpackhi_epi16(s_128[0], s_128[1]);
1459
64.6k
  r[0] = convolve16_2tap_sse2(&ss0, coeffs);
1460
64.6k
  r[1] = convolve16_2tap_sse2(&ss1, coeffs);
1461
64.6k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_4x2_sse2
1462
1463
static inline __m128i xy_y_convolve_2tap_4x2_half_pel_sse2(
1464
15.1k
    const int16_t *const src, __m128i s_64[2]) {
1465
15.1k
  __m128i s_128[2];
1466
1467
15.1k
  s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
1468
15.1k
  s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
1469
15.1k
  s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
1470
15.1k
  s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
1471
15.1k
  return _mm_add_epi16(s_128[0], s_128[1]);
1472
15.1k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_4x2_half_pel_sse2
Line
Count
Source
1464
15.1k
    const int16_t *const src, __m128i s_64[2]) {
1465
15.1k
  __m128i s_128[2];
1466
1467
15.1k
  s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
1468
15.1k
  s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
1469
15.1k
  s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
1470
15.1k
  s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
1471
15.1k
  return _mm_add_epi16(s_128[0], s_128[1]);
1472
15.1k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_4x2_half_pel_sse2
1473
1474
static inline void xy_y_convolve_2tap_16_avx2(const __m256i s0,
1475
                                              const __m256i s1,
1476
                                              const __m256i coeffs[1],
1477
862k
                                              __m256i r[2]) {
1478
862k
  const __m256i ss0 = _mm256_unpacklo_epi16(s0, s1);
1479
862k
  const __m256i ss1 = _mm256_unpackhi_epi16(s0, s1);
1480
862k
  r[0] = convolve16_2tap_avx2(&ss0, coeffs);
1481
862k
  r[1] = convolve16_2tap_avx2(&ss1, coeffs);
1482
862k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_16_avx2
Line
Count
Source
1477
862k
                                              __m256i r[2]) {
1478
862k
  const __m256i ss0 = _mm256_unpacklo_epi16(s0, s1);
1479
862k
  const __m256i ss1 = _mm256_unpackhi_epi16(s0, s1);
1480
862k
  r[0] = convolve16_2tap_avx2(&ss0, coeffs);
1481
862k
  r[1] = convolve16_2tap_avx2(&ss1, coeffs);
1482
862k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_16_avx2
1483
1484
static inline void xy_y_convolve_2tap_8x2_avx2(const int16_t *const src,
1485
                                               __m128i s_128[2],
1486
                                               const __m256i coeffs[1],
1487
52.7k
                                               __m256i r[2]) {
1488
52.7k
  __m256i s_256[2];
1489
52.7k
  s_128[1] = _mm_loadu_si128((__m128i *)(src + 8));
1490
52.7k
  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1491
52.7k
  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
1492
52.7k
  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1493
52.7k
  xy_y_convolve_2tap_16_avx2(s_256[0], s_256[1], coeffs, r);
1494
52.7k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_8x2_avx2
Line
Count
Source
1487
52.7k
                                               __m256i r[2]) {
1488
52.7k
  __m256i s_256[2];
1489
52.7k
  s_128[1] = _mm_loadu_si128((__m128i *)(src + 8));
1490
52.7k
  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1491
52.7k
  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
1492
52.7k
  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1493
52.7k
  xy_y_convolve_2tap_16_avx2(s_256[0], s_256[1], coeffs, r);
1494
52.7k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_8x2_avx2
1495
1496
static inline __m256i xy_y_convolve_2tap_8x2_half_pel_avx2(
1497
17.0k
    const int16_t *const src, __m128i s_128[2]) {
1498
17.0k
  __m256i s_256[2];
1499
17.0k
  s_128[1] = _mm_loadu_si128((__m128i *)(src + 8));
1500
17.0k
  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1501
17.0k
  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
1502
17.0k
  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1503
17.0k
  return _mm256_add_epi16(s_256[0], s_256[1]);
1504
17.0k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_8x2_half_pel_avx2
Line
Count
Source
1497
17.0k
    const int16_t *const src, __m128i s_128[2]) {
1498
17.0k
  __m256i s_256[2];
1499
17.0k
  s_128[1] = _mm_loadu_si128((__m128i *)(src + 8));
1500
17.0k
  s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1501
17.0k
  s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
1502
17.0k
  s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1503
17.0k
  return _mm256_add_epi16(s_256[0], s_256[1]);
1504
17.0k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_8x2_half_pel_avx2
1505
1506
static inline void xy_y_convolve_2tap_16x2_half_pel_avx2(
1507
16.4k
    const int16_t *const src, __m256i s_256[2], __m256i r[2]) {
1508
16.4k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1509
16.4k
  r[0] = _mm256_add_epi16(s_256[0], s_256[1]);
1510
16.4k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
1511
16.4k
  r[1] = _mm256_add_epi16(s_256[1], s_256[0]);
1512
16.4k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_16x2_half_pel_avx2
Line
Count
Source
1507
16.4k
    const int16_t *const src, __m256i s_256[2], __m256i r[2]) {
1508
16.4k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1509
16.4k
  r[0] = _mm256_add_epi16(s_256[0], s_256[1]);
1510
16.4k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
1511
16.4k
  r[1] = _mm256_add_epi16(s_256[1], s_256[0]);
1512
16.4k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_16x2_half_pel_avx2
1513
1514
static inline void xy_y_store_16x2_avx2(const __m256i r[2], uint8_t *const dst,
1515
0
                                        const ptrdiff_t stride) {
1516
0
  const __m256i t = _mm256_packus_epi16(r[0], r[1]);
1517
0
  const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
1518
0
  storeu_u8_16x2_avx2(d, dst, stride);
1519
0
}
Unexecuted instantiation: convolve_2d_avx2.c:xy_y_store_16x2_avx2
Unexecuted instantiation: convolve_avx2.c:xy_y_store_16x2_avx2
1520
1521
static inline void xy_y_convolve_2tap_16x2_avx2(const int16_t *const src,
1522
                                                __m256i s[2],
1523
                                                const __m256i coeffs[1],
1524
31.2k
                                                __m256i r[4]) {
1525
31.2k
  s[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1526
31.2k
  xy_y_convolve_2tap_16_avx2(s[0], s[1], coeffs, r + 0);
1527
31.2k
  s[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
1528
31.2k
  xy_y_convolve_2tap_16_avx2(s[1], s[0], coeffs, r + 2);
1529
31.2k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_16x2_avx2
Line
Count
Source
1524
31.2k
                                                __m256i r[4]) {
1525
31.2k
  s[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1526
31.2k
  xy_y_convolve_2tap_16_avx2(s[0], s[1], coeffs, r + 0);
1527
31.2k
  s[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
1528
31.2k
  xy_y_convolve_2tap_16_avx2(s[1], s[0], coeffs, r + 2);
1529
31.2k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_16x2_avx2
1530
1531
static inline void xy_y_convolve_2tap_32_avx2(const int16_t *const src,
1532
                                              const __m256i s0[2],
1533
                                              __m256i s1[2],
1534
                                              const __m256i coeffs[1],
1535
280k
                                              __m256i r[4]) {
1536
280k
  s1[0] = _mm256_loadu_si256((__m256i *)src);
1537
280k
  s1[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1538
280k
  xy_y_convolve_2tap_16_avx2(s0[0], s1[0], coeffs, r + 0);
1539
280k
  xy_y_convolve_2tap_16_avx2(s0[1], s1[1], coeffs, r + 2);
1540
280k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_32_avx2
Line
Count
Source
1535
280k
                                              __m256i r[4]) {
1536
280k
  s1[0] = _mm256_loadu_si256((__m256i *)src);
1537
280k
  s1[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1538
280k
  xy_y_convolve_2tap_16_avx2(s0[0], s1[0], coeffs, r + 0);
1539
280k
  xy_y_convolve_2tap_16_avx2(s0[1], s1[1], coeffs, r + 2);
1540
280k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_32_avx2
1541
1542
static inline void xy_y_convolve_2tap_32_all_avx2(const int16_t *const src,
1543
                                                  const __m256i s0[2],
1544
                                                  __m256i s1[2],
1545
                                                  const __m256i coeffs[1],
1546
280k
                                                  uint8_t *const dst) {
1547
280k
  __m256i r[4];
1548
1549
280k
  xy_y_convolve_2tap_32_avx2(src, s0, s1, coeffs, r);
1550
280k
  xy_y_round_store_32_avx2(r + 0, r + 2, dst);
1551
280k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_32_all_avx2
Line
Count
Source
1546
280k
                                                  uint8_t *const dst) {
1547
280k
  __m256i r[4];
1548
1549
280k
  xy_y_convolve_2tap_32_avx2(src, s0, s1, coeffs, r);
1550
280k
  xy_y_round_store_32_avx2(r + 0, r + 2, dst);
1551
280k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_32_all_avx2
1552
1553
static inline void xy_y_convolve_2tap_half_pel_32_avx2(const int16_t *const src,
1554
                                                       const __m256i s0[2],
1555
                                                       __m256i s1[2],
1556
132k
                                                       __m256i r[2]) {
1557
132k
  s1[0] = _mm256_loadu_si256((__m256i *)src);
1558
132k
  s1[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1559
132k
  r[0] = _mm256_add_epi16(s0[0], s1[0]);
1560
132k
  r[1] = _mm256_add_epi16(s0[1], s1[1]);
1561
132k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_half_pel_32_avx2
Line
Count
Source
1556
132k
                                                       __m256i r[2]) {
1557
132k
  s1[0] = _mm256_loadu_si256((__m256i *)src);
1558
132k
  s1[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1559
132k
  r[0] = _mm256_add_epi16(s0[0], s1[0]);
1560
132k
  r[1] = _mm256_add_epi16(s0[1], s1[1]);
1561
132k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_half_pel_32_avx2
1562
1563
static inline void xy_y_convolve_2tap_half_pel_32_all_avx2(
1564
    const int16_t *const src, const __m256i s0[2], __m256i s1[2],
1565
132k
    uint8_t *const dst) {
1566
132k
  __m256i r[2];
1567
1568
132k
  xy_y_convolve_2tap_half_pel_32_avx2(src, s0, s1, r);
1569
132k
  r[0] = xy_y_round_half_pel_avx2(r[0]);
1570
132k
  r[1] = xy_y_round_half_pel_avx2(r[1]);
1571
132k
  xy_y_pack_store_32_avx2(r[0], r[1], dst);
1572
132k
}
convolve_2d_avx2.c:xy_y_convolve_2tap_half_pel_32_all_avx2
Line
Count
Source
1565
132k
    uint8_t *const dst) {
1566
132k
  __m256i r[2];
1567
1568
132k
  xy_y_convolve_2tap_half_pel_32_avx2(src, s0, s1, r);
1569
132k
  r[0] = xy_y_round_half_pel_avx2(r[0]);
1570
132k
  r[1] = xy_y_round_half_pel_avx2(r[1]);
1571
132k
  xy_y_pack_store_32_avx2(r[0], r[1], dst);
1572
132k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_2tap_half_pel_32_all_avx2
1573
1574
static inline __m128i xy_y_convolve_4tap_2x2_sse2(const int16_t *const src,
1575
                                                  __m128i s_32[4],
1576
                                                  __m128i ss_128[2],
1577
146k
                                                  const __m128i coeffs[2]) {
1578
146k
  s_32[3] = _mm_cvtsi32_si128(loadu_int32(src + 3 * 2));
1579
146k
  const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1580
146k
  s_32[2] = _mm_cvtsi32_si128(loadu_int32(src + 4 * 2));
1581
146k
  const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
1582
146k
  ss_128[1] = _mm_unpacklo_epi16(src23, src34);
1583
146k
  const __m128i r = convolve16_4tap_sse2(ss_128, coeffs);
1584
146k
  ss_128[0] = ss_128[1];
1585
146k
  return r;
1586
146k
}
convolve_2d_avx2.c:xy_y_convolve_4tap_2x2_sse2
Line
Count
Source
1577
146k
                                                  const __m128i coeffs[2]) {
1578
146k
  s_32[3] = _mm_cvtsi32_si128(loadu_int32(src + 3 * 2));
1579
146k
  const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1580
146k
  s_32[2] = _mm_cvtsi32_si128(loadu_int32(src + 4 * 2));
1581
146k
  const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
1582
146k
  ss_128[1] = _mm_unpacklo_epi16(src23, src34);
1583
146k
  const __m128i r = convolve16_4tap_sse2(ss_128, coeffs);
1584
146k
  ss_128[0] = ss_128[1];
1585
146k
  return r;
1586
146k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_2x2_sse2
1587
1588
static inline __m256i xy_y_convolve_4tap_4x2_avx2(const int16_t *const src,
1589
                                                  __m128i s_64[4],
1590
                                                  __m256i ss_256[2],
1591
726k
                                                  const __m256i coeffs[2]) {
1592
726k
  __m256i s_256[2];
1593
726k
  s_64[3] = _mm_loadl_epi64((__m128i *)(src + 3 * 4));
1594
726k
  s_256[0] = _mm256_setr_m128i(s_64[2], s_64[3]);
1595
726k
  s_64[2] = _mm_loadl_epi64((__m128i *)(src + 4 * 4));
1596
726k
  s_256[1] = _mm256_setr_m128i(s_64[3], s_64[2]);
1597
726k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1598
726k
  const __m256i r = convolve16_4tap_avx2(ss_256, coeffs);
1599
726k
  ss_256[0] = ss_256[1];
1600
726k
  return r;
1601
726k
}
convolve_2d_avx2.c:xy_y_convolve_4tap_4x2_avx2
Line
Count
Source
1591
726k
                                                  const __m256i coeffs[2]) {
1592
726k
  __m256i s_256[2];
1593
726k
  s_64[3] = _mm_loadl_epi64((__m128i *)(src + 3 * 4));
1594
726k
  s_256[0] = _mm256_setr_m128i(s_64[2], s_64[3]);
1595
726k
  s_64[2] = _mm_loadl_epi64((__m128i *)(src + 4 * 4));
1596
726k
  s_256[1] = _mm256_setr_m128i(s_64[3], s_64[2]);
1597
726k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1598
726k
  const __m256i r = convolve16_4tap_avx2(ss_256, coeffs);
1599
726k
  ss_256[0] = ss_256[1];
1600
726k
  return r;
1601
726k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_4x2_avx2
1602
1603
static inline void xy_y_convolve_4tap_16_avx2(const __m256i *const ss,
1604
                                              const __m256i coeffs[2],
1605
2.96M
                                              __m256i r[2]) {
1606
2.96M
  r[0] = convolve16_4tap_avx2(ss, coeffs);
1607
2.96M
  r[1] = convolve16_4tap_avx2(ss + 2, coeffs);
1608
2.96M
}
convolve_2d_avx2.c:xy_y_convolve_4tap_16_avx2
Line
Count
Source
1605
2.96M
                                              __m256i r[2]) {
1606
2.96M
  r[0] = convolve16_4tap_avx2(ss, coeffs);
1607
2.96M
  r[1] = convolve16_4tap_avx2(ss + 2, coeffs);
1608
2.96M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_16_avx2
1609
1610
static inline void xy_y_convolve_4tap_8x2_avx2(const int16_t *const src,
1611
                                               __m256i ss_256[4],
1612
                                               const __m256i coeffs[2],
1613
482k
                                               __m256i r[2]) {
1614
482k
  __m256i s_256[2];
1615
482k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
1616
482k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
1617
482k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1618
482k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1619
482k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1620
482k
  ss_256[0] = ss_256[1];
1621
482k
  ss_256[2] = ss_256[3];
1622
482k
}
convolve_2d_avx2.c:xy_y_convolve_4tap_8x2_avx2
Line
Count
Source
1613
482k
                                               __m256i r[2]) {
1614
482k
  __m256i s_256[2];
1615
482k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
1616
482k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
1617
482k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1618
482k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1619
482k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1620
482k
  ss_256[0] = ss_256[1];
1621
482k
  ss_256[2] = ss_256[3];
1622
482k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_8x2_avx2
1623
1624
static inline void xy_y_convolve_4tap_8x2_half_pel_avx2(
1625
    const int16_t *const src, const __m256i coeffs[1], __m256i s_256[4],
1626
88.0k
    __m256i r[2]) {
1627
88.0k
  __m256i a_256[2];
1628
88.0k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
1629
88.0k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
1630
88.0k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
1631
88.0k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
1632
88.0k
  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r);
1633
88.0k
  s_256[0] = s_256[2];
1634
88.0k
  s_256[1] = s_256[3];
1635
88.0k
}
convolve_2d_avx2.c:xy_y_convolve_4tap_8x2_half_pel_avx2
Line
Count
Source
1626
88.0k
    __m256i r[2]) {
1627
88.0k
  __m256i a_256[2];
1628
88.0k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
1629
88.0k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
1630
88.0k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
1631
88.0k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
1632
88.0k
  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r);
1633
88.0k
  s_256[0] = s_256[2];
1634
88.0k
  s_256[1] = s_256[3];
1635
88.0k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_8x2_half_pel_avx2
1636
1637
static inline void xy_y_convolve_4tap_16x2_avx2(
1638
    const int16_t *const src, __m256i s_256[4], __m256i ss_256[4],
1639
266k
    __m256i tt_256[4], const __m256i coeffs[2], __m256i r[4]) {
1640
266k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
1641
266k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1642
266k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1643
266k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
1644
266k
  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
1645
266k
  tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
1646
266k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1647
266k
  xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
1648
266k
  ss_256[0] = ss_256[1];
1649
266k
  ss_256[2] = ss_256[3];
1650
266k
  tt_256[0] = tt_256[1];
1651
266k
  tt_256[2] = tt_256[3];
1652
266k
}
convolve_2d_avx2.c:xy_y_convolve_4tap_16x2_avx2
Line
Count
Source
1639
266k
    __m256i tt_256[4], const __m256i coeffs[2], __m256i r[4]) {
1640
266k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
1641
266k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1642
266k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1643
266k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
1644
266k
  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
1645
266k
  tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
1646
266k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1647
266k
  xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
1648
266k
  ss_256[0] = ss_256[1];
1649
266k
  ss_256[2] = ss_256[3];
1650
266k
  tt_256[0] = tt_256[1];
1651
266k
  tt_256[2] = tt_256[3];
1652
266k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_16x2_avx2
1653
1654
static inline void xy_y_convolve_4tap_32x2_avx2(
1655
    const int16_t *const src, const ptrdiff_t stride, __m256i s_256[4],
1656
    __m256i ss_256[4], __m256i tt_256[4], const __m256i coeffs[2],
1657
444k
    __m256i r[4]) {
1658
444k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
1659
444k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1660
444k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1661
444k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
1662
444k
  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
1663
444k
  tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
1664
444k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1665
444k
  xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
1666
444k
  ss_256[0] = ss_256[1];
1667
444k
  ss_256[2] = ss_256[3];
1668
444k
  tt_256[0] = tt_256[1];
1669
444k
  tt_256[2] = tt_256[3];
1670
444k
}
convolve_2d_avx2.c:xy_y_convolve_4tap_32x2_avx2
Line
Count
Source
1657
444k
    __m256i r[4]) {
1658
444k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
1659
444k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1660
444k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1661
444k
  s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
1662
444k
  tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
1663
444k
  tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
1664
444k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1665
444k
  xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
1666
444k
  ss_256[0] = ss_256[1];
1667
444k
  ss_256[2] = ss_256[3];
1668
444k
  tt_256[0] = tt_256[1];
1669
444k
  tt_256[2] = tt_256[3];
1670
444k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_32x2_avx2
1671
1672
static inline void xy_y_convolve_4tap_16x2_half_pelavx2(
1673
    const int16_t *const src, __m256i s_256[5], const __m256i coeffs[1],
1674
49.0k
    __m256i r[4]) {
1675
49.0k
  __m256i a_256[2];
1676
1677
49.0k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
1678
49.0k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
1679
1680
49.0k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
1681
49.0k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
1682
49.0k
  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 0);
1683
1684
49.0k
  a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
1685
49.0k
  a_256[1] = _mm256_add_epi16(s_256[2], s_256[3]);
1686
49.0k
  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 2);
1687
1688
49.0k
  s_256[0] = s_256[2];
1689
49.0k
  s_256[1] = s_256[3];
1690
49.0k
  s_256[2] = s_256[4];
1691
49.0k
}
convolve_2d_avx2.c:xy_y_convolve_4tap_16x2_half_pelavx2
Line
Count
Source
1674
49.0k
    __m256i r[4]) {
1675
49.0k
  __m256i a_256[2];
1676
1677
49.0k
  s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
1678
49.0k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
1679
1680
49.0k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
1681
49.0k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
1682
49.0k
  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 0);
1683
1684
49.0k
  a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
1685
49.0k
  a_256[1] = _mm256_add_epi16(s_256[2], s_256[3]);
1686
49.0k
  xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 2);
1687
1688
49.0k
  s_256[0] = s_256[2];
1689
49.0k
  s_256[1] = s_256[3];
1690
49.0k
  s_256[2] = s_256[4];
1691
49.0k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_4tap_16x2_half_pelavx2
1692
1693
static inline __m128i xy_y_convolve_6tap_2x2_sse2(const int16_t *const src,
1694
                                                  __m128i s_32[6],
1695
                                                  __m128i ss_128[3],
1696
239k
                                                  const __m128i coeffs[3]) {
1697
239k
  s_32[5] = _mm_cvtsi32_si128(loadu_int32(src + 5 * 2));
1698
239k
  const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
1699
239k
  s_32[4] = _mm_cvtsi32_si128(loadu_int32(src + 6 * 2));
1700
239k
  const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
1701
239k
  ss_128[2] = _mm_unpacklo_epi16(src45, src56);
1702
239k
  const __m128i r = convolve16_6tap_sse2(ss_128, coeffs);
1703
239k
  ss_128[0] = ss_128[1];
1704
239k
  ss_128[1] = ss_128[2];
1705
239k
  return r;
1706
239k
}
convolve_2d_avx2.c:xy_y_convolve_6tap_2x2_sse2
Line
Count
Source
1696
239k
                                                  const __m128i coeffs[3]) {
1697
239k
  s_32[5] = _mm_cvtsi32_si128(loadu_int32(src + 5 * 2));
1698
239k
  const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
1699
239k
  s_32[4] = _mm_cvtsi32_si128(loadu_int32(src + 6 * 2));
1700
239k
  const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
1701
239k
  ss_128[2] = _mm_unpacklo_epi16(src45, src56);
1702
239k
  const __m128i r = convolve16_6tap_sse2(ss_128, coeffs);
1703
239k
  ss_128[0] = ss_128[1];
1704
239k
  ss_128[1] = ss_128[2];
1705
239k
  return r;
1706
239k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_6tap_2x2_sse2
1707
1708
static inline __m256i xy_y_convolve_6tap_4x2_avx2(const int16_t *const src,
1709
                                                  __m128i s_64[6],
1710
                                                  __m256i ss_256[3],
1711
1.20M
                                                  const __m256i coeffs[3]) {
1712
1.20M
  __m256i s_256[2];
1713
1.20M
  s_64[5] = _mm_loadl_epi64((__m128i *)(src + 5 * 4));
1714
1.20M
  s_256[0] = _mm256_setr_m128i(s_64[4], s_64[5]);
1715
1.20M
  s_64[4] = _mm_loadl_epi64((__m128i *)(src + 6 * 4));
1716
1.20M
  s_256[1] = _mm256_setr_m128i(s_64[5], s_64[4]);
1717
1.20M
  ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1718
1.20M
  const __m256i r = convolve16_6tap_avx2(ss_256, coeffs);
1719
1.20M
  ss_256[0] = ss_256[1];
1720
1.20M
  ss_256[1] = ss_256[2];
1721
1.20M
  return r;
1722
1.20M
}
convolve_2d_avx2.c:xy_y_convolve_6tap_4x2_avx2
Line
Count
Source
1711
1.20M
                                                  const __m256i coeffs[3]) {
1712
1.20M
  __m256i s_256[2];
1713
1.20M
  s_64[5] = _mm_loadl_epi64((__m128i *)(src + 5 * 4));
1714
1.20M
  s_256[0] = _mm256_setr_m128i(s_64[4], s_64[5]);
1715
1.20M
  s_64[4] = _mm_loadl_epi64((__m128i *)(src + 6 * 4));
1716
1.20M
  s_256[1] = _mm256_setr_m128i(s_64[5], s_64[4]);
1717
1.20M
  ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1718
1.20M
  const __m256i r = convolve16_6tap_avx2(ss_256, coeffs);
1719
1.20M
  ss_256[0] = ss_256[1];
1720
1.20M
  ss_256[1] = ss_256[2];
1721
1.20M
  return r;
1722
1.20M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_6tap_4x2_avx2
1723
1724
static inline void xy_y_convolve_6tap_16_avx2(const __m256i ss[6],
1725
                                              const __m256i coeffs[3],
1726
10.0M
                                              __m256i r[2]) {
1727
10.0M
  r[0] = convolve16_6tap_avx2(ss, coeffs);
1728
10.0M
  r[1] = convolve16_6tap_avx2(ss + 3, coeffs);
1729
10.0M
}
convolve_2d_avx2.c:xy_y_convolve_6tap_16_avx2
Line
Count
Source
1726
10.0M
                                              __m256i r[2]) {
1727
10.0M
  r[0] = convolve16_6tap_avx2(ss, coeffs);
1728
10.0M
  r[1] = convolve16_6tap_avx2(ss + 3, coeffs);
1729
10.0M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_6tap_16_avx2
1730
1731
static inline void xy_y_convolve_6tap_8x2_avx2(const int16_t *const src,
1732
                                               __m256i ss_256[6],
1733
                                               const __m256i coeffs[3],
1734
1.23M
                                               __m256i r[2]) {
1735
1.23M
  __m256i s_256[2];
1736
1.23M
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
1737
1.23M
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
1738
1.23M
  ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1739
1.23M
  ss_256[5] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1740
1.23M
  xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r);
1741
1.23M
  ss_256[0] = ss_256[1];
1742
1.23M
  ss_256[1] = ss_256[2];
1743
1.23M
  ss_256[3] = ss_256[4];
1744
1.23M
  ss_256[4] = ss_256[5];
1745
1.23M
}
convolve_2d_avx2.c:xy_y_convolve_6tap_8x2_avx2
Line
Count
Source
1734
1.23M
                                               __m256i r[2]) {
1735
1.23M
  __m256i s_256[2];
1736
1.23M
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
1737
1.23M
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
1738
1.23M
  ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1739
1.23M
  ss_256[5] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1740
1.23M
  xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r);
1741
1.23M
  ss_256[0] = ss_256[1];
1742
1.23M
  ss_256[1] = ss_256[2];
1743
1.23M
  ss_256[3] = ss_256[4];
1744
1.23M
  ss_256[4] = ss_256[5];
1745
1.23M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_6tap_8x2_avx2
1746
1747
static inline void xy_y_convolve_6tap_8x2_half_pel_avx2(
1748
    const int16_t *const src, const __m256i coeffs[2], __m256i s_256[6],
1749
358k
    __m256i r[2]) {
1750
358k
  __m256i a_256[2], ss_256[4];
1751
358k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
1752
358k
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
1753
358k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
1754
358k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
1755
358k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1756
358k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1757
358k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1758
358k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1759
358k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1760
358k
  s_256[0] = s_256[2];
1761
358k
  s_256[1] = s_256[3];
1762
358k
  s_256[2] = s_256[4];
1763
358k
  s_256[3] = s_256[5];
1764
358k
}
convolve_2d_avx2.c:xy_y_convolve_6tap_8x2_half_pel_avx2
Line
Count
Source
1749
358k
    __m256i r[2]) {
1750
358k
  __m256i a_256[2], ss_256[4];
1751
358k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
1752
358k
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
1753
358k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
1754
358k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
1755
358k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1756
358k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1757
358k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1758
358k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1759
358k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1760
358k
  s_256[0] = s_256[2];
1761
358k
  s_256[1] = s_256[3];
1762
358k
  s_256[2] = s_256[4];
1763
358k
  s_256[3] = s_256[5];
1764
358k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_6tap_8x2_half_pel_avx2
1765
1766
static inline void xy_y_convolve_6tap_16x2_avx2(
1767
    const int16_t *const src, const ptrdiff_t stride, __m256i s_256[6],
1768
    __m256i ss_256[6], __m256i tt_256[6], const __m256i coeffs[3],
1769
4.38M
    __m256i r[4]) {
1770
4.38M
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
1771
4.38M
  ss_256[2] = _mm256_unpacklo_epi16(s_256[4], s_256[5]);
1772
4.38M
  ss_256[5] = _mm256_unpackhi_epi16(s_256[4], s_256[5]);
1773
4.38M
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
1774
4.38M
  tt_256[2] = _mm256_unpacklo_epi16(s_256[5], s_256[4]);
1775
4.38M
  tt_256[5] = _mm256_unpackhi_epi16(s_256[5], s_256[4]);
1776
1777
4.38M
  xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r + 0);
1778
4.38M
  xy_y_convolve_6tap_16_avx2(tt_256, coeffs, r + 2);
1779
1780
4.38M
  ss_256[0] = ss_256[1];
1781
4.38M
  ss_256[1] = ss_256[2];
1782
4.38M
  ss_256[3] = ss_256[4];
1783
4.38M
  ss_256[4] = ss_256[5];
1784
1785
4.38M
  tt_256[0] = tt_256[1];
1786
4.38M
  tt_256[1] = tt_256[2];
1787
4.38M
  tt_256[3] = tt_256[4];
1788
4.38M
  tt_256[4] = tt_256[5];
1789
4.38M
}
convolve_2d_avx2.c:xy_y_convolve_6tap_16x2_avx2
Line
Count
Source
1769
4.38M
    __m256i r[4]) {
1770
4.38M
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
1771
4.38M
  ss_256[2] = _mm256_unpacklo_epi16(s_256[4], s_256[5]);
1772
4.38M
  ss_256[5] = _mm256_unpackhi_epi16(s_256[4], s_256[5]);
1773
4.38M
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
1774
4.38M
  tt_256[2] = _mm256_unpacklo_epi16(s_256[5], s_256[4]);
1775
4.38M
  tt_256[5] = _mm256_unpackhi_epi16(s_256[5], s_256[4]);
1776
1777
4.38M
  xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r + 0);
1778
4.38M
  xy_y_convolve_6tap_16_avx2(tt_256, coeffs, r + 2);
1779
1780
4.38M
  ss_256[0] = ss_256[1];
1781
4.38M
  ss_256[1] = ss_256[2];
1782
4.38M
  ss_256[3] = ss_256[4];
1783
4.38M
  ss_256[4] = ss_256[5];
1784
1785
4.38M
  tt_256[0] = tt_256[1];
1786
4.38M
  tt_256[1] = tt_256[2];
1787
4.38M
  tt_256[3] = tt_256[4];
1788
4.38M
  tt_256[4] = tt_256[5];
1789
4.38M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_6tap_16x2_avx2
1790
1791
static inline void xy_y_convolve_6tap_16x2_half_pel_avx2(
1792
    const int16_t *const src, const ptrdiff_t stride, __m256i s_256[6],
1793
321k
    __m256i ss_256[4], const __m256i coeffs[2], __m256i r[4]) {
1794
321k
  __m256i a_256[2];
1795
1796
321k
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
1797
321k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
1798
321k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
1799
321k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1800
321k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1801
321k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1802
321k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1803
321k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1804
1805
321k
  a_256[1] = _mm256_add_epi16(s_256[2], s_256[5]);
1806
321k
  s_256[0] = s_256[2];
1807
321k
  s_256[2] = s_256[4];
1808
321k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
1809
321k
  a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
1810
321k
  s_256[1] = s_256[3];
1811
321k
  s_256[3] = s_256[5];
1812
321k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1813
321k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
1814
321k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1815
321k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
1816
321k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
1817
321k
}
convolve_2d_avx2.c:xy_y_convolve_6tap_16x2_half_pel_avx2
Line
Count
Source
1793
321k
    __m256i ss_256[4], const __m256i coeffs[2], __m256i r[4]) {
1794
321k
  __m256i a_256[2];
1795
1796
321k
  s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
1797
321k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
1798
321k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
1799
321k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1800
321k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1801
321k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1802
321k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1803
321k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1804
1805
321k
  a_256[1] = _mm256_add_epi16(s_256[2], s_256[5]);
1806
321k
  s_256[0] = s_256[2];
1807
321k
  s_256[2] = s_256[4];
1808
321k
  s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
1809
321k
  a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
1810
321k
  s_256[1] = s_256[3];
1811
321k
  s_256[3] = s_256[5];
1812
321k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1813
321k
  ss_256[1] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
1814
321k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1815
321k
  ss_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
1816
321k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
1817
321k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_6tap_16x2_half_pel_avx2
1818
1819
static inline __m128i xy_y_convolve_8tap_2x2_sse2(const int16_t *const src,
1820
                                                  __m128i s_32[8],
1821
                                                  __m128i ss_128[4],
1822
10.9k
                                                  const __m128i coeffs[4]) {
1823
10.9k
  s_32[7] = _mm_cvtsi32_si128(loadu_int32(src + 7 * 2));
1824
10.9k
  const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
1825
10.9k
  s_32[6] = _mm_cvtsi32_si128(loadu_int32(src + 8 * 2));
1826
10.9k
  const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
1827
10.9k
  ss_128[3] = _mm_unpacklo_epi16(src67, src78);
1828
10.9k
  const __m128i r = convolve16_8tap_sse2(ss_128, coeffs);
1829
10.9k
  ss_128[0] = ss_128[1];
1830
10.9k
  ss_128[1] = ss_128[2];
1831
10.9k
  ss_128[2] = ss_128[3];
1832
10.9k
  return r;
1833
10.9k
}
convolve_2d_avx2.c:xy_y_convolve_8tap_2x2_sse2
Line
Count
Source
1822
10.9k
                                                  const __m128i coeffs[4]) {
1823
10.9k
  s_32[7] = _mm_cvtsi32_si128(loadu_int32(src + 7 * 2));
1824
10.9k
  const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
1825
10.9k
  s_32[6] = _mm_cvtsi32_si128(loadu_int32(src + 8 * 2));
1826
10.9k
  const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
1827
10.9k
  ss_128[3] = _mm_unpacklo_epi16(src67, src78);
1828
10.9k
  const __m128i r = convolve16_8tap_sse2(ss_128, coeffs);
1829
10.9k
  ss_128[0] = ss_128[1];
1830
10.9k
  ss_128[1] = ss_128[2];
1831
10.9k
  ss_128[2] = ss_128[3];
1832
10.9k
  return r;
1833
10.9k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_8tap_2x2_sse2
1834
1835
static inline __m256i xy_y_convolve_8tap_4x2_avx2(const int16_t *const src,
1836
                                                  __m128i s_64[8],
1837
                                                  __m256i ss_256[4],
1838
52.6k
                                                  const __m256i coeffs[4]) {
1839
52.6k
  __m256i s_256[2];
1840
52.6k
  s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * 4));
1841
52.6k
  s_256[0] = _mm256_setr_m128i(s_64[6], s_64[7]);
1842
52.6k
  s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * 4));
1843
52.6k
  s_256[1] = _mm256_setr_m128i(s_64[7], s_64[6]);
1844
52.6k
  ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1845
52.6k
  const __m256i r = convolve16_8tap_avx2(ss_256, coeffs);
1846
52.6k
  ss_256[0] = ss_256[1];
1847
52.6k
  ss_256[1] = ss_256[2];
1848
52.6k
  ss_256[2] = ss_256[3];
1849
52.6k
  return r;
1850
52.6k
}
convolve_2d_avx2.c:xy_y_convolve_8tap_4x2_avx2
Line
Count
Source
1838
52.6k
                                                  const __m256i coeffs[4]) {
1839
52.6k
  __m256i s_256[2];
1840
52.6k
  s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * 4));
1841
52.6k
  s_256[0] = _mm256_setr_m128i(s_64[6], s_64[7]);
1842
52.6k
  s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * 4));
1843
52.6k
  s_256[1] = _mm256_setr_m128i(s_64[7], s_64[6]);
1844
52.6k
  ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1845
52.6k
  const __m256i r = convolve16_8tap_avx2(ss_256, coeffs);
1846
52.6k
  ss_256[0] = ss_256[1];
1847
52.6k
  ss_256[1] = ss_256[2];
1848
52.6k
  ss_256[2] = ss_256[3];
1849
52.6k
  return r;
1850
52.6k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_8tap_4x2_avx2
1851
1852
static inline void xy_y_convolve_8tap_16_avx2(const __m256i *const ss,
1853
                                              const __m256i coeffs[4],
1854
1.86M
                                              __m256i r[2]) {
1855
1.86M
  r[0] = convolve16_8tap_avx2(ss, coeffs);
1856
1.86M
  r[1] = convolve16_8tap_avx2(ss + 4, coeffs);
1857
1.86M
}
convolve_2d_avx2.c:xy_y_convolve_8tap_16_avx2
Line
Count
Source
1854
1.86M
                                              __m256i r[2]) {
1855
1.86M
  r[0] = convolve16_8tap_avx2(ss, coeffs);
1856
1.86M
  r[1] = convolve16_8tap_avx2(ss + 4, coeffs);
1857
1.86M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_8tap_16_avx2
1858
1859
static inline void xy_y_convolve_8tap_8x2_avx2(const int16_t *const src,
1860
                                               __m256i ss_256[8],
1861
                                               const __m256i coeffs[4],
1862
43.1k
                                               __m256i r[2]) {
1863
43.1k
  __m256i s_256[2];
1864
43.1k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
1865
43.1k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
1866
43.1k
  ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1867
43.1k
  ss_256[7] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1868
43.1k
  xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r);
1869
43.1k
  ss_256[0] = ss_256[1];
1870
43.1k
  ss_256[1] = ss_256[2];
1871
43.1k
  ss_256[2] = ss_256[3];
1872
43.1k
  ss_256[4] = ss_256[5];
1873
43.1k
  ss_256[5] = ss_256[6];
1874
43.1k
  ss_256[6] = ss_256[7];
1875
43.1k
}
convolve_2d_avx2.c:xy_y_convolve_8tap_8x2_avx2
Line
Count
Source
1862
43.1k
                                               __m256i r[2]) {
1863
43.1k
  __m256i s_256[2];
1864
43.1k
  s_256[0] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
1865
43.1k
  s_256[1] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
1866
43.1k
  ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1867
43.1k
  ss_256[7] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1868
43.1k
  xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r);
1869
43.1k
  ss_256[0] = ss_256[1];
1870
43.1k
  ss_256[1] = ss_256[2];
1871
43.1k
  ss_256[2] = ss_256[3];
1872
43.1k
  ss_256[4] = ss_256[5];
1873
43.1k
  ss_256[5] = ss_256[6];
1874
43.1k
  ss_256[6] = ss_256[7];
1875
43.1k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_8tap_8x2_avx2
1876
1877
static inline void xy_y_convolve_8tap_8x2_half_pel_avx2(
1878
    const int16_t *const src, const __m256i coeffs[2], __m256i s_256[8],
1879
22.0k
    __m256i r[2]) {
1880
22.0k
  __m256i a_256[4], ss_256[4];
1881
1882
22.0k
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
1883
22.0k
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
1884
22.0k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
1885
22.0k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
1886
22.0k
  a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
1887
22.0k
  a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
1888
22.0k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1889
22.0k
  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1890
22.0k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1891
22.0k
  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1892
22.0k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1893
22.0k
  s_256[0] = s_256[2];
1894
22.0k
  s_256[1] = s_256[3];
1895
22.0k
  s_256[2] = s_256[4];
1896
22.0k
  s_256[3] = s_256[5];
1897
22.0k
  s_256[4] = s_256[6];
1898
22.0k
  s_256[5] = s_256[7];
1899
22.0k
}
convolve_2d_avx2.c:xy_y_convolve_8tap_8x2_half_pel_avx2
Line
Count
Source
1879
22.0k
    __m256i r[2]) {
1880
22.0k
  __m256i a_256[4], ss_256[4];
1881
1882
22.0k
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
1883
22.0k
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
1884
22.0k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
1885
22.0k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
1886
22.0k
  a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
1887
22.0k
  a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
1888
22.0k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1889
22.0k
  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1890
22.0k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1891
22.0k
  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1892
22.0k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1893
22.0k
  s_256[0] = s_256[2];
1894
22.0k
  s_256[1] = s_256[3];
1895
22.0k
  s_256[2] = s_256[4];
1896
22.0k
  s_256[3] = s_256[5];
1897
22.0k
  s_256[4] = s_256[6];
1898
22.0k
  s_256[5] = s_256[7];
1899
22.0k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_8tap_8x2_half_pel_avx2
1900
1901
static AOM_FORCE_INLINE void xy_y_convolve_8tap_16x2_avx2(
1902
    const int16_t *const src, const ptrdiff_t stride, const __m256i coeffs[4],
1903
911k
    __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
1904
911k
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1905
911k
  ss_256[3] = _mm256_unpacklo_epi16(s_256[6], s_256[7]);
1906
911k
  ss_256[7] = _mm256_unpackhi_epi16(s_256[6], s_256[7]);
1907
911k
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1908
911k
  tt_256[3] = _mm256_unpacklo_epi16(s_256[7], s_256[6]);
1909
911k
  tt_256[7] = _mm256_unpackhi_epi16(s_256[7], s_256[6]);
1910
1911
911k
  xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r + 0);
1912
911k
  xy_y_convolve_8tap_16_avx2(tt_256, coeffs, r + 2);
1913
1914
911k
  ss_256[0] = ss_256[1];
1915
911k
  ss_256[1] = ss_256[2];
1916
911k
  ss_256[2] = ss_256[3];
1917
911k
  ss_256[4] = ss_256[5];
1918
911k
  ss_256[5] = ss_256[6];
1919
911k
  ss_256[6] = ss_256[7];
1920
1921
911k
  tt_256[0] = tt_256[1];
1922
911k
  tt_256[1] = tt_256[2];
1923
911k
  tt_256[2] = tt_256[3];
1924
911k
  tt_256[4] = tt_256[5];
1925
911k
  tt_256[5] = tt_256[6];
1926
911k
  tt_256[6] = tt_256[7];
1927
911k
}
convolve_2d_avx2.c:xy_y_convolve_8tap_16x2_avx2
Line
Count
Source
1903
911k
    __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
1904
911k
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1905
911k
  ss_256[3] = _mm256_unpacklo_epi16(s_256[6], s_256[7]);
1906
911k
  ss_256[7] = _mm256_unpackhi_epi16(s_256[6], s_256[7]);
1907
911k
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1908
911k
  tt_256[3] = _mm256_unpacklo_epi16(s_256[7], s_256[6]);
1909
911k
  tt_256[7] = _mm256_unpackhi_epi16(s_256[7], s_256[6]);
1910
1911
911k
  xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r + 0);
1912
911k
  xy_y_convolve_8tap_16_avx2(tt_256, coeffs, r + 2);
1913
1914
911k
  ss_256[0] = ss_256[1];
1915
911k
  ss_256[1] = ss_256[2];
1916
911k
  ss_256[2] = ss_256[3];
1917
911k
  ss_256[4] = ss_256[5];
1918
911k
  ss_256[5] = ss_256[6];
1919
911k
  ss_256[6] = ss_256[7];
1920
1921
911k
  tt_256[0] = tt_256[1];
1922
911k
  tt_256[1] = tt_256[2];
1923
911k
  tt_256[2] = tt_256[3];
1924
911k
  tt_256[4] = tt_256[5];
1925
911k
  tt_256[5] = tt_256[6];
1926
911k
  tt_256[6] = tt_256[7];
1927
911k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_8tap_16x2_avx2
1928
1929
static inline void xy_y_convolve_8tap_16x2_half_pel_avx2(
1930
    const int16_t *const src, const ptrdiff_t stride, const __m256i coeffs[4],
1931
17.4k
    __m256i s_256[8], __m256i r[4]) {
1932
17.4k
  __m256i a_256[4], ss_256[4];
1933
17.4k
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1934
1935
17.4k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
1936
17.4k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
1937
17.4k
  a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
1938
17.4k
  a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
1939
17.4k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1940
17.4k
  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1941
17.4k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1942
17.4k
  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1943
1944
17.4k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1945
1946
17.4k
  a_256[1] = _mm256_add_epi16(s_256[2], s_256[7]);
1947
17.4k
  a_256[2] = _mm256_add_epi16(s_256[3], s_256[6]);
1948
17.4k
  a_256[3] = _mm256_add_epi16(s_256[4], s_256[5]);
1949
17.4k
  s_256[0] = s_256[2];
1950
17.4k
  s_256[2] = s_256[4];
1951
17.4k
  s_256[4] = s_256[6];
1952
17.4k
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1953
1954
17.4k
  a_256[0] = _mm256_add_epi16(s_256[1], s_256[6]);
1955
17.4k
  s_256[1] = s_256[3];
1956
17.4k
  s_256[3] = s_256[5];
1957
17.4k
  s_256[5] = s_256[7];
1958
17.4k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1959
17.4k
  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1960
17.4k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1961
17.4k
  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1962
1963
17.4k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
1964
17.4k
}
convolve_2d_avx2.c:xy_y_convolve_8tap_16x2_half_pel_avx2
Line
Count
Source
1931
17.4k
    __m256i s_256[8], __m256i r[4]) {
1932
17.4k
  __m256i a_256[4], ss_256[4];
1933
17.4k
  s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1934
1935
17.4k
  a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
1936
17.4k
  a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
1937
17.4k
  a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
1938
17.4k
  a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
1939
17.4k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1940
17.4k
  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1941
17.4k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1942
17.4k
  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1943
1944
17.4k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1945
1946
17.4k
  a_256[1] = _mm256_add_epi16(s_256[2], s_256[7]);
1947
17.4k
  a_256[2] = _mm256_add_epi16(s_256[3], s_256[6]);
1948
17.4k
  a_256[3] = _mm256_add_epi16(s_256[4], s_256[5]);
1949
17.4k
  s_256[0] = s_256[2];
1950
17.4k
  s_256[2] = s_256[4];
1951
17.4k
  s_256[4] = s_256[6];
1952
17.4k
  s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1953
1954
17.4k
  a_256[0] = _mm256_add_epi16(s_256[1], s_256[6]);
1955
17.4k
  s_256[1] = s_256[3];
1956
17.4k
  s_256[3] = s_256[5];
1957
17.4k
  s_256[5] = s_256[7];
1958
17.4k
  ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1959
17.4k
  ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1960
17.4k
  ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1961
17.4k
  ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1962
1963
17.4k
  xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
1964
17.4k
}
Unexecuted instantiation: convolve_avx2.c:xy_y_convolve_8tap_16x2_half_pel_avx2
1965
1966
static inline void xy_y_round_store_8x2_avx2(const __m256i res[2],
1967
                                             uint8_t *const dst,
1968
2.28M
                                             const ptrdiff_t stride) {
1969
2.28M
  const __m256i r = xy_y_round_16_avx2(res);
1970
2.28M
  pack_store_8x2_avx2(r, dst, stride);
1971
2.28M
}
convolve_2d_avx2.c:xy_y_round_store_8x2_avx2
Line
Count
Source
1968
2.28M
                                             const ptrdiff_t stride) {
1969
2.28M
  const __m256i r = xy_y_round_16_avx2(res);
1970
2.28M
  pack_store_8x2_avx2(r, dst, stride);
1971
2.28M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_store_8x2_avx2
1972
1973
static inline void xy_y_round_store_16x2_avx2(const __m256i res[4],
1974
                                              uint8_t *const dst,
1975
1.69M
                                              const ptrdiff_t stride) {
1976
1.69M
  const __m256i r0 = xy_y_round_16_avx2(res + 0);
1977
1.69M
  const __m256i r1 = xy_y_round_16_avx2(res + 2);
1978
1.69M
  xy_y_pack_store_16x2_avx2(r0, r1, dst, stride);
1979
1.69M
}
convolve_2d_avx2.c:xy_y_round_store_16x2_avx2
Line
Count
Source
1975
1.69M
                                              const ptrdiff_t stride) {
1976
1.69M
  const __m256i r0 = xy_y_round_16_avx2(res + 0);
1977
1.69M
  const __m256i r1 = xy_y_round_16_avx2(res + 2);
1978
1.69M
  xy_y_pack_store_16x2_avx2(r0, r1, dst, stride);
1979
1.69M
}
Unexecuted instantiation: convolve_avx2.c:xy_y_round_store_16x2_avx2
1980
1981
static inline void sr_y_round_store_32_avx2(const __m256i res[2],
1982
2.15M
                                            uint8_t *const dst) {
1983
2.15M
  __m256i r[2];
1984
1985
2.15M
  r[0] = sr_y_round_avx2(res[0]);
1986
2.15M
  r[1] = sr_y_round_avx2(res[1]);
1987
2.15M
  convolve_store_32_avx2(r[0], r[1], dst);
1988
2.15M
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_round_store_32_avx2
convolve_avx2.c:sr_y_round_store_32_avx2
Line
Count
Source
1982
2.15M
                                            uint8_t *const dst) {
1983
2.15M
  __m256i r[2];
1984
1985
2.15M
  r[0] = sr_y_round_avx2(res[0]);
1986
2.15M
  r[1] = sr_y_round_avx2(res[1]);
1987
2.15M
  convolve_store_32_avx2(r[0], r[1], dst);
1988
2.15M
}
1989
1990
static inline void sr_y_round_store_32x2_avx2(const __m256i res[4],
1991
                                              uint8_t *const dst,
1992
995k
                                              const int32_t dst_stride) {
1993
995k
  sr_y_round_store_32_avx2(res, dst);
1994
995k
  sr_y_round_store_32_avx2(res + 2, dst + dst_stride);
1995
995k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_round_store_32x2_avx2
convolve_avx2.c:sr_y_round_store_32x2_avx2
Line
Count
Source
1992
995k
                                              const int32_t dst_stride) {
1993
995k
  sr_y_round_store_32_avx2(res, dst);
1994
995k
  sr_y_round_store_32_avx2(res + 2, dst + dst_stride);
1995
995k
}
1996
1997
static inline void sr_y_2tap_32_avx2(const uint8_t *const src,
1998
                                     const __m256i coeffs[1], const __m256i s0,
1999
169k
                                     __m256i *const s1, uint8_t *const dst) {
2000
169k
  __m256i r[2];
2001
169k
  y_convolve_2tap_32_avx2(src, coeffs, s0, s1, r);
2002
169k
  sr_y_round_store_32_avx2(r, dst);
2003
169k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_y_2tap_32_avx2
convolve_avx2.c:sr_y_2tap_32_avx2
Line
Count
Source
1999
169k
                                     __m256i *const s1, uint8_t *const dst) {
2000
169k
  __m256i r[2];
2001
169k
  y_convolve_2tap_32_avx2(src, coeffs, s0, s1, r);
2002
169k
  sr_y_round_store_32_avx2(r, dst);
2003
169k
}
2004
2005
static AOM_FORCE_INLINE void av1_convolve_y_sr_specialized_avx2(
2006
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
2007
    int32_t w, int32_t h, const InterpFilterParams *filter_params_y,
2008
747k
    const int32_t subpel_y_q4) {
2009
747k
  int32_t x, y;
2010
747k
  __m128i coeffs_128[4];
2011
747k
  __m256i coeffs_256[4];
2012
2013
747k
  int vert_tap = get_filter_tap(filter_params_y, subpel_y_q4);
2014
2015
747k
  if (vert_tap == 2) {
2016
    // vert_filt as 2 tap
2017
37.2k
    const uint8_t *src_ptr = src;
2018
2019
37.2k
    y = h;
2020
2021
37.2k
    if (subpel_y_q4 != 8) {
2022
22.8k
      if (w <= 8) {
2023
17.5k
        prepare_half_coeffs_2tap_ssse3(filter_params_y, subpel_y_q4,
2024
17.5k
                                       coeffs_128);
2025
2026
17.5k
        if (w == 2) {
2027
2.84k
          __m128i s_16[2];
2028
2029
2.84k
          s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
2030
2031
5.59k
          do {
2032
5.59k
            const __m128i res = y_convolve_2tap_2x2_ssse3(src_ptr, src_stride,
2033
5.59k
                                                          coeffs_128, s_16);
2034
5.59k
            const __m128i r = sr_y_round_sse2(res);
2035
5.59k
            pack_store_2x2_sse2(r, dst, dst_stride);
2036
5.59k
            src_ptr += 2 * src_stride;
2037
5.59k
            dst += 2 * dst_stride;
2038
5.59k
            y -= 2;
2039
5.59k
          } while (y);
2040
14.6k
        } else if (w == 4) {
2041
8.80k
          __m128i s_32[2];
2042
2043
8.80k
          s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr));
2044
2045
26.0k
          do {
2046
26.0k
            const __m128i res = y_convolve_2tap_4x2_ssse3(src_ptr, src_stride,
2047
26.0k
                                                          coeffs_128, s_32);
2048
26.0k
            const __m128i r = sr_y_round_sse2(res);
2049
26.0k
            pack_store_4x2_sse2(r, dst, dst_stride);
2050
26.0k
            src_ptr += 2 * src_stride;
2051
26.0k
            dst += 2 * dst_stride;
2052
26.0k
            y -= 2;
2053
26.0k
          } while (y);
2054
8.80k
        } else {
2055
5.86k
          __m128i s_64[2], s_128[2];
2056
2057
5.86k
          assert(w == 8);
2058
2059
5.86k
          s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
2060
2061
20.2k
          do {
2062
            // Note: Faster than binding to AVX2 registers.
2063
20.2k
            s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
2064
20.2k
            s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
2065
20.2k
            s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2066
20.2k
            s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
2067
20.2k
            const __m128i ss0 = _mm_unpacklo_epi8(s_128[0], s_128[1]);
2068
20.2k
            const __m128i ss1 = _mm_unpackhi_epi8(s_128[0], s_128[1]);
2069
20.2k
            const __m128i res0 = convolve_2tap_ssse3(&ss0, coeffs_128);
2070
20.2k
            const __m128i res1 = convolve_2tap_ssse3(&ss1, coeffs_128);
2071
20.2k
            const __m128i r0 = sr_y_round_sse2(res0);
2072
20.2k
            const __m128i r1 = sr_y_round_sse2(res1);
2073
20.2k
            const __m128i d = _mm_packus_epi16(r0, r1);
2074
20.2k
            _mm_storel_epi64((__m128i *)dst, d);
2075
20.2k
            _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
2076
20.2k
            src_ptr += 2 * src_stride;
2077
20.2k
            dst += 2 * dst_stride;
2078
20.2k
            y -= 2;
2079
20.2k
          } while (y);
2080
5.86k
        }
2081
17.5k
      } else {
2082
5.29k
        prepare_half_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2083
2084
5.29k
        if (w == 16) {
2085
2.99k
          __m128i s_128[2];
2086
2087
2.99k
          s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
2088
2089
18.8k
          do {
2090
18.8k
            __m256i r[2];
2091
2092
18.8k
            y_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2093
18.8k
                                      r);
2094
18.8k
            sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2095
18.8k
            src_ptr += 2 * src_stride;
2096
18.8k
            dst += 2 * dst_stride;
2097
18.8k
            y -= 2;
2098
18.8k
          } while (y);
2099
2.99k
        } else if (w == 32) {
2100
1.30k
          __m256i s_256[2];
2101
2102
1.30k
          s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
2103
2104
15.4k
          do {
2105
15.4k
            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0],
2106
15.4k
                              &s_256[1], dst);
2107
15.4k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1],
2108
15.4k
                              &s_256[0], dst + dst_stride);
2109
15.4k
            src_ptr += 2 * src_stride;
2110
15.4k
            dst += 2 * dst_stride;
2111
15.4k
            y -= 2;
2112
15.4k
          } while (y);
2113
1.30k
        } else if (w == 64) {
2114
828
          __m256i s_256[2][2];
2115
2116
828
          s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2117
828
          s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2118
2119
18.9k
          do {
2120
18.9k
            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0][0],
2121
18.9k
                              &s_256[1][0], dst);
2122
18.9k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 32, coeffs_256,
2123
18.9k
                              s_256[0][1], &s_256[1][1], dst + 32);
2124
18.9k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1][0],
2125
18.9k
                              &s_256[0][0], dst + dst_stride);
2126
18.9k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 32, coeffs_256,
2127
18.9k
                              s_256[1][1], &s_256[0][1], dst + dst_stride + 32);
2128
2129
18.9k
            src_ptr += 2 * src_stride;
2130
18.9k
            dst += 2 * dst_stride;
2131
18.9k
            y -= 2;
2132
18.9k
          } while (y);
2133
828
        } else {
2134
163
          __m256i s_256[2][4];
2135
2136
163
          assert(w == 128);
2137
2138
163
          s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2139
163
          s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2140
163
          s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
2141
163
          s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
2142
2143
7.80k
          do {
2144
7.80k
            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0][0],
2145
7.80k
                              &s_256[1][0], dst);
2146
7.80k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 1 * 32, coeffs_256,
2147
7.80k
                              s_256[0][1], &s_256[1][1], dst + 1 * 32);
2148
7.80k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 2 * 32, coeffs_256,
2149
7.80k
                              s_256[0][2], &s_256[1][2], dst + 2 * 32);
2150
7.80k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 3 * 32, coeffs_256,
2151
7.80k
                              s_256[0][3], &s_256[1][3], dst + 3 * 32);
2152
2153
7.80k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1][0],
2154
7.80k
                              &s_256[0][0], dst + dst_stride);
2155
7.80k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 1 * 32, coeffs_256,
2156
7.80k
                              s_256[1][1], &s_256[0][1],
2157
7.80k
                              dst + dst_stride + 1 * 32);
2158
7.80k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 2 * 32, coeffs_256,
2159
7.80k
                              s_256[1][2], &s_256[0][2],
2160
7.80k
                              dst + dst_stride + 2 * 32);
2161
7.80k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 3 * 32, coeffs_256,
2162
7.80k
                              s_256[1][3], &s_256[0][3],
2163
7.80k
                              dst + dst_stride + 3 * 32);
2164
2165
7.80k
            src_ptr += 2 * src_stride;
2166
7.80k
            dst += 2 * dst_stride;
2167
7.80k
            y -= 2;
2168
7.80k
          } while (y);
2169
163
        }
2170
5.29k
      }
2171
22.8k
    } else {
2172
      // average to get half pel
2173
14.4k
      if (w <= 8) {
2174
10.7k
        if (w == 2) {
2175
1.31k
          __m128i s_16[2];
2176
2177
1.31k
          s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
2178
2179
2.80k
          do {
2180
2.80k
            s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + src_stride));
2181
2.80k
            const __m128i d0 = _mm_avg_epu8(s_16[0], s_16[1]);
2182
2.80k
            *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d0);
2183
2.80k
            s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
2184
2.80k
            const __m128i d1 = _mm_avg_epu8(s_16[1], s_16[0]);
2185
2.80k
            *(int16_t *)(dst + dst_stride) = (int16_t)_mm_cvtsi128_si32(d1);
2186
2.80k
            src_ptr += 2 * src_stride;
2187
2.80k
            dst += 2 * dst_stride;
2188
2.80k
            y -= 2;
2189
2.80k
          } while (y);
2190
9.47k
        } else if (w == 4) {
2191
5.18k
          __m128i s_32[2];
2192
2193
5.18k
          s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr));
2194
2195
16.5k
          do {
2196
16.5k
            s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + src_stride));
2197
16.5k
            const __m128i d0 = _mm_avg_epu8(s_32[0], s_32[1]);
2198
16.5k
            xx_storel_32(dst, d0);
2199
16.5k
            s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2200
16.5k
            const __m128i d1 = _mm_avg_epu8(s_32[1], s_32[0]);
2201
16.5k
            xx_storel_32(dst + dst_stride, d1);
2202
16.5k
            src_ptr += 2 * src_stride;
2203
16.5k
            dst += 2 * dst_stride;
2204
16.5k
            y -= 2;
2205
16.5k
          } while (y);
2206
5.18k
        } else {
2207
4.29k
          __m128i s_64[2];
2208
2209
4.29k
          assert(w == 8);
2210
2211
4.29k
          s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
2212
2213
17.4k
          do {
2214
            // Note: Faster than binding to AVX2 registers.
2215
17.4k
            s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
2216
17.4k
            const __m128i d0 = _mm_avg_epu8(s_64[0], s_64[1]);
2217
17.4k
            _mm_storel_epi64((__m128i *)dst, d0);
2218
17.4k
            s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2219
17.4k
            const __m128i d1 = _mm_avg_epu8(s_64[1], s_64[0]);
2220
17.4k
            _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
2221
17.4k
            src_ptr += 2 * src_stride;
2222
17.4k
            dst += 2 * dst_stride;
2223
17.4k
            y -= 2;
2224
17.4k
          } while (y);
2225
4.29k
        }
2226
10.7k
      } else if (w == 16) {
2227
2.25k
        __m128i s_128[2];
2228
2229
2.25k
        s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
2230
2231
13.8k
        do {
2232
13.8k
          s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
2233
13.8k
          const __m128i d0 = _mm_avg_epu8(s_128[0], s_128[1]);
2234
13.8k
          _mm_storeu_si128((__m128i *)dst, d0);
2235
13.8k
          s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2236
13.8k
          const __m128i d1 = _mm_avg_epu8(s_128[1], s_128[0]);
2237
13.8k
          _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
2238
13.8k
          src_ptr += 2 * src_stride;
2239
13.8k
          dst += 2 * dst_stride;
2240
13.8k
          y -= 2;
2241
13.8k
        } while (y);
2242
2.25k
      } else if (w == 32) {
2243
884
        __m256i s_256[2];
2244
2245
884
        s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
2246
2247
10.6k
        do {
2248
10.6k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0], &s_256[1], dst);
2249
10.6k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1], &s_256[0],
2250
10.6k
                                dst + dst_stride);
2251
10.6k
          src_ptr += 2 * src_stride;
2252
10.6k
          dst += 2 * dst_stride;
2253
10.6k
          y -= 2;
2254
10.6k
        } while (y);
2255
884
      } else if (w == 64) {
2256
344
        __m256i s_256[2][2];
2257
2258
344
        s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2259
344
        s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2260
2261
8.46k
        do {
2262
8.46k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0][0], &s_256[1][0],
2263
8.46k
                                dst);
2264
8.46k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 32, s_256[0][1],
2265
8.46k
                                &s_256[1][1], dst + 32);
2266
2267
8.46k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1][0],
2268
8.46k
                                &s_256[0][0], dst + dst_stride);
2269
8.46k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 32, s_256[1][1],
2270
8.46k
                                &s_256[0][1], dst + dst_stride + 32);
2271
2272
8.46k
          src_ptr += 2 * src_stride;
2273
8.46k
          dst += 2 * dst_stride;
2274
8.46k
          y -= 2;
2275
8.46k
        } while (y);
2276
344
      } else {
2277
178
        __m256i s_256[2][4];
2278
2279
178
        assert(w == 128);
2280
2281
179
        s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2282
179
        s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2283
179
        s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
2284
179
        s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
2285
2286
8.73k
        do {
2287
8.73k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0][0], &s_256[1][0],
2288
8.73k
                                dst);
2289
8.73k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 1 * 32, s_256[0][1],
2290
8.73k
                                &s_256[1][1], dst + 1 * 32);
2291
8.73k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 2 * 32, s_256[0][2],
2292
8.73k
                                &s_256[1][2], dst + 2 * 32);
2293
8.73k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 3 * 32, s_256[0][3],
2294
8.73k
                                &s_256[1][3], dst + 3 * 32);
2295
2296
8.73k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1][0],
2297
8.73k
                                &s_256[0][0], dst + dst_stride);
2298
8.73k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 1 * 32, s_256[1][1],
2299
8.73k
                                &s_256[0][1], dst + dst_stride + 1 * 32);
2300
8.73k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 2 * 32, s_256[1][2],
2301
8.73k
                                &s_256[0][2], dst + dst_stride + 2 * 32);
2302
8.73k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 3 * 32, s_256[1][3],
2303
8.73k
                                &s_256[0][3], dst + dst_stride + 3 * 32);
2304
2305
8.73k
          src_ptr += 2 * src_stride;
2306
8.73k
          dst += 2 * dst_stride;
2307
8.73k
          y -= 2;
2308
8.73k
        } while (y);
2309
179
      }
2310
14.4k
    }
2311
710k
  } else if (vert_tap == 4) {
2312
    // vert_filt as 4 tap
2313
340k
    const uint8_t *src_ptr = src - src_stride;
2314
2315
340k
    y = h;
2316
2317
340k
    if (w <= 4) {
2318
170k
      prepare_half_coeffs_4tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2319
2320
170k
      if (w == 2) {
2321
32.3k
        __m128i s_16[4], ss_128[2];
2322
2323
32.3k
        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2324
32.3k
        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2325
32.3k
        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2326
2327
32.3k
        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2328
32.3k
        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2329
2330
32.3k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2331
2332
51.2k
        do {
2333
51.2k
          src_ptr += 2 * src_stride;
2334
51.2k
          const __m128i res = y_convolve_4tap_2x2_ssse3(
2335
51.2k
              src_ptr, src_stride, coeffs_128, s_16, ss_128);
2336
51.2k
          const __m128i r = sr_y_round_sse2(res);
2337
51.2k
          pack_store_2x2_sse2(r, dst, dst_stride);
2338
2339
51.2k
          ss_128[0] = ss_128[1];
2340
51.2k
          dst += 2 * dst_stride;
2341
51.2k
          y -= 2;
2342
51.2k
        } while (y);
2343
138k
      } else {
2344
138k
        __m128i s_32[4], ss_128[2];
2345
2346
138k
        assert(w == 4);
2347
2348
138k
        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2349
138k
        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2350
138k
        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2351
2352
138k
        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2353
138k
        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2354
2355
138k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2356
2357
281k
        do {
2358
281k
          src_ptr += 2 * src_stride;
2359
281k
          const __m128i res = y_convolve_4tap_4x2_ssse3(
2360
281k
              src_ptr, src_stride, coeffs_128, s_32, ss_128);
2361
281k
          const __m128i r = sr_y_round_sse2(res);
2362
281k
          pack_store_4x2_sse2(r, dst, dst_stride);
2363
2364
281k
          ss_128[0] = ss_128[1];
2365
281k
          dst += 2 * dst_stride;
2366
281k
          y -= 2;
2367
281k
        } while (y);
2368
138k
      }
2369
170k
    } else {
2370
169k
      prepare_half_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2371
2372
169k
      if (w == 8) {
2373
110k
        __m128i s_64[4];
2374
110k
        __m256i ss_256[2];
2375
2376
110k
        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2377
110k
        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2378
110k
        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2379
2380
        // Load lines a and b. Line a to lower 128, line b to upper 128
2381
110k
        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2382
110k
        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2383
2384
110k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2385
2386
228k
        do {
2387
228k
          src_ptr += 2 * src_stride;
2388
228k
          const __m256i res = y_convolve_4tap_8x2_avx2(
2389
228k
              src_ptr, src_stride, coeffs_256, s_64, ss_256);
2390
228k
          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2391
2392
228k
          ss_256[0] = ss_256[1];
2393
228k
          dst += 2 * dst_stride;
2394
228k
          y -= 2;
2395
228k
        } while (y);
2396
110k
      } else if (w == 16) {
2397
52.9k
        __m128i s_128[4];
2398
52.9k
        __m256i ss_256[4], r[2];
2399
2400
52.9k
        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2401
52.9k
        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2402
52.9k
        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2403
2404
        // Load lines a and b. Line a to lower 128, line b to upper 128
2405
52.9k
        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2406
52.9k
        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2407
2408
52.9k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2409
52.9k
        ss_256[2] = _mm256_unpackhi_epi8(src01, src12);
2410
2411
140k
        do {
2412
140k
          src_ptr += 2 * src_stride;
2413
140k
          y_convolve_4tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2414
140k
                                    ss_256, r);
2415
140k
          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2416
2417
140k
          ss_256[0] = ss_256[1];
2418
140k
          ss_256[2] = ss_256[3];
2419
140k
          dst += 2 * dst_stride;
2420
140k
          y -= 2;
2421
140k
        } while (y);
2422
52.9k
      } else if (w == 32) {
2423
        // AV1 standard won't have 32x4 case.
2424
        // This only favors some optimization feature which
2425
        // subsamples 32x8 to 32x4 and triggers 4-tap filter.
2426
2427
4.84k
        __m256i s_256[4], ss_256[4], tt_256[4], r[4];
2428
2429
4.84k
        s_256[0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * src_stride));
2430
4.84k
        s_256[1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * src_stride));
2431
4.84k
        s_256[2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * src_stride));
2432
2433
4.84k
        ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2434
4.84k
        ss_256[2] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2435
2436
4.84k
        tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2437
4.84k
        tt_256[2] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2438
2439
38.8k
        do {
2440
38.8k
          src_ptr += 2 * src_stride;
2441
38.8k
          y_convolve_4tap_32x2_avx2(src_ptr, src_stride, coeffs_256, s_256,
2442
38.8k
                                    ss_256, tt_256, r);
2443
38.8k
          sr_y_round_store_32x2_avx2(r, dst, dst_stride);
2444
2445
38.8k
          ss_256[0] = ss_256[1];
2446
38.8k
          ss_256[2] = ss_256[3];
2447
2448
38.8k
          tt_256[0] = tt_256[1];
2449
38.8k
          tt_256[2] = tt_256[3];
2450
38.8k
          dst += 2 * dst_stride;
2451
38.8k
          y -= 2;
2452
38.8k
        } while (y);
2453
4.84k
      } else {
2454
1.47k
        assert(!(w % 32));
2455
2456
1.48k
        __m256i s_256[4], ss_256[4], tt_256[4], r[4];
2457
1.48k
        x = 0;
2458
3.54k
        do {
2459
3.54k
          const uint8_t *s = src_ptr + x;
2460
3.54k
          uint8_t *d = dst + x;
2461
3.54k
          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2462
3.54k
          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2463
3.54k
          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2464
2465
3.54k
          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2466
3.54k
          ss_256[2] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2467
2468
3.54k
          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2469
3.54k
          tt_256[2] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2470
2471
3.54k
          y = h;
2472
134k
          do {
2473
134k
            s += 2 * src_stride;
2474
134k
            y_convolve_4tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2475
134k
                                      tt_256, r);
2476
134k
            sr_y_round_store_32x2_avx2(r, d, dst_stride);
2477
2478
134k
            ss_256[0] = ss_256[1];
2479
134k
            ss_256[2] = ss_256[3];
2480
2481
134k
            tt_256[0] = tt_256[1];
2482
134k
            tt_256[2] = tt_256[3];
2483
134k
            d += 2 * dst_stride;
2484
134k
            y -= 2;
2485
134k
          } while (y);
2486
3.54k
          x += 32;
2487
3.54k
        } while (x < w);
2488
1.48k
      }
2489
169k
    }
2490
370k
  } else if (vert_tap == 6) {
2491
    // vert_filt as 6 tap
2492
346k
    const uint8_t *src_ptr = src - 2 * src_stride;
2493
2494
346k
    if (w <= 4) {
2495
109k
      prepare_half_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2496
2497
109k
      y = h;
2498
2499
109k
      if (w == 2) {
2500
18.6k
        __m128i s_16[6], ss_128[3];
2501
2502
18.6k
        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2503
18.6k
        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2504
18.6k
        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2505
18.6k
        s_16[3] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 3 * src_stride));
2506
18.6k
        s_16[4] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 4 * src_stride));
2507
2508
18.6k
        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2509
18.6k
        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2510
18.6k
        const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
2511
18.6k
        const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
2512
2513
18.6k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2514
18.6k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2515
2516
74.4k
        do {
2517
74.4k
          src_ptr += 2 * src_stride;
2518
74.4k
          const __m128i res = y_convolve_6tap_2x2_ssse3(
2519
74.4k
              src_ptr, src_stride, coeffs_128, s_16, ss_128);
2520
74.4k
          const __m128i r = sr_y_round_sse2(res);
2521
74.4k
          pack_store_2x2_sse2(r, dst, dst_stride);
2522
2523
74.4k
          ss_128[0] = ss_128[1];
2524
74.4k
          ss_128[1] = ss_128[2];
2525
74.4k
          dst += 2 * dst_stride;
2526
74.4k
          y -= 2;
2527
74.4k
        } while (y);
2528
90.9k
      } else {
2529
90.9k
        __m128i s_32[6], ss_128[3];
2530
2531
90.9k
        assert(w == 4);
2532
2533
90.9k
        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2534
90.9k
        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2535
90.9k
        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2536
90.9k
        s_32[3] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 3 * src_stride));
2537
90.9k
        s_32[4] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 4 * src_stride));
2538
2539
90.9k
        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2540
90.9k
        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2541
90.9k
        const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
2542
90.9k
        const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
2543
2544
90.9k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2545
90.9k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2546
2547
493k
        do {
2548
493k
          src_ptr += 2 * src_stride;
2549
493k
          const __m128i res = y_convolve_6tap_4x2_ssse3(
2550
493k
              src_ptr, src_stride, coeffs_128, s_32, ss_128);
2551
493k
          const __m128i r = sr_y_round_sse2(res);
2552
493k
          pack_store_4x2_sse2(r, dst, dst_stride);
2553
2554
493k
          ss_128[0] = ss_128[1];
2555
493k
          ss_128[1] = ss_128[2];
2556
493k
          dst += 2 * dst_stride;
2557
493k
          y -= 2;
2558
493k
        } while (y);
2559
90.9k
      }
2560
237k
    } else {
2561
237k
      prepare_half_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2562
2563
237k
      if (w == 8) {
2564
120k
        __m128i s_64[6];
2565
120k
        __m256i ss_256[3];
2566
2567
120k
        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2568
120k
        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2569
120k
        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2570
120k
        s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
2571
120k
        s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
2572
2573
        // Load lines a and b. Line a to lower 128, line b to upper 128
2574
120k
        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2575
120k
        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2576
120k
        const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
2577
120k
        const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
2578
2579
120k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2580
120k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2581
2582
120k
        y = h;
2583
690k
        do {
2584
690k
          src_ptr += 2 * src_stride;
2585
690k
          const __m256i res = y_convolve_6tap_8x2_avx2(
2586
690k
              src_ptr, src_stride, coeffs_256, s_64, ss_256);
2587
690k
          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2588
2589
690k
          ss_256[0] = ss_256[1];
2590
690k
          ss_256[1] = ss_256[2];
2591
690k
          dst += 2 * dst_stride;
2592
690k
          y -= 2;
2593
690k
        } while (y);
2594
120k
      } else if (w == 16) {
2595
81.7k
        __m128i s_128[6];
2596
81.7k
        __m256i ss_256[6], r[2];
2597
2598
81.7k
        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2599
81.7k
        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2600
81.7k
        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2601
81.7k
        s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
2602
81.7k
        s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
2603
2604
        // Load lines a and b. Line a to lower 128, line b to upper 128
2605
81.7k
        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2606
81.7k
        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2607
81.7k
        const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
2608
81.7k
        const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
2609
2610
81.7k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2611
81.7k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2612
2613
81.7k
        ss_256[3] = _mm256_unpackhi_epi8(src01, src12);
2614
81.7k
        ss_256[4] = _mm256_unpackhi_epi8(src23, src34);
2615
2616
81.7k
        y = h;
2617
602k
        do {
2618
602k
          src_ptr += 2 * src_stride;
2619
602k
          y_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2620
602k
                                    ss_256, r);
2621
602k
          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2622
2623
602k
          ss_256[0] = ss_256[1];
2624
602k
          ss_256[1] = ss_256[2];
2625
2626
602k
          ss_256[3] = ss_256[4];
2627
602k
          ss_256[4] = ss_256[5];
2628
602k
          dst += 2 * dst_stride;
2629
602k
          y -= 2;
2630
602k
        } while (y);
2631
81.7k
      } else {
2632
35.4k
        __m256i s_256[6], ss_256[6], tt_256[6], r[4];
2633
2634
35.4k
        assert(!(w % 32));
2635
2636
35.4k
        x = 0;
2637
42.0k
        do {
2638
42.0k
          const uint8_t *s = src_ptr + x;
2639
42.0k
          uint8_t *d = dst + x;
2640
2641
42.0k
          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2642
42.0k
          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2643
42.0k
          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2644
42.0k
          s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
2645
42.0k
          s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
2646
2647
42.0k
          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2648
42.0k
          ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
2649
42.0k
          ss_256[3] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2650
42.0k
          ss_256[4] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
2651
2652
42.0k
          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2653
42.0k
          tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
2654
42.0k
          tt_256[3] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2655
42.0k
          tt_256[4] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
2656
2657
42.0k
          y = h;
2658
714k
          do {
2659
714k
            s += 2 * src_stride;
2660
714k
            y_convolve_6tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2661
714k
                                      tt_256, r);
2662
714k
            sr_y_round_store_32x2_avx2(r, d, dst_stride);
2663
2664
714k
            ss_256[0] = ss_256[1];
2665
714k
            ss_256[1] = ss_256[2];
2666
714k
            ss_256[3] = ss_256[4];
2667
714k
            ss_256[4] = ss_256[5];
2668
2669
714k
            tt_256[0] = tt_256[1];
2670
714k
            tt_256[1] = tt_256[2];
2671
714k
            tt_256[3] = tt_256[4];
2672
714k
            tt_256[4] = tt_256[5];
2673
714k
            d += 2 * dst_stride;
2674
714k
            y -= 2;
2675
714k
          } while (y);
2676
2677
42.0k
          x += 32;
2678
42.0k
        } while (x < w);
2679
35.4k
      }
2680
237k
    }
2681
346k
  } else if (vert_tap == 8) {
2682
    // vert_filt as 8 tap
2683
23.2k
    const uint8_t *src_ptr = src - 3 * src_stride;
2684
2685
23.2k
    if (w <= 4) {
2686
7.32k
      prepare_half_coeffs_8tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2687
2688
7.32k
      y = h;
2689
2690
7.32k
      if (w == 2) {
2691
1.37k
        __m128i s_16[8], ss_128[4];
2692
2693
1.37k
        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2694
1.37k
        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2695
1.37k
        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2696
1.37k
        s_16[3] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 3 * src_stride));
2697
1.37k
        s_16[4] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 4 * src_stride));
2698
1.37k
        s_16[5] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 5 * src_stride));
2699
1.37k
        s_16[6] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 6 * src_stride));
2700
2701
1.37k
        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2702
1.37k
        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2703
1.37k
        const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
2704
1.37k
        const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
2705
1.37k
        const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
2706
1.37k
        const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[6]);
2707
2708
1.37k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2709
1.37k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2710
1.37k
        ss_128[2] = _mm_unpacklo_epi8(src45, src56);
2711
2712
5.50k
        do {
2713
5.50k
          const __m128i res = y_convolve_8tap_2x2_ssse3(
2714
5.50k
              src_ptr, src_stride, coeffs_128, s_16, ss_128);
2715
5.50k
          const __m128i r = sr_y_round_sse2(res);
2716
5.50k
          pack_store_2x2_sse2(r, dst, dst_stride);
2717
5.50k
          ss_128[0] = ss_128[1];
2718
5.50k
          ss_128[1] = ss_128[2];
2719
5.50k
          ss_128[2] = ss_128[3];
2720
5.50k
          src_ptr += 2 * src_stride;
2721
5.50k
          dst += 2 * dst_stride;
2722
5.50k
          y -= 2;
2723
5.50k
        } while (y);
2724
5.94k
      } else {
2725
5.94k
        __m128i s_32[8], ss_128[4];
2726
2727
5.94k
        assert(w == 4);
2728
2729
5.94k
        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2730
5.94k
        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2731
5.94k
        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2732
5.94k
        s_32[3] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 3 * src_stride));
2733
5.94k
        s_32[4] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 4 * src_stride));
2734
5.94k
        s_32[5] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 5 * src_stride));
2735
5.94k
        s_32[6] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 6 * src_stride));
2736
2737
5.94k
        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2738
5.94k
        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2739
5.94k
        const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
2740
5.94k
        const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
2741
5.94k
        const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
2742
5.94k
        const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
2743
2744
5.94k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2745
5.94k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2746
5.94k
        ss_128[2] = _mm_unpacklo_epi8(src45, src56);
2747
2748
32.7k
        do {
2749
32.7k
          const __m128i res = y_convolve_8tap_4x2_ssse3(
2750
32.7k
              src_ptr, src_stride, coeffs_128, s_32, ss_128);
2751
32.7k
          const __m128i r = sr_y_round_sse2(res);
2752
32.7k
          pack_store_4x2_sse2(r, dst, dst_stride);
2753
32.7k
          ss_128[0] = ss_128[1];
2754
32.7k
          ss_128[1] = ss_128[2];
2755
32.7k
          ss_128[2] = ss_128[3];
2756
32.7k
          src_ptr += 2 * src_stride;
2757
32.7k
          dst += 2 * dst_stride;
2758
32.7k
          y -= 2;
2759
32.7k
        } while (y);
2760
5.94k
      }
2761
15.9k
    } else {
2762
15.9k
      prepare_half_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2763
2764
15.9k
      if (w == 8) {
2765
7.72k
        __m128i s_64[8];
2766
7.72k
        __m256i ss_256[4];
2767
2768
7.72k
        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2769
7.72k
        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2770
7.72k
        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2771
7.72k
        s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
2772
7.72k
        s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
2773
7.72k
        s_64[5] = _mm_loadl_epi64((__m128i *)(src_ptr + 5 * src_stride));
2774
7.72k
        s_64[6] = _mm_loadl_epi64((__m128i *)(src_ptr + 6 * src_stride));
2775
2776
        // Load lines a and b. Line a to lower 128, line b to upper 128
2777
7.72k
        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2778
7.72k
        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2779
7.72k
        const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
2780
7.72k
        const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
2781
7.72k
        const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
2782
7.72k
        const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[6]);
2783
2784
7.72k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2785
7.72k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2786
7.72k
        ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
2787
2788
7.72k
        y = h;
2789
47.0k
        do {
2790
47.0k
          const __m256i res = y_convolve_8tap_8x2_avx2(
2791
47.0k
              src_ptr, src_stride, coeffs_256, s_64, ss_256);
2792
47.0k
          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2793
47.0k
          ss_256[0] = ss_256[1];
2794
47.0k
          ss_256[1] = ss_256[2];
2795
47.0k
          ss_256[2] = ss_256[3];
2796
47.0k
          src_ptr += 2 * src_stride;
2797
47.0k
          dst += 2 * dst_stride;
2798
47.0k
          y -= 2;
2799
47.0k
        } while (y);
2800
8.18k
      } else if (w == 16) {
2801
4.94k
        __m128i s_128[8];
2802
4.94k
        __m256i ss_256[8], r[2];
2803
2804
4.94k
        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2805
4.94k
        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2806
4.94k
        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2807
4.94k
        s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
2808
4.94k
        s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
2809
4.94k
        s_128[5] = _mm_loadu_si128((__m128i *)(src_ptr + 5 * src_stride));
2810
4.94k
        s_128[6] = _mm_loadu_si128((__m128i *)(src_ptr + 6 * src_stride));
2811
2812
        // Load lines a and b. Line a to lower 128, line b to upper 128
2813
4.94k
        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2814
4.94k
        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2815
4.94k
        const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
2816
4.94k
        const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
2817
4.94k
        const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
2818
4.94k
        const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[6]);
2819
2820
4.94k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2821
4.94k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2822
4.94k
        ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
2823
2824
4.94k
        ss_256[4] = _mm256_unpackhi_epi8(src01, src12);
2825
4.94k
        ss_256[5] = _mm256_unpackhi_epi8(src23, src34);
2826
4.94k
        ss_256[6] = _mm256_unpackhi_epi8(src45, src56);
2827
2828
4.94k
        y = h;
2829
41.2k
        do {
2830
41.2k
          y_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2831
41.2k
                                    ss_256, r);
2832
41.2k
          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2833
2834
41.2k
          ss_256[0] = ss_256[1];
2835
41.2k
          ss_256[1] = ss_256[2];
2836
41.2k
          ss_256[2] = ss_256[3];
2837
2838
41.2k
          ss_256[4] = ss_256[5];
2839
41.2k
          ss_256[5] = ss_256[6];
2840
41.2k
          ss_256[6] = ss_256[7];
2841
41.2k
          src_ptr += 2 * src_stride;
2842
41.2k
          dst += 2 * dst_stride;
2843
41.2k
          y -= 2;
2844
41.2k
        } while (y);
2845
4.94k
      } else {
2846
3.24k
        __m256i s_256[8], ss_256[8], tt_256[8], r[4];
2847
2848
3.24k
        assert(!(w % 32));
2849
2850
3.24k
        x = 0;
2851
4.52k
        do {
2852
4.52k
          const uint8_t *s = src_ptr + x;
2853
4.52k
          uint8_t *d = dst + x;
2854
2855
4.52k
          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2856
4.52k
          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2857
4.52k
          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2858
4.52k
          s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
2859
4.52k
          s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
2860
4.52k
          s_256[5] = _mm256_loadu_si256((__m256i *)(s + 5 * src_stride));
2861
4.52k
          s_256[6] = _mm256_loadu_si256((__m256i *)(s + 6 * src_stride));
2862
2863
4.52k
          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2864
4.52k
          ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
2865
4.52k
          ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
2866
4.52k
          ss_256[4] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2867
4.52k
          ss_256[5] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
2868
4.52k
          ss_256[6] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
2869
2870
4.52k
          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2871
4.52k
          tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
2872
4.52k
          tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[6]);
2873
4.52k
          tt_256[4] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2874
4.52k
          tt_256[5] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
2875
4.52k
          tt_256[6] = _mm256_unpackhi_epi8(s_256[5], s_256[6]);
2876
2877
4.52k
          y = h;
2878
107k
          do {
2879
107k
            y_convolve_8tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2880
107k
                                      tt_256, r);
2881
107k
            sr_y_round_store_32x2_avx2(r, d, dst_stride);
2882
2883
107k
            ss_256[0] = ss_256[1];
2884
107k
            ss_256[1] = ss_256[2];
2885
107k
            ss_256[2] = ss_256[3];
2886
107k
            ss_256[4] = ss_256[5];
2887
107k
            ss_256[5] = ss_256[6];
2888
107k
            ss_256[6] = ss_256[7];
2889
2890
107k
            tt_256[0] = tt_256[1];
2891
107k
            tt_256[1] = tt_256[2];
2892
107k
            tt_256[2] = tt_256[3];
2893
107k
            tt_256[4] = tt_256[5];
2894
107k
            tt_256[5] = tt_256[6];
2895
107k
            tt_256[6] = tt_256[7];
2896
107k
            s += 2 * src_stride;
2897
107k
            d += 2 * dst_stride;
2898
107k
            y -= 2;
2899
107k
          } while (y);
2900
2901
4.52k
          x += 32;
2902
4.52k
        } while (x < w);
2903
3.24k
      }
2904
15.9k
    }
2905
23.2k
  }
2906
747k
}
Unexecuted instantiation: convolve_2d_avx2.c:av1_convolve_y_sr_specialized_avx2
convolve_avx2.c:av1_convolve_y_sr_specialized_avx2
Line
Count
Source
2008
747k
    const int32_t subpel_y_q4) {
2009
747k
  int32_t x, y;
2010
747k
  __m128i coeffs_128[4];
2011
747k
  __m256i coeffs_256[4];
2012
2013
747k
  int vert_tap = get_filter_tap(filter_params_y, subpel_y_q4);
2014
2015
747k
  if (vert_tap == 2) {
2016
    // vert_filt as 2 tap
2017
37.2k
    const uint8_t *src_ptr = src;
2018
2019
37.2k
    y = h;
2020
2021
37.2k
    if (subpel_y_q4 != 8) {
2022
22.8k
      if (w <= 8) {
2023
17.5k
        prepare_half_coeffs_2tap_ssse3(filter_params_y, subpel_y_q4,
2024
17.5k
                                       coeffs_128);
2025
2026
17.5k
        if (w == 2) {
2027
2.84k
          __m128i s_16[2];
2028
2029
2.84k
          s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
2030
2031
5.59k
          do {
2032
5.59k
            const __m128i res = y_convolve_2tap_2x2_ssse3(src_ptr, src_stride,
2033
5.59k
                                                          coeffs_128, s_16);
2034
5.59k
            const __m128i r = sr_y_round_sse2(res);
2035
5.59k
            pack_store_2x2_sse2(r, dst, dst_stride);
2036
5.59k
            src_ptr += 2 * src_stride;
2037
5.59k
            dst += 2 * dst_stride;
2038
5.59k
            y -= 2;
2039
5.59k
          } while (y);
2040
14.6k
        } else if (w == 4) {
2041
8.80k
          __m128i s_32[2];
2042
2043
8.80k
          s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr));
2044
2045
26.0k
          do {
2046
26.0k
            const __m128i res = y_convolve_2tap_4x2_ssse3(src_ptr, src_stride,
2047
26.0k
                                                          coeffs_128, s_32);
2048
26.0k
            const __m128i r = sr_y_round_sse2(res);
2049
26.0k
            pack_store_4x2_sse2(r, dst, dst_stride);
2050
26.0k
            src_ptr += 2 * src_stride;
2051
26.0k
            dst += 2 * dst_stride;
2052
26.0k
            y -= 2;
2053
26.0k
          } while (y);
2054
8.80k
        } else {
2055
5.86k
          __m128i s_64[2], s_128[2];
2056
2057
5.86k
          assert(w == 8);
2058
2059
5.86k
          s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
2060
2061
20.2k
          do {
2062
            // Note: Faster than binding to AVX2 registers.
2063
20.2k
            s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
2064
20.2k
            s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
2065
20.2k
            s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2066
20.2k
            s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
2067
20.2k
            const __m128i ss0 = _mm_unpacklo_epi8(s_128[0], s_128[1]);
2068
20.2k
            const __m128i ss1 = _mm_unpackhi_epi8(s_128[0], s_128[1]);
2069
20.2k
            const __m128i res0 = convolve_2tap_ssse3(&ss0, coeffs_128);
2070
20.2k
            const __m128i res1 = convolve_2tap_ssse3(&ss1, coeffs_128);
2071
20.2k
            const __m128i r0 = sr_y_round_sse2(res0);
2072
20.2k
            const __m128i r1 = sr_y_round_sse2(res1);
2073
20.2k
            const __m128i d = _mm_packus_epi16(r0, r1);
2074
20.2k
            _mm_storel_epi64((__m128i *)dst, d);
2075
20.2k
            _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
2076
20.2k
            src_ptr += 2 * src_stride;
2077
20.2k
            dst += 2 * dst_stride;
2078
20.2k
            y -= 2;
2079
20.2k
          } while (y);
2080
5.86k
        }
2081
17.5k
      } else {
2082
5.29k
        prepare_half_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2083
2084
5.29k
        if (w == 16) {
2085
2.99k
          __m128i s_128[2];
2086
2087
2.99k
          s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
2088
2089
18.8k
          do {
2090
18.8k
            __m256i r[2];
2091
2092
18.8k
            y_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2093
18.8k
                                      r);
2094
18.8k
            sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2095
18.8k
            src_ptr += 2 * src_stride;
2096
18.8k
            dst += 2 * dst_stride;
2097
18.8k
            y -= 2;
2098
18.8k
          } while (y);
2099
2.99k
        } else if (w == 32) {
2100
1.30k
          __m256i s_256[2];
2101
2102
1.30k
          s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
2103
2104
15.4k
          do {
2105
15.4k
            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0],
2106
15.4k
                              &s_256[1], dst);
2107
15.4k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1],
2108
15.4k
                              &s_256[0], dst + dst_stride);
2109
15.4k
            src_ptr += 2 * src_stride;
2110
15.4k
            dst += 2 * dst_stride;
2111
15.4k
            y -= 2;
2112
15.4k
          } while (y);
2113
1.30k
        } else if (w == 64) {
2114
828
          __m256i s_256[2][2];
2115
2116
828
          s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2117
828
          s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2118
2119
18.9k
          do {
2120
18.9k
            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0][0],
2121
18.9k
                              &s_256[1][0], dst);
2122
18.9k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 32, coeffs_256,
2123
18.9k
                              s_256[0][1], &s_256[1][1], dst + 32);
2124
18.9k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1][0],
2125
18.9k
                              &s_256[0][0], dst + dst_stride);
2126
18.9k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 32, coeffs_256,
2127
18.9k
                              s_256[1][1], &s_256[0][1], dst + dst_stride + 32);
2128
2129
18.9k
            src_ptr += 2 * src_stride;
2130
18.9k
            dst += 2 * dst_stride;
2131
18.9k
            y -= 2;
2132
18.9k
          } while (y);
2133
828
        } else {
2134
163
          __m256i s_256[2][4];
2135
2136
163
          assert(w == 128);
2137
2138
163
          s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2139
163
          s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2140
163
          s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
2141
163
          s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
2142
2143
7.80k
          do {
2144
7.80k
            sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0][0],
2145
7.80k
                              &s_256[1][0], dst);
2146
7.80k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 1 * 32, coeffs_256,
2147
7.80k
                              s_256[0][1], &s_256[1][1], dst + 1 * 32);
2148
7.80k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 2 * 32, coeffs_256,
2149
7.80k
                              s_256[0][2], &s_256[1][2], dst + 2 * 32);
2150
7.80k
            sr_y_2tap_32_avx2(src_ptr + src_stride + 3 * 32, coeffs_256,
2151
7.80k
                              s_256[0][3], &s_256[1][3], dst + 3 * 32);
2152
2153
7.80k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1][0],
2154
7.80k
                              &s_256[0][0], dst + dst_stride);
2155
7.80k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 1 * 32, coeffs_256,
2156
7.80k
                              s_256[1][1], &s_256[0][1],
2157
7.80k
                              dst + dst_stride + 1 * 32);
2158
7.80k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 2 * 32, coeffs_256,
2159
7.80k
                              s_256[1][2], &s_256[0][2],
2160
7.80k
                              dst + dst_stride + 2 * 32);
2161
7.80k
            sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 3 * 32, coeffs_256,
2162
7.80k
                              s_256[1][3], &s_256[0][3],
2163
7.80k
                              dst + dst_stride + 3 * 32);
2164
2165
7.80k
            src_ptr += 2 * src_stride;
2166
7.80k
            dst += 2 * dst_stride;
2167
7.80k
            y -= 2;
2168
7.80k
          } while (y);
2169
163
        }
2170
5.29k
      }
2171
22.8k
    } else {
2172
      // average to get half pel
2173
14.4k
      if (w <= 8) {
2174
10.7k
        if (w == 2) {
2175
1.31k
          __m128i s_16[2];
2176
2177
1.31k
          s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
2178
2179
2.80k
          do {
2180
2.80k
            s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + src_stride));
2181
2.80k
            const __m128i d0 = _mm_avg_epu8(s_16[0], s_16[1]);
2182
2.80k
            *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d0);
2183
2.80k
            s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
2184
2.80k
            const __m128i d1 = _mm_avg_epu8(s_16[1], s_16[0]);
2185
2.80k
            *(int16_t *)(dst + dst_stride) = (int16_t)_mm_cvtsi128_si32(d1);
2186
2.80k
            src_ptr += 2 * src_stride;
2187
2.80k
            dst += 2 * dst_stride;
2188
2.80k
            y -= 2;
2189
2.80k
          } while (y);
2190
9.47k
        } else if (w == 4) {
2191
5.18k
          __m128i s_32[2];
2192
2193
5.18k
          s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr));
2194
2195
16.5k
          do {
2196
16.5k
            s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + src_stride));
2197
16.5k
            const __m128i d0 = _mm_avg_epu8(s_32[0], s_32[1]);
2198
16.5k
            xx_storel_32(dst, d0);
2199
16.5k
            s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2200
16.5k
            const __m128i d1 = _mm_avg_epu8(s_32[1], s_32[0]);
2201
16.5k
            xx_storel_32(dst + dst_stride, d1);
2202
16.5k
            src_ptr += 2 * src_stride;
2203
16.5k
            dst += 2 * dst_stride;
2204
16.5k
            y -= 2;
2205
16.5k
          } while (y);
2206
5.18k
        } else {
2207
4.29k
          __m128i s_64[2];
2208
2209
4.29k
          assert(w == 8);
2210
2211
4.29k
          s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
2212
2213
17.4k
          do {
2214
            // Note: Faster than binding to AVX2 registers.
2215
17.4k
            s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
2216
17.4k
            const __m128i d0 = _mm_avg_epu8(s_64[0], s_64[1]);
2217
17.4k
            _mm_storel_epi64((__m128i *)dst, d0);
2218
17.4k
            s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2219
17.4k
            const __m128i d1 = _mm_avg_epu8(s_64[1], s_64[0]);
2220
17.4k
            _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
2221
17.4k
            src_ptr += 2 * src_stride;
2222
17.4k
            dst += 2 * dst_stride;
2223
17.4k
            y -= 2;
2224
17.4k
          } while (y);
2225
4.29k
        }
2226
10.7k
      } else if (w == 16) {
2227
2.25k
        __m128i s_128[2];
2228
2229
2.25k
        s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
2230
2231
13.8k
        do {
2232
13.8k
          s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
2233
13.8k
          const __m128i d0 = _mm_avg_epu8(s_128[0], s_128[1]);
2234
13.8k
          _mm_storeu_si128((__m128i *)dst, d0);
2235
13.8k
          s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2236
13.8k
          const __m128i d1 = _mm_avg_epu8(s_128[1], s_128[0]);
2237
13.8k
          _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
2238
13.8k
          src_ptr += 2 * src_stride;
2239
13.8k
          dst += 2 * dst_stride;
2240
13.8k
          y -= 2;
2241
13.8k
        } while (y);
2242
2.25k
      } else if (w == 32) {
2243
884
        __m256i s_256[2];
2244
2245
884
        s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
2246
2247
10.6k
        do {
2248
10.6k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0], &s_256[1], dst);
2249
10.6k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1], &s_256[0],
2250
10.6k
                                dst + dst_stride);
2251
10.6k
          src_ptr += 2 * src_stride;
2252
10.6k
          dst += 2 * dst_stride;
2253
10.6k
          y -= 2;
2254
10.6k
        } while (y);
2255
884
      } else if (w == 64) {
2256
344
        __m256i s_256[2][2];
2257
2258
344
        s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2259
344
        s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2260
2261
8.46k
        do {
2262
8.46k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0][0], &s_256[1][0],
2263
8.46k
                                dst);
2264
8.46k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 32, s_256[0][1],
2265
8.46k
                                &s_256[1][1], dst + 32);
2266
2267
8.46k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1][0],
2268
8.46k
                                &s_256[0][0], dst + dst_stride);
2269
8.46k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 32, s_256[1][1],
2270
8.46k
                                &s_256[0][1], dst + dst_stride + 32);
2271
2272
8.46k
          src_ptr += 2 * src_stride;
2273
8.46k
          dst += 2 * dst_stride;
2274
8.46k
          y -= 2;
2275
8.46k
        } while (y);
2276
344
      } else {
2277
178
        __m256i s_256[2][4];
2278
2279
178
        assert(w == 128);
2280
2281
179
        s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2282
179
        s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2283
179
        s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
2284
179
        s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
2285
2286
8.73k
        do {
2287
8.73k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0][0], &s_256[1][0],
2288
8.73k
                                dst);
2289
8.73k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 1 * 32, s_256[0][1],
2290
8.73k
                                &s_256[1][1], dst + 1 * 32);
2291
8.73k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 2 * 32, s_256[0][2],
2292
8.73k
                                &s_256[1][2], dst + 2 * 32);
2293
8.73k
          sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 3 * 32, s_256[0][3],
2294
8.73k
                                &s_256[1][3], dst + 3 * 32);
2295
2296
8.73k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1][0],
2297
8.73k
                                &s_256[0][0], dst + dst_stride);
2298
8.73k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 1 * 32, s_256[1][1],
2299
8.73k
                                &s_256[0][1], dst + dst_stride + 1 * 32);
2300
8.73k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 2 * 32, s_256[1][2],
2301
8.73k
                                &s_256[0][2], dst + dst_stride + 2 * 32);
2302
8.73k
          sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 3 * 32, s_256[1][3],
2303
8.73k
                                &s_256[0][3], dst + dst_stride + 3 * 32);
2304
2305
8.73k
          src_ptr += 2 * src_stride;
2306
8.73k
          dst += 2 * dst_stride;
2307
8.73k
          y -= 2;
2308
8.73k
        } while (y);
2309
179
      }
2310
14.4k
    }
2311
710k
  } else if (vert_tap == 4) {
2312
    // vert_filt as 4 tap
2313
340k
    const uint8_t *src_ptr = src - src_stride;
2314
2315
340k
    y = h;
2316
2317
340k
    if (w <= 4) {
2318
170k
      prepare_half_coeffs_4tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2319
2320
170k
      if (w == 2) {
2321
32.3k
        __m128i s_16[4], ss_128[2];
2322
2323
32.3k
        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2324
32.3k
        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2325
32.3k
        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2326
2327
32.3k
        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2328
32.3k
        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2329
2330
32.3k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2331
2332
51.2k
        do {
2333
51.2k
          src_ptr += 2 * src_stride;
2334
51.2k
          const __m128i res = y_convolve_4tap_2x2_ssse3(
2335
51.2k
              src_ptr, src_stride, coeffs_128, s_16, ss_128);
2336
51.2k
          const __m128i r = sr_y_round_sse2(res);
2337
51.2k
          pack_store_2x2_sse2(r, dst, dst_stride);
2338
2339
51.2k
          ss_128[0] = ss_128[1];
2340
51.2k
          dst += 2 * dst_stride;
2341
51.2k
          y -= 2;
2342
51.2k
        } while (y);
2343
138k
      } else {
2344
138k
        __m128i s_32[4], ss_128[2];
2345
2346
138k
        assert(w == 4);
2347
2348
138k
        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2349
138k
        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2350
138k
        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2351
2352
138k
        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2353
138k
        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2354
2355
138k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2356
2357
281k
        do {
2358
281k
          src_ptr += 2 * src_stride;
2359
281k
          const __m128i res = y_convolve_4tap_4x2_ssse3(
2360
281k
              src_ptr, src_stride, coeffs_128, s_32, ss_128);
2361
281k
          const __m128i r = sr_y_round_sse2(res);
2362
281k
          pack_store_4x2_sse2(r, dst, dst_stride);
2363
2364
281k
          ss_128[0] = ss_128[1];
2365
281k
          dst += 2 * dst_stride;
2366
281k
          y -= 2;
2367
281k
        } while (y);
2368
138k
      }
2369
170k
    } else {
2370
169k
      prepare_half_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2371
2372
169k
      if (w == 8) {
2373
110k
        __m128i s_64[4];
2374
110k
        __m256i ss_256[2];
2375
2376
110k
        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2377
110k
        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2378
110k
        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2379
2380
        // Load lines a and b. Line a to lower 128, line b to upper 128
2381
110k
        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2382
110k
        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2383
2384
110k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2385
2386
228k
        do {
2387
228k
          src_ptr += 2 * src_stride;
2388
228k
          const __m256i res = y_convolve_4tap_8x2_avx2(
2389
228k
              src_ptr, src_stride, coeffs_256, s_64, ss_256);
2390
228k
          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2391
2392
228k
          ss_256[0] = ss_256[1];
2393
228k
          dst += 2 * dst_stride;
2394
228k
          y -= 2;
2395
228k
        } while (y);
2396
110k
      } else if (w == 16) {
2397
52.9k
        __m128i s_128[4];
2398
52.9k
        __m256i ss_256[4], r[2];
2399
2400
52.9k
        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2401
52.9k
        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2402
52.9k
        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2403
2404
        // Load lines a and b. Line a to lower 128, line b to upper 128
2405
52.9k
        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2406
52.9k
        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2407
2408
52.9k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2409
52.9k
        ss_256[2] = _mm256_unpackhi_epi8(src01, src12);
2410
2411
140k
        do {
2412
140k
          src_ptr += 2 * src_stride;
2413
140k
          y_convolve_4tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2414
140k
                                    ss_256, r);
2415
140k
          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2416
2417
140k
          ss_256[0] = ss_256[1];
2418
140k
          ss_256[2] = ss_256[3];
2419
140k
          dst += 2 * dst_stride;
2420
140k
          y -= 2;
2421
140k
        } while (y);
2422
52.9k
      } else if (w == 32) {
2423
        // AV1 standard won't have 32x4 case.
2424
        // This only favors some optimization feature which
2425
        // subsamples 32x8 to 32x4 and triggers 4-tap filter.
2426
2427
4.84k
        __m256i s_256[4], ss_256[4], tt_256[4], r[4];
2428
2429
4.84k
        s_256[0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * src_stride));
2430
4.84k
        s_256[1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * src_stride));
2431
4.84k
        s_256[2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * src_stride));
2432
2433
4.84k
        ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2434
4.84k
        ss_256[2] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2435
2436
4.84k
        tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2437
4.84k
        tt_256[2] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2438
2439
38.8k
        do {
2440
38.8k
          src_ptr += 2 * src_stride;
2441
38.8k
          y_convolve_4tap_32x2_avx2(src_ptr, src_stride, coeffs_256, s_256,
2442
38.8k
                                    ss_256, tt_256, r);
2443
38.8k
          sr_y_round_store_32x2_avx2(r, dst, dst_stride);
2444
2445
38.8k
          ss_256[0] = ss_256[1];
2446
38.8k
          ss_256[2] = ss_256[3];
2447
2448
38.8k
          tt_256[0] = tt_256[1];
2449
38.8k
          tt_256[2] = tt_256[3];
2450
38.8k
          dst += 2 * dst_stride;
2451
38.8k
          y -= 2;
2452
38.8k
        } while (y);
2453
4.84k
      } else {
2454
1.47k
        assert(!(w % 32));
2455
2456
1.48k
        __m256i s_256[4], ss_256[4], tt_256[4], r[4];
2457
1.48k
        x = 0;
2458
3.54k
        do {
2459
3.54k
          const uint8_t *s = src_ptr + x;
2460
3.54k
          uint8_t *d = dst + x;
2461
3.54k
          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2462
3.54k
          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2463
3.54k
          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2464
2465
3.54k
          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2466
3.54k
          ss_256[2] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2467
2468
3.54k
          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2469
3.54k
          tt_256[2] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2470
2471
3.54k
          y = h;
2472
134k
          do {
2473
134k
            s += 2 * src_stride;
2474
134k
            y_convolve_4tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2475
134k
                                      tt_256, r);
2476
134k
            sr_y_round_store_32x2_avx2(r, d, dst_stride);
2477
2478
134k
            ss_256[0] = ss_256[1];
2479
134k
            ss_256[2] = ss_256[3];
2480
2481
134k
            tt_256[0] = tt_256[1];
2482
134k
            tt_256[2] = tt_256[3];
2483
134k
            d += 2 * dst_stride;
2484
134k
            y -= 2;
2485
134k
          } while (y);
2486
3.54k
          x += 32;
2487
3.54k
        } while (x < w);
2488
1.48k
      }
2489
169k
    }
2490
370k
  } else if (vert_tap == 6) {
2491
    // vert_filt as 6 tap
2492
346k
    const uint8_t *src_ptr = src - 2 * src_stride;
2493
2494
346k
    if (w <= 4) {
2495
109k
      prepare_half_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2496
2497
109k
      y = h;
2498
2499
109k
      if (w == 2) {
2500
18.6k
        __m128i s_16[6], ss_128[3];
2501
2502
18.6k
        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2503
18.6k
        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2504
18.6k
        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2505
18.6k
        s_16[3] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 3 * src_stride));
2506
18.6k
        s_16[4] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 4 * src_stride));
2507
2508
18.6k
        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2509
18.6k
        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2510
18.6k
        const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
2511
18.6k
        const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
2512
2513
18.6k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2514
18.6k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2515
2516
74.4k
        do {
2517
74.4k
          src_ptr += 2 * src_stride;
2518
74.4k
          const __m128i res = y_convolve_6tap_2x2_ssse3(
2519
74.4k
              src_ptr, src_stride, coeffs_128, s_16, ss_128);
2520
74.4k
          const __m128i r = sr_y_round_sse2(res);
2521
74.4k
          pack_store_2x2_sse2(r, dst, dst_stride);
2522
2523
74.4k
          ss_128[0] = ss_128[1];
2524
74.4k
          ss_128[1] = ss_128[2];
2525
74.4k
          dst += 2 * dst_stride;
2526
74.4k
          y -= 2;
2527
74.4k
        } while (y);
2528
90.9k
      } else {
2529
90.9k
        __m128i s_32[6], ss_128[3];
2530
2531
90.9k
        assert(w == 4);
2532
2533
90.9k
        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2534
90.9k
        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2535
90.9k
        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2536
90.9k
        s_32[3] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 3 * src_stride));
2537
90.9k
        s_32[4] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 4 * src_stride));
2538
2539
90.9k
        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2540
90.9k
        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2541
90.9k
        const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
2542
90.9k
        const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
2543
2544
90.9k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2545
90.9k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2546
2547
493k
        do {
2548
493k
          src_ptr += 2 * src_stride;
2549
493k
          const __m128i res = y_convolve_6tap_4x2_ssse3(
2550
493k
              src_ptr, src_stride, coeffs_128, s_32, ss_128);
2551
493k
          const __m128i r = sr_y_round_sse2(res);
2552
493k
          pack_store_4x2_sse2(r, dst, dst_stride);
2553
2554
493k
          ss_128[0] = ss_128[1];
2555
493k
          ss_128[1] = ss_128[2];
2556
493k
          dst += 2 * dst_stride;
2557
493k
          y -= 2;
2558
493k
        } while (y);
2559
90.9k
      }
2560
237k
    } else {
2561
237k
      prepare_half_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2562
2563
237k
      if (w == 8) {
2564
120k
        __m128i s_64[6];
2565
120k
        __m256i ss_256[3];
2566
2567
120k
        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2568
120k
        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2569
120k
        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2570
120k
        s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
2571
120k
        s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
2572
2573
        // Load lines a and b. Line a to lower 128, line b to upper 128
2574
120k
        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2575
120k
        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2576
120k
        const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
2577
120k
        const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
2578
2579
120k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2580
120k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2581
2582
120k
        y = h;
2583
690k
        do {
2584
690k
          src_ptr += 2 * src_stride;
2585
690k
          const __m256i res = y_convolve_6tap_8x2_avx2(
2586
690k
              src_ptr, src_stride, coeffs_256, s_64, ss_256);
2587
690k
          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2588
2589
690k
          ss_256[0] = ss_256[1];
2590
690k
          ss_256[1] = ss_256[2];
2591
690k
          dst += 2 * dst_stride;
2592
690k
          y -= 2;
2593
690k
        } while (y);
2594
120k
      } else if (w == 16) {
2595
81.7k
        __m128i s_128[6];
2596
81.7k
        __m256i ss_256[6], r[2];
2597
2598
81.7k
        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2599
81.7k
        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2600
81.7k
        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2601
81.7k
        s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
2602
81.7k
        s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
2603
2604
        // Load lines a and b. Line a to lower 128, line b to upper 128
2605
81.7k
        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2606
81.7k
        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2607
81.7k
        const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
2608
81.7k
        const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
2609
2610
81.7k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2611
81.7k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2612
2613
81.7k
        ss_256[3] = _mm256_unpackhi_epi8(src01, src12);
2614
81.7k
        ss_256[4] = _mm256_unpackhi_epi8(src23, src34);
2615
2616
81.7k
        y = h;
2617
602k
        do {
2618
602k
          src_ptr += 2 * src_stride;
2619
602k
          y_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2620
602k
                                    ss_256, r);
2621
602k
          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2622
2623
602k
          ss_256[0] = ss_256[1];
2624
602k
          ss_256[1] = ss_256[2];
2625
2626
602k
          ss_256[3] = ss_256[4];
2627
602k
          ss_256[4] = ss_256[5];
2628
602k
          dst += 2 * dst_stride;
2629
602k
          y -= 2;
2630
602k
        } while (y);
2631
81.7k
      } else {
2632
35.4k
        __m256i s_256[6], ss_256[6], tt_256[6], r[4];
2633
2634
35.4k
        assert(!(w % 32));
2635
2636
35.4k
        x = 0;
2637
42.0k
        do {
2638
42.0k
          const uint8_t *s = src_ptr + x;
2639
42.0k
          uint8_t *d = dst + x;
2640
2641
42.0k
          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2642
42.0k
          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2643
42.0k
          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2644
42.0k
          s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
2645
42.0k
          s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
2646
2647
42.0k
          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2648
42.0k
          ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
2649
42.0k
          ss_256[3] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2650
42.0k
          ss_256[4] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
2651
2652
42.0k
          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2653
42.0k
          tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
2654
42.0k
          tt_256[3] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2655
42.0k
          tt_256[4] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
2656
2657
42.0k
          y = h;
2658
714k
          do {
2659
714k
            s += 2 * src_stride;
2660
714k
            y_convolve_6tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2661
714k
                                      tt_256, r);
2662
714k
            sr_y_round_store_32x2_avx2(r, d, dst_stride);
2663
2664
714k
            ss_256[0] = ss_256[1];
2665
714k
            ss_256[1] = ss_256[2];
2666
714k
            ss_256[3] = ss_256[4];
2667
714k
            ss_256[4] = ss_256[5];
2668
2669
714k
            tt_256[0] = tt_256[1];
2670
714k
            tt_256[1] = tt_256[2];
2671
714k
            tt_256[3] = tt_256[4];
2672
714k
            tt_256[4] = tt_256[5];
2673
714k
            d += 2 * dst_stride;
2674
714k
            y -= 2;
2675
714k
          } while (y);
2676
2677
42.0k
          x += 32;
2678
42.0k
        } while (x < w);
2679
35.4k
      }
2680
237k
    }
2681
346k
  } else if (vert_tap == 8) {
2682
    // vert_filt as 8 tap
2683
23.2k
    const uint8_t *src_ptr = src - 3 * src_stride;
2684
2685
23.2k
    if (w <= 4) {
2686
7.32k
      prepare_half_coeffs_8tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2687
2688
7.32k
      y = h;
2689
2690
7.32k
      if (w == 2) {
2691
1.37k
        __m128i s_16[8], ss_128[4];
2692
2693
1.37k
        s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2694
1.37k
        s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2695
1.37k
        s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2696
1.37k
        s_16[3] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 3 * src_stride));
2697
1.37k
        s_16[4] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 4 * src_stride));
2698
1.37k
        s_16[5] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 5 * src_stride));
2699
1.37k
        s_16[6] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 6 * src_stride));
2700
2701
1.37k
        const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2702
1.37k
        const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2703
1.37k
        const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
2704
1.37k
        const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
2705
1.37k
        const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
2706
1.37k
        const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[6]);
2707
2708
1.37k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2709
1.37k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2710
1.37k
        ss_128[2] = _mm_unpacklo_epi8(src45, src56);
2711
2712
5.50k
        do {
2713
5.50k
          const __m128i res = y_convolve_8tap_2x2_ssse3(
2714
5.50k
              src_ptr, src_stride, coeffs_128, s_16, ss_128);
2715
5.50k
          const __m128i r = sr_y_round_sse2(res);
2716
5.50k
          pack_store_2x2_sse2(r, dst, dst_stride);
2717
5.50k
          ss_128[0] = ss_128[1];
2718
5.50k
          ss_128[1] = ss_128[2];
2719
5.50k
          ss_128[2] = ss_128[3];
2720
5.50k
          src_ptr += 2 * src_stride;
2721
5.50k
          dst += 2 * dst_stride;
2722
5.50k
          y -= 2;
2723
5.50k
        } while (y);
2724
5.94k
      } else {
2725
5.94k
        __m128i s_32[8], ss_128[4];
2726
2727
5.94k
        assert(w == 4);
2728
2729
5.94k
        s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2730
5.94k
        s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2731
5.94k
        s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2732
5.94k
        s_32[3] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 3 * src_stride));
2733
5.94k
        s_32[4] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 4 * src_stride));
2734
5.94k
        s_32[5] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 5 * src_stride));
2735
5.94k
        s_32[6] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 6 * src_stride));
2736
2737
5.94k
        const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2738
5.94k
        const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2739
5.94k
        const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
2740
5.94k
        const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
2741
5.94k
        const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
2742
5.94k
        const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
2743
2744
5.94k
        ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2745
5.94k
        ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2746
5.94k
        ss_128[2] = _mm_unpacklo_epi8(src45, src56);
2747
2748
32.7k
        do {
2749
32.7k
          const __m128i res = y_convolve_8tap_4x2_ssse3(
2750
32.7k
              src_ptr, src_stride, coeffs_128, s_32, ss_128);
2751
32.7k
          const __m128i r = sr_y_round_sse2(res);
2752
32.7k
          pack_store_4x2_sse2(r, dst, dst_stride);
2753
32.7k
          ss_128[0] = ss_128[1];
2754
32.7k
          ss_128[1] = ss_128[2];
2755
32.7k
          ss_128[2] = ss_128[3];
2756
32.7k
          src_ptr += 2 * src_stride;
2757
32.7k
          dst += 2 * dst_stride;
2758
32.7k
          y -= 2;
2759
32.7k
        } while (y);
2760
5.94k
      }
2761
15.9k
    } else {
2762
15.9k
      prepare_half_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2763
2764
15.9k
      if (w == 8) {
2765
7.72k
        __m128i s_64[8];
2766
7.72k
        __m256i ss_256[4];
2767
2768
7.72k
        s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2769
7.72k
        s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2770
7.72k
        s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2771
7.72k
        s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
2772
7.72k
        s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
2773
7.72k
        s_64[5] = _mm_loadl_epi64((__m128i *)(src_ptr + 5 * src_stride));
2774
7.72k
        s_64[6] = _mm_loadl_epi64((__m128i *)(src_ptr + 6 * src_stride));
2775
2776
        // Load lines a and b. Line a to lower 128, line b to upper 128
2777
7.72k
        const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2778
7.72k
        const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2779
7.72k
        const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
2780
7.72k
        const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
2781
7.72k
        const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
2782
7.72k
        const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[6]);
2783
2784
7.72k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2785
7.72k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2786
7.72k
        ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
2787
2788
7.72k
        y = h;
2789
47.0k
        do {
2790
47.0k
          const __m256i res = y_convolve_8tap_8x2_avx2(
2791
47.0k
              src_ptr, src_stride, coeffs_256, s_64, ss_256);
2792
47.0k
          sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2793
47.0k
          ss_256[0] = ss_256[1];
2794
47.0k
          ss_256[1] = ss_256[2];
2795
47.0k
          ss_256[2] = ss_256[3];
2796
47.0k
          src_ptr += 2 * src_stride;
2797
47.0k
          dst += 2 * dst_stride;
2798
47.0k
          y -= 2;
2799
47.0k
        } while (y);
2800
8.18k
      } else if (w == 16) {
2801
4.94k
        __m128i s_128[8];
2802
4.94k
        __m256i ss_256[8], r[2];
2803
2804
4.94k
        s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2805
4.94k
        s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2806
4.94k
        s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2807
4.94k
        s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
2808
4.94k
        s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
2809
4.94k
        s_128[5] = _mm_loadu_si128((__m128i *)(src_ptr + 5 * src_stride));
2810
4.94k
        s_128[6] = _mm_loadu_si128((__m128i *)(src_ptr + 6 * src_stride));
2811
2812
        // Load lines a and b. Line a to lower 128, line b to upper 128
2813
4.94k
        const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2814
4.94k
        const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2815
4.94k
        const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
2816
4.94k
        const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
2817
4.94k
        const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
2818
4.94k
        const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[6]);
2819
2820
4.94k
        ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2821
4.94k
        ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2822
4.94k
        ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
2823
2824
4.94k
        ss_256[4] = _mm256_unpackhi_epi8(src01, src12);
2825
4.94k
        ss_256[5] = _mm256_unpackhi_epi8(src23, src34);
2826
4.94k
        ss_256[6] = _mm256_unpackhi_epi8(src45, src56);
2827
2828
4.94k
        y = h;
2829
41.2k
        do {
2830
41.2k
          y_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2831
41.2k
                                    ss_256, r);
2832
41.2k
          sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2833
2834
41.2k
          ss_256[0] = ss_256[1];
2835
41.2k
          ss_256[1] = ss_256[2];
2836
41.2k
          ss_256[2] = ss_256[3];
2837
2838
41.2k
          ss_256[4] = ss_256[5];
2839
41.2k
          ss_256[5] = ss_256[6];
2840
41.2k
          ss_256[6] = ss_256[7];
2841
41.2k
          src_ptr += 2 * src_stride;
2842
41.2k
          dst += 2 * dst_stride;
2843
41.2k
          y -= 2;
2844
41.2k
        } while (y);
2845
4.94k
      } else {
2846
3.24k
        __m256i s_256[8], ss_256[8], tt_256[8], r[4];
2847
2848
3.24k
        assert(!(w % 32));
2849
2850
3.24k
        x = 0;
2851
4.52k
        do {
2852
4.52k
          const uint8_t *s = src_ptr + x;
2853
4.52k
          uint8_t *d = dst + x;
2854
2855
4.52k
          s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2856
4.52k
          s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2857
4.52k
          s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2858
4.52k
          s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
2859
4.52k
          s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
2860
4.52k
          s_256[5] = _mm256_loadu_si256((__m256i *)(s + 5 * src_stride));
2861
4.52k
          s_256[6] = _mm256_loadu_si256((__m256i *)(s + 6 * src_stride));
2862
2863
4.52k
          ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2864
4.52k
          ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
2865
4.52k
          ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
2866
4.52k
          ss_256[4] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2867
4.52k
          ss_256[5] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
2868
4.52k
          ss_256[6] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
2869
2870
4.52k
          tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2871
4.52k
          tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
2872
4.52k
          tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[6]);
2873
4.52k
          tt_256[4] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2874
4.52k
          tt_256[5] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
2875
4.52k
          tt_256[6] = _mm256_unpackhi_epi8(s_256[5], s_256[6]);
2876
2877
4.52k
          y = h;
2878
107k
          do {
2879
107k
            y_convolve_8tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2880
107k
                                      tt_256, r);
2881
107k
            sr_y_round_store_32x2_avx2(r, d, dst_stride);
2882
2883
107k
            ss_256[0] = ss_256[1];
2884
107k
            ss_256[1] = ss_256[2];
2885
107k
            ss_256[2] = ss_256[3];
2886
107k
            ss_256[4] = ss_256[5];
2887
107k
            ss_256[5] = ss_256[6];
2888
107k
            ss_256[6] = ss_256[7];
2889
2890
107k
            tt_256[0] = tt_256[1];
2891
107k
            tt_256[1] = tt_256[2];
2892
107k
            tt_256[2] = tt_256[3];
2893
107k
            tt_256[4] = tt_256[5];
2894
107k
            tt_256[5] = tt_256[6];
2895
107k
            tt_256[6] = tt_256[7];
2896
107k
            s += 2 * src_stride;
2897
107k
            d += 2 * dst_stride;
2898
107k
            y -= 2;
2899
107k
          } while (y);
2900
2901
4.52k
          x += 32;
2902
4.52k
        } while (x < w);
2903
3.24k
      }
2904
15.9k
    }
2905
23.2k
  }
2906
747k
}
2907
2908
static inline void sr_x_2tap_32_avx2(const uint8_t *const src,
2909
                                     const __m256i coeffs[1],
2910
232k
                                     uint8_t *const dst) {
2911
232k
  __m256i r[2];
2912
2913
232k
  x_convolve_2tap_32_avx2(src, coeffs, r);
2914
232k
  sr_x_round_store_32_avx2(r, dst);
2915
232k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_2tap_32_avx2
convolve_avx2.c:sr_x_2tap_32_avx2
Line
Count
Source
2910
232k
                                     uint8_t *const dst) {
2911
232k
  __m256i r[2];
2912
2913
232k
  x_convolve_2tap_32_avx2(src, coeffs, r);
2914
232k
  sr_x_round_store_32_avx2(r, dst);
2915
232k
}
2916
2917
static inline void sr_x_6tap_32_avx2(const uint8_t *const src,
2918
                                     const __m256i coeffs[3],
2919
                                     const __m256i filt[3],
2920
1.77M
                                     uint8_t *const dst) {
2921
1.77M
  __m256i r[2];
2922
2923
1.77M
  x_convolve_6tap_32_avx2(src, coeffs, filt, r);
2924
1.77M
  sr_x_round_store_32_avx2(r, dst);
2925
1.77M
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_6tap_32_avx2
convolve_avx2.c:sr_x_6tap_32_avx2
Line
Count
Source
2920
1.77M
                                     uint8_t *const dst) {
2921
1.77M
  __m256i r[2];
2922
2923
1.77M
  x_convolve_6tap_32_avx2(src, coeffs, filt, r);
2924
1.77M
  sr_x_round_store_32_avx2(r, dst);
2925
1.77M
}
2926
2927
static AOM_FORCE_INLINE void sr_x_8tap_32_avx2(const uint8_t *const src,
2928
                                               const __m256i coeffs[4],
2929
                                               const __m256i filt[4],
2930
179k
                                               uint8_t *const dst) {
2931
179k
  __m256i r[2];
2932
2933
179k
  x_convolve_8tap_32_avx2(src, coeffs, filt, r);
2934
179k
  sr_x_round_store_32_avx2(r, dst);
2935
179k
}
Unexecuted instantiation: convolve_2d_avx2.c:sr_x_8tap_32_avx2
convolve_avx2.c:sr_x_8tap_32_avx2
Line
Count
Source
2930
179k
                                               uint8_t *const dst) {
2931
179k
  __m256i r[2];
2932
2933
179k
  x_convolve_8tap_32_avx2(src, coeffs, filt, r);
2934
179k
  sr_x_round_store_32_avx2(r, dst);
2935
179k
}
2936
2937
static AOM_FORCE_INLINE void av1_convolve_x_sr_specialized_avx2(
2938
    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
2939
    int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
2940
828k
    const int32_t subpel_x_q4, ConvolveParams *conv_params) {
2941
828k
  int32_t y = h;
2942
828k
  __m128i coeffs_128[4];
2943
828k
  __m256i coeffs_256[4];
2944
2945
828k
  assert(conv_params->round_0 == 3);
2946
828k
  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
2947
828k
         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
2948
828k
  (void)conv_params;
2949
2950
828k
  const int horz_tap = get_filter_tap(filter_params_x, subpel_x_q4);
2951
2952
828k
  if (horz_tap == 2) {
2953
    // horz_filt as 2 tap
2954
44.7k
    const uint8_t *src_ptr = src;
2955
2956
44.7k
    if (subpel_x_q4 != 8) {
2957
28.3k
      if (w <= 8) {
2958
21.1k
        prepare_half_coeffs_2tap_ssse3(filter_params_x, subpel_x_q4,
2959
21.1k
                                       coeffs_128);
2960
2961
21.1k
        if (w == 2) {
2962
5.93k
          do {
2963
5.93k
            const __m128i res =
2964
5.93k
                x_convolve_2tap_2x2_sse4_1(src_ptr, src_stride, coeffs_128);
2965
5.93k
            const __m128i r = sr_x_round_sse2(res);
2966
5.93k
            pack_store_2x2_sse2(r, dst, dst_stride);
2967
5.93k
            src_ptr += 2 * src_stride;
2968
5.93k
            dst += 2 * dst_stride;
2969
5.93k
            y -= 2;
2970
5.93k
          } while (y);
2971
17.9k
        } else if (w == 4) {
2972
30.9k
          do {
2973
30.9k
            const __m128i res =
2974
30.9k
                x_convolve_2tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
2975
30.9k
            const __m128i r = sr_x_round_sse2(res);
2976
30.9k
            pack_store_4x2_sse2(r, dst, dst_stride);
2977
30.9k
            src_ptr += 2 * src_stride;
2978
30.9k
            dst += 2 * dst_stride;
2979
30.9k
            y -= 2;
2980
30.9k
          } while (y);
2981
9.75k
        } else {
2982
8.22k
          assert(w == 8);
2983
2984
31.1k
          do {
2985
31.1k
            __m128i res[2];
2986
2987
31.1k
            x_convolve_2tap_8x2_ssse3(src_ptr, src_stride, coeffs_128, res);
2988
31.1k
            res[0] = sr_x_round_sse2(res[0]);
2989
31.1k
            res[1] = sr_x_round_sse2(res[1]);
2990
31.1k
            const __m128i d = _mm_packus_epi16(res[0], res[1]);
2991
31.1k
            _mm_storel_epi64((__m128i *)dst, d);
2992
31.1k
            _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
2993
2994
31.1k
            src_ptr += 2 * src_stride;
2995
31.1k
            dst += 2 * dst_stride;
2996
31.1k
            y -= 2;
2997
31.1k
          } while (y);
2998
8.22k
        }
2999
21.1k
      } else {
3000
7.14k
        prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3001
3002
7.14k
        if (w == 16) {
3003
25.2k
          do {
3004
25.2k
            __m256i r[2];
3005
3006
25.2k
            x_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, r);
3007
25.2k
            sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3008
25.2k
            src_ptr += 2 * src_stride;
3009
25.2k
            dst += 2 * dst_stride;
3010
25.2k
            y -= 2;
3011
25.2k
          } while (y);
3012
4.35k
        } else if (w == 32) {
3013
38.1k
          do {
3014
38.1k
            sr_x_2tap_32_avx2(src_ptr, coeffs_256, dst);
3015
38.1k
            src_ptr += src_stride;
3016
38.1k
            dst += dst_stride;
3017
38.1k
          } while (--y);
3018
1.57k
        } else if (w == 64) {
3019
46.9k
          do {
3020
46.9k
            sr_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
3021
46.9k
            sr_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
3022
46.9k
            src_ptr += src_stride;
3023
46.9k
            dst += dst_stride;
3024
46.9k
          } while (--y);
3025
969
        } else {
3026
248
          assert(w == 128);
3027
3028
25.0k
          do {
3029
25.0k
            sr_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
3030
25.0k
            sr_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
3031
25.0k
            sr_x_2tap_32_avx2(src_ptr + 2 * 32, coeffs_256, dst + 2 * 32);
3032
25.0k
            sr_x_2tap_32_avx2(src_ptr + 3 * 32, coeffs_256, dst + 3 * 32);
3033
25.0k
            src_ptr += src_stride;
3034
25.0k
            dst += dst_stride;
3035
25.0k
          } while (--y);
3036
248
        }
3037
7.14k
      }
3038
28.3k
    } else {
3039
      // average to get half pel
3040
16.4k
      if (w == 2) {
3041
3.92k
        do {
3042
3.92k
          __m128i s_128;
3043
3044
3.92k
          s_128 = load_u8_4x2_sse4_1(src_ptr, src_stride);
3045
3.92k
          const __m128i s1 = _mm_srli_si128(s_128, 1);
3046
3.92k
          const __m128i d = _mm_avg_epu8(s_128, s1);
3047
3.92k
          *(uint16_t *)dst = (uint16_t)_mm_cvtsi128_si32(d);
3048
3.92k
          *(uint16_t *)(dst + dst_stride) = _mm_extract_epi16(d, 2);
3049
3050
3.92k
          src_ptr += 2 * src_stride;
3051
3.92k
          dst += 2 * dst_stride;
3052
3.92k
          y -= 2;
3053
3.92k
        } while (y);
3054
14.5k
      } else if (w == 4) {
3055
17.5k
        do {
3056
17.5k
          __m128i s_128;
3057
3058
17.5k
          s_128 = load_u8_8x2_sse2(src_ptr, src_stride);
3059
17.5k
          const __m128i s1 = _mm_srli_si128(s_128, 1);
3060
17.5k
          const __m128i d = _mm_avg_epu8(s_128, s1);
3061
17.5k
          xx_storel_32(dst, d);
3062
17.5k
          *(int32_t *)(dst + dst_stride) = _mm_extract_epi32(d, 2);
3063
3064
17.5k
          src_ptr += 2 * src_stride;
3065
17.5k
          dst += 2 * dst_stride;
3066
17.5k
          y -= 2;
3067
17.5k
        } while (y);
3068
8.34k
      } else if (w == 8) {
3069
16.4k
        do {
3070
16.4k
          const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
3071
16.4k
          const __m128i s10 =
3072
16.4k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
3073
16.4k
          const __m128i s01 = _mm_srli_si128(s00, 1);
3074
16.4k
          const __m128i s11 = _mm_srli_si128(s10, 1);
3075
16.4k
          const __m128i d0 = _mm_avg_epu8(s00, s01);
3076
16.4k
          const __m128i d1 = _mm_avg_epu8(s10, s11);
3077
16.4k
          _mm_storel_epi64((__m128i *)dst, d0);
3078
16.4k
          _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
3079
3080
16.4k
          src_ptr += 2 * src_stride;
3081
16.4k
          dst += 2 * dst_stride;
3082
16.4k
          y -= 2;
3083
16.4k
        } while (y);
3084
4.55k
      } else if (w == 16) {
3085
13.3k
        do {
3086
13.3k
          const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
3087
13.3k
          const __m128i s01 = _mm_loadu_si128((__m128i *)(src_ptr + 1));
3088
13.3k
          const __m128i s10 =
3089
13.3k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
3090
13.3k
          const __m128i s11 =
3091
13.3k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride + 1));
3092
13.3k
          const __m128i d0 = _mm_avg_epu8(s00, s01);
3093
13.3k
          const __m128i d1 = _mm_avg_epu8(s10, s11);
3094
13.3k
          _mm_storeu_si128((__m128i *)dst, d0);
3095
13.3k
          _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
3096
3097
13.3k
          src_ptr += 2 * src_stride;
3098
13.3k
          dst += 2 * dst_stride;
3099
13.3k
          y -= 2;
3100
13.3k
        } while (y);
3101
2.25k
      } else if (w == 32) {
3102
28.7k
        do {
3103
28.7k
          sr_x_2tap_32_avg_avx2(src_ptr, dst);
3104
28.7k
          src_ptr += src_stride;
3105
28.7k
          dst += dst_stride;
3106
28.7k
        } while (--y);
3107
927
      } else if (w == 64) {
3108
25.1k
        do {
3109
25.1k
          sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
3110
25.1k
          sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
3111
25.1k
          src_ptr += src_stride;
3112
25.1k
          dst += dst_stride;
3113
25.1k
        } while (--y);
3114
468
      } else {
3115
140
        assert(w == 128);
3116
3117
14.0k
        do {
3118
14.0k
          sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
3119
14.0k
          sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
3120
14.0k
          sr_x_2tap_32_avg_avx2(src_ptr + 2 * 32, dst + 2 * 32);
3121
14.0k
          sr_x_2tap_32_avg_avx2(src_ptr + 3 * 32, dst + 3 * 32);
3122
14.0k
          src_ptr += src_stride;
3123
14.0k
          dst += dst_stride;
3124
14.0k
        } while (--y);
3125
140
      }
3126
16.4k
    }
3127
783k
  } else if (horz_tap == 4) {
3128
    // horz_filt as 4 tap
3129
339k
    const uint8_t *src_ptr = src - 1;
3130
3131
339k
    prepare_half_coeffs_4tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
3132
3133
339k
    if (w == 2) {
3134
173k
      do {
3135
173k
        const __m128i res =
3136
173k
            x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
3137
173k
        const __m128i r = sr_x_round_sse2(res);
3138
173k
        pack_store_2x2_sse2(r, dst, dst_stride);
3139
173k
        src_ptr += 2 * src_stride;
3140
173k
        dst += 2 * dst_stride;
3141
173k
        y -= 2;
3142
173k
      } while (y);
3143
275k
    } else if (w == 4) {
3144
891k
      do {
3145
891k
        const __m128i res =
3146
891k
            x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
3147
891k
        const __m128i r = sr_x_round_sse2(res);
3148
891k
        pack_store_4x2_sse2(r, dst, dst_stride);
3149
891k
        src_ptr += 2 * src_stride;
3150
891k
        dst += 2 * dst_stride;
3151
891k
        y -= 2;
3152
891k
      } while (y);
3153
250k
    } else if (w == 8) {
3154
      // TODO(chiyotsai@google.com): Reuse the old SIMD code here. Need to
3155
      // rewrite this for better performance later.
3156
15.0k
      __m256i filt_256[2];
3157
15.0k
      prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_256);
3158
3159
15.0k
      filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3160
15.0k
      filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3161
71.5k
      for (int i = 0; i < h; i += 2) {
3162
56.5k
        const __m256i data = _mm256_permute2x128_si256(
3163
56.5k
            _mm256_castsi128_si256(
3164
56.5k
                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
3165
56.5k
            _mm256_castsi128_si256(_mm_loadu_si128(
3166
56.5k
                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
3167
56.5k
            0x20);
3168
3169
56.5k
        __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs_256 + 1, filt_256);
3170
56.5k
        res_16b = sr_x_round_avx2(res_16b);
3171
3172
56.5k
        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
3173
3174
56.5k
        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
3175
56.5k
        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
3176
3177
56.5k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
3178
56.5k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
3179
56.5k
      }
3180
15.0k
    } else {
3181
10.2k
      assert(!(w % 16));
3182
      // TODO(chiyotsai@google.com): Reuse the old SIMD code here. Need to
3183
      // rewrite this for better performance later.
3184
10.2k
      __m256i filt_256[2];
3185
10.2k
      prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_256);
3186
10.2k
      filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3187
10.2k
      filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3188
3189
215k
      for (int i = 0; i < h; ++i) {
3190
764k
        for (int j = 0; j < w; j += 16) {
3191
          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
3192
          // 18 19 20 21 22 23
3193
559k
          const __m256i data = _mm256_inserti128_si256(
3194
559k
              _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
3195
559k
              _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
3196
559k
              1);
3197
3198
559k
          __m256i res_16b =
3199
559k
              convolve_lowbd_x_4tap(data, coeffs_256 + 1, filt_256);
3200
559k
          res_16b = sr_x_round_avx2(res_16b);
3201
3202
          /* rounding code */
3203
          // 8 bit conversion and saturation to uint8
3204
559k
          __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
3205
3206
          // Store values into the destination buffer
3207
          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
3208
559k
          res_8b = _mm256_permute4x64_epi64(res_8b, 216);
3209
559k
          __m128i res = _mm256_castsi256_si128(res_8b);
3210
559k
          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
3211
559k
        }
3212
204k
      }
3213
10.2k
    }
3214
443k
  } else {
3215
443k
    __m256i filt_256[4];
3216
3217
443k
    filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3218
443k
    filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3219
443k
    filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx2);
3220
3221
443k
    if (horz_tap == 6) {
3222
      // horz_filt as 6 tap
3223
427k
      const uint8_t *src_ptr = src - 2;
3224
3225
427k
      prepare_half_coeffs_6tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3226
3227
427k
      if (w == 8) {
3228
961k
        do {
3229
961k
          const __m256i res = x_convolve_6tap_8x2_avx2(src_ptr, src_stride,
3230
961k
                                                       coeffs_256, filt_256);
3231
961k
          sr_x_round_store_8x2_avx2(res, dst, dst_stride);
3232
961k
          src_ptr += 2 * src_stride;
3233
961k
          dst += 2 * dst_stride;
3234
961k
          y -= 2;
3235
961k
        } while (y);
3236
237k
      } else if (w == 16) {
3237
788k
        do {
3238
788k
          __m256i r[2];
3239
3240
788k
          x_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256,
3241
788k
                                    r);
3242
788k
          sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3243
788k
          src_ptr += 2 * src_stride;
3244
788k
          dst += 2 * dst_stride;
3245
788k
          y -= 2;
3246
788k
        } while (y);
3247
146k
      } else if (w == 32) {
3248
696k
        do {
3249
696k
          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3250
696k
          src_ptr += src_stride;
3251
696k
          dst += dst_stride;
3252
696k
        } while (--y);
3253
36.5k
      } else if (w == 64) {
3254
279k
        do {
3255
279k
          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3256
279k
          sr_x_6tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, dst + 32);
3257
279k
          src_ptr += src_stride;
3258
279k
          dst += dst_stride;
3259
279k
        } while (--y);
3260
5.58k
      } else {
3261
1.08k
        assert(w == 128);
3262
3263
130k
        do {
3264
130k
          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3265
130k
          sr_x_6tap_32_avx2(src_ptr + 1 * 32, coeffs_256, filt_256,
3266
130k
                            dst + 1 * 32);
3267
130k
          sr_x_6tap_32_avx2(src_ptr + 2 * 32, coeffs_256, filt_256,
3268
130k
                            dst + 2 * 32);
3269
130k
          sr_x_6tap_32_avx2(src_ptr + 3 * 32, coeffs_256, filt_256,
3270
130k
                            dst + 3 * 32);
3271
130k
          src_ptr += src_stride;
3272
130k
          dst += dst_stride;
3273
130k
        } while (--y);
3274
1.10k
      }
3275
427k
    } else if (horz_tap == 8) {
3276
      // horz_filt as 8 tap
3277
15.9k
      const uint8_t *src_ptr = src - 3;
3278
3279
15.9k
      filt_256[3] = _mm256_loadu_si256((__m256i const *)filt4_global_avx2);
3280
3281
15.9k
      prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3282
3283
15.9k
      if (w == 8) {
3284
36.2k
        do {
3285
36.2k
          const __m256i res = x_convolve_8tap_8x2_avx2(src_ptr, src_stride,
3286
36.2k
                                                       coeffs_256, filt_256);
3287
36.2k
          sr_x_round_store_8x2_avx2(res, dst, dst_stride);
3288
36.2k
          src_ptr += 2 * src_stride;
3289
36.2k
          dst += 2 * dst_stride;
3290
36.2k
          y -= 2;
3291
36.2k
        } while (y);
3292
8.42k
      } else if (w == 16) {
3293
29.3k
        do {
3294
29.3k
          __m256i r[2];
3295
3296
29.3k
          x_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256,
3297
29.3k
                                    r);
3298
29.3k
          sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3299
29.3k
          src_ptr += 2 * src_stride;
3300
29.3k
          dst += 2 * dst_stride;
3301
29.3k
          y -= 2;
3302
29.3k
        } while (y);
3303
4.95k
      } else if (w == 32) {
3304
39.6k
        do {
3305
39.6k
          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3306
39.6k
          src_ptr += src_stride;
3307
39.6k
          dst += dst_stride;
3308
39.6k
        } while (--y);
3309
1.77k
      } else if (w == 64) {
3310
31.9k
        do {
3311
31.9k
          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3312
31.9k
          sr_x_8tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, dst + 32);
3313
31.9k
          src_ptr += src_stride;
3314
31.9k
          dst += dst_stride;
3315
31.9k
        } while (--y);
3316
656
      } else {
3317
180
        assert(w == 128);
3318
3319
19.0k
        do {
3320
19.0k
          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3321
19.0k
          sr_x_8tap_32_avx2(src_ptr + 1 * 32, coeffs_256, filt_256,
3322
19.0k
                            dst + 1 * 32);
3323
19.0k
          sr_x_8tap_32_avx2(src_ptr + 2 * 32, coeffs_256, filt_256,
3324
19.0k
                            dst + 2 * 32);
3325
19.0k
          sr_x_8tap_32_avx2(src_ptr + 3 * 32, coeffs_256, filt_256,
3326
19.0k
                            dst + 3 * 32);
3327
19.0k
          src_ptr += src_stride;
3328
19.0k
          dst += dst_stride;
3329
19.0k
        } while (--y);
3330
180
      }
3331
15.9k
    }
3332
443k
  }
3333
828k
}
Unexecuted instantiation: convolve_2d_avx2.c:av1_convolve_x_sr_specialized_avx2
convolve_avx2.c:av1_convolve_x_sr_specialized_avx2
Line
Count
Source
2940
828k
    const int32_t subpel_x_q4, ConvolveParams *conv_params) {
2941
828k
  int32_t y = h;
2942
828k
  __m128i coeffs_128[4];
2943
828k
  __m256i coeffs_256[4];
2944
2945
828k
  assert(conv_params->round_0 == 3);
2946
828k
  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
2947
828k
         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
2948
828k
  (void)conv_params;
2949
2950
828k
  const int horz_tap = get_filter_tap(filter_params_x, subpel_x_q4);
2951
2952
828k
  if (horz_tap == 2) {
2953
    // horz_filt as 2 tap
2954
44.7k
    const uint8_t *src_ptr = src;
2955
2956
44.7k
    if (subpel_x_q4 != 8) {
2957
28.3k
      if (w <= 8) {
2958
21.1k
        prepare_half_coeffs_2tap_ssse3(filter_params_x, subpel_x_q4,
2959
21.1k
                                       coeffs_128);
2960
2961
21.1k
        if (w == 2) {
2962
5.93k
          do {
2963
5.93k
            const __m128i res =
2964
5.93k
                x_convolve_2tap_2x2_sse4_1(src_ptr, src_stride, coeffs_128);
2965
5.93k
            const __m128i r = sr_x_round_sse2(res);
2966
5.93k
            pack_store_2x2_sse2(r, dst, dst_stride);
2967
5.93k
            src_ptr += 2 * src_stride;
2968
5.93k
            dst += 2 * dst_stride;
2969
5.93k
            y -= 2;
2970
5.93k
          } while (y);
2971
17.9k
        } else if (w == 4) {
2972
30.9k
          do {
2973
30.9k
            const __m128i res =
2974
30.9k
                x_convolve_2tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
2975
30.9k
            const __m128i r = sr_x_round_sse2(res);
2976
30.9k
            pack_store_4x2_sse2(r, dst, dst_stride);
2977
30.9k
            src_ptr += 2 * src_stride;
2978
30.9k
            dst += 2 * dst_stride;
2979
30.9k
            y -= 2;
2980
30.9k
          } while (y);
2981
9.75k
        } else {
2982
8.22k
          assert(w == 8);
2983
2984
31.1k
          do {
2985
31.1k
            __m128i res[2];
2986
2987
31.1k
            x_convolve_2tap_8x2_ssse3(src_ptr, src_stride, coeffs_128, res);
2988
31.1k
            res[0] = sr_x_round_sse2(res[0]);
2989
31.1k
            res[1] = sr_x_round_sse2(res[1]);
2990
31.1k
            const __m128i d = _mm_packus_epi16(res[0], res[1]);
2991
31.1k
            _mm_storel_epi64((__m128i *)dst, d);
2992
31.1k
            _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
2993
2994
31.1k
            src_ptr += 2 * src_stride;
2995
31.1k
            dst += 2 * dst_stride;
2996
31.1k
            y -= 2;
2997
31.1k
          } while (y);
2998
8.22k
        }
2999
21.1k
      } else {
3000
7.14k
        prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3001
3002
7.14k
        if (w == 16) {
3003
25.2k
          do {
3004
25.2k
            __m256i r[2];
3005
3006
25.2k
            x_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, r);
3007
25.2k
            sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3008
25.2k
            src_ptr += 2 * src_stride;
3009
25.2k
            dst += 2 * dst_stride;
3010
25.2k
            y -= 2;
3011
25.2k
          } while (y);
3012
4.35k
        } else if (w == 32) {
3013
38.1k
          do {
3014
38.1k
            sr_x_2tap_32_avx2(src_ptr, coeffs_256, dst);
3015
38.1k
            src_ptr += src_stride;
3016
38.1k
            dst += dst_stride;
3017
38.1k
          } while (--y);
3018
1.57k
        } else if (w == 64) {
3019
46.9k
          do {
3020
46.9k
            sr_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
3021
46.9k
            sr_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
3022
46.9k
            src_ptr += src_stride;
3023
46.9k
            dst += dst_stride;
3024
46.9k
          } while (--y);
3025
969
        } else {
3026
248
          assert(w == 128);
3027
3028
25.0k
          do {
3029
25.0k
            sr_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
3030
25.0k
            sr_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
3031
25.0k
            sr_x_2tap_32_avx2(src_ptr + 2 * 32, coeffs_256, dst + 2 * 32);
3032
25.0k
            sr_x_2tap_32_avx2(src_ptr + 3 * 32, coeffs_256, dst + 3 * 32);
3033
25.0k
            src_ptr += src_stride;
3034
25.0k
            dst += dst_stride;
3035
25.0k
          } while (--y);
3036
248
        }
3037
7.14k
      }
3038
28.3k
    } else {
3039
      // average to get half pel
3040
16.4k
      if (w == 2) {
3041
3.92k
        do {
3042
3.92k
          __m128i s_128;
3043
3044
3.92k
          s_128 = load_u8_4x2_sse4_1(src_ptr, src_stride);
3045
3.92k
          const __m128i s1 = _mm_srli_si128(s_128, 1);
3046
3.92k
          const __m128i d = _mm_avg_epu8(s_128, s1);
3047
3.92k
          *(uint16_t *)dst = (uint16_t)_mm_cvtsi128_si32(d);
3048
3.92k
          *(uint16_t *)(dst + dst_stride) = _mm_extract_epi16(d, 2);
3049
3050
3.92k
          src_ptr += 2 * src_stride;
3051
3.92k
          dst += 2 * dst_stride;
3052
3.92k
          y -= 2;
3053
3.92k
        } while (y);
3054
14.5k
      } else if (w == 4) {
3055
17.5k
        do {
3056
17.5k
          __m128i s_128;
3057
3058
17.5k
          s_128 = load_u8_8x2_sse2(src_ptr, src_stride);
3059
17.5k
          const __m128i s1 = _mm_srli_si128(s_128, 1);
3060
17.5k
          const __m128i d = _mm_avg_epu8(s_128, s1);
3061
17.5k
          xx_storel_32(dst, d);
3062
17.5k
          *(int32_t *)(dst + dst_stride) = _mm_extract_epi32(d, 2);
3063
3064
17.5k
          src_ptr += 2 * src_stride;
3065
17.5k
          dst += 2 * dst_stride;
3066
17.5k
          y -= 2;
3067
17.5k
        } while (y);
3068
8.34k
      } else if (w == 8) {
3069
16.4k
        do {
3070
16.4k
          const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
3071
16.4k
          const __m128i s10 =
3072
16.4k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
3073
16.4k
          const __m128i s01 = _mm_srli_si128(s00, 1);
3074
16.4k
          const __m128i s11 = _mm_srli_si128(s10, 1);
3075
16.4k
          const __m128i d0 = _mm_avg_epu8(s00, s01);
3076
16.4k
          const __m128i d1 = _mm_avg_epu8(s10, s11);
3077
16.4k
          _mm_storel_epi64((__m128i *)dst, d0);
3078
16.4k
          _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
3079
3080
16.4k
          src_ptr += 2 * src_stride;
3081
16.4k
          dst += 2 * dst_stride;
3082
16.4k
          y -= 2;
3083
16.4k
        } while (y);
3084
4.55k
      } else if (w == 16) {
3085
13.3k
        do {
3086
13.3k
          const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
3087
13.3k
          const __m128i s01 = _mm_loadu_si128((__m128i *)(src_ptr + 1));
3088
13.3k
          const __m128i s10 =
3089
13.3k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
3090
13.3k
          const __m128i s11 =
3091
13.3k
              _mm_loadu_si128((__m128i *)(src_ptr + src_stride + 1));
3092
13.3k
          const __m128i d0 = _mm_avg_epu8(s00, s01);
3093
13.3k
          const __m128i d1 = _mm_avg_epu8(s10, s11);
3094
13.3k
          _mm_storeu_si128((__m128i *)dst, d0);
3095
13.3k
          _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
3096
3097
13.3k
          src_ptr += 2 * src_stride;
3098
13.3k
          dst += 2 * dst_stride;
3099
13.3k
          y -= 2;
3100
13.3k
        } while (y);
3101
2.25k
      } else if (w == 32) {
3102
28.7k
        do {
3103
28.7k
          sr_x_2tap_32_avg_avx2(src_ptr, dst);
3104
28.7k
          src_ptr += src_stride;
3105
28.7k
          dst += dst_stride;
3106
28.7k
        } while (--y);
3107
927
      } else if (w == 64) {
3108
25.1k
        do {
3109
25.1k
          sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
3110
25.1k
          sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
3111
25.1k
          src_ptr += src_stride;
3112
25.1k
          dst += dst_stride;
3113
25.1k
        } while (--y);
3114
468
      } else {
3115
140
        assert(w == 128);
3116
3117
14.0k
        do {
3118
14.0k
          sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
3119
14.0k
          sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
3120
14.0k
          sr_x_2tap_32_avg_avx2(src_ptr + 2 * 32, dst + 2 * 32);
3121
14.0k
          sr_x_2tap_32_avg_avx2(src_ptr + 3 * 32, dst + 3 * 32);
3122
14.0k
          src_ptr += src_stride;
3123
14.0k
          dst += dst_stride;
3124
14.0k
        } while (--y);
3125
140
      }
3126
16.4k
    }
3127
783k
  } else if (horz_tap == 4) {
3128
    // horz_filt as 4 tap
3129
339k
    const uint8_t *src_ptr = src - 1;
3130
3131
339k
    prepare_half_coeffs_4tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
3132
3133
339k
    if (w == 2) {
3134
173k
      do {
3135
173k
        const __m128i res =
3136
173k
            x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
3137
173k
        const __m128i r = sr_x_round_sse2(res);
3138
173k
        pack_store_2x2_sse2(r, dst, dst_stride);
3139
173k
        src_ptr += 2 * src_stride;
3140
173k
        dst += 2 * dst_stride;
3141
173k
        y -= 2;
3142
173k
      } while (y);
3143
275k
    } else if (w == 4) {
3144
891k
      do {
3145
891k
        const __m128i res =
3146
891k
            x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
3147
891k
        const __m128i r = sr_x_round_sse2(res);
3148
891k
        pack_store_4x2_sse2(r, dst, dst_stride);
3149
891k
        src_ptr += 2 * src_stride;
3150
891k
        dst += 2 * dst_stride;
3151
891k
        y -= 2;
3152
891k
      } while (y);
3153
250k
    } else if (w == 8) {
3154
      // TODO(chiyotsai@google.com): Reuse the old SIMD code here. Need to
3155
      // rewrite this for better performance later.
3156
15.0k
      __m256i filt_256[2];
3157
15.0k
      prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_256);
3158
3159
15.0k
      filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3160
15.0k
      filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3161
71.5k
      for (int i = 0; i < h; i += 2) {
3162
56.5k
        const __m256i data = _mm256_permute2x128_si256(
3163
56.5k
            _mm256_castsi128_si256(
3164
56.5k
                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
3165
56.5k
            _mm256_castsi128_si256(_mm_loadu_si128(
3166
56.5k
                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
3167
56.5k
            0x20);
3168
3169
56.5k
        __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs_256 + 1, filt_256);
3170
56.5k
        res_16b = sr_x_round_avx2(res_16b);
3171
3172
56.5k
        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
3173
3174
56.5k
        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
3175
56.5k
        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
3176
3177
56.5k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
3178
56.5k
        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
3179
56.5k
      }
3180
15.0k
    } else {
3181
10.2k
      assert(!(w % 16));
3182
      // TODO(chiyotsai@google.com): Reuse the old SIMD code here. Need to
3183
      // rewrite this for better performance later.
3184
10.2k
      __m256i filt_256[2];
3185
10.2k
      prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_256);
3186
10.2k
      filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3187
10.2k
      filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3188
3189
215k
      for (int i = 0; i < h; ++i) {
3190
764k
        for (int j = 0; j < w; j += 16) {
3191
          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
3192
          // 18 19 20 21 22 23
3193
559k
          const __m256i data = _mm256_inserti128_si256(
3194
559k
              _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
3195
559k
              _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
3196
559k
              1);
3197
3198
559k
          __m256i res_16b =
3199
559k
              convolve_lowbd_x_4tap(data, coeffs_256 + 1, filt_256);
3200
559k
          res_16b = sr_x_round_avx2(res_16b);
3201
3202
          /* rounding code */
3203
          // 8 bit conversion and saturation to uint8
3204
559k
          __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
3205
3206
          // Store values into the destination buffer
3207
          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
3208
559k
          res_8b = _mm256_permute4x64_epi64(res_8b, 216);
3209
559k
          __m128i res = _mm256_castsi256_si128(res_8b);
3210
559k
          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
3211
559k
        }
3212
204k
      }
3213
10.2k
    }
3214
443k
  } else {
3215
443k
    __m256i filt_256[4];
3216
3217
443k
    filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3218
443k
    filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3219
443k
    filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx2);
3220
3221
443k
    if (horz_tap == 6) {
3222
      // horz_filt as 6 tap
3223
427k
      const uint8_t *src_ptr = src - 2;
3224
3225
427k
      prepare_half_coeffs_6tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3226
3227
427k
      if (w == 8) {
3228
961k
        do {
3229
961k
          const __m256i res = x_convolve_6tap_8x2_avx2(src_ptr, src_stride,
3230
961k
                                                       coeffs_256, filt_256);
3231
961k
          sr_x_round_store_8x2_avx2(res, dst, dst_stride);
3232
961k
          src_ptr += 2 * src_stride;
3233
961k
          dst += 2 * dst_stride;
3234
961k
          y -= 2;
3235
961k
        } while (y);
3236
237k
      } else if (w == 16) {
3237
788k
        do {
3238
788k
          __m256i r[2];
3239
3240
788k
          x_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256,
3241
788k
                                    r);
3242
788k
          sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3243
788k
          src_ptr += 2 * src_stride;
3244
788k
          dst += 2 * dst_stride;
3245
788k
          y -= 2;
3246
788k
        } while (y);
3247
146k
      } else if (w == 32) {
3248
696k
        do {
3249
696k
          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3250
696k
          src_ptr += src_stride;
3251
696k
          dst += dst_stride;
3252
696k
        } while (--y);
3253
36.5k
      } else if (w == 64) {
3254
279k
        do {
3255
279k
          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3256
279k
          sr_x_6tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, dst + 32);
3257
279k
          src_ptr += src_stride;
3258
279k
          dst += dst_stride;
3259
279k
        } while (--y);
3260
5.58k
      } else {
3261
1.08k
        assert(w == 128);
3262
3263
130k
        do {
3264
130k
          sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3265
130k
          sr_x_6tap_32_avx2(src_ptr + 1 * 32, coeffs_256, filt_256,
3266
130k
                            dst + 1 * 32);
3267
130k
          sr_x_6tap_32_avx2(src_ptr + 2 * 32, coeffs_256, filt_256,
3268
130k
                            dst + 2 * 32);
3269
130k
          sr_x_6tap_32_avx2(src_ptr + 3 * 32, coeffs_256, filt_256,
3270
130k
                            dst + 3 * 32);
3271
130k
          src_ptr += src_stride;
3272
130k
          dst += dst_stride;
3273
130k
        } while (--y);
3274
1.10k
      }
3275
427k
    } else if (horz_tap == 8) {
3276
      // horz_filt as 8 tap
3277
15.9k
      const uint8_t *src_ptr = src - 3;
3278
3279
15.9k
      filt_256[3] = _mm256_loadu_si256((__m256i const *)filt4_global_avx2);
3280
3281
15.9k
      prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3282
3283
15.9k
      if (w == 8) {
3284
36.2k
        do {
3285
36.2k
          const __m256i res = x_convolve_8tap_8x2_avx2(src_ptr, src_stride,
3286
36.2k
                                                       coeffs_256, filt_256);
3287
36.2k
          sr_x_round_store_8x2_avx2(res, dst, dst_stride);
3288
36.2k
          src_ptr += 2 * src_stride;
3289
36.2k
          dst += 2 * dst_stride;
3290
36.2k
          y -= 2;
3291
36.2k
        } while (y);
3292
8.42k
      } else if (w == 16) {
3293
29.3k
        do {
3294
29.3k
          __m256i r[2];
3295
3296
29.3k
          x_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256,
3297
29.3k
                                    r);
3298
29.3k
          sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3299
29.3k
          src_ptr += 2 * src_stride;
3300
29.3k
          dst += 2 * dst_stride;
3301
29.3k
          y -= 2;
3302
29.3k
        } while (y);
3303
4.95k
      } else if (w == 32) {
3304
39.6k
        do {
3305
39.6k
          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3306
39.6k
          src_ptr += src_stride;
3307
39.6k
          dst += dst_stride;
3308
39.6k
        } while (--y);
3309
1.77k
      } else if (w == 64) {
3310
31.9k
        do {
3311
31.9k
          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3312
31.9k
          sr_x_8tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, dst + 32);
3313
31.9k
          src_ptr += src_stride;
3314
31.9k
          dst += dst_stride;
3315
31.9k
        } while (--y);
3316
656
      } else {
3317
180
        assert(w == 128);
3318
3319
19.0k
        do {
3320
19.0k
          sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3321
19.0k
          sr_x_8tap_32_avx2(src_ptr + 1 * 32, coeffs_256, filt_256,
3322
19.0k
                            dst + 1 * 32);
3323
19.0k
          sr_x_8tap_32_avx2(src_ptr + 2 * 32, coeffs_256, filt_256,
3324
19.0k
                            dst + 2 * 32);
3325
19.0k
          sr_x_8tap_32_avx2(src_ptr + 3 * 32, coeffs_256, filt_256,
3326
19.0k
                            dst + 3 * 32);
3327
19.0k
          src_ptr += src_stride;
3328
19.0k
          dst += dst_stride;
3329
19.0k
        } while (--y);
3330
180
      }
3331
15.9k
    }
3332
443k
  }
3333
828k
}
3334
3335
#endif  // THIRD_PARTY_SVT_AV1_CONVOLVE_AVX2_H_